diff --git a/.github/actions/codeclone/README.md b/.github/actions/codeclone/README.md
index 88dbc69..aa05a22 100644
--- a/.github/actions/codeclone/README.md
+++ b/.github/actions/codeclone/README.md
@@ -73,7 +73,7 @@ jobs:
| Input | Default | Purpose |
|-------------------------|---------------------------------|-------------------------------------------------------------------------------------------------------------------|
-| `python-version` | `3.13` | Python version used to run the action |
+| `python-version` | `3.14` | Python version used to run the action |
| `package-version` | `""` | CodeClone version from PyPI for remote installs; ignored when the action runs from the checked-out CodeClone repo |
| `path` | `.` | Project root to analyze |
| `json-path` | `.cache/codeclone/report.json` | JSON report output path |
diff --git a/.github/actions/codeclone/action.yml b/.github/actions/codeclone/action.yml
index 7cc9975..2d0d1f1 100644
--- a/.github/actions/codeclone/action.yml
+++ b/.github/actions/codeclone/action.yml
@@ -13,7 +13,7 @@ inputs:
python-version:
description: "Python version"
required: false
- default: "3.13"
+ default: "3.14"
package-version:
description: "CodeClone version from PyPI for remote installs (ignored when the action runs from the checked-out CodeClone repo)"
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index f6f9ab7..259556f 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -87,7 +87,7 @@ jobs:
if: env.BENCH_ENABLED == '1' && runner.os == 'macOS'
uses: actions/setup-python@v6.2.0
with:
- python-version: "3.13"
+ python-version: "3.14"
allow-prereleases: true
- name: Set up uv (macOS local benchmark)
@@ -98,7 +98,7 @@ jobs:
- name: Install dependencies (macOS local benchmark)
if: env.BENCH_ENABLED == '1' && runner.os == 'macOS'
- run: uv sync --all-extras --dev
+ run: uv sync --extra dev
- name: Set benchmark output path
if: env.BENCH_ENABLED == '1'
diff --git a/.github/workflows/codeclone.yml b/.github/workflows/codeclone.yml
index d0566e4..23ce340 100644
--- a/.github/workflows/codeclone.yml
+++ b/.github/workflows/codeclone.yml
@@ -26,7 +26,7 @@ jobs:
- name: Run CodeClone
uses: ./.github/actions/codeclone
with:
- python-version: "3.13"
+ python-version: "3.14"
fail-on-new: "true"
fail-health: "60"
sarif: "true"
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index bcec725..85c616a 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -24,7 +24,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6.2.0
with:
- python-version: "3.13"
+ python-version: "3.14"
allow-prereleases: true
- name: Set up uv
@@ -33,7 +33,7 @@ jobs:
enable-cache: true
- name: Install project dependencies
- run: uv sync --dev
+ run: uv sync --extra dev
- name: Configure GitHub Pages
uses: actions/configure-pages@v5
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..73e561c
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,116 @@
+name: publish
+run-name: >-
+ publish • ${{ github.event_name }} •
+ ${{ github.event.release.tag_name || inputs.repository || github.ref_name }}
+
+on:
+ release:
+ types: [published]
+ workflow_dispatch:
+ inputs:
+ repository:
+ description: Target package index
+ required: true
+ default: testpypi
+ type: choice
+ options:
+ - testpypi
+ - pypi
+
+permissions:
+ contents: read
+
+concurrency:
+ group: publish-${{ github.event.release.tag_name || github.ref }}
+ cancel-in-progress: false
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v6.0.2
+
+ - name: Set up Python
+ uses: actions/setup-python@v6.2.0
+ with:
+ python-version: "3.14"
+ allow-prereleases: true
+
+ - name: Set up uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ enable-cache: true
+
+ - name: Verify release tag matches project version
+ if: ${{ github.event_name == 'release' }}
+ shell: bash
+ run: |
+ set -euo pipefail
+ project_version="$(python - <<'PY'
+ import pathlib, tomllib
+ payload = tomllib.loads(pathlib.Path("pyproject.toml").read_text(encoding="utf-8"))
+ print(payload["project"]["version"])
+ PY
+ )"
+ release_tag="${{ github.event.release.tag_name }}"
+ normalized_tag="${release_tag#v}"
+ if [ "$normalized_tag" != "$project_version" ]; then
+ echo "release tag $release_tag does not match project version $project_version" >&2
+ exit 1
+ fi
+
+ - name: Build distributions
+ run: uv run --with build python -m build --sdist --wheel
+
+ - name: Validate distributions
+ run: uv run --with twine twine check dist/*
+
+ - name: Upload distributions
+ uses: actions/upload-artifact@v4
+ with:
+ name: python-package-distributions
+ path: dist/
+ if-no-files-found: error
+
+ publish-testpypi:
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.repository == 'testpypi' }}
+ needs: build
+ runs-on: ubuntu-latest
+ environment: testpypi
+ permissions:
+ contents: read
+ id-token: write
+ steps:
+ - name: Download distributions
+ uses: actions/download-artifact@v5
+ with:
+ name: python-package-distributions
+ path: dist/
+
+ - name: Publish to TestPyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ repository-url: https://test.pypi.org/legacy/
+
+ publish-pypi:
+ if: >-
+ ${{
+ github.event_name == 'release' ||
+ (github.event_name == 'workflow_dispatch' && inputs.repository == 'pypi')
+ }}
+ needs: build
+ runs-on: ubuntu-latest
+ environment: pypi
+ permissions:
+ contents: read
+ id-token: write
+ steps:
+ - name: Download distributions
+ uses: actions/download-artifact@v5
+ with:
+ name: python-package-distributions
+ path: dist/
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d07ce3f..6acb4a4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -35,7 +35,7 @@ jobs:
enable-cache: true
- name: Install dependencies
- run: uv sync --all-extras --dev
+ run: uv sync --extra dev --extra mcp
- name: Run tests
# Smoke CLI tests intentionally disable subprocess coverage collection
@@ -43,11 +43,11 @@ jobs:
run: uv run pytest --cov=codeclone --cov-report=term-missing --cov-fail-under=99
- name: Verify baseline exists
- if: ${{ matrix.python-version == '3.13' }}
+ if: ${{ matrix.python-version == '3.14' }}
run: test -f codeclone.baseline.json
- name: Check for new clones vs baseline
- if: ${{ matrix.python-version == '3.13' }}
+ if: ${{ matrix.python-version == '3.14' }}
run: uv run codeclone . --ci
lint:
@@ -59,7 +59,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v6.2.0
with:
- python-version: "3.13"
+ python-version: "3.14"
- name: Set up uv
uses: astral-sh/setup-uv@v5
@@ -67,7 +67,7 @@ jobs:
enable-cache: true
- name: Install dependencies
- run: uv sync --all-extras --dev
+ run: uv sync --extra dev --extra mcp
- name: Ruff
run: uv run ruff check .
diff --git a/.gitignore b/.gitignore
index e3ad2eb..71bd32f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,8 @@ site/
/package-lock.json
extensions/vscode-codeclone/node_modules
/coverage.xml
+/.cgcignore
+/mcp.json
+/scripts/refactor_guard.sh
+/docs/refactoring-spec.md
+/smoke_cli.sh
diff --git a/AGENTS.md b/AGENTS.md
index 16e579d..a645b79 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -137,18 +137,18 @@ uv run pytest -q tests/test_codex_plugin.py
### Versioned constants (single source of truth)
-All schema/version constants live in `codeclone/contracts.py`. **Always read them from code, never copy
+All schema/version constants live in `codeclone/contracts/__init__.py`. **Always read them from code, never copy
from another doc.** Current values (verified at write time):
-| Constant | Source | Current value |
-|-----------------------------------|------------------------------|---------------|
-| `BASELINE_SCHEMA_VERSION` | `codeclone/contracts.py` | `2.1` |
-| `BASELINE_FINGERPRINT_VERSION` | `codeclone/contracts.py` | `1` |
-| `CACHE_VERSION` | `codeclone/contracts.py` | `2.5` |
-| `REPORT_SCHEMA_VERSION` | `codeclone/contracts.py` | `2.8` |
-| `METRICS_BASELINE_SCHEMA_VERSION` | `codeclone/contracts.py` | `1.2` |
+| Constant | Source | Current value |
+|-----------------------------------|-----------------------------------|---------------|
+| `BASELINE_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `2.1` |
+| `BASELINE_FINGERPRINT_VERSION` | `codeclone/contracts/__init__.py` | `1` |
+| `CACHE_VERSION` | `codeclone/contracts/__init__.py` | `2.6` |
+| `REPORT_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `2.10` |
+| `METRICS_BASELINE_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `1.2` |
-When updating any doc that mentions a version, re-read `codeclone/contracts.py` first. Do not derive
+When updating any doc that mentions a version, re-read `codeclone/contracts/__init__.py` first. Do not derive
versions from another document.
### Baseline file structure (canonical)
@@ -162,7 +162,7 @@ versions from another document.
},
"schema_version": "2.1",
"fingerprint_version": "1",
- "python_tag": "cp313",
+ "python_tag": "cp314",
"created_at": "2026-02-08T14:20:15Z",
"payload_sha256": "…"
},
@@ -181,7 +181,7 @@ versions from another document.
- `schema_version` is **baseline schema**, not package version.
- Runtime writes baseline schema `2.1`.
- Runtime accepts baseline schema `1.0` and `2.0`–`2.1` (governed by
- `_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR` in `codeclone/baseline.py`).
+ `_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR` in `codeclone/baseline/trust.py`).
- Compatibility is tied to:
- `fingerprint_version`
- `python_tag`
@@ -367,24 +367,29 @@ Before cutting a release:
Architecture is layered, but grounded in current code (not aspirational diagrams):
-- **CLI / orchestration surface** (`codeclone/cli.py`, `codeclone/_cli_*.py`) parses args, resolves runtime mode,
- coordinates pipeline calls, and prints UX.
-- **Pipeline orchestrator** (`codeclone/pipeline.py`) owns end-to-end flow: bootstrap → discovery → processing →
- analysis → report artifacts → gating.
-- **Core analysis** (`codeclone/extractor.py`, `codeclone/cfg.py`, `codeclone/normalize.py`, `codeclone/blocks.py`,
- `codeclone/grouping.py`, `codeclone/scanner.py`) produces normalized structural facts and clone candidates.
-- **Domain/contracts layer** (`codeclone/models.py`, `codeclone/contracts.py`, `codeclone/errors.py`,
- `codeclone/domain/*.py`) defines typed entities and stable enums/constants used across layers.
-- **Persistence contracts** (`codeclone/baseline.py`, `codeclone/cache.py`, `codeclone/cache_io.py`,
- `codeclone/metrics_baseline.py`) store trusted comparison state and optimization state.
-- **Canonical report + projections** (`codeclone/report/json_contract.py`, `codeclone/report/*.py`) converts analysis
- facts to deterministic, contract-shaped outputs.
-- **HTML/UI rendering** (`codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_*.py`,
- `codeclone/templates.py`) renders views from report/meta facts.
+- **CLI entry + orchestration surface** (`codeclone/main.py`, `codeclone/surfaces/cli/*`, `codeclone/ui_messages/*`)
+ owns argument parsing, runtime/config resolution, summaries, report writes, and exit routing.
+- **Config layer** (`codeclone/config/*`) is the single source of truth for option specs, parser construction,
+ `pyproject.toml` loading, and CLI > pyproject > defaults resolution.
+- **Core orchestration** (`codeclone/core/*`) owns bootstrap → discovery → worker processing → project metrics →
+ report/gate integration. It does not own shell UX.
+- **Analysis layer** (`codeclone/analysis/*`, `codeclone/blocks/*`, `codeclone/paths/*`, `codeclone/qualnames/*`)
+ parses source, normalizes AST/CFG facts, extracts units, and prepares deterministic analysis inputs.
+- **Clone/finding derivation layer** (`codeclone/findings/*`, `codeclone/metrics/*`) groups clones and computes
+ structural and quality signals from already-extracted facts.
+- **Domain/contracts layer** (`codeclone/models.py`, `codeclone/contracts/*`, `codeclone/domain/*`) defines typed
+ entities, enums, schema/version constants, and typed exceptions used across layers.
+- **Persistence contracts** (`codeclone/baseline/*`, `codeclone/cache/*`) store trusted comparison state and
+ optimization state. They are contracts, not analysis truth.
+- **Canonical report + projections** (`codeclone/report/document/*`, `codeclone/report/gates/*`,
+ `codeclone/report/renderers/*`, `codeclone/report/*.py`) converts analysis facts into deterministic report payloads
+ and deterministic projections.
+- **HTML/UI rendering** (`codeclone/report/html/*`) renders views from canonical report/meta
+ facts. HTML is render-only.
+- **MCP agent interface** (`codeclone/surfaces/mcp/*`) exposes the same pipeline/report contracts as a deterministic,
+ read-only MCP surface for AI agents and MCP-capable clients.
- **Documentation/publishing surface** (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`,
`scripts/build_docs_example_report.py`) publishes contract docs and the live sample report.
-- **MCP agent interface** (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`) exposes the current pipeline as a
- deterministic, read-only MCP server for AI agents and MCP-capable clients.
- **VS Code extension surface** (`extensions/vscode-codeclone/*`) is a native, workspace-only IDE client over
`codeclone-mcp`, with baseline-aware, triage-first, source-first review UX.
- **Claude Desktop bundle surface** (`extensions/claude-desktop-codeclone/*`) is a native `.mcpb` install wrapper for
@@ -409,51 +414,53 @@ Non-negotiable interpretation:
Use this map to route changes to the right owner module.
-- `codeclone/cli.py` — public CLI entry and control-flow coordinator; add orchestration and top-level UX here; do not
- move core analysis logic here.
-- `codeclone/_cli_*.py` — CLI support slices (args, config, runtime, summary, reports, baselines, gating); keep them
- thin and reusable; do not encode domain semantics that belong to pipeline/core/contracts.
-- `codeclone/pipeline.py` — canonical orchestration and data plumbing between scanner/extractor/metrics/report/gating;
- change integration flow here; do not move HTML-only presentation logic here.
-- `codeclone/extractor.py` — AST extraction, CFG fingerprint input preparation, symbol/declaration collection, and
- per-file metrics inputs; change parsing/extraction semantics here; do not couple this module to CLI/report
- rendering/baseline logic.
-- `codeclone/grouping.py` / `codeclone/blocks.py` — clone grouping and block/segment mechanics; normalization-adjacent
- statement hashing lives with `codeclone/normalize.py`; do not mix grouping behavior with CLI/report UX concerns.
-- `codeclone/metrics/` — metric computations and dead-code/dependency/health logic; change metric math and thresholds
- here; do not make metrics depend on renderer/UI concerns.
-- `codeclone/structural_findings.py` — structural finding extraction/normalization policy; keep it report-layer factual
+- `codeclone/main.py` — public CLI entrypoint only. Keep it tiny.
+- `codeclone/surfaces/cli/workflow.py` — top-level CLI orchestration and exit routing. Add CLI control flow here, not
+ in `main.py`.
+- `codeclone/surfaces/cli/*` — CLI support slices (startup, runtime, execution, post-run handling, summaries,
+ reports, changed-scope logic, baseline state, console helpers). Keep them orchestration/UX-focused.
+- `codeclone/config/*` — parser construction, option specs/defaults, pyproject loading, config resolution. Do not
+ duplicate option semantics elsewhere.
+- `codeclone/core/*` — canonical runtime pipeline and payload plumbing. Change integration flow here; do not move shell
+ UX or HTML-only logic here.
+- `codeclone/analysis/*` — AST parsing, CFG/fingerprint preparation, declaration/reference collection, and unit
+ extraction. Change parsing/extraction semantics here; keep it independent from CLI/report/baseline UX.
+- `codeclone/findings/clones/grouping.py` + `codeclone/blocks/*` — clone grouping and block/segment mechanics.
+- `codeclone/findings/structural/detectors.py` — structural finding extraction/normalization policy; keep it factual
and deterministic.
-- `codeclone/suppressions.py` — inline `# codeclone: ignore[...]` parse/bind/index logic; keep it declaration-scoped and
- deterministic.
-- `codeclone/baseline.py` — baseline schema/trust/integrity/compatibility contract; all baseline format changes go here
- with explicit contract process.
-- `codeclone/cache.py` — cache schema/status/profile compatibility and high-level serialization policy; cache remains
- optimization-only.
-- `codeclone/cache_io.py` — IO-layer helpers for the cache: atomic JSON read/write
- (`read_json_document`, `write_json_document_atomically`), canonical JSON (`canonical_json`), and
- HMAC signing/verification (`sign_cache_payload`, `verify_cache_payload_signature`); attribute these
- functions to `cache_io.py`, not `cache.py`.
-- `codeclone/report/json_contract.py` — canonical report schema builder/integrity payload; any JSON contract shape
- change belongs here.
-- `codeclone/report/*.py` (other modules) — deterministic projections/format transforms (
- text/markdown/sarif/derived/findings/suggestions); avoid injecting new analysis heuristics here.
-- `codeclone/mcp_service.py` — typed, in-process MCP service adapter over the current pipeline/report contracts; keep
- it deterministic; allow only session-local in-memory state such as reviewed markers, and never move shell UX or
- `sys.exit` behavior here.
-- `codeclone/mcp_server.py` — optional MCP launcher/server wiring, transport config, and MCP tool/resource
+- `codeclone/metrics/*` — metric computations and dead-code/dependency/health logic; change metric math and thresholds
+ here; do not make metrics depend on renderer/UI concerns.
+- `codeclone/analysis/suppressions.py` — inline `# codeclone: ignore[...]` parse/bind/index logic; keep it
+ declaration-scoped and deterministic.
+- `codeclone/findings/clones/golden_fixtures.py` — golden-fixture clone exclusion policy and suppressed-clone bucket
+ shaping; keep it clone-derivation-only and deterministic.
+- `codeclone/baseline/clone_baseline.py` + `codeclone/baseline/trust.py` — clone baseline schema/trust/integrity/
+ compatibility contract; all clone-baseline format changes go here with explicit contract process.
+- `codeclone/baseline/metrics_baseline.py` + `codeclone/baseline/_metrics_baseline_*` — metrics-baseline schema,
+ validation, payload hashing, and unified-baseline merge logic.
+- `codeclone/cache/store.py`, `codeclone/cache/versioning.py`, `codeclone/cache/integrity.py`,
+ `codeclone/cache/_wire_*`, `codeclone/cache/projection.py` — cache schema/status/profile compatibility, canonical
+ JSON/signing, wire encoding/decoding, and segment projection persistence. Cache remains optimization-only.
+- `codeclone/report/document/*` — canonical report schema builder and integrity payload. Any JSON contract shape change
+ belongs here.
+- `codeclone/report/renderers/*` — deterministic text/markdown/SARIF/JSON projections over the canonical report.
+- `codeclone/report/html/*` — actual HTML assembly, context shaping, tabs, sections, widgets, CSS/JS/escaping, and
+ snippets. Change report layout and interactive HTML UX here, not in report builders.
+- `codeclone/report/gates/*` — metric-gate reason derivation over canonical metrics state.
+- `codeclone/report/*.py` (other modules) — deterministic report support slices such as explainability, suggestions,
+ merge, overview, findings helpers, and source-kind routing.
+- `codeclone/surfaces/mcp/service.py` — typed, in-process MCP service over the current pipeline/report contracts;
+ keep it deterministic and read-only except for session-local in-memory markers.
+- `codeclone/surfaces/mcp/server.py` — optional MCP launcher/server wiring, transport config, and MCP tool/resource
registration; keep dependency loading lazy so base installs/CI do not require MCP runtime packages.
- `tests/test_mcp_service.py`, `tests/test_mcp_server.py` — MCP contract and integration tests; run these when
touching any MCP surface.
-- `codeclone/html_report.py` — public HTML facade/re-export surface; preserve backward-compatible imports here; do not
- grow section/layout logic in this module.
-- `codeclone/_html_report/*` — actual HTML assembly, context shaping, tabs, sections, and overview/navigation behavior;
- change report layout and interactive HTML UX here, not in the facade.
-- `codeclone/_html_*.py` — shared HTML badges, CSS, JS, escaping, snippets, and data-attrs; keep these as render-only
- helpers.
+- `codeclone/contracts/*` — version constants, schema types, exit enum, URLs, and typed exceptions. Treat as contract
+ surface.
- `codeclone/models.py` — shared typed models crossing modules; keep model changes contract-aware.
- `codeclone/domain/*.py` — centralized domain taxonomies/IDs (families, categories, source scopes, risk/severity
levels); use these constants in pipeline/report/UI instead of scattering raw literals.
+- `codeclone/ui_messages/*` — CLI text/marker/help constants and formatter helpers. Keep message policy centralized.
- `docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py` — docs-site source,
publication workflow, and live sample-report generation; keep published docs aligned with code contracts.
- `extensions/vscode-codeclone/*` — preview VS Code extension surface; keep it baseline-aware, triage-first,
@@ -468,19 +475,22 @@ Use this map to route changes to the right owner module.
Dependency direction is enforceable and partially test-guarded (`tests/test_architecture.py`):
-- `codeclone.report.*` must not import `codeclone.cli`, `codeclone.html_report`, or `codeclone.ui_messages`.
-- `codeclone.extractor` must not import `codeclone.report`, `codeclone.cli`, or `codeclone.baseline`.
-- `codeclone.grouping` must not import `codeclone.cli`, `codeclone.baseline`, or `codeclone.html_report`.
-- `codeclone.baseline` and `codeclone.cache` must not import `codeclone.cli`, `codeclone.ui_messages`, or
- `codeclone.html_report`.
-- `codeclone.models` may import only `codeclone.contracts` and `codeclone.errors` from local modules.
+- `codeclone.report.*` must not import `codeclone.ui_messages`, `codeclone.surfaces.cli`, or HTML consumers outside
+ `codeclone.report.html.*`.
+- `codeclone.baseline` and `codeclone.cache` must not import `codeclone.surfaces.cli`, `codeclone.ui_messages`, or
+ `codeclone.report.html`.
+- `codeclone.core` must not import `codeclone.surfaces.*` or `codeclone.config`.
+- `codeclone.analysis`, `codeclone.findings`, and `codeclone.metrics` must not import `codeclone.surfaces.*`; analysis
+ and findings must also stay independent from config/report-builder wiring.
+- `codeclone.models` may import only `codeclone.contracts` from local modules.
+- `codeclone.domain.*` must remain leaf domain modules.
Operational rules:
-- Core/domain code must not depend on HTML/UI.
-- Renderers depend on canonical report payload/model; canonical report code must not depend on renderer/UI.
+- Core/domain code must not depend on HTML/UI or MCP.
+- Renderers depend on canonical report payload/model; canonical report builders must not depend on renderer/UI.
- Metrics/report layers must not recompute or invent core facts in UI.
-- CLI helper modules (`_cli_*`) must orchestrate/format, not own domain semantics.
+- CLI support modules under `codeclone/surfaces/cli/*` must orchestrate/format, not own domain semantics.
- Persistence semantics (baseline/cache trust/integrity) must stay in persistence/domain modules, not in render/UI
layers.
- MCP may depend on pipeline/report/contracts, but core/persistence/report layers must not depend on MCP modules.
@@ -489,7 +499,7 @@ Operational rules:
Inline suppressions are explicit local policy, not analysis truth.
-- Supported syntax is `# codeclone: ignore[rule-id,...]` via `codeclone/suppressions.py`.
+- Supported syntax is `# codeclone: ignore[rule-id,...]` via `codeclone/analysis/suppressions.py`.
- Binding scope is declaration-only (`def`, `async def`, `class`) using:
- leading comment on the line immediately before declaration
- inline comment on the declaration header start line
@@ -510,13 +520,13 @@ If you change a contract-sensitive zone, route docs/tests/approval deliberately.
| Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger |
|-------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
-| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change |
-| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change |
-| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change |
-| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes |
+| Baseline schema/trust/integrity (`codeclone/baseline/clone_baseline.py`, `codeclone/baseline/trust.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change |
+| Cache schema/profile/integrity (`codeclone/cache/store.py`, `codeclone/cache/versioning.py`, `codeclone/cache/integrity.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change |
+| Canonical report JSON shape (`codeclone/report/document/*`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change |
+| CLI flags/help/exit behavior (`codeclone/main.py`, `codeclone/surfaces/cli/*`, `codeclone/config/*`, `codeclone/contracts/*`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes |
| Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change |
-| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload |
-| MCP interface (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`, packaging extra/launcher) | `README.md`, `docs/book/20-mcp-interface.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_mcp_service.py`, `tests/test_mcp_server.py`, plus CLI/package tests if launcher/install semantics change | tool/resource shapes, read-only semantics, optional-dependency packaging behavior change | public MCP tool names, resource URIs, launcher/install behavior, or response semantics change |
+| Suppression semantics/reporting (`codeclone/analysis/suppressions.py`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload |
+| MCP interface (`codeclone/surfaces/mcp/*`, packaging extra/launcher) | `README.md`, `docs/book/20-mcp-interface.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_mcp_service.py`, `tests/test_mcp_server.py`, plus CLI/package tests if launcher/install semantics change | tool/resource shapes, read-only semantics, optional-dependency packaging behavior change | public MCP tool names, resource URIs, launcher/install behavior, or response semantics change |
| VS Code extension surface (`extensions/vscode-codeclone/*`) | `README.md`, `docs/book/21-vscode-extension.md`, `docs/vscode-extension.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `node --check extensions/vscode-codeclone/src/support.js`, `node --check extensions/vscode-codeclone/src/mcpClient.js`, `node --check extensions/vscode-codeclone/src/extension.js`, `node --test extensions/vscode-codeclone/test/*.test.js`, plus local extension-host smoke and package smoke when surface/manifest/assets change | command/view UX, trust/runtime model, source-first review flow, or packaging metadata change | documented commands/views/setup/trust behavior, packaged assets, or publish metadata change |
| Claude Desktop bundle surface (`extensions/claude-desktop-codeclone/*`) | `docs/book/22-claude-desktop-bundle.md`, `docs/claude-desktop-bundle.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `node --check extensions/claude-desktop-codeclone/server/index.js`, `node --check extensions/claude-desktop-codeclone/src/launcher.js`, `node --check extensions/claude-desktop-codeclone/scripts/build-mcpb.mjs`, `node --test extensions/claude-desktop-codeclone/test/*.test.js`, plus `.mcpb` build smoke | bundle install/runtime model, launcher UX, local-stdio constraints, or bundle metadata change | documented Claude Desktop install/setup/runtime behavior or packaged bundle semantics change |
| Codex plugin surface (`plugins/codeclone/*`, `.agents/plugins/marketplace.json`) | `docs/book/23-codex-plugin.md`, `docs/codex-plugin.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `python3 -m json.tool plugins/codeclone/.codex-plugin/plugin.json`, `python3 -m json.tool plugins/codeclone/.mcp.json`, `python3 -m json.tool .agents/plugins/marketplace.json`, `tests/test_codex_plugin.py` | plugin discovery/runtime model, bundled MCP config, bundled skill behavior, or plugin metadata change | documented Codex plugin install/discovery/runtime behavior or plugin manifest/marketplace semantics change |
@@ -564,8 +574,9 @@ Policy:
### Internal implementation surfaces
-- Local helpers and formatting utilities (`_html_*`, many private `_as_*` normalizers, local transformers).
-- Internal orchestration decomposition inside `_cli_*` modules.
+- Local helpers and formatting utilities (`codeclone/report/html/widgets/*`,
+ `codeclone/report/html/primitives/*`, many private `_as_*` normalizers, local transformers).
+- Internal orchestration decomposition inside `codeclone/surfaces/cli/*`.
- Private utility refactors that do not change public payloads, exit semantics, ordering, or trust rules.
If classification is ambiguous, treat it as contract-sensitive and add tests/docs before merging.
@@ -660,7 +671,7 @@ These rules exist because of real incidents in this repo. They are non-negotiabl
- Every doc claim about code (schema version, module path, function name, MCP tool count, exit code,
CLI flag) must be verified against the **current** code before writing or editing.
-- Always read version constants from `codeclone/contracts.py` (see Section 4 table), never from
+- Always read version constants from `codeclone/contracts/__init__.py` (see Section 4 table), never from
another doc.
- When updating a file that mentions schema versions, verify **every** version reference in that
file — not only the one you came to change.
@@ -678,10 +689,11 @@ These rules exist because of real incidents in this repo. They are non-negotiabl
### Shared helpers
-- HTML/UI helpers (`_html_badges.py`, `_html_css.py`, `_html_js.py`, `_html_escape.py`,
- `_html_report/_glossary.py`) are imported, not duplicated locally inside `_html_report/_sections/*`.
+- HTML/UI helpers (`codeclone/report/html/widgets/*`, `codeclone/report/html/primitives/*`,
+ `codeclone/report/html/assets/*`) are imported, not duplicated locally inside
+ `codeclone/report/html/sections/*`.
If you need a helper that doesn't exist, add it to the shared module.
-- Glossary terms used in stat-card labels live in `codeclone/_html_report/_glossary.py`. Adding a
+- Glossary terms used in stat-card labels live in `codeclone/report/html/widgets/glossary.py`. Adding a
new label without a glossary entry is a contract gap.
### Conflict avoidance
@@ -699,7 +711,7 @@ These rules exist because of real incidents in this repo. They are non-negotiabl
- A task that touches MCP is not complete until
`pytest tests/test_mcp_service.py tests/test_mcp_server.py -x -q` is green.
- A task that touches docs schema/version claims is not complete until you have grep'd the whole
- file for *all* version-shaped strings and verified each against `codeclone/contracts.py`.
+ file for *all* version-shaped strings and verified each against `codeclone/contracts/__init__.py`.
---
diff --git a/CHANGELOG.md b/CHANGELOG.md
index db54f6e..24dc77c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,52 @@
# Changelog
+## [2.0.0b6] - 2026-04-28
+
+The global package refactor lands here: the entire runtime moves onto the
+canonical module layout and legacy shims are removed for good. On top of that,
+dependency-depth scoring is replaced with an adaptive project-relative model,
+and the report/cache contracts advance to surface the new depth profile and the
+report-only `security_surfaces` layer.
+
+### Package layout and contracts
+
+- Move the runtime fully onto the canonical package layout: `main` + `surfaces/cli`, `surfaces/mcp`, `core`, `analysis`,
+ `baseline`, `cache`, `contracts`, `report/document`, `report/renderers`, and `report/html`.
+- Remove remaining legacy root shims and stale compatibility modules in favor of direct canonical imports.
+- Remove stale deleted-file cache entries and trim post-refactor import tails that were inflating dependency depth and
+ clone pressure.
+- Bump report schema to `2.10` and cache schema to `2.6` for additive dependency depth profile fields and
+ `security_surfaces` facts; keep clone baseline schema `2.1` and metrics-baseline schema `1.2` unchanged.
+- Preserve deterministic contracts and read-only MCP semantics across the new layout.
+
+### Dependency depth scoring
+
+- Replace the old fixed dependency-depth penalty (`max_depth > 8`) with an adaptive internal-graph profile based on
+ `avg_depth`, `p95_depth`, and `max_depth`.
+- Keep dependency cycles as the hard signal; treat acyclic depth as adaptive pressure relative to the project's own
+ dependency profile.
+- Limit dependency-depth scoring to the internal module graph instead of external imports such as `typing` or
+ `argparse`.
+- Surface the dependency depth profile in the canonical report, HTML Dependencies tab, and CLI/CI summaries.
+
+### Security surfaces
+
+- Add `metrics.families.security_surfaces`: a report-only exact inventory of security-relevant capability surfaces and
+ trust-boundary code.
+- Surface compact `security_surfaces` facts in canonical report JSON, CLI Metrics, HTML Quality, text/markdown
+ projections, and MCP summaries / `metrics_detail`.
+- Keep the layer honest: no vulnerability claims, no score impact, no gates, no SARIF security findings, and no baseline
+ truth.
+
+### Tooling, docs, and UX
+
+- Refresh AGENTS, docs/book, and changelog content for the b6 package layout and report schema `2.10`.
+- Tighten preview client metadata and install guidance for VS Code, Claude Desktop, and Codex.
+- Replace the Codex plugin shell snippet with a repo-local shell-free launcher, and parallelize VS Code post-run MCP
+ artifact hydration.
+- Add a quiet one-time VS Code extension hint in interactive VS Code terminals, tracked per CodeClone version next to
+ the resolved project cache path.
+
## [2.0.0b5] - 2026-04-16
Expands the canonical contract with adoption, API-surface, and coverage-join layers; clarifies run interpretation
@@ -21,7 +68,8 @@ across MCP/HTML/clients; tightens MCP launcher/runtime behavior.
`--fail-on-docstring-regression`, `--fail-on-api-break`, `--fail-on-untested-hotspots`, `--coverage-min`.
- Surface adoption/API/coverage-join in MCP, CLI Metrics, report payloads, and HTML (Overview + Quality subtab).
- Preserve embedded metrics and optional `api_surface` in unified baselines.
-- Cache `2.5`: make analysis-profile compatibility API-surface-aware; invalidate stale non-API warm caches; preserve parameter order; align warm/cold API diffs.
+- Cache `2.5`: make analysis-profile compatibility API-surface-aware; invalidate stale non-API warm caches; preserve
+ parameter order; align warm/cold API diffs.
### MCP, HTML, and client interpretation
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index af4bb11..cd105f6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -49,7 +49,7 @@ When reporting issues related to clone detection, include:
- minimal reproducible code snippets (preferred over screenshots);
- the CodeClone version;
-- the Python version (`python_tag`, e.g. `cp313`);
+- the Python version (`python_tag`, e.g. `cp314`);
- whether the issue is primarily:
- AST-related,
- CFG-related,
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..9f3d32f
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Denis Rozhnovskiy
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/LICENSE-docs b/LICENSE-docs
deleted file mode 100644
index 66b3e88..0000000
--- a/LICENSE-docs
+++ /dev/null
@@ -1,25 +0,0 @@
-MIT License
-
-Copyright (c) 2024 Denis Rozhnovskiy
-
-This license applies to documentation in this repository, including the
-`docs/` tree and Markdown documentation files, unless a file states
-otherwise.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this documentation and associated files (the "Documentation"), to deal
-in the Documentation without restriction, including without limitation the
-rights to use, copy, modify, merge, publish, distribute, sublicense,
-and/or sell copies of the Documentation, and to permit persons to whom the
-Documentation is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Documentation.
-
-THE DOCUMENTATION IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE DOCUMENTATION OR THE USE OR OTHER DEALINGS IN
-THE DOCUMENTATION.
diff --git a/LICENSES.md b/LICENSES.md
new file mode 100644
index 0000000..ed0a973
--- /dev/null
+++ b/LICENSES.md
@@ -0,0 +1,22 @@
+# License Scope
+
+CodeClone uses a dual-license layout in this repository.
+
+## Default mapping
+
+- Source code and other implementation files are licensed under
+ [MPL-2.0](LICENSE).
+- Documentation content, including the `docs/` tree and published docs-site
+ content, is licensed under [MIT](LICENSE-MIT).
+
+## File-level overrides
+
+If a file or bundled third-party artifact includes its own license notice, that
+file-level notice takes precedence over this default mapping.
+
+## Notes
+
+- Keep [LICENSE](LICENSE) and [LICENSE-MIT](LICENSE-MIT) as canonical license
+ texts for tooling and GitHub license detection.
+- Use this file to describe scope, not to redefine the underlying license
+ texts.
diff --git a/README.md b/README.md
index 0284773..047d97e 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,18 @@
-
-
-
+
+
+
@@ -16,7 +26,7 @@
-
+
@@ -25,7 +35,7 @@
CodeClone provides deterministic structural code quality analysis for Python.
It detects architectural duplication, computes quality metrics, and enforces CI gates — all with **baseline-aware
governance** that separates **known** technical debt from **new** regressions.
-An optional MCP interface exposes the same canonical analysis pipeline to AI agents and IDEs.
+A triage-first MCP control surface exposes the same canonical pipeline to AI agents and IDEs.
Docs: [orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/) ·
Live sample report:
@@ -42,13 +52,13 @@ Live sample report:
- **Clone detection** — function (CFG fingerprint), block (statement windows), and segment (report-only) clones
- **Structural findings** — duplicated branch families, clone guard/exit divergence, and clone-cohort drift
-- **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, dead code,
- health score, and overloaded-module profiling
+- **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, adaptive depth
+ profile, dead code, health score, and overloaded-module profiling
- **Adoption & API** — type/docstring annotation coverage, public API surface inventory and baseline diff
- **Coverage Join** — fuse external Cobertura XML into the current run to surface coverage hotspots and scope gaps
- **Baseline governance** — separates accepted **legacy** debt from **new regressions**; CI fails only on what changed
- **Reports** — interactive HTML, JSON, Markdown, SARIF, and text from one canonical report
-- **MCP server** — optional read-only surface for AI agents and IDEs
+- **MCP control surface** — triage-first agent and IDE interface over the same canonical pipeline; read-only by contract
- **IDE & agent clients** — VS Code extension, Claude Desktop bundle, and Codex plugin over the same MCP contract
- **CI-first** — deterministic output, stable ordering, exit code contract, pre-commit support
- **Fast** — incremental caching, parallel processing, warm-run optimization
@@ -169,16 +179,21 @@ repos:
types: [ python ]
```
-## MCP Server
+## MCP Control Surface
-Optional read-only MCP server for AI agents and IDE clients.
-Never mutates source, baselines, or repo state.
+Triage-first MCP server for AI agents and IDE clients, built on the same canonical pipeline as the CLI. Read-only by
+contract: never mutates source, baselines, or repo state.
```bash
-uv tool install --pre "codeclone[mcp]" # or: uv pip install --pre "codeclone[mcp]"
+uv tool install --pre "codeclone[mcp]"
+# or
+uv pip install --pre "codeclone[mcp]"
-codeclone-mcp --transport stdio # local (Claude Code, Codex, Copilot, Gemini CLI)
-codeclone-mcp --transport streamable-http # remote / HTTP-only clients
+# local stdio clients
+codeclone-mcp --transport stdio
+
+# remote / HTTP-only clients
+codeclone-mcp --transport streamable-http
```
[MCP usage guide](https://orenlab.github.io/codeclone/mcp/) ·
@@ -192,7 +207,7 @@ codeclone-mcp --transport streamable-http # remote / HTTP-only clients
| **Claude Desktop bundle** | [`extensions/claude-desktop-codeclone/`](https://github.com/orenlab/codeclone/tree/main/extensions/claude-desktop-codeclone) | Local `.mcpb` install with pre-loaded instructions |
| **Codex plugin** | [`plugins/codeclone/`](https://github.com/orenlab/codeclone/tree/main/plugins/codeclone) | Native discovery, two skills, and MCP definition |
-All three are thin wrappers over the same `codeclone-mcp` contract — no second analysis engine.
+All three are native clients over the same `codeclone-mcp` contract — no second analysis engine.
[VS Code extension docs](https://orenlab.github.io/codeclone/book/21-vscode-extension/) ·
[Claude Desktop docs](https://orenlab.github.io/codeclone/book/22-claude-desktop-bundle/) ·
@@ -268,13 +283,13 @@ Report contract: [Report contract](https://orenlab.github.io/codeclone/book/08-r
[HTML render](https://orenlab.github.io/codeclone/book/10-html-render/)
-Canonical JSON report shape (v2.8)
+Canonical JSON report shape (v2.10)
```json
{
- "report_schema_version": "2.8",
+ "report_schema_version": "2.10",
"meta": {
- "codeclone_version": "2.0.0b5",
+ "codeclone_version": "2.0.0b6",
"project_name": "...",
"scan_root": ".",
"report_mode": "full",
@@ -341,15 +356,27 @@ Report contract: [Report contract](https://orenlab.github.io/codeclone/book/08-r
"metrics": {
"summary": {
"...": "...",
- "coverage_adoption": { "...": "..." },
- "coverage_join": { "...": "..." },
- "api_surface": { "...": "..." }
+ "coverage_adoption": {
+ "...": "..."
+ },
+ "coverage_join": {
+ "...": "..."
+ },
+ "api_surface": {
+ "...": "..."
+ }
},
"families": {
"...": "...",
- "coverage_adoption": { "...": "..." },
- "coverage_join": { "...": "..." },
- "api_surface": { "...": "..." }
+ "coverage_adoption": {
+ "...": "..."
+ },
+ "coverage_join": {
+ "...": "..."
+ },
+ "api_surface": {
+ "...": "..."
+ }
}
},
"derived": {
@@ -455,8 +482,8 @@ in [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmark
## License
-- **Code:** MPL-2.0
-- **Documentation:** MIT
+- **Code:** MPL-2.0 (`LICENSE`)
+- **Documentation and docs-site content:** MIT (`LICENSE-MIT`)
Versions released before this change remain under their original license terms.
@@ -465,4 +492,4 @@ Versions released before this change remain under their original license terms.
- **Docs:**
- **Issues:**
- **PyPI:**
-- **Licenses:** [MPL-2.0](LICENSE) · [MIT docs](LICENSE-docs)
+- **Licenses:** [MPL-2.0](https://github.com/orenlab/codeclone/blob/main/LICENSE) · [MIT docs](https://github.com/orenlab/codeclone/blob/main/LICENSE-MIT) · [Scope map](https://github.com/orenlab/codeclone/blob/main/LICENSES.md)
diff --git a/SECURITY.md b/SECURITY.md
index 333de2d..72cf77a 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -10,7 +10,7 @@ The following versions currently receive security updates:
| Version | Supported |
|---------|-----------|
| 2.0.x | Yes |
-| 1.4.x | Yes |
+| 1.4.x | No |
| 1.3.x | No |
| 1.2.x | No |
| 1.1.x | No |
diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile
index 8768aad..c747fc4 100644
--- a/benchmarks/Dockerfile
+++ b/benchmarks/Dockerfile
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1.7
-FROM python:3.13.2-slim-bookworm
+FROM python:3.14.3-slim-bookworm
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
diff --git a/benchmarks/baselines/reference-cp313.json b/benchmarks/baselines/reference-cp313.json
new file mode 100644
index 0000000..f8089ce
--- /dev/null
+++ b/benchmarks/baselines/reference-cp313.json
@@ -0,0 +1,120 @@
+{
+ "benchmark_schema_version": "1.0",
+ "tool": {
+ "name": "codeclone",
+ "version": "2.0.0b5",
+ "python_tag": "cp313"
+ },
+ "config": {
+ "target": "",
+ "runs": 3,
+ "warmups": 1,
+ "python_executable": ""
+ },
+ "environment": {
+ "platform": "macOS-15.7.5-arm64-arm-64bit-Mach-O",
+ "machine": "arm64",
+ "python_version": "3.13.12",
+ "python_implementation": "CPython",
+ "python_tag": "cp313",
+ "cpu_count": 10,
+ "cpu_affinity_count": null,
+ "container_detected": false,
+ "cgroup_cpu_max": null,
+ "cgroup_memory_max": null,
+ "timestamp_utc": "2026-04-17T13:45:19Z"
+ },
+ "scenarios": [
+ {
+ "name": "cold_full",
+ "mode": "cold",
+ "extra_args": [],
+ "warmups": 1,
+ "runs": 3,
+ "deterministic": true,
+ "digest": "0e4366a01ad8a0db646c9a984e92fa913166119541e296387d963c5ae4301bc9",
+ "timings_seconds": [
+ 1.0560423749998336,
+ 1.0907688339998458,
+ 1.0867978750002294
+ ],
+ "stats_seconds": {
+ "min": 1.0560423749998336,
+ "max": 1.0907688339998458,
+ "mean": 1.0778696946666362,
+ "median": 1.0867978750002294,
+ "p95": 1.090371738099884,
+ "stdev": 0.015519150357364984
+ },
+ "inventory_sample": {
+ "found": 180,
+ "analyzed": 180,
+ "cached": 0,
+ "skipped": 0
+ }
+ },
+ {
+ "name": "warm_full",
+ "mode": "warm",
+ "extra_args": [],
+ "warmups": 1,
+ "runs": 3,
+ "deterministic": true,
+ "digest": "55ea63867ffdd599784d10cd0c86d15ba0944e128d62d4d6cb8e68ce8779ea2e",
+ "timings_seconds": [
+ 0.2863777919997119,
+ 0.2806324170001062,
+ 0.27757904200007033
+ ],
+ "stats_seconds": {
+ "min": 0.27757904200007033,
+ "max": 0.2863777919997119,
+ "mean": 0.28152975033329614,
+ "median": 0.2806324170001062,
+ "p95": 0.2858032544997513,
+ "stdev": 0.003647684719762983
+ },
+ "inventory_sample": {
+ "found": 180,
+ "analyzed": 0,
+ "cached": 180,
+ "skipped": 0
+ }
+ },
+ {
+ "name": "warm_clones_only",
+ "mode": "warm",
+ "extra_args": [
+ "--skip-metrics"
+ ],
+ "warmups": 1,
+ "runs": 3,
+ "deterministic": true,
+ "digest": "8e2fbaf49e9f577b89348aa54fc8f7d6866c9c8213ff1e69d831edc2f663d907",
+ "timings_seconds": [
+ 0.2363325830001486,
+ 0.22605108300012944,
+ 0.21571508300030473
+ ],
+ "stats_seconds": {
+ "min": 0.21571508300030473,
+ "max": 0.2363325830001486,
+ "mean": 0.2260329163335276,
+ "median": 0.22605108300012944,
+ "p95": 0.23530443300014667,
+ "stdev": 0.00841706893091738
+ },
+ "inventory_sample": {
+ "found": 180,
+ "analyzed": 0,
+ "cached": 180,
+ "skipped": 0
+ }
+ }
+ ],
+ "comparisons": {
+ "warm_full_speedup_vs_cold_full": 3.8726740360854963,
+ "warm_clones_only_speedup_vs_warm_full": 1.2414557509549535
+ },
+ "generated_at_utc": "2026-04-17T13:45:19Z"
+}
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index b77c96b..af04cf8 100755
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -15,6 +15,7 @@
import subprocess
import sys
import time
+from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
@@ -25,6 +26,7 @@
from codeclone.baseline import current_python_tag
BENCHMARK_SCHEMA_VERSION = "1.0"
+BENCHMARK_CLI_MODULE = "codeclone.main"
BENCHMARK_NEUTRAL_ARGS: tuple[str, ...] = (
"--no-fail-on-new",
"--no-fail-on-new-metrics",
@@ -48,6 +50,8 @@
"-1",
"--min-docstring-coverage",
"-1",
+ "--no-api-surface",
+ "--no-update-metrics-baseline",
)
@@ -161,7 +165,7 @@ def _run_cli_once(
cmd = [
python_executable,
"-m",
- "codeclone.cli",
+ BENCHMARK_CLI_MODULE,
str(target),
*BENCHMARK_NEUTRAL_ARGS,
"--json",
@@ -240,6 +244,14 @@ def _validate_inventory_sample(
)
+def _print_bulleted_lines(header: str, lines: Sequence[str]) -> None:
+ if not lines:
+ return
+ print(header)
+ for line in lines:
+ print(f"- {line}")
+
+
def _scenario_result(
*,
scenario: Scenario,
@@ -393,6 +405,67 @@ def _median_for(name: str) -> float | None:
return comparisons
+def _load_benchmark_payload(path: Path) -> dict[str, object]:
+ payload_obj: object = json.loads(path.read_text(encoding="utf-8"))
+ if not isinstance(payload_obj, dict):
+ raise RuntimeError(f"benchmark payload is not an object: {path}")
+ return payload_obj
+
+
+def _scenario_medians(payload: Mapping[str, object]) -> dict[str, float]:
+ scenarios_obj = payload.get("scenarios")
+ if not isinstance(scenarios_obj, list):
+ raise RuntimeError("benchmark payload is missing a scenarios list")
+
+ medians: dict[str, float] = {}
+ for item in scenarios_obj:
+ if not isinstance(item, dict):
+ raise RuntimeError("benchmark scenario entry is not an object")
+ name = item.get("name")
+ stats = item.get("stats_seconds")
+ if not isinstance(name, str) or not isinstance(stats, dict):
+ raise RuntimeError("benchmark scenario entry is missing name/stats_seconds")
+ median = stats.get("median")
+ if not isinstance(median, (int, float)):
+ raise RuntimeError(f"benchmark scenario {name} is missing median timing")
+ medians[name] = float(median)
+ return medians
+
+
+def _timing_regressions(
+ *,
+ current_payload: Mapping[str, object],
+ baseline_payload: Mapping[str, object],
+ max_regression_pct: float,
+) -> list[str]:
+ current_medians = _scenario_medians(current_payload)
+ baseline_medians = _scenario_medians(baseline_payload)
+
+ missing = sorted(set(baseline_medians) - set(current_medians))
+ if missing:
+ raise RuntimeError(
+ "benchmark payload is missing baseline scenario(s): " + ", ".join(missing)
+ )
+
+ regressions: list[str] = []
+ for name, baseline_median in sorted(baseline_medians.items()):
+ if baseline_median <= 0:
+ raise RuntimeError(
+ f"baseline scenario {name} has non-positive median: {baseline_median}"
+ )
+ current_median = current_medians[name]
+ allowed_median = baseline_median * (1.0 + (max_regression_pct / 100.0))
+ if current_median <= allowed_median:
+ continue
+ regression_pct = ((current_median - baseline_median) / baseline_median) * 100.0
+ regressions.append(
+ f"{name}: median {current_median:.4f}s exceeds baseline "
+ f"{baseline_median:.4f}s by {regression_pct:.2f}% "
+ f"(allowed {max_regression_pct:.2f}%)"
+ )
+ return regressions
+
+
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
@@ -440,6 +513,18 @@ def _parse_args() -> argparse.Namespace:
default=sys.executable,
help="Python executable used to invoke codeclone CLI",
)
+ parser.add_argument(
+ "--baseline",
+ type=Path,
+ default=None,
+ help="Existing benchmark JSON used for per-scenario median regression checks.",
+ )
+ parser.add_argument(
+ "--max-regression-pct",
+ type=float,
+ default=5.0,
+ help="Allowed per-scenario median slowdown versus --baseline.",
+ )
return parser.parse_args()
@@ -449,6 +534,8 @@ def main() -> int:
raise SystemExit("--runs must be > 0")
if args.warmups < 0:
raise SystemExit("--warmups must be >= 0")
+ if args.max_regression_pct < 0:
+ raise SystemExit("--max-regression-pct must be >= 0")
target = args.target.resolve()
if not target.exists():
raise SystemExit(f"target does not exist: {target}")
@@ -501,6 +588,22 @@ def main() -> int:
.replace("+00:00", "Z"),
}
+ regressions: list[str] = []
+ baseline_path = args.baseline.resolve() if args.baseline is not None else None
+ if baseline_path is not None:
+ baseline_payload = _load_benchmark_payload(baseline_path)
+ regressions = _timing_regressions(
+ current_payload=payload,
+ baseline_payload=baseline_payload,
+ max_regression_pct=args.max_regression_pct,
+ )
+ payload["baseline_comparison"] = {
+ "baseline_path": str(baseline_path),
+ "max_regression_pct": args.max_regression_pct,
+ "status": "regression" if regressions else "ok",
+ "regressions": regressions,
+ }
+
args.output.parent.mkdir(parents=True, exist_ok=True)
tmp_output = args.output.with_suffix(args.output.suffix + ".tmp")
rendered = json.dumps(payload, ensure_ascii=False, indent=2)
@@ -522,12 +625,19 @@ def main() -> int:
f"p95={p95_s:.4f}s stdev={stdev_s:.4f}s "
f"digest={scenario['digest']}"
)
- if comparisons:
- print("ratios:")
- for name, value in sorted(comparisons.items()):
- print(f"- {name}={value:.3f}x")
+ _print_bulleted_lines(
+ "ratios:",
+ [f"{name}={value:.3f}x" for name, value in sorted(comparisons.items())],
+ )
+ if baseline_path is not None:
+ print(f"baseline={baseline_path}")
+ print(f"max_regression_pct={args.max_regression_pct:.2f}")
+ if regressions:
+ _print_bulleted_lines("regressions:", regressions)
+ else:
+ print("baseline_status=ok")
print(f"output={args.output}")
- return 0
+ return 1 if regressions else 0
if __name__ == "__main__":
diff --git a/benchmarks/run_docker_benchmark.sh b/benchmarks/run_docker_benchmark.sh
index 7a11fe7..c828a80 100755
--- a/benchmarks/run_docker_benchmark.sh
+++ b/benchmarks/run_docker_benchmark.sh
@@ -2,7 +2,7 @@
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-IMAGE_TAG="${IMAGE_TAG:-codeclone-benchmark:2.0.0b5}"
+IMAGE_TAG="${IMAGE_TAG:-codeclone-benchmark:local}"
OUT_DIR="${OUT_DIR:-$ROOT_DIR/.cache/benchmarks}"
OUTPUT_BASENAME="${OUTPUT_BASENAME:-codeclone-benchmark.json}"
CPUSET="${CPUSET:-0}"
diff --git a/codeclone.baseline.json b/codeclone.baseline.json
index b4656f8..c6b5719 100644
--- a/codeclone.baseline.json
+++ b/codeclone.baseline.json
@@ -2,14 +2,14 @@
"meta": {
"generator": {
"name": "codeclone",
- "version": "2.0.0b5"
+ "version": "2.0.0b6"
},
"schema_version": "2.1",
"fingerprint_version": "1",
- "python_tag": "cp313",
- "created_at": "2026-04-13T13:10:37Z",
- "payload_sha256": "07a383c1d0974593c83ac30430aec9b99d89fe50f640a9b3b433658e0bd029e8",
- "metrics_payload_sha256": "122ee5d2d3dc2d4e9553b1d440c0314515dcb60cc79ada264b13c39c6ba18e04"
+ "python_tag": "cp314",
+ "created_at": "2026-04-24T14:37:27Z",
+ "payload_sha256": "a2e5e3ac672ddbc7ba95c3a9608257727a01480ef343bc6a70c168fc9355e99a",
+ "metrics_payload_sha256": "26ebd9e502bb4d98d97da593532395de140b2c64b03d85ab91e681f9025fedff"
},
"clones": {
"functions": [],
@@ -18,14 +18,18 @@
"metrics": {
"max_complexity": 20,
"high_risk_functions": [],
- "max_coupling": 10,
+ "max_coupling": 9,
"high_coupling_classes": [],
"max_cohesion": 3,
"low_cohesion_classes": [],
"dependency_cycles": [],
- "dependency_max_depth": 11,
+ "dependency_max_depth": 16,
"dead_code_items": [],
- "health_score": 89,
- "health_grade": "B"
+ "health_score": 90,
+ "health_grade": "A",
+ "typing_param_permille": 1000,
+ "typing_return_permille": 999,
+ "docstring_permille": 39,
+ "typing_any_count": 10
}
}
diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py
deleted file mode 100644
index 7ad4c95..0000000
--- a/codeclone/_cli_args.py
+++ /dev/null
@@ -1,456 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import argparse
-import sys
-from typing import NoReturn
-
-from . import ui_messages as ui
-from .contracts import (
- DEFAULT_COHESION_THRESHOLD,
- DEFAULT_COMPLEXITY_THRESHOLD,
- DEFAULT_COUPLING_THRESHOLD,
- DEFAULT_HEALTH_THRESHOLD,
- ExitCode,
- cli_help_epilog,
-)
-
-DEFAULT_ROOT = "."
-DEFAULT_MIN_LOC = 10
-DEFAULT_MIN_STMT = 6
-DEFAULT_BLOCK_MIN_LOC = 20
-DEFAULT_BLOCK_MIN_STMT = 8
-DEFAULT_SEGMENT_MIN_LOC = 20
-DEFAULT_SEGMENT_MIN_STMT = 10
-DEFAULT_PROCESSES = 4
-DEFAULT_MAX_CACHE_SIZE_MB = 50
-DEFAULT_MAX_BASELINE_SIZE_MB = 5
-
-DEFAULT_BASELINE_PATH = "codeclone.baseline.json"
-DEFAULT_HTML_REPORT_PATH = ".cache/codeclone/report.html"
-DEFAULT_JSON_REPORT_PATH = ".cache/codeclone/report.json"
-DEFAULT_MARKDOWN_REPORT_PATH = ".cache/codeclone/report.md"
-DEFAULT_SARIF_REPORT_PATH = ".cache/codeclone/report.sarif"
-DEFAULT_TEXT_REPORT_PATH = ".cache/codeclone/report.txt"
-
-
-class _ArgumentParser(argparse.ArgumentParser):
- def error(self, message: str) -> NoReturn:
- self.print_usage(sys.stderr)
- self.exit(
- int(ExitCode.CONTRACT_ERROR),
- f"CONTRACT ERROR: {message}\n",
- )
-
-
-class _HelpFormatter(argparse.RawTextHelpFormatter):
- """Product-oriented help formatter extension point."""
-
-
-def _add_optional_path_argument(
- group: argparse._ArgumentGroup,
- *,
- flag: str,
- dest: str,
- help_text: str,
- default: str | None = None,
- const: str | None = None,
- metavar: str = "FILE",
-) -> None:
- group.add_argument(
- flag,
- dest=dest,
- nargs="?",
- metavar=metavar,
- default=default,
- const=const,
- help=help_text,
- )
-
-
-def _add_bool_optional_argument(
- group: argparse._ArgumentGroup,
- *,
- flag: str,
- help_text: str,
- default: bool = False,
-) -> None:
- group.add_argument(
- flag,
- action=argparse.BooleanOptionalAction,
- default=default,
- help=help_text,
- )
-
-
-def build_parser(version: str) -> _ArgumentParser:
- ap = _ArgumentParser(
- prog="codeclone",
- description="Structural code quality analysis for Python.",
- add_help=False,
- formatter_class=_HelpFormatter,
- epilog=cli_help_epilog(),
- )
-
- target_group = ap.add_argument_group("Target")
- target_group.add_argument(
- "root",
- nargs="?",
- default=DEFAULT_ROOT,
- help=ui.HELP_ROOT,
- )
-
- analysis_group = ap.add_argument_group("Analysis")
- analysis_group.add_argument(
- "--min-loc",
- type=int,
- default=DEFAULT_MIN_LOC,
- help=ui.HELP_MIN_LOC,
- )
- analysis_group.add_argument(
- "--min-stmt",
- type=int,
- default=DEFAULT_MIN_STMT,
- help=ui.HELP_MIN_STMT,
- )
- # Block/segment thresholds are advanced tuning: configurable via
- # pyproject.toml only (no CLI flags). Defaults live on the namespace
- # so apply_pyproject_config_overrides can override them.
- ap.set_defaults(
- block_min_loc=DEFAULT_BLOCK_MIN_LOC,
- block_min_stmt=DEFAULT_BLOCK_MIN_STMT,
- segment_min_loc=DEFAULT_SEGMENT_MIN_LOC,
- segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT,
- golden_fixture_paths=(),
- )
- analysis_group.add_argument(
- "--processes",
- type=int,
- default=DEFAULT_PROCESSES,
- help=ui.HELP_PROCESSES,
- )
- _add_bool_optional_argument(
- analysis_group,
- flag="--changed-only",
- help_text=ui.HELP_CHANGED_ONLY,
- )
- analysis_group.add_argument(
- "--diff-against",
- default=None,
- metavar="GIT_REF",
- help=ui.HELP_DIFF_AGAINST,
- )
- analysis_group.add_argument(
- "--paths-from-git-diff",
- default=None,
- metavar="GIT_REF",
- help=ui.HELP_PATHS_FROM_GIT_DIFF,
- )
- _add_optional_path_argument(
- analysis_group,
- flag="--cache-path",
- dest="cache_path",
- default=None,
- const=None,
- help_text=ui.HELP_CACHE_PATH,
- )
- _add_optional_path_argument(
- analysis_group,
- flag="--cache-dir",
- dest="cache_path",
- default=None,
- const=None,
- help_text=ui.HELP_CACHE_DIR_LEGACY,
- )
- analysis_group.add_argument(
- "--max-cache-size-mb",
- type=int,
- default=DEFAULT_MAX_CACHE_SIZE_MB,
- metavar="MB",
- help=ui.HELP_MAX_CACHE_SIZE_MB,
- )
-
- baselines_ci_group = ap.add_argument_group("Baselines and CI")
- _add_optional_path_argument(
- baselines_ci_group,
- flag="--baseline",
- dest="baseline",
- default=DEFAULT_BASELINE_PATH,
- const=DEFAULT_BASELINE_PATH,
- help_text=ui.HELP_BASELINE,
- )
- baselines_ci_group.add_argument(
- "--max-baseline-size-mb",
- type=int,
- default=DEFAULT_MAX_BASELINE_SIZE_MB,
- metavar="MB",
- help=ui.HELP_MAX_BASELINE_SIZE_MB,
- )
- _add_bool_optional_argument(
- baselines_ci_group,
- flag="--update-baseline",
- help_text=ui.HELP_UPDATE_BASELINE,
- )
- _add_optional_path_argument(
- baselines_ci_group,
- flag="--metrics-baseline",
- dest="metrics_baseline",
- default=DEFAULT_BASELINE_PATH,
- const=DEFAULT_BASELINE_PATH,
- help_text=ui.HELP_METRICS_BASELINE,
- )
- _add_bool_optional_argument(
- baselines_ci_group,
- flag="--update-metrics-baseline",
- help_text=ui.HELP_UPDATE_METRICS_BASELINE,
- )
- _add_bool_optional_argument(
- baselines_ci_group,
- flag="--ci",
- help_text=ui.HELP_CI,
- )
- _add_bool_optional_argument(
- baselines_ci_group,
- flag="--api-surface",
- help_text=ui.HELP_API_SURFACE,
- )
- baselines_ci_group.add_argument(
- "--coverage",
- dest="coverage_xml",
- metavar="FILE",
- default=None,
- help=ui.HELP_COVERAGE,
- )
-
- quality_group = ap.add_argument_group("Quality gates")
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-new",
- help_text=ui.HELP_FAIL_ON_NEW,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-new-metrics",
- help_text=ui.HELP_FAIL_ON_NEW_METRICS,
- )
- quality_group.add_argument(
- "--fail-threshold",
- type=int,
- default=-1,
- metavar="MAX_CLONES",
- help=ui.HELP_FAIL_THRESHOLD,
- )
- quality_group.add_argument(
- "--fail-complexity",
- type=int,
- nargs="?",
- const=DEFAULT_COMPLEXITY_THRESHOLD,
- default=-1,
- metavar="CC_MAX",
- help=ui.HELP_FAIL_COMPLEXITY,
- )
- quality_group.add_argument(
- "--fail-coupling",
- type=int,
- nargs="?",
- const=DEFAULT_COUPLING_THRESHOLD,
- default=-1,
- metavar="CBO_MAX",
- help=ui.HELP_FAIL_COUPLING,
- )
- quality_group.add_argument(
- "--fail-cohesion",
- type=int,
- nargs="?",
- const=DEFAULT_COHESION_THRESHOLD,
- default=-1,
- metavar="LCOM4_MAX",
- help=ui.HELP_FAIL_COHESION,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-cycles",
- help_text=ui.HELP_FAIL_CYCLES,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-dead-code",
- help_text=ui.HELP_FAIL_DEAD_CODE,
- )
- quality_group.add_argument(
- "--fail-health",
- type=int,
- nargs="?",
- const=DEFAULT_HEALTH_THRESHOLD,
- default=-1,
- metavar="SCORE_MIN",
- help=ui.HELP_FAIL_HEALTH,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-typing-regression",
- help_text=ui.HELP_FAIL_ON_TYPING_REGRESSION,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-docstring-regression",
- help_text=ui.HELP_FAIL_ON_DOCSTRING_REGRESSION,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-api-break",
- help_text=ui.HELP_FAIL_ON_API_BREAK,
- )
- _add_bool_optional_argument(
- quality_group,
- flag="--fail-on-untested-hotspots",
- help_text=ui.HELP_FAIL_ON_UNTESTED_HOTSPOTS,
- )
- quality_group.add_argument(
- "--min-typing-coverage",
- type=int,
- default=-1,
- metavar="PERCENT",
- help=ui.HELP_MIN_TYPING_COVERAGE,
- )
- quality_group.add_argument(
- "--min-docstring-coverage",
- type=int,
- default=-1,
- metavar="PERCENT",
- help=ui.HELP_MIN_DOCSTRING_COVERAGE,
- )
- quality_group.add_argument(
- "--coverage-min",
- type=int,
- default=50,
- metavar="PERCENT",
- help=ui.HELP_COVERAGE_MIN,
- )
-
- stages_group = ap.add_argument_group("Analysis stages")
- _add_bool_optional_argument(
- stages_group,
- flag="--skip-metrics",
- help_text=ui.HELP_SKIP_METRICS,
- )
- _add_bool_optional_argument(
- stages_group,
- flag="--skip-dead-code",
- help_text=ui.HELP_SKIP_DEAD_CODE,
- )
- _add_bool_optional_argument(
- stages_group,
- flag="--skip-dependencies",
- help_text=ui.HELP_SKIP_DEPENDENCIES,
- )
-
- reporting_group = ap.add_argument_group("Reporting")
- _add_optional_path_argument(
- reporting_group,
- flag="--html",
- dest="html_out",
- const=DEFAULT_HTML_REPORT_PATH,
- help_text=ui.HELP_HTML,
- )
- _add_optional_path_argument(
- reporting_group,
- flag="--json",
- dest="json_out",
- const=DEFAULT_JSON_REPORT_PATH,
- help_text=ui.HELP_JSON,
- )
- _add_optional_path_argument(
- reporting_group,
- flag="--md",
- dest="md_out",
- const=DEFAULT_MARKDOWN_REPORT_PATH,
- help_text=ui.HELP_MD,
- )
- _add_optional_path_argument(
- reporting_group,
- flag="--sarif",
- dest="sarif_out",
- const=DEFAULT_SARIF_REPORT_PATH,
- help_text=ui.HELP_SARIF,
- )
- _add_optional_path_argument(
- reporting_group,
- flag="--text",
- dest="text_out",
- const=DEFAULT_TEXT_REPORT_PATH,
- help_text=ui.HELP_TEXT,
- )
- _add_bool_optional_argument(
- reporting_group,
- flag="--timestamped-report-paths",
- help_text=ui.HELP_TIMESTAMPED_REPORT_PATHS,
- )
-
- ui_group = ap.add_argument_group("Output and UI")
- _add_bool_optional_argument(
- ui_group,
- flag="--open-html-report",
- help_text=ui.HELP_OPEN_HTML_REPORT,
- )
- ui_group.add_argument(
- "--no-progress",
- dest="no_progress",
- action="store_true",
- help=ui.HELP_NO_PROGRESS,
- )
- ui_group.add_argument(
- "--progress",
- dest="no_progress",
- action="store_false",
- help=ui.HELP_PROGRESS,
- )
- ui_group.add_argument(
- "--no-color",
- dest="no_color",
- action="store_true",
- help=ui.HELP_NO_COLOR,
- )
- ui_group.add_argument(
- "--color",
- dest="no_color",
- action="store_false",
- help=ui.HELP_COLOR,
- )
- ui_group.set_defaults(no_progress=False, no_color=False)
- _add_bool_optional_argument(
- ui_group,
- flag="--quiet",
- help_text=ui.HELP_QUIET,
- )
- _add_bool_optional_argument(
- ui_group,
- flag="--verbose",
- help_text=ui.HELP_VERBOSE,
- )
- _add_bool_optional_argument(
- ui_group,
- flag="--debug",
- help_text=ui.HELP_DEBUG,
- )
-
- general_group = ap.add_argument_group("General")
- general_group.add_argument(
- "-h",
- "--help",
- action="help",
- help="Show this help message and exit.",
- )
- general_group.add_argument(
- "--version",
- action="version",
- version=ui.version_output(version),
- help=ui.HELP_VERSION,
- )
-
- return ap
diff --git a/codeclone/_cli_config.py b/codeclone/_cli_config.py
deleted file mode 100644
index b17ba43..0000000
--- a/codeclone/_cli_config.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import importlib
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from typing import TYPE_CHECKING, Final
-
-from .golden_fixtures import (
- GoldenFixturePatternError,
- normalize_golden_fixture_patterns,
-)
-
-if TYPE_CHECKING:
- import argparse
- from collections.abc import Mapping, Sequence
-
-
-class ConfigValidationError(ValueError):
- """Raised when pyproject.toml contains invalid CodeClone configuration."""
-
-
-@dataclass(frozen=True, slots=True)
-class _ConfigKeySpec:
- expected_type: type[object]
- allow_none: bool = False
- expected_name: str | None = None
-
-
-_CONFIG_KEY_SPECS: Final[dict[str, _ConfigKeySpec]] = {
- "min_loc": _ConfigKeySpec(int),
- "min_stmt": _ConfigKeySpec(int),
- "block_min_loc": _ConfigKeySpec(int),
- "block_min_stmt": _ConfigKeySpec(int),
- "segment_min_loc": _ConfigKeySpec(int),
- "segment_min_stmt": _ConfigKeySpec(int),
- "processes": _ConfigKeySpec(int),
- "cache_path": _ConfigKeySpec(str, allow_none=True),
- "max_cache_size_mb": _ConfigKeySpec(int),
- "baseline": _ConfigKeySpec(str),
- "max_baseline_size_mb": _ConfigKeySpec(int),
- "update_baseline": _ConfigKeySpec(bool),
- "fail_on_new": _ConfigKeySpec(bool),
- "fail_threshold": _ConfigKeySpec(int),
- "ci": _ConfigKeySpec(bool),
- "fail_complexity": _ConfigKeySpec(int),
- "fail_coupling": _ConfigKeySpec(int),
- "fail_cohesion": _ConfigKeySpec(int),
- "fail_cycles": _ConfigKeySpec(bool),
- "fail_dead_code": _ConfigKeySpec(bool),
- "fail_health": _ConfigKeySpec(int),
- "fail_on_new_metrics": _ConfigKeySpec(bool),
- "api_surface": _ConfigKeySpec(bool),
- "coverage_xml": _ConfigKeySpec(str, allow_none=True),
- "fail_on_typing_regression": _ConfigKeySpec(bool),
- "fail_on_docstring_regression": _ConfigKeySpec(bool),
- "fail_on_api_break": _ConfigKeySpec(bool),
- "fail_on_untested_hotspots": _ConfigKeySpec(bool),
- "min_typing_coverage": _ConfigKeySpec(int),
- "min_docstring_coverage": _ConfigKeySpec(int),
- "coverage_min": _ConfigKeySpec(int),
- "update_metrics_baseline": _ConfigKeySpec(bool),
- "metrics_baseline": _ConfigKeySpec(str),
- "skip_metrics": _ConfigKeySpec(bool),
- "skip_dead_code": _ConfigKeySpec(bool),
- "skip_dependencies": _ConfigKeySpec(bool),
- "golden_fixture_paths": _ConfigKeySpec(list, expected_name="list[str]"),
- "html_out": _ConfigKeySpec(str, allow_none=True),
- "json_out": _ConfigKeySpec(str, allow_none=True),
- "md_out": _ConfigKeySpec(str, allow_none=True),
- "sarif_out": _ConfigKeySpec(str, allow_none=True),
- "text_out": _ConfigKeySpec(str, allow_none=True),
- "no_progress": _ConfigKeySpec(bool),
- "no_color": _ConfigKeySpec(bool),
- "quiet": _ConfigKeySpec(bool),
- "verbose": _ConfigKeySpec(bool),
- "debug": _ConfigKeySpec(bool),
-}
-_PATH_CONFIG_KEYS: Final[frozenset[str]] = frozenset(
- {
- "cache_path",
- "baseline",
- "metrics_baseline",
- "coverage_xml",
- "html_out",
- "json_out",
- "md_out",
- "sarif_out",
- "text_out",
- }
-)
-
-
-def collect_explicit_cli_dests(
- parser: argparse.ArgumentParser,
- *,
- argv: Sequence[str],
-) -> set[str]:
- option_to_dest: dict[str, str] = {}
- for action in parser._actions:
- for option in action.option_strings:
- option_to_dest[option] = action.dest
-
- explicit: set[str] = set()
- for token in argv:
- if token == "--":
- break
- if not token.startswith("-"):
- continue
- option = token.split("=", maxsplit=1)[0]
- dest = option_to_dest.get(option)
- if dest is not None:
- explicit.add(dest)
- return explicit
-
-
-def load_pyproject_config(root_path: Path) -> dict[str, object]:
- config_path = root_path / "pyproject.toml"
- if not config_path.exists():
- return {}
-
- payload: object
- try:
- payload = _load_toml(config_path)
- except OSError as exc:
- raise ConfigValidationError(
- f"Cannot read pyproject.toml at {config_path}: {exc}"
- ) from exc
- except ValueError as exc:
- raise ConfigValidationError(f"Invalid TOML in {config_path}: {exc}") from exc
-
- if not isinstance(payload, dict):
- raise ConfigValidationError(
- f"Invalid pyproject payload at {config_path}: root must be object"
- )
-
- tool_obj = payload.get("tool")
- if tool_obj is None:
- return {}
- if not isinstance(tool_obj, dict):
- raise ConfigValidationError(
- f"Invalid pyproject payload at {config_path}: 'tool' must be object"
- )
-
- codeclone_obj = tool_obj.get("codeclone")
- if codeclone_obj is None:
- return {}
- if not isinstance(codeclone_obj, dict):
- raise ConfigValidationError(
- "Invalid pyproject payload at "
- f"{config_path}: 'tool.codeclone' must be object"
- )
-
- unknown = sorted(set(codeclone_obj.keys()) - set(_CONFIG_KEY_SPECS))
- if unknown:
- raise ConfigValidationError(
- "Unknown key(s) in tool.codeclone: " + ", ".join(unknown)
- )
-
- validated: dict[str, object] = {}
- for key in sorted(codeclone_obj.keys()):
- value = _validate_config_value(
- key=key,
- value=codeclone_obj[key],
- )
- validated[key] = _normalize_path_config_value(
- key=key,
- value=value,
- root_path=root_path,
- )
- return validated
-
-
-def apply_pyproject_config_overrides(
- *,
- args: argparse.Namespace,
- config_values: Mapping[str, object],
- explicit_cli_dests: set[str],
-) -> None:
- for key, value in config_values.items():
- if key in explicit_cli_dests:
- continue
- setattr(args, key, value)
-
-
-def _validate_config_value(*, key: str, value: object) -> object:
- spec = _CONFIG_KEY_SPECS[key]
- if value is None:
- if spec.allow_none:
- return None
- raise ConfigValidationError(
- "Invalid value type for tool.codeclone."
- f"{key}: expected {spec.expected_name or spec.expected_type.__name__}"
- )
-
- expected_type = spec.expected_type
- if expected_type is bool:
- return _validated_config_instance(
- key=key,
- value=value,
- expected_type=bool,
- expected_name="bool",
- )
-
- if expected_type is int:
- return _validated_config_instance(
- key=key,
- value=value,
- expected_type=int,
- expected_name="int",
- reject_bool=True,
- )
-
- if expected_type is str:
- return _validated_config_instance(
- key=key,
- value=value,
- expected_type=str,
- expected_name="str",
- )
- if expected_type is list:
- return _validated_string_list(key=key, value=value)
-
- raise ConfigValidationError(f"Unsupported config key spec for tool.codeclone.{key}")
-
-
-def _validated_config_instance(
- *,
- key: str,
- value: object,
- expected_type: type[object],
- expected_name: str,
- reject_bool: bool = False,
-) -> object:
- if isinstance(value, expected_type) and (
- not reject_bool or not isinstance(value, bool)
- ):
- return value
- raise ConfigValidationError(
- f"Invalid value type for tool.codeclone.{key}: expected {expected_name}"
- )
-
-
-def _validated_string_list(*, key: str, value: object) -> tuple[str, ...]:
- if not isinstance(value, list):
- raise ConfigValidationError(
- f"Invalid value type for tool.codeclone.{key}: expected list[str]"
- )
- if not all(isinstance(item, str) for item in value):
- raise ConfigValidationError(
- f"Invalid value type for tool.codeclone.{key}: expected list[str]"
- )
- try:
- return normalize_golden_fixture_patterns(value)
- except GoldenFixturePatternError as exc:
- raise ConfigValidationError(str(exc)) from exc
-
-
-def _load_toml(path: Path) -> object:
- if sys.version_info >= (3, 11):
- import tomllib
-
- with path.open("rb") as config_file:
- return tomllib.load(config_file)
- else:
- try:
- tomli_module = importlib.import_module("tomli")
- except ModuleNotFoundError as exc:
- raise ConfigValidationError(
- "Python 3.10 requires dependency 'tomli' to read pyproject.toml."
- ) from exc
-
- load_fn = getattr(tomli_module, "load", None)
- if not callable(load_fn):
- raise ConfigValidationError(
- "Invalid 'tomli' module: missing callable 'load'."
- )
-
- with path.open("rb") as config_file:
- return load_fn(config_file)
-
-
-def _normalize_path_config_value(
- *,
- key: str,
- value: object,
- root_path: Path,
-) -> object:
- if key not in _PATH_CONFIG_KEYS:
- return value
- if not isinstance(value, str):
- return value
-
- path = Path(value).expanduser()
- if path.is_absolute():
- return str(path)
- return str(root_path / path)
diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py
deleted file mode 100644
index 3577dc0..0000000
--- a/codeclone/_cli_paths.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-from typing import TYPE_CHECKING, Protocol
-
-from .contracts import ExitCode
-from .ui_messages import fmt_contract_error
-
-if TYPE_CHECKING:
- from collections.abc import Callable
-
-
-class _Printer(Protocol):
- def print(self, *objects: object, **kwargs: object) -> None: ...
-
-
-def _validate_output_path(
- path: str,
- *,
- expected_suffix: str,
- label: str,
- console: _Printer,
- invalid_message: Callable[..., str],
- invalid_path_message: Callable[..., str],
-) -> Path:
- out = Path(path).expanduser()
- if out.suffix.lower() != expected_suffix:
- console.print(
- fmt_contract_error(
- invalid_message(label=label, path=out, expected_suffix=expected_suffix)
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- try:
- return out.resolve()
- except OSError as e:
- console.print(
- fmt_contract_error(invalid_path_message(label=label, path=out, error=e))
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
diff --git a/codeclone/_cli_reports.py b/codeclone/_cli_reports.py
deleted file mode 100644
index 126879c..0000000
--- a/codeclone/_cli_reports.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import sys
-import webbrowser
-from pathlib import Path
-from typing import Protocol
-
-from . import ui_messages as ui
-from .contracts import ExitCode
-
-__all__ = ["write_report_outputs"]
-
-
-class _PrinterLike(Protocol):
- def print(self, *objects: object, **kwargs: object) -> None: ...
-
-
-class _QuietArgs(Protocol):
- quiet: bool
-
-
-def _path_attr(obj: object, name: str) -> Path | None:
- value = getattr(obj, name, None)
- return value if isinstance(value, Path) else None
-
-
-def _text_attr(obj: object, name: str) -> str | None:
- value = getattr(obj, name, None)
- return value if isinstance(value, str) else None
-
-
-def _write_report_output(
- *,
- out: Path,
- content: str,
- label: str,
- console: _PrinterLike,
-) -> None:
- try:
- out.parent.mkdir(parents=True, exist_ok=True)
- out.write_text(content, "utf-8")
- except OSError as exc:
- console.print(
- ui.fmt_contract_error(
- ui.fmt_report_write_failed(label=label, path=out, error=exc)
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
-
-def _open_html_report_in_browser(*, path: Path) -> None:
- if not webbrowser.open_new_tab(path.as_uri()):
- raise OSError("no browser handler available")
-
-
-def write_report_outputs(
- *,
- args: _QuietArgs,
- output_paths: object,
- report_artifacts: object,
- console: _PrinterLike,
- open_html_report: bool = False,
-) -> str | None:
- html_report_path: str | None = None
- saved_reports: list[tuple[str, Path]] = []
- html_path = _path_attr(output_paths, "html")
- json_path = _path_attr(output_paths, "json")
- md_path = _path_attr(output_paths, "md")
- sarif_path = _path_attr(output_paths, "sarif")
- text_path = _path_attr(output_paths, "text")
- html_report = _text_attr(report_artifacts, "html")
- json_report = _text_attr(report_artifacts, "json")
- md_report = _text_attr(report_artifacts, "md")
- sarif_report = _text_attr(report_artifacts, "sarif")
- text_report = _text_attr(report_artifacts, "text")
-
- if html_path and html_report is not None:
- out = html_path
- _write_report_output(
- out=out,
- content=html_report,
- label="HTML",
- console=console,
- )
- html_report_path = str(out)
- saved_reports.append(("HTML", out))
-
- if json_path and json_report is not None:
- out = json_path
- _write_report_output(
- out=out,
- content=json_report,
- label="JSON",
- console=console,
- )
- saved_reports.append(("JSON", out))
-
- if md_path and md_report is not None:
- out = md_path
- _write_report_output(
- out=out,
- content=md_report,
- label="Markdown",
- console=console,
- )
- saved_reports.append(("Markdown", out))
-
- if sarif_path and sarif_report is not None:
- out = sarif_path
- _write_report_output(
- out=out,
- content=sarif_report,
- label="SARIF",
- console=console,
- )
- saved_reports.append(("SARIF", out))
-
- if text_path and text_report is not None:
- out = text_path
- _write_report_output(
- out=out,
- content=text_report,
- label="text",
- console=console,
- )
- saved_reports.append(("Text", out))
-
- if saved_reports and not args.quiet:
- cwd = Path.cwd()
- console.print()
- for label, path in saved_reports:
- try:
- display = path.relative_to(cwd)
- except ValueError:
- display = path
- console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]")
-
- if open_html_report and html_path is not None:
- try:
- _open_html_report_in_browser(path=html_path)
- except Exception as exc:
- console.print(ui.fmt_html_report_open_failed(path=html_path, error=exc))
-
- return html_report_path
diff --git a/codeclone/_cli_runtime.py b/codeclone/_cli_runtime.py
deleted file mode 100644
index 28ca869..0000000
--- a/codeclone/_cli_runtime.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-from typing import Protocol
-
-from . import ui_messages as ui
-from .cache import CacheStatus
-from .contracts import ExitCode
-
-__all__ = [
- "configure_metrics_mode",
- "metrics_computed",
- "print_failed_files",
- "resolve_cache_path",
- "resolve_cache_status",
- "validate_numeric_args",
-]
-
-
-class _RuntimeArgs(Protocol):
- cache_path: str | None
- coverage_xml: str | None
- max_baseline_size_mb: int
- max_cache_size_mb: int
- fail_threshold: int
- fail_complexity: int
- fail_coupling: int
- fail_cohesion: int
- fail_health: int
- fail_on_new_metrics: bool
- fail_on_typing_regression: bool
- fail_on_docstring_regression: bool
- fail_on_api_break: bool
- fail_on_untested_hotspots: bool
- min_typing_coverage: int
- min_docstring_coverage: int
- coverage_min: int
- api_surface: bool
- update_metrics_baseline: bool
- skip_metrics: bool
- fail_cycles: bool
- fail_dead_code: bool
- skip_dead_code: bool
- skip_dependencies: bool
-
-
-class _PrinterLike(Protocol):
- def print(self, *objects: object, **kwargs: object) -> None: ...
-
-
-class _CacheLike(Protocol):
- @property
- def load_status(self) -> CacheStatus | str | None: ...
-
- @property
- def load_warning(self) -> str | None: ...
-
- @property
- def cache_schema_version(self) -> str | None: ...
-
-
-def validate_numeric_args(args: _RuntimeArgs) -> bool:
- return bool(
- not (
- args.max_baseline_size_mb < 0
- or args.max_cache_size_mb < 0
- or args.fail_threshold < -1
- or args.fail_complexity < -1
- or args.fail_coupling < -1
- or args.fail_cohesion < -1
- or args.fail_health < -1
- or args.min_typing_coverage < -1
- or args.min_typing_coverage > 100
- or args.min_docstring_coverage < -1
- or args.min_docstring_coverage > 100
- or args.coverage_min < 0
- or args.coverage_min > 100
- )
- )
-
-
-def _metrics_flags_requested(args: _RuntimeArgs) -> bool:
- return bool(
- args.fail_complexity >= 0
- or args.fail_coupling >= 0
- or args.fail_cohesion >= 0
- or args.fail_cycles
- or args.fail_dead_code
- or args.fail_health >= 0
- or args.fail_on_new_metrics
- or args.fail_on_typing_regression
- or args.fail_on_docstring_regression
- or args.fail_on_api_break
- or args.fail_on_untested_hotspots
- or args.min_typing_coverage >= 0
- or args.min_docstring_coverage >= 0
- or args.api_surface
- or args.update_metrics_baseline
- or bool(getattr(args, "coverage_xml", None))
- )
-
-
-def configure_metrics_mode(
- *,
- args: _RuntimeArgs,
- metrics_baseline_exists: bool,
- console: _PrinterLike,
-) -> None:
- metrics_flags_requested = _metrics_flags_requested(args)
-
- if args.skip_metrics and metrics_flags_requested:
- console.print(
- ui.fmt_contract_error(
- "--skip-metrics cannot be used together with metrics gating/update "
- "flags."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- if (
- not args.skip_metrics
- and not metrics_flags_requested
- and not metrics_baseline_exists
- ):
- args.skip_metrics = True
-
- if args.skip_metrics:
- args.skip_dead_code = True
- args.skip_dependencies = True
- return
-
- if args.fail_dead_code:
- args.skip_dead_code = False
- if args.fail_cycles:
- args.skip_dependencies = False
- if bool(getattr(args, "fail_on_api_break", False)):
- args.api_surface = True
-
-
-def resolve_cache_path(
- *,
- root_path: Path,
- args: _RuntimeArgs,
- from_args: bool,
- legacy_cache_path: Path,
- console: _PrinterLike,
-) -> Path:
- if from_args and args.cache_path:
- return Path(args.cache_path).expanduser()
-
- cache_path = root_path / ".cache" / "codeclone" / "cache.json"
- if legacy_cache_path.exists():
- try:
- legacy_resolved = legacy_cache_path.resolve()
- except OSError:
- legacy_resolved = legacy_cache_path
- if legacy_resolved != cache_path:
- console.print(
- ui.fmt_legacy_cache_warning(
- legacy_path=legacy_resolved,
- new_path=cache_path,
- )
- )
- return cache_path
-
-
-def metrics_computed(args: _RuntimeArgs) -> tuple[str, ...]:
- if args.skip_metrics:
- return ()
-
- computed = ["complexity", "coupling", "cohesion", "health"]
- if not args.skip_dependencies:
- computed.append("dependencies")
- if not args.skip_dead_code:
- computed.append("dead_code")
- computed.append("coverage_adoption")
- if bool(getattr(args, "api_surface", False)):
- computed.append("api_surface")
- if bool(getattr(args, "coverage_xml", None)):
- computed.append("coverage_join")
- return tuple(computed)
-
-
-def resolve_cache_status(cache: _CacheLike) -> tuple[CacheStatus, str | None]:
- raw_cache_status = getattr(cache, "load_status", None)
- load_warning = getattr(cache, "load_warning", None)
- if isinstance(raw_cache_status, CacheStatus):
- cache_status = raw_cache_status
- elif isinstance(raw_cache_status, str):
- try:
- cache_status = CacheStatus(raw_cache_status)
- except ValueError:
- cache_status = (
- CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE
- )
- else:
- cache_status = (
- CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE
- )
-
- raw_cache_schema_version = getattr(cache, "cache_schema_version", None)
- cache_schema_version = (
- raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None
- )
- return cache_status, cache_schema_version
-
-
-def print_failed_files(*, failed_files: tuple[str, ...], console: _PrinterLike) -> None:
- if not failed_files:
- return
- console.print(ui.fmt_failed_files_header(len(failed_files)))
- for failure in failed_files[:10]:
- console.print(f" • {failure}")
- if len(failed_files) > 10:
- console.print(f" ... and {len(failed_files) - 10} more")
diff --git a/codeclone/analysis/__init__.py b/codeclone/analysis/__init__.py
new file mode 100644
index 0000000..a521754
--- /dev/null
+++ b/codeclone/analysis/__init__.py
@@ -0,0 +1,22 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+
+from __future__ import annotations
+
+from .cfg import CFG, CFGBuilder
+from .fingerprint import bucket_loc, sha1
+from .normalizer import AstNormalizer, NormalizationConfig, stmt_hashes
+from .units import extract_units_and_stats_from_source
+
+__all__ = [
+ "CFG",
+ "AstNormalizer",
+ "CFGBuilder",
+ "NormalizationConfig",
+ "bucket_loc",
+ "extract_units_and_stats_from_source",
+ "sha1",
+ "stmt_hashes",
+]
diff --git a/codeclone/analysis/_module_walk.py b/codeclone/analysis/_module_walk.py
new file mode 100644
index 0000000..bba111e
--- /dev/null
+++ b/codeclone/analysis/_module_walk.py
@@ -0,0 +1,553 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import ast
+import tokenize
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal, NamedTuple
+
+from .. import qualnames as _qualnames
+from ..models import DeadCandidate, ModuleDep
+from .class_metrics import _node_line_span
+from .parser import (
+ _build_declaration_token_index,
+ _declaration_end_line,
+ _DeclarationTokenIndexKey,
+ _source_tokens,
+)
+from .suppressions import (
+ DeclarationTarget,
+ bind_suppressions_to_declarations,
+ build_suppression_index,
+ extract_suppression_directives,
+ suppression_target_key,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Mapping
+
+ from .suppressions import SuppressionTargetKey
+
+
+_NamedDeclarationNode = _qualnames.FunctionNode | ast.ClassDef
+_PROTOCOL_MODULE_NAMES = frozenset({"typing", "typing_extensions"})
+
+
+def _resolve_import_target(
+ module_name: str,
+ import_node: ast.ImportFrom,
+) -> str:
+ if import_node.level <= 0:
+ return import_node.module or ""
+
+ parent_parts = module_name.split(".")
+ keep = max(0, len(parent_parts) - import_node.level)
+ prefix = parent_parts[:keep]
+ if import_node.module:
+ return ".".join([*prefix, import_node.module])
+ return ".".join(prefix)
+
+
+@dataclass(slots=True)
+class _ModuleWalkState:
+ import_names: set[str] = field(default_factory=set)
+ deps: list[ModuleDep] = field(default_factory=list)
+ referenced_names: set[str] = field(default_factory=set)
+ imported_symbol_bindings: dict[str, set[str]] = field(default_factory=dict)
+ imported_module_aliases: dict[str, str] = field(default_factory=dict)
+ name_nodes: list[ast.Name] = field(default_factory=list)
+ attr_nodes: list[ast.Attribute] = field(default_factory=list)
+ protocol_symbol_aliases: set[str] = field(default_factory=lambda: {"Protocol"})
+ protocol_module_aliases: set[str] = field(
+ default_factory=lambda: set(_PROTOCOL_MODULE_NAMES)
+ )
+
+
+def _append_module_dep(
+ *,
+ module_name: str,
+ target: str,
+ import_type: Literal["import", "from_import"],
+ line: int,
+ state: _ModuleWalkState,
+) -> None:
+ state.deps.append(
+ ModuleDep(
+ source=module_name,
+ target=target,
+ import_type=import_type,
+ line=line,
+ )
+ )
+
+
+def _collect_import_node(
+ *,
+ node: ast.Import,
+ module_name: str,
+ state: _ModuleWalkState,
+ collect_referenced_names: bool,
+) -> None:
+ line = int(getattr(node, "lineno", 0))
+ for alias in node.names:
+ alias_name = alias.asname or alias.name.split(".", 1)[0]
+ state.import_names.add(alias_name)
+ _append_module_dep(
+ module_name=module_name,
+ target=alias.name,
+ import_type="import",
+ line=line,
+ state=state,
+ )
+ if collect_referenced_names:
+ state.imported_module_aliases[alias_name] = alias.name
+ if alias.name in _PROTOCOL_MODULE_NAMES:
+ state.protocol_module_aliases.add(alias_name)
+
+
+def _dotted_expr_name(expr: ast.expr) -> str | None:
+ if isinstance(expr, ast.Name):
+ return expr.id
+ if isinstance(expr, ast.Attribute):
+ prefix = _dotted_expr_name(expr.value)
+ if prefix is None:
+ return None
+ return f"{prefix}.{expr.attr}"
+ return None
+
+
+def _collect_import_from_node(
+ *,
+ node: ast.ImportFrom,
+ module_name: str,
+ state: _ModuleWalkState,
+ collect_referenced_names: bool,
+) -> None:
+ target = _resolve_import_target(module_name, node)
+ if target:
+ state.import_names.add(target.split(".", 1)[0])
+ _append_module_dep(
+ module_name=module_name,
+ target=target,
+ import_type="from_import",
+ line=int(getattr(node, "lineno", 0)),
+ state=state,
+ )
+
+ if node.module in _PROTOCOL_MODULE_NAMES:
+ for alias in node.names:
+ if alias.name == "Protocol":
+ state.protocol_symbol_aliases.add(alias.asname or alias.name)
+
+ if not collect_referenced_names or not target:
+ return
+
+ for alias in node.names:
+ if alias.name == "*":
+ continue
+ alias_name = alias.asname or alias.name
+ state.imported_symbol_bindings.setdefault(alias_name, set()).add(
+ f"{target}:{alias.name}"
+ )
+
+
+def _collect_load_reference_node(
+ *,
+ node: ast.AST,
+ state: _ModuleWalkState,
+) -> None:
+ if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
+ state.referenced_names.add(node.id)
+ state.name_nodes.append(node)
+ return
+ if isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
+ state.referenced_names.add(node.attr)
+ state.attr_nodes.append(node)
+
+
+def _is_protocol_class(
+ class_node: ast.ClassDef,
+ *,
+ protocol_symbol_aliases: frozenset[str],
+ protocol_module_aliases: frozenset[str],
+) -> bool:
+ for base in class_node.bases:
+ base_name = _dotted_expr_name(base)
+ if base_name is None:
+ continue
+ if base_name in protocol_symbol_aliases:
+ return True
+ if "." in base_name and base_name.rsplit(".", 1)[-1] == "Protocol":
+ module_alias = base_name.rsplit(".", 1)[0]
+ if module_alias in protocol_module_aliases:
+ return True
+ return False
+
+
+def _is_non_runtime_candidate(node: _qualnames.FunctionNode) -> bool:
+ for decorator in node.decorator_list:
+ name = _dotted_expr_name(decorator)
+ if name is None:
+ continue
+ terminal = name.rsplit(".", 1)[-1]
+ if terminal in {"overload", "abstractmethod"}:
+ return True
+ return False
+
+
+def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]:
+ return "method" if "." in local_name else "function"
+
+
+def _should_skip_dead_candidate(
+ local_name: str,
+ node: _qualnames.FunctionNode,
+ *,
+ protocol_class_qualnames: set[str],
+) -> bool:
+ if _is_non_runtime_candidate(node):
+ return True
+ if "." not in local_name:
+ return False
+ owner_qualname = local_name.rsplit(".", 1)[0]
+ return owner_qualname in protocol_class_qualnames
+
+
+def _build_dead_candidate(
+ *,
+ module_name: str,
+ local_name: str,
+ node: _NamedDeclarationNode,
+ filepath: str,
+ kind: Literal["class", "function", "method"],
+ suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]],
+ start_line: int,
+ end_line: int,
+) -> DeadCandidate:
+ qualname = f"{module_name}:{local_name}"
+ return DeadCandidate(
+ qualname=qualname,
+ local_name=node.name,
+ filepath=filepath,
+ start_line=start_line,
+ end_line=end_line,
+ kind=kind,
+ suppressed_rules=suppression_index.get(
+ suppression_target_key(
+ filepath=filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=end_line,
+ kind=kind,
+ ),
+ (),
+ ),
+ )
+
+
+def _dead_candidate_for_unit(
+ *,
+ module_name: str,
+ local_name: str,
+ node: _qualnames.FunctionNode,
+ filepath: str,
+ suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]],
+ protocol_class_qualnames: set[str],
+) -> DeadCandidate | None:
+ span = _node_line_span(node)
+ if span is None:
+ return None
+ if _should_skip_dead_candidate(
+ local_name,
+ node,
+ protocol_class_qualnames=protocol_class_qualnames,
+ ):
+ return None
+ start, end = span
+ return _build_dead_candidate(
+ module_name=module_name,
+ local_name=local_name,
+ node=node,
+ filepath=filepath,
+ kind=_dead_candidate_kind(local_name),
+ suppression_index=suppression_index,
+ start_line=start,
+ end_line=end,
+ )
+
+
+def _resolve_referenced_qualnames(
+ *,
+ module_name: str,
+ collector: _qualnames.QualnameCollector,
+ state: _ModuleWalkState,
+) -> frozenset[str]:
+ top_level_class_by_name = {
+ class_qualname: class_qualname
+ for class_qualname, _class_node in collector.class_nodes
+ if "." not in class_qualname
+ }
+ local_method_qualnames = frozenset(
+ f"{module_name}:{local_name}"
+ for local_name, _node in collector.units
+ if "." in local_name
+ )
+
+ resolved: set[str] = set()
+ for name_node in state.name_nodes:
+ for qualname in state.imported_symbol_bindings.get(name_node.id, ()):
+ resolved.add(qualname)
+
+ for attr_node in state.attr_nodes:
+ base = attr_node.value
+ if isinstance(base, ast.Name):
+ imported_module = state.imported_module_aliases.get(base.id)
+ if imported_module is not None:
+ resolved.add(f"{imported_module}:{attr_node.attr}")
+ else:
+ class_qualname = top_level_class_by_name.get(base.id)
+ if class_qualname is not None:
+ local_method_qualname = (
+ f"{module_name}:{class_qualname}.{attr_node.attr}"
+ )
+ if local_method_qualname in local_method_qualnames:
+ resolved.add(local_method_qualname)
+
+ return frozenset(resolved)
+
+
+class _ModuleWalkResult(NamedTuple):
+ import_names: frozenset[str]
+ module_deps: tuple[ModuleDep, ...]
+ referenced_names: frozenset[str]
+ referenced_qualnames: frozenset[str]
+ protocol_symbol_aliases: frozenset[str]
+ protocol_module_aliases: frozenset[str]
+
+
+def _collect_module_walk_data(
+ *,
+ tree: ast.AST,
+ module_name: str,
+ collector: _qualnames.QualnameCollector,
+ collect_referenced_names: bool,
+) -> _ModuleWalkResult:
+ """Single ast.walk that collects imports, deps, names, qualnames & protocol aliases.
+
+ Reduces the hot path to one tree walk plus one local qualname resolution phase.
+ """
+ state = _ModuleWalkState()
+ for node in ast.walk(tree):
+ if isinstance(node, ast.Import):
+ _collect_import_node(
+ node=node,
+ module_name=module_name,
+ state=state,
+ collect_referenced_names=collect_referenced_names,
+ )
+ elif isinstance(node, ast.ImportFrom):
+ _collect_import_from_node(
+ node=node,
+ module_name=module_name,
+ state=state,
+ collect_referenced_names=collect_referenced_names,
+ )
+ elif collect_referenced_names:
+ _collect_load_reference_node(node=node, state=state)
+
+ deps_sorted = tuple(
+ sorted(
+ state.deps,
+ key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line),
+ )
+ )
+ resolved = (
+ _resolve_referenced_qualnames(
+ module_name=module_name,
+ collector=collector,
+ state=state,
+ )
+ if collect_referenced_names
+ else frozenset()
+ )
+
+ return _ModuleWalkResult(
+ import_names=frozenset(state.import_names),
+ module_deps=deps_sorted,
+ referenced_names=frozenset(state.referenced_names),
+ referenced_qualnames=resolved,
+ protocol_symbol_aliases=frozenset(state.protocol_symbol_aliases),
+ protocol_module_aliases=frozenset(state.protocol_module_aliases),
+ )
+
+
+def _collect_dead_candidates(
+ *,
+ filepath: str,
+ module_name: str,
+ collector: _qualnames.QualnameCollector,
+ protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}),
+ protocol_module_aliases: frozenset[str] = frozenset(
+ {"typing", "typing_extensions"}
+ ),
+ suppression_rules_by_target: Mapping[SuppressionTargetKey, tuple[str, ...]]
+ | None = None,
+) -> tuple[DeadCandidate, ...]:
+ protocol_class_qualnames = {
+ class_qualname
+ for class_qualname, class_node in collector.class_nodes
+ if _is_protocol_class(
+ class_node,
+ protocol_symbol_aliases=protocol_symbol_aliases,
+ protocol_module_aliases=protocol_module_aliases,
+ )
+ }
+
+ candidates: list[DeadCandidate] = []
+ suppression_index = (
+ suppression_rules_by_target if suppression_rules_by_target is not None else {}
+ )
+ for local_name, node in collector.units:
+ candidate = _dead_candidate_for_unit(
+ module_name=module_name,
+ local_name=local_name,
+ node=node,
+ filepath=filepath,
+ suppression_index=suppression_index,
+ protocol_class_qualnames=protocol_class_qualnames,
+ )
+ if candidate is not None:
+ candidates.append(candidate)
+
+ for class_qualname, class_node in collector.class_nodes:
+ span = _node_line_span(class_node)
+ if span is not None:
+ start, end = span
+ candidates.append(
+ _build_dead_candidate(
+ module_name=module_name,
+ local_name=class_qualname,
+ node=class_node,
+ filepath=filepath,
+ kind="class",
+ suppression_index=suppression_index,
+ start_line=start,
+ end_line=end,
+ )
+ )
+
+ return tuple(
+ sorted(
+ candidates,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ ),
+ )
+ )
+
+
+def _collect_declaration_targets(
+ *,
+ filepath: str,
+ module_name: str,
+ collector: _qualnames.QualnameCollector,
+ source_tokens: tuple[tokenize.TokenInfo, ...] = (),
+ source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
+ include_inline_lines: bool = False,
+) -> tuple[DeclarationTarget, ...]:
+ declarations: list[DeclarationTarget] = []
+ declaration_specs: list[
+ tuple[str, ast.AST, Literal["function", "method", "class"]]
+ ] = [
+ (
+ local_name,
+ node,
+ "method" if "." in local_name else "function",
+ )
+ for local_name, node in collector.units
+ ]
+ declaration_specs.extend(
+ (class_qualname, class_node, "class")
+ for class_qualname, class_node in collector.class_nodes
+ )
+
+ for qualname_suffix, node, kind in declaration_specs:
+ start = int(getattr(node, "lineno", 0))
+ end = int(getattr(node, "end_lineno", 0))
+ if start > 0 and end > 0:
+ declaration_end_line = (
+ _declaration_end_line(
+ node,
+ source_tokens=source_tokens,
+ source_token_index=source_token_index,
+ )
+ if include_inline_lines
+ else None
+ )
+ declarations.append(
+ DeclarationTarget(
+ filepath=filepath,
+ qualname=f"{module_name}:{qualname_suffix}",
+ start_line=start,
+ end_line=end,
+ kind=kind,
+ declaration_end_line=declaration_end_line,
+ )
+ )
+
+ return tuple(
+ sorted(
+ declarations,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ item.kind,
+ ),
+ )
+ )
+
+
+def _build_suppression_index_for_source(
+ *,
+ source: str,
+ filepath: str,
+ module_name: str,
+ collector: _qualnames.QualnameCollector,
+) -> Mapping[SuppressionTargetKey, tuple[str, ...]]:
+ suppression_directives = extract_suppression_directives(source)
+ if not suppression_directives:
+ return {}
+
+ needs_inline_binding = any(
+ directive.binding == "inline" for directive in suppression_directives
+ )
+ source_tokens: tuple[tokenize.TokenInfo, ...] = ()
+ source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None
+ if needs_inline_binding:
+ source_tokens = _source_tokens(source)
+ if source_tokens:
+ source_token_index = _build_declaration_token_index(source_tokens)
+
+ declaration_targets = _collect_declaration_targets(
+ filepath=filepath,
+ module_name=module_name,
+ collector=collector,
+ source_tokens=source_tokens,
+ source_token_index=source_token_index,
+ include_inline_lines=needs_inline_binding,
+ )
+ suppression_bindings = bind_suppressions_to_declarations(
+ directives=suppression_directives,
+ declarations=declaration_targets,
+ )
+ return build_suppression_index(suppression_bindings)
diff --git a/codeclone/cfg.py b/codeclone/analysis/cfg.py
similarity index 92%
rename from codeclone/cfg.py
rename to codeclone/analysis/cfg.py
index f10811f..5da1933 100644
--- a/codeclone/cfg.py
+++ b/codeclone/analysis/cfg.py
@@ -8,10 +8,10 @@
import ast
from dataclasses import dataclass
-from typing import TYPE_CHECKING, Protocol, cast
+from typing import TYPE_CHECKING
+from ..meta_markers import CFG_META_PREFIX
from .cfg_model import CFG, Block
-from .meta_markers import CFG_META_PREFIX
if TYPE_CHECKING:
from collections.abc import Iterable
@@ -21,13 +21,6 @@
TryStar = getattr(ast, "TryStar", ast.Try)
-class _TryLike(Protocol):
- body: list[ast.stmt]
- handlers: list[ast.ExceptHandler]
- orelse: list[ast.stmt]
- finalbody: list[ast.stmt]
-
-
@dataclass(slots=True)
class _LoopContext:
continue_target: Block
@@ -105,9 +98,19 @@ def _visit(self, stmt: ast.stmt) -> None:
self._visit_for(stmt) # Structure is identical to For
case ast.Try():
- self._visit_try(cast("_TryLike", stmt))
+ self._visit_try(
+ body=stmt.body,
+ handlers=stmt.handlers,
+ orelse=stmt.orelse,
+ finalbody=stmt.finalbody,
+ )
case _ if TryStar is not None and isinstance(stmt, TryStar):
- self._visit_try(cast("_TryLike", cast("object", stmt)))
+ self._visit_try(
+ body=stmt.body,
+ handlers=stmt.handlers,
+ orelse=stmt.orelse,
+ finalbody=stmt.finalbody,
+ )
case ast.With() | ast.AsyncWith():
self._visit_with(stmt)
@@ -261,18 +264,25 @@ def _visit_with(self, stmt: ast.With | ast.AsyncWith) -> None:
self.current = after_block
- def _visit_try(self, stmt: _TryLike) -> None:
+ def _visit_try(
+ self,
+ *,
+ body: list[ast.stmt],
+ handlers: list[ast.ExceptHandler],
+ orelse: list[ast.stmt],
+ finalbody: list[ast.stmt],
+ ) -> None:
try_entry = self.cfg.create_block()
self.current.add_successor(try_entry)
self.current = try_entry
- handler_test_blocks = [self.cfg.create_block() for _ in stmt.handlers]
- handler_body_blocks = [self.cfg.create_block() for _ in stmt.handlers]
- else_block = self.cfg.create_block() if stmt.orelse else None
+ handler_test_blocks = [self.cfg.create_block() for _ in handlers]
+ handler_body_blocks = [self.cfg.create_block() for _ in handlers]
+ else_block = self.cfg.create_block() if orelse else None
final_block = self.cfg.create_block()
for idx, (handler, test_block, body_block) in enumerate(
- zip(stmt.handlers, handler_test_blocks, handler_body_blocks, strict=True)
+ zip(handlers, handler_test_blocks, handler_body_blocks, strict=True)
):
test_block.statements.append(_meta_expr(f"TRY_HANDLER_INDEX:{idx}"))
if handler.type is not None:
@@ -290,7 +300,7 @@ def _visit_try(self, stmt: _TryLike) -> None:
# Process each statement in try body
# Link only statements that can raise to exception handlers
- for stmt_node in stmt.body:
+ for stmt_node in body:
if self.current.is_terminated:
break
@@ -307,7 +317,7 @@ def _visit_try(self, stmt: _TryLike) -> None:
self.current.add_successor(final_block)
# Process handlers
- for handler, body_block in zip(stmt.handlers, handler_body_blocks, strict=True):
+ for handler, body_block in zip(handlers, handler_body_blocks, strict=True):
self.current = body_block
self._visit_statements(handler.body)
if not self.current.is_terminated:
@@ -316,14 +326,14 @@ def _visit_try(self, stmt: _TryLike) -> None:
# Process else
if else_block:
self.current = else_block
- self._visit_statements(stmt.orelse)
+ self._visit_statements(orelse)
if not self.current.is_terminated:
self.current.add_successor(final_block)
# Process finally
self.current = final_block
- if stmt.finalbody:
- self._visit_statements(stmt.finalbody)
+ if finalbody:
+ self._visit_statements(finalbody)
def _visit_match(self, stmt: ast.Match) -> None:
self.current.statements.append(ast.Expr(value=stmt.subject))
diff --git a/codeclone/cfg_model.py b/codeclone/analysis/cfg_model.py
similarity index 100%
rename from codeclone/cfg_model.py
rename to codeclone/analysis/cfg_model.py
diff --git a/codeclone/analysis/class_metrics.py b/codeclone/analysis/class_metrics.py
new file mode 100644
index 0000000..d343ec7
--- /dev/null
+++ b/codeclone/analysis/class_metrics.py
@@ -0,0 +1,55 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import ast
+
+from ..metrics.cohesion import cohesion_risk, compute_lcom4
+from ..metrics.coupling import compute_cbo, coupling_risk
+from ..models import ClassMetrics
+
+
+def _node_line_span(node: ast.AST) -> tuple[int, int] | None:
+ start = int(getattr(node, "lineno", 0))
+ end = int(getattr(node, "end_lineno", 0))
+ if start <= 0 or end <= 0:
+ return None
+ return start, end
+
+
+def _class_metrics_for_node(
+ *,
+ module_name: str,
+ class_qualname: str,
+ class_node: ast.ClassDef,
+ filepath: str,
+ module_import_names: set[str],
+ module_class_names: set[str],
+) -> ClassMetrics | None:
+ span = _node_line_span(class_node)
+ if span is None:
+ return None
+ start, end = span
+ cbo, coupled_classes = compute_cbo(
+ class_node,
+ module_import_names=module_import_names,
+ module_class_names=module_class_names,
+ )
+ lcom4, method_count, instance_var_count = compute_lcom4(class_node)
+ return ClassMetrics(
+ qualname=f"{module_name}:{class_qualname}",
+ filepath=filepath,
+ start_line=start,
+ end_line=end,
+ cbo=cbo,
+ lcom4=lcom4,
+ method_count=method_count,
+ instance_var_count=instance_var_count,
+ risk_coupling=coupling_risk(cbo),
+ risk_cohesion=cohesion_risk(lcom4),
+ coupled_classes=coupled_classes,
+ )
diff --git a/codeclone/analysis/fingerprint.py b/codeclone/analysis/fingerprint.py
new file mode 100644
index 0000000..dff7dbc
--- /dev/null
+++ b/codeclone/analysis/fingerprint.py
@@ -0,0 +1,81 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import hashlib
+
+from .. import qualnames as _qualnames
+from ..metrics.complexity import cyclomatic_complexity
+from .cfg import CFGBuilder
+from .normalizer import (
+ AstNormalizer,
+ NormalizationConfig,
+ normalized_ast_dump_from_list,
+)
+
+
+def sha1(s: str) -> str:
+ return hashlib.sha1(s.encode("utf-8")).hexdigest()
+
+
+def bucket_loc(loc: int) -> str:
+ # Helps avoid grouping wildly different sizes if desired
+ if loc < 20:
+ return "0-19"
+ if loc < 50:
+ return "20-49"
+ if loc < 100:
+ return "50-99"
+ return "100+"
+
+
+def _cfg_fingerprint_and_complexity(
+ node: _qualnames.FunctionNode,
+ cfg: NormalizationConfig,
+ qualname: str,
+) -> tuple[str, int]:
+ """
+ Generate a structural fingerprint for a function using CFG analysis.
+
+ The fingerprint is computed by:
+ 1. Building a Control Flow Graph (CFG) from the function
+ 2. Normalizing each CFG block's statements (variable names, constants, etc.)
+ 3. Creating a canonical representation of the CFG structure
+ 4. Hashing the representation with SHA-1
+
+ Functions with identical control flow and normalized statements will
+ produce the same fingerprint, even if they differ in variable names,
+ constants, or type annotations.
+
+ Args:
+ node: Function AST node to fingerprint
+ cfg: Normalization configuration (what to ignore)
+ qualname: Qualified name for logging/debugging
+
+ Returns:
+ 40-character hex SHA-1 hash of the normalized CFG
+ """
+ builder = CFGBuilder()
+ graph = builder.build(qualname, node)
+ cfg_normalizer = AstNormalizer(cfg)
+
+ # Use generator to avoid building large list of strings
+ parts: list[str] = []
+ for block in sorted(graph.blocks, key=lambda b: b.id):
+ succ_ids = ",".join(
+ str(s.id) for s in sorted(block.successors, key=lambda s: s.id)
+ )
+ block_dump = normalized_ast_dump_from_list(
+ block.statements,
+ cfg,
+ normalizer=cfg_normalizer,
+ )
+ parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}")
+ return sha1("|".join(parts)), cyclomatic_complexity(graph)
+
+
+_CFG_FINGERPRINT_AND_COMPLEXITY_IMPL = _cfg_fingerprint_and_complexity
diff --git a/codeclone/normalize.py b/codeclone/analysis/normalizer.py
similarity index 95%
rename from codeclone/normalize.py
rename to codeclone/analysis/normalizer.py
index 31f39e8..19e44b1 100644
--- a/codeclone/normalize.py
+++ b/codeclone/analysis/normalizer.py
@@ -11,9 +11,9 @@
import hashlib
from ast import AST
from dataclasses import dataclass
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
-from .meta_markers import CFG_META_PREFIX
+from ..meta_markers import CFG_META_PREFIX
if TYPE_CHECKING:
from collections.abc import Sequence
@@ -92,11 +92,16 @@ def visit_Constant(self, node: ast.Constant) -> ast.Constant:
node.value = "_CONST_"
return node
+ def _visit_expr(self, node: ast.expr) -> ast.expr:
+ visited = self.visit(node)
+ assert isinstance(visited, ast.expr)
+ return visited
+
def visit_Call(self, node: ast.Call) -> ast.Call:
node.func = self._visit_call_target(node.func)
- node.args = [cast("ast.expr", self.visit(arg)) for arg in node.args]
+ node.args = [self._visit_expr(arg) for arg in node.args]
for kw in node.keywords:
- kw.value = cast("ast.expr", self.visit(kw.value))
+ kw.value = self._visit_expr(kw.value)
return node
def _visit_call_target(self, node: ast.expr) -> ast.expr:
@@ -108,9 +113,9 @@ def _visit_call_target(self, node: ast.expr) -> ast.expr:
if isinstance(value, (ast.Name, ast.Attribute)):
node.value = self._visit_call_target(value)
else:
- node.value = cast("ast.expr", self.visit(value))
+ node.value = self._visit_expr(value)
return node
- return cast("ast.expr", self.visit(node))
+ return self._visit_expr(node)
def visit_AugAssign(self, node: ast.AugAssign) -> AST:
# Normalize x += 1 to x = x + 1
diff --git a/codeclone/analysis/parser.py b/codeclone/analysis/parser.py
new file mode 100644
index 0000000..f8bbbb5
--- /dev/null
+++ b/codeclone/analysis/parser.py
@@ -0,0 +1,219 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import ast
+import io
+import math
+import os
+import signal
+import tokenize
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+
+from ..contracts.errors import ParseError
+
+if TYPE_CHECKING:
+ from collections.abc import Iterator, Mapping
+
+PARSE_TIMEOUT_SECONDS = 5
+
+
+class _ParseTimeoutError(Exception):
+ pass
+
+
+_DeclarationTokenIndexKey = tuple[int, int, str]
+_DECLARATION_TOKEN_STRINGS = frozenset({"def", "async", "class"})
+
+
+def _consumed_cpu_seconds(resource_module: object) -> float:
+ """Return consumed CPU seconds for the current process."""
+ try:
+ usage = resource_module.getrusage( # type: ignore[attr-defined]
+ resource_module.RUSAGE_SELF # type: ignore[attr-defined]
+ )
+ return float(usage.ru_utime) + float(usage.ru_stime)
+ except Exception:
+ return 0.0
+
+
+@contextmanager
+def _parse_limits(timeout_s: int) -> Iterator[None]:
+ if os.name != "posix" or timeout_s <= 0:
+ yield
+ return
+
+ old_handler = signal.getsignal(signal.SIGALRM)
+
+ def _timeout_handler(_signum: int, _frame: object) -> None:
+ raise _ParseTimeoutError("AST parsing timeout")
+
+ old_limits: tuple[int, int] | None = None
+ try:
+ signal.signal(signal.SIGALRM, _timeout_handler)
+ signal.setitimer(signal.ITIMER_REAL, timeout_s)
+
+ try:
+ import resource
+
+ old_limits = resource.getrlimit(resource.RLIMIT_CPU)
+ soft, hard = old_limits
+ consumed_cpu_s = _consumed_cpu_seconds(resource)
+ desired_soft = max(1, timeout_s + math.ceil(consumed_cpu_s))
+ if soft == resource.RLIM_INFINITY:
+ candidate_soft = desired_soft
+ else:
+ # Never reduce finite soft limits and avoid immediate SIGXCPU
+ # when the process already consumed more CPU than timeout_s.
+ candidate_soft = max(soft, desired_soft)
+ if hard == resource.RLIM_INFINITY:
+ new_soft = candidate_soft
+ else:
+ new_soft = min(max(1, hard), candidate_soft)
+ # Never lower hard limit: raising it back may be disallowed for
+ # unprivileged processes and can lead to process termination later.
+ resource.setrlimit(resource.RLIMIT_CPU, (new_soft, hard))
+ except Exception:
+ # If resource is unavailable or cannot be set, rely on alarm only.
+ pass
+
+ yield
+ finally:
+ signal.setitimer(signal.ITIMER_REAL, 0)
+ signal.signal(signal.SIGALRM, old_handler)
+ if old_limits is not None:
+ try:
+ import resource
+
+ resource.setrlimit(resource.RLIMIT_CPU, old_limits)
+ except Exception:
+ pass
+
+
+_PARSE_LIMITS_IMPL = _parse_limits
+
+
+def _parse_with_limits(source: str, timeout_s: int) -> ast.AST:
+ try:
+ with _parse_limits(timeout_s):
+ return ast.parse(source)
+ except _ParseTimeoutError as e:
+ raise ParseError(str(e)) from e
+
+
+_PARSE_WITH_LIMITS_IMPL = _parse_with_limits
+
+
+def _source_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]:
+ try:
+ return tuple(tokenize.generate_tokens(io.StringIO(source).readline))
+ except tokenize.TokenError:
+ return ()
+
+
+_SOURCE_TOKENS_IMPL = _source_tokens
+
+
+def _declaration_token_name(node: ast.AST) -> str:
+ if isinstance(node, ast.ClassDef):
+ return "class"
+ if isinstance(node, ast.AsyncFunctionDef):
+ return "async"
+ return "def"
+
+
+def _declaration_token_index(
+ *,
+ source_tokens: tuple[tokenize.TokenInfo, ...],
+ start_line: int,
+ start_col: int,
+ declaration_token: str,
+ source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
+) -> int | None:
+ if source_token_index is not None:
+ return source_token_index.get((start_line, start_col, declaration_token))
+ for idx, token in enumerate(source_tokens):
+ if token.start != (start_line, start_col):
+ continue
+ if token.type == tokenize.NAME and token.string == declaration_token:
+ return idx
+ return None
+
+
+def _build_declaration_token_index(
+ source_tokens: tuple[tokenize.TokenInfo, ...],
+) -> Mapping[_DeclarationTokenIndexKey, int]:
+ indexed: dict[_DeclarationTokenIndexKey, int] = {}
+ for idx, token in enumerate(source_tokens):
+ if token.type == tokenize.NAME and token.string in _DECLARATION_TOKEN_STRINGS:
+ indexed[(token.start[0], token.start[1], token.string)] = idx
+ return indexed
+
+
+def _scan_declaration_colon_line(
+ *,
+ source_tokens: tuple[tokenize.TokenInfo, ...],
+ start_index: int,
+) -> int | None:
+ nesting = 0
+ for token in source_tokens[start_index + 1 :]:
+ if token.type == tokenize.OP:
+ if token.string in "([{":
+ nesting += 1
+ continue
+ if token.string in ")]}":
+ if nesting > 0:
+ nesting -= 1
+ continue
+ if token.string == ":" and nesting == 0:
+ return token.start[0]
+ if token.type == tokenize.NEWLINE and nesting == 0:
+ return None
+ return None
+
+
+def _fallback_declaration_end_line(node: ast.AST, *, start_line: int) -> int:
+ body = getattr(node, "body", None)
+ if not isinstance(body, list) or not body:
+ return start_line
+
+ first_body_line = int(getattr(body[0], "lineno", 0))
+ if first_body_line <= 0 or first_body_line == start_line:
+ return start_line
+ return max(start_line, first_body_line - 1)
+
+
+def _declaration_end_line(
+ node: ast.AST,
+ *,
+ source_tokens: tuple[tokenize.TokenInfo, ...],
+ source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
+) -> int:
+ start_line = int(getattr(node, "lineno", 0))
+ start_col = int(getattr(node, "col_offset", 0))
+ if start_line <= 0:
+ return 0
+
+ declaration_token = _declaration_token_name(node)
+ start_index = _declaration_token_index(
+ source_tokens=source_tokens,
+ start_line=start_line,
+ start_col=start_col,
+ declaration_token=declaration_token,
+ source_token_index=source_token_index,
+ )
+ if start_index is None:
+ return _fallback_declaration_end_line(node, start_line=start_line)
+
+ colon_line = _scan_declaration_colon_line(
+ source_tokens=source_tokens,
+ start_index=start_index,
+ )
+ if colon_line is not None:
+ return colon_line
+ return _fallback_declaration_end_line(node, start_line=start_line)
diff --git a/codeclone/analysis/security_surfaces.py b/codeclone/analysis/security_surfaces.py
new file mode 100644
index 0000000..0827fab
--- /dev/null
+++ b/codeclone/analysis/security_surfaces.py
@@ -0,0 +1,476 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+
+from ..models import (
+ SecuritySurface,
+ SecuritySurfaceCategory,
+ SecuritySurfaceClassificationMode,
+ SecuritySurfaceEvidenceKind,
+ SecuritySurfaceLocationScope,
+)
+
+
+@dataclass(frozen=True, slots=True)
+class _ImportRule:
+ module_prefix: str
+ category: SecuritySurfaceCategory
+ capability: str
+
+
+@dataclass(frozen=True, slots=True)
+class _CallRule:
+ symbol: str
+ category: SecuritySurfaceCategory
+ capability: str
+ prefix_match: bool = False
+
+
+_BUILTIN_RULES: dict[str, tuple[SecuritySurfaceCategory, str]] = {
+ "__import__": ("dynamic_loading", "builtin_import"),
+ "compile": ("dynamic_execution", "dynamic_compile"),
+ "eval": ("dynamic_execution", "dynamic_eval"),
+ "exec": ("dynamic_execution", "dynamic_exec"),
+}
+
+_IMPORT_RULES: tuple[_ImportRule, ...] = (
+ _ImportRule("aiohttp", "network_boundary", "aiohttp_import"),
+ _ImportRule("asyncpg", "database_boundary", "asyncpg_import"),
+ _ImportRule("authlib", "identity_token", "authlib_import"),
+ _ImportRule("bcrypt", "identity_token", "bcrypt_import"),
+ _ImportRule("cloudpickle", "deserialization", "cloudpickle_import"),
+ _ImportRule("cryptography", "crypto_transport", "cryptography_import"),
+ _ImportRule("dill", "deserialization", "dill_import"),
+ _ImportRule("django.http", "network_boundary", "django_http_import"),
+ _ImportRule("fastapi", "network_boundary", "fastapi_import"),
+ _ImportRule("flask", "network_boundary", "flask_import"),
+ _ImportRule("grpc", "network_boundary", "grpc_import"),
+ _ImportRule("hmac", "crypto_transport", "hmac_import"),
+ _ImportRule("http.server", "network_boundary", "http_server_import"),
+ _ImportRule("httpx", "network_boundary", "httpx_import"),
+ _ImportRule("importlib", "dynamic_loading", "importlib_import"),
+ _ImportRule("itsdangerous", "identity_token", "itsdangerous_import"),
+ _ImportRule("jsonpickle", "deserialization", "jsonpickle_import"),
+ _ImportRule("jwt", "identity_token", "jwt_import"),
+ _ImportRule("marshal", "deserialization", "marshal_import"),
+ _ImportRule("OpenSSL", "crypto_transport", "openssl_import"),
+ _ImportRule("passlib", "identity_token", "passlib_import"),
+ _ImportRule("pickle", "deserialization", "pickle_import"),
+ _ImportRule("psycopg", "database_boundary", "psycopg_import"),
+ _ImportRule("psycopg2", "database_boundary", "psycopg2_import"),
+ _ImportRule("pymysql", "database_boundary", "pymysql_import"),
+ _ImportRule("redis", "database_boundary", "redis_import"),
+ _ImportRule("requests", "network_boundary", "requests_import"),
+ _ImportRule("ruamel.yaml", "deserialization", "ruamel_yaml_import"),
+ _ImportRule("runpy", "dynamic_loading", "runpy_import"),
+ _ImportRule("secrets", "crypto_transport", "secrets_import"),
+ _ImportRule("shelve", "deserialization", "shelve_import"),
+ _ImportRule("socket", "network_boundary", "socket_import"),
+ _ImportRule("sqlalchemy", "database_boundary", "sqlalchemy_import"),
+ _ImportRule("sqlite3", "database_boundary", "sqlite3_import"),
+ _ImportRule("ssl", "crypto_transport", "ssl_import"),
+ _ImportRule("subprocess", "process_boundary", "subprocess_import"),
+ _ImportRule("tarfile", "archive_extraction", "tarfile_import"),
+ _ImportRule("websockets", "network_boundary", "websockets_import"),
+ _ImportRule("urllib", "network_boundary", "urllib_import"),
+ _ImportRule("yaml", "deserialization", "yaml_import"),
+ _ImportRule("zipfile", "archive_extraction", "zipfile_import"),
+)
+
+_CALL_RULES: tuple[_CallRule, ...] = (
+ _CallRule(
+ "asyncio.create_subprocess_exec", "process_boundary", "asyncio_subprocess_exec"
+ ),
+ _CallRule(
+ "asyncio.create_subprocess_shell",
+ "process_boundary",
+ "asyncio_subprocess_shell",
+ ),
+ _CallRule("cloudpickle.load", "deserialization", "cloudpickle_load"),
+ _CallRule("cloudpickle.loads", "deserialization", "cloudpickle_loads"),
+ _CallRule("dill.load", "deserialization", "dill_load"),
+ _CallRule("dill.loads", "deserialization", "dill_loads"),
+ _CallRule("importlib.import_module", "dynamic_loading", "import_module"),
+ _CallRule(
+ "importlib.util.spec_from_file_location",
+ "dynamic_loading",
+ "import_spec_from_file",
+ ),
+ _CallRule("jsonpickle.decode", "deserialization", "jsonpickle_decode"),
+ _CallRule("marshal.load", "deserialization", "marshal_load"),
+ _CallRule("marshal.loads", "deserialization", "marshal_loads"),
+ _CallRule("os.chmod", "filesystem_mutation", "os_chmod"),
+ _CallRule("os.chown", "filesystem_mutation", "os_chown"),
+ _CallRule("os.makedirs", "filesystem_mutation", "os_makedirs"),
+ _CallRule("os.remove", "filesystem_mutation", "os_remove"),
+ _CallRule("os.rename", "filesystem_mutation", "os_rename"),
+ _CallRule("os.replace", "filesystem_mutation", "os_replace"),
+ _CallRule("os.rmdir", "filesystem_mutation", "os_rmdir"),
+ _CallRule("os.spawn", "process_boundary", "os_spawn", prefix_match=True),
+ _CallRule("os.system", "process_boundary", "os_system"),
+ _CallRule("os.unlink", "filesystem_mutation", "os_unlink"),
+ _CallRule("pathlib.Path.chmod", "filesystem_mutation", "pathlib_chmod"),
+ _CallRule("pathlib.Path.mkdir", "filesystem_mutation", "pathlib_mkdir"),
+ _CallRule("pathlib.Path.open", "filesystem_mutation", "pathlib_open_write"),
+ _CallRule("pathlib.Path.rename", "filesystem_mutation", "pathlib_rename"),
+ _CallRule("pathlib.Path.replace", "filesystem_mutation", "pathlib_replace"),
+ _CallRule("pathlib.Path.rmdir", "filesystem_mutation", "pathlib_rmdir"),
+ _CallRule("pathlib.Path.touch", "filesystem_mutation", "pathlib_touch"),
+ _CallRule("pathlib.Path.unlink", "filesystem_mutation", "pathlib_unlink"),
+ _CallRule("pathlib.Path.write_bytes", "filesystem_mutation", "pathlib_write_bytes"),
+ _CallRule("pathlib.Path.write_text", "filesystem_mutation", "pathlib_write_text"),
+ _CallRule("pickle.load", "deserialization", "pickle_load"),
+ _CallRule("pickle.loads", "deserialization", "pickle_loads"),
+ _CallRule("pty.spawn", "process_boundary", "pty_spawn"),
+ _CallRule("runpy.run_module", "dynamic_loading", "run_module"),
+ _CallRule("runpy.run_path", "dynamic_loading", "run_path"),
+ _CallRule("shutil.move", "filesystem_mutation", "shutil_move"),
+ _CallRule("shutil.rmtree", "filesystem_mutation", "shutil_rmtree"),
+ _CallRule("shutil.unpack_archive", "archive_extraction", "unpack_archive"),
+ _CallRule("subprocess.call", "process_boundary", "subprocess_call"),
+ _CallRule("subprocess.check_call", "process_boundary", "subprocess_check_call"),
+ _CallRule("subprocess.check_output", "process_boundary", "subprocess_check_output"),
+ _CallRule("subprocess.Popen", "process_boundary", "subprocess_popen"),
+ _CallRule("subprocess.run", "process_boundary", "subprocess_run"),
+ _CallRule(
+ "tarfile.open.extract", "archive_extraction", "tar_extract", prefix_match=True
+ ),
+ _CallRule("tempfile.mkdtemp", "filesystem_mutation", "tempfile_mkdtemp"),
+ _CallRule(
+ "tempfile.NamedTemporaryFile",
+ "filesystem_mutation",
+ "tempfile_named_temporary_file",
+ ),
+ _CallRule("yaml.load", "deserialization", "yaml_load"),
+ _CallRule("yaml.unsafe_load", "deserialization", "yaml_unsafe_load"),
+ _CallRule(
+ "zipfile.ZipFile.extract",
+ "archive_extraction",
+ "zip_extract",
+ prefix_match=True,
+ ),
+)
+
+
+def _node_start_line(node: ast.AST) -> int | None:
+ line = getattr(node, "lineno", None)
+ if isinstance(line, int) and line > 0:
+ return line
+ return None
+
+
+def _node_end_line(node: ast.AST) -> int:
+ start_line = _node_start_line(node)
+ if start_line is None:
+ return 0
+ end_line = getattr(node, "end_lineno", None)
+ return (
+ end_line if isinstance(end_line, int) and end_line >= start_line else start_line
+ )
+
+
+def _is_type_checking_guard(test: ast.AST) -> bool:
+ match test:
+ case ast.Name(id="TYPE_CHECKING"):
+ return True
+ case ast.Attribute(value=ast.Name(id="typing"), attr="TYPE_CHECKING"):
+ return True
+ case _:
+ return False
+
+
+def _matches_import_prefix(imported_name: str, module_prefix: str) -> bool:
+ return imported_name == module_prefix or imported_name.startswith(
+ module_prefix + "."
+ )
+
+
+def _matches_call_rule(symbol: str, rule: _CallRule) -> bool:
+ return symbol == rule.symbol or (
+ rule.prefix_match and symbol.startswith(rule.symbol)
+ )
+
+
+class _SecuritySurfaceVisitor(ast.NodeVisitor):
+ __slots__ = (
+ "_aliases",
+ "_callable_depth",
+ "_class_depth",
+ "_filepath",
+ "_module_name",
+ "_scope_stack",
+ "_seen",
+ "items",
+ )
+
+ def __init__(self, *, module_name: str, filepath: str) -> None:
+ self._aliases: dict[str, str] = {}
+ self._module_name = module_name
+ self._filepath = filepath
+ self._scope_stack: list[str] = []
+ self._callable_depth = 0
+ self._class_depth = 0
+ self._seen: set[
+ tuple[
+ str,
+ str,
+ str,
+ int,
+ int,
+ str,
+ str,
+ str,
+ ]
+ ] = set()
+ self.items: list[SecuritySurface] = []
+
+ def _current_scope(self) -> tuple[str, SecuritySurfaceLocationScope]:
+ if not self._scope_stack:
+ return self._module_name, "module"
+ return (
+ f"{self._module_name}:{'.'.join(self._scope_stack)}",
+ "callable" if self._callable_depth > 0 else "class",
+ )
+
+ def _emit(
+ self,
+ *,
+ category: SecuritySurfaceCategory,
+ capability: str,
+ node: ast.AST,
+ classification_mode: SecuritySurfaceClassificationMode,
+ evidence_kind: SecuritySurfaceEvidenceKind,
+ evidence_symbol: str,
+ ) -> None:
+ start_line = _node_start_line(node)
+ if start_line is None:
+ return
+ qualname, location_scope = self._current_scope()
+ key = (
+ category,
+ capability,
+ qualname,
+ start_line,
+ _node_end_line(node),
+ classification_mode,
+ evidence_kind,
+ evidence_symbol,
+ )
+ if key in self._seen:
+ return
+ self._seen.add(key)
+ self.items.append(
+ SecuritySurface(
+ category=category,
+ capability=capability,
+ module=self._module_name,
+ filepath=self._filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=_node_end_line(node),
+ location_scope=location_scope,
+ classification_mode=classification_mode,
+ evidence_kind=evidence_kind,
+ evidence_symbol=evidence_symbol,
+ )
+ )
+
+ def _register_import_alias(self, *, bound_name: str, imported_name: str) -> None:
+ clean_bound = bound_name.strip()
+ clean_imported = imported_name.strip()
+ if clean_bound and clean_imported:
+ self._aliases[clean_bound] = clean_imported
+
+ def _emit_import_matches(self, *, imported_name: str, node: ast.AST) -> None:
+ for rule in _IMPORT_RULES:
+ if _matches_import_prefix(imported_name, rule.module_prefix):
+ self._emit(
+ category=rule.category,
+ capability=rule.capability,
+ node=node,
+ classification_mode="exact_import",
+ evidence_kind="import",
+ evidence_symbol=imported_name,
+ )
+
+ def _resolve_expr_symbol(self, node: ast.AST) -> str | None:
+ match node:
+ case ast.Name(id=name):
+ resolved = self._aliases.get(name)
+ if resolved is not None:
+ return resolved
+ if name in _BUILTIN_RULES or name == "open":
+ return name
+ return None
+ case ast.Attribute(value=value, attr=attr):
+ parent = self._resolve_expr_symbol(value)
+ if parent is None:
+ return None
+ return f"{parent}.{attr}"
+ case ast.Call(func=func):
+ return self._resolve_expr_symbol(func)
+ case _:
+ return None
+
+ def _mode_from_open_call(self, node: ast.Call) -> str | None:
+ mode_arg: ast.AST | None = None
+ if len(node.args) >= 2:
+ mode_arg = node.args[1]
+ else:
+ for keyword in node.keywords:
+ if keyword.arg == "mode":
+ mode_arg = keyword.value
+ break
+ if not isinstance(mode_arg, ast.Constant) or not isinstance(
+ mode_arg.value, str
+ ):
+ return None
+ mode = mode_arg.value
+ if any(marker in mode for marker in ("w", "a", "x", "+")):
+ return mode
+ return None
+
+ def _emit_call_matches(self, node: ast.Call) -> None:
+ symbol = self._resolve_expr_symbol(node.func)
+ if symbol is None:
+ return
+ if symbol in _BUILTIN_RULES:
+ category, capability = _BUILTIN_RULES[symbol]
+ self._emit(
+ category=category,
+ capability=capability,
+ node=node,
+ classification_mode="exact_builtin",
+ evidence_kind="builtin",
+ evidence_symbol=symbol,
+ )
+ if symbol in {"open", "pathlib.Path.open"}:
+ mode = self._mode_from_open_call(node)
+ if mode is not None:
+ capability = (
+ "pathlib_open_write"
+ if symbol == "pathlib.Path.open"
+ else "builtin_open_write"
+ )
+ self._emit(
+ category="filesystem_mutation",
+ capability=capability,
+ node=node,
+ classification_mode="exact_call",
+ evidence_kind="call",
+ evidence_symbol=f"{symbol}[mode={mode}]",
+ )
+ for rule in _CALL_RULES:
+ if _matches_call_rule(symbol, rule):
+ self._emit(
+ category=rule.category,
+ capability=rule.capability,
+ node=node,
+ classification_mode="exact_call",
+ evidence_kind="call",
+ evidence_symbol=symbol,
+ )
+
+ def visit_If(self, node: ast.If) -> None:
+ if _is_type_checking_guard(node.test):
+ for child in node.orelse:
+ self.visit(child)
+ return
+ self.generic_visit(node)
+
+ def visit_Import(self, node: ast.Import) -> None:
+ for alias in node.names:
+ full_name = alias.name.strip()
+ if not full_name:
+ continue
+ bound_name = alias.asname or full_name.split(".", maxsplit=1)[0]
+ self._register_import_alias(
+ bound_name=bound_name,
+ imported_name=full_name if alias.asname else bound_name,
+ )
+ self._emit_import_matches(imported_name=full_name, node=node)
+
+ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+ if (
+ node.level != 0
+ or not isinstance(node.module, str)
+ or not node.module.strip()
+ ):
+ return
+ module_name = node.module.strip()
+ for alias in node.names:
+ if alias.name == "*":
+ continue
+ full_name = f"{module_name}.{alias.name}"
+ self._register_import_alias(
+ bound_name=alias.asname or alias.name,
+ imported_name=full_name,
+ )
+ self._emit_import_matches(imported_name=full_name, node=node)
+
+ def _visit_scoped_node(
+ self,
+ node: ast.AST,
+ *,
+ scope_name: str,
+ is_callable: bool,
+ ) -> None:
+ self._scope_stack.append(scope_name)
+ if is_callable:
+ self._callable_depth += 1
+ else:
+ self._class_depth += 1
+ self.generic_visit(node)
+ if is_callable:
+ self._callable_depth -= 1
+ else:
+ self._class_depth -= 1
+ self._scope_stack.pop()
+
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
+ self._visit_scoped_node(node, scope_name=node.name, is_callable=False)
+
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+ self._visit_scoped_node(node, scope_name=node.name, is_callable=True)
+
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
+ self._visit_scoped_node(node, scope_name=node.name, is_callable=True)
+
+ def visit_Call(self, node: ast.Call) -> None:
+ self._emit_call_matches(node)
+ self.generic_visit(node)
+
+
+def collect_security_surfaces(
+ *,
+ tree: ast.Module,
+ module_name: str,
+ filepath: str,
+) -> tuple[SecuritySurface, ...]:
+ visitor = _SecuritySurfaceVisitor(module_name=module_name, filepath=filepath)
+ visitor.visit(tree)
+ return tuple(
+ sorted(
+ visitor.items,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ item.category,
+ item.capability,
+ item.evidence_symbol,
+ item.classification_mode,
+ ),
+ )
+ )
+
+
+__all__ = ["collect_security_surfaces"]
diff --git a/codeclone/suppressions.py b/codeclone/analysis/suppressions.py
similarity index 96%
rename from codeclone/suppressions.py
rename to codeclone/analysis/suppressions.py
index 0b45987..b717f6b 100644
--- a/codeclone/suppressions.py
+++ b/codeclone/analysis/suppressions.py
@@ -31,7 +31,7 @@
SuppressionTargetKey = tuple[str, str, int, int, DeclarationKind]
_SUPPRESSION_DIRECTIVE_PATTERN: Final[re.Pattern[str]] = re.compile(
- r"^\s*#\s*codeclone\s*:\s*ignore\s*\[(?P[^\]]+)\]\s*$"
+ r"^\s*#\s*codeclone\s*:\s*ignore\s*\[(?P[^]]+)]\s*$"
)
_RULE_ID_PATTERN: Final[re.Pattern[str]] = re.compile(r"^[a-z0-9][a-z0-9-]*$")
@@ -174,7 +174,7 @@ def _declaration_inline_lines(target: DeclarationTarget) -> tuple[int, ...]:
end_line = target.declaration_end_line or target.start_line
if end_line <= 0 or end_line == target.start_line:
return (target.start_line,)
- return (target.start_line, end_line)
+ return target.start_line, end_line
def _bound_inline_rules(
@@ -250,7 +250,7 @@ def suppression_target_key(
end_line: int,
kind: DeclarationKind,
) -> SuppressionTargetKey:
- return (filepath, qualname, start_line, end_line, kind)
+ return filepath, qualname, start_line, end_line, kind
def build_suppression_index(
@@ -265,6 +265,5 @@ def build_suppression_index(
end_line=binding.end_line,
kind=binding.kind,
)
- existing = index.get(key, ())
- index[key] = _merge_rules(existing, binding.rules)
+ index[key] = _merge_rules(index.get(key, ()), binding.rules)
return index
diff --git a/codeclone/analysis/units.py b/codeclone/analysis/units.py
new file mode 100644
index 0000000..88d9b4b
--- /dev/null
+++ b/codeclone/analysis/units.py
@@ -0,0 +1,323 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import ast
+from hashlib import sha1 as _sha1
+
+from .. import qualnames as _qualnames
+from ..blocks import extract_blocks, extract_segments
+from ..contracts import (
+ DEFAULT_BLOCK_MIN_LOC,
+ DEFAULT_BLOCK_MIN_STMT,
+ DEFAULT_SEGMENT_MIN_LOC,
+ DEFAULT_SEGMENT_MIN_STMT,
+)
+from ..contracts.errors import ParseError
+from ..findings.structural.detectors import scan_function_structure
+from ..metrics.adoption import collect_module_adoption
+from ..metrics.api_surface import collect_module_api_surface
+from ..metrics.complexity import risk_level
+from ..models import (
+ BlockUnit,
+ ClassMetrics,
+ FileMetrics,
+ SegmentUnit,
+ SourceStats,
+ StructuralFindingGroup,
+ Unit,
+)
+from ..paths import is_test_filepath
+from ._module_walk import (
+ _build_suppression_index_for_source,
+ _collect_dead_candidates,
+ _collect_module_walk_data,
+)
+from .class_metrics import _class_metrics_for_node, _node_line_span
+from .fingerprint import _cfg_fingerprint_and_complexity, bucket_loc
+from .normalizer import NormalizationConfig, stmt_hashes
+from .parser import PARSE_TIMEOUT_SECONDS, _parse_with_limits
+from .security_surfaces import collect_security_surfaces
+
+__all__ = ["extract_units_and_stats_from_source"]
+
+
+def _stmt_count(node: ast.AST) -> int:
+ body = getattr(node, "body", None)
+ return len(body) if isinstance(body, list) else 0
+
+
+_STMT_COUNT_IMPL = _stmt_count
+
+
+def _raw_source_hash_for_range(
+ source_lines: list[str],
+ start_line: int,
+ end_line: int,
+) -> str:
+ window = "".join(source_lines[start_line - 1 : end_line]).strip()
+ no_space = "".join(window.split())
+ return _sha1(no_space.encode("utf-8")).hexdigest()
+
+
+def _eligible_unit_shape(
+ node: _qualnames.FunctionNode,
+ *,
+ min_loc: int,
+ min_stmt: int,
+) -> tuple[int, int, int, int] | None:
+ span = _node_line_span(node)
+ if span is None:
+ return None
+ start, end = span
+ if end < start:
+ return None
+ loc = end - start + 1
+ stmt_count = _stmt_count(node)
+ if loc < min_loc or stmt_count < min_stmt:
+ return None
+ return start, end, loc, stmt_count
+
+
+def extract_units_and_stats_from_source(
+ source: str,
+ filepath: str,
+ module_name: str,
+ cfg: NormalizationConfig,
+ min_loc: int,
+ min_stmt: int,
+ *,
+ block_min_loc: int = DEFAULT_BLOCK_MIN_LOC,
+ block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT,
+ segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC,
+ segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT,
+ collect_structural_findings: bool = True,
+ collect_api_surface: bool = False,
+ api_include_private_modules: bool = False,
+) -> tuple[
+ list[Unit],
+ list[BlockUnit],
+ list[SegmentUnit],
+ SourceStats,
+ FileMetrics,
+ list[StructuralFindingGroup],
+]:
+ try:
+ tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS)
+ except SyntaxError as e:
+ raise ParseError(f"Failed to parse {filepath}: {e}") from e
+ if not isinstance(tree, ast.Module):
+ raise ParseError(f"Failed to parse {filepath}: expected module AST root")
+
+ collector = _qualnames.QualnameCollector()
+ collector.visit(tree)
+ source_lines = source.splitlines()
+ source_line_count = len(source_lines)
+
+ is_test_file = is_test_filepath(filepath)
+
+ # Single-pass AST walk replaces 3 separate functions / 4 walks.
+ _walk = _collect_module_walk_data(
+ tree=tree,
+ module_name=module_name,
+ collector=collector,
+ collect_referenced_names=not is_test_file,
+ )
+ import_names = _walk.import_names
+ module_deps = _walk.module_deps
+ referenced_names = _walk.referenced_names
+ referenced_qualnames = _walk.referenced_qualnames
+ protocol_symbol_aliases = _walk.protocol_symbol_aliases
+ protocol_module_aliases = _walk.protocol_module_aliases
+
+ suppression_index = _build_suppression_index_for_source(
+ source=source,
+ filepath=filepath,
+ module_name=module_name,
+ collector=collector,
+ )
+ class_names = frozenset(class_node.name for _, class_node in collector.class_nodes)
+ module_import_names = set(import_names)
+ module_class_names = set(class_names)
+ class_metrics: list[ClassMetrics] = []
+
+ units: list[Unit] = []
+ block_units: list[BlockUnit] = []
+ segment_units: list[SegmentUnit] = []
+ structural_findings: list[StructuralFindingGroup] = []
+
+ for local_name, node in collector.units:
+ unit_shape = _eligible_unit_shape(
+ node,
+ min_loc=min_loc,
+ min_stmt=min_stmt,
+ )
+ if unit_shape is None:
+ continue
+ start, end, loc, stmt_count = unit_shape
+
+ qualname = f"{module_name}:{local_name}"
+ fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname)
+ structure_facts = scan_function_structure(
+ node,
+ filepath,
+ qualname,
+ collect_findings=collect_structural_findings,
+ )
+ depth = structure_facts.nesting_depth
+ risk = risk_level(complexity)
+ raw_hash = _raw_source_hash_for_range(source_lines, start, end)
+
+ units.append(
+ Unit(
+ qualname=qualname,
+ filepath=filepath,
+ start_line=start,
+ end_line=end,
+ loc=loc,
+ stmt_count=stmt_count,
+ fingerprint=fingerprint,
+ loc_bucket=bucket_loc(loc),
+ cyclomatic_complexity=complexity,
+ nesting_depth=depth,
+ risk=risk,
+ raw_hash=raw_hash,
+ entry_guard_count=structure_facts.entry_guard_count,
+ entry_guard_terminal_profile=(
+ structure_facts.entry_guard_terminal_profile
+ ),
+ entry_guard_has_side_effect_before=(
+ structure_facts.entry_guard_has_side_effect_before
+ ),
+ terminal_kind=structure_facts.terminal_kind,
+ try_finally_profile=structure_facts.try_finally_profile,
+ side_effect_order_profile=structure_facts.side_effect_order_profile,
+ )
+ )
+
+ needs_blocks = (
+ not local_name.endswith("__init__")
+ and loc >= block_min_loc
+ and stmt_count >= block_min_stmt
+ )
+ needs_segments = loc >= segment_min_loc and stmt_count >= segment_min_stmt
+
+ if needs_blocks or needs_segments:
+ body = getattr(node, "body", None)
+ hashes: list[str] | None = None
+ if isinstance(body, list):
+ hashes = stmt_hashes(body, cfg)
+
+ if needs_blocks:
+ block_units.extend(
+ extract_blocks(
+ node,
+ filepath=filepath,
+ qualname=qualname,
+ cfg=cfg,
+ block_size=4,
+ max_blocks=15,
+ precomputed_hashes=hashes,
+ )
+ )
+
+ if needs_segments:
+ segment_units.extend(
+ extract_segments(
+ node,
+ filepath=filepath,
+ qualname=qualname,
+ cfg=cfg,
+ window_size=6,
+ max_segments=60,
+ precomputed_hashes=hashes,
+ )
+ )
+
+ if collect_structural_findings:
+ structural_findings.extend(structure_facts.structural_findings)
+
+ for class_qualname, class_node in collector.class_nodes:
+ class_metric = _class_metrics_for_node(
+ module_name=module_name,
+ class_qualname=class_qualname,
+ class_node=class_node,
+ filepath=filepath,
+ module_import_names=module_import_names,
+ module_class_names=module_class_names,
+ )
+ if class_metric is not None:
+ class_metrics.append(class_metric)
+
+ dead_candidates = _collect_dead_candidates(
+ filepath=filepath,
+ module_name=module_name,
+ collector=collector,
+ protocol_symbol_aliases=protocol_symbol_aliases,
+ protocol_module_aliases=protocol_module_aliases,
+ suppression_rules_by_target=suppression_index,
+ )
+
+ sorted_class_metrics = tuple(
+ sorted(
+ class_metrics,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ ),
+ )
+ )
+ typing_coverage, docstring_coverage = collect_module_adoption(
+ tree=tree,
+ module_name=module_name,
+ filepath=filepath,
+ collector=collector,
+ imported_names=import_names,
+ )
+ api_surface = None
+ if collect_api_surface:
+ api_surface = collect_module_api_surface(
+ tree=tree,
+ module_name=module_name,
+ filepath=filepath,
+ collector=collector,
+ imported_names=import_names,
+ include_private_modules=api_include_private_modules,
+ )
+ security_surfaces = collect_security_surfaces(
+ tree=tree,
+ module_name=module_name,
+ filepath=filepath,
+ )
+
+ return (
+ units,
+ block_units,
+ segment_units,
+ SourceStats(
+ lines=source_line_count,
+ functions=collector.function_count,
+ methods=collector.method_count,
+ classes=collector.class_count,
+ ),
+ FileMetrics(
+ class_metrics=sorted_class_metrics,
+ module_deps=module_deps,
+ dead_candidates=dead_candidates,
+ referenced_names=referenced_names,
+ import_names=import_names,
+ class_names=class_names,
+ security_surfaces=security_surfaces,
+ referenced_qualnames=referenced_qualnames,
+ typing_coverage=typing_coverage,
+ docstring_coverage=docstring_coverage,
+ api_surface=api_surface,
+ ),
+ structural_findings,
+ )
diff --git a/codeclone/baseline/__init__.py b/codeclone/baseline/__init__.py
new file mode 100644
index 0000000..88891d4
--- /dev/null
+++ b/codeclone/baseline/__init__.py
@@ -0,0 +1,25 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from .clone_baseline import Baseline
+from .trust import (
+ BASELINE_GENERATOR,
+ BASELINE_UNTRUSTED_STATUSES,
+ MAX_BASELINE_SIZE_BYTES,
+ BaselineStatus,
+ coerce_baseline_status,
+ current_python_tag,
+)
+
+__all__ = [
+ "BASELINE_GENERATOR",
+ "BASELINE_UNTRUSTED_STATUSES",
+ "MAX_BASELINE_SIZE_BYTES",
+ "Baseline",
+ "BaselineStatus",
+ "coerce_baseline_status",
+ "current_python_tag",
+]
diff --git a/codeclone/baseline/_metrics_baseline_contract.py b/codeclone/baseline/_metrics_baseline_contract.py
new file mode 100644
index 0000000..6ba78a1
--- /dev/null
+++ b/codeclone/baseline/_metrics_baseline_contract.py
@@ -0,0 +1,100 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Final
+
+METRICS_BASELINE_GENERATOR: Final = "codeclone"
+MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024
+
+
+class MetricsBaselineStatus(str, Enum):
+ OK = "ok"
+ MISSING = "missing"
+ TOO_LARGE = "too_large"
+ INVALID_JSON = "invalid_json"
+ INVALID_TYPE = "invalid_type"
+ MISSING_FIELDS = "missing_fields"
+ MISMATCH_SCHEMA_VERSION = "mismatch_schema_version"
+ MISMATCH_PYTHON_VERSION = "mismatch_python_version"
+ GENERATOR_MISMATCH = "generator_mismatch"
+ INTEGRITY_MISSING = "integrity_missing"
+ INTEGRITY_FAILED = "integrity_failed"
+
+
+METRICS_BASELINE_UNTRUSTED_STATUSES: Final[frozenset[MetricsBaselineStatus]] = (
+ frozenset(
+ {
+ MetricsBaselineStatus.MISSING,
+ MetricsBaselineStatus.TOO_LARGE,
+ MetricsBaselineStatus.INVALID_JSON,
+ MetricsBaselineStatus.INVALID_TYPE,
+ MetricsBaselineStatus.MISSING_FIELDS,
+ MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION,
+ MetricsBaselineStatus.MISMATCH_PYTHON_VERSION,
+ MetricsBaselineStatus.GENERATOR_MISMATCH,
+ MetricsBaselineStatus.INTEGRITY_MISSING,
+ MetricsBaselineStatus.INTEGRITY_FAILED,
+ }
+ )
+)
+
+_TOP_LEVEL_REQUIRED_KEYS = frozenset({"meta", "metrics"})
+_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | frozenset(
+ {"clones", "api_surface"}
+)
+_META_REQUIRED_KEYS = frozenset(
+ {"generator", "schema_version", "python_tag", "created_at", "payload_sha256"}
+)
+_METRICS_REQUIRED_KEYS = frozenset(
+ {
+ "max_complexity",
+ "high_risk_functions",
+ "max_coupling",
+ "high_coupling_classes",
+ "max_cohesion",
+ "low_cohesion_classes",
+ "dependency_cycles",
+ "dependency_max_depth",
+ "dead_code_items",
+ "health_score",
+ "health_grade",
+ }
+)
+_METRICS_OPTIONAL_KEYS = frozenset(
+ {
+ "typing_param_permille",
+ "typing_return_permille",
+ "docstring_permille",
+ "typing_any_count",
+ }
+)
+_METRICS_PAYLOAD_SHA256_KEY = "metrics_payload_sha256"
+_API_SURFACE_PAYLOAD_SHA256_KEY = "api_surface_payload_sha256"
+
+
+def coerce_metrics_baseline_status(
+ raw_status: str | MetricsBaselineStatus | None,
+) -> MetricsBaselineStatus:
+ if isinstance(raw_status, MetricsBaselineStatus):
+ return raw_status
+ if isinstance(raw_status, str):
+ try:
+ return MetricsBaselineStatus(raw_status)
+ except ValueError:
+ return MetricsBaselineStatus.INVALID_TYPE
+ return MetricsBaselineStatus.INVALID_TYPE
+
+
+__all__ = [
+ "MAX_METRICS_BASELINE_SIZE_BYTES",
+ "METRICS_BASELINE_GENERATOR",
+ "METRICS_BASELINE_UNTRUSTED_STATUSES",
+ "MetricsBaselineStatus",
+ "coerce_metrics_baseline_status",
+]
diff --git a/codeclone/baseline/_metrics_baseline_payload.py b/codeclone/baseline/_metrics_baseline_payload.py
new file mode 100644
index 0000000..4f24864
--- /dev/null
+++ b/codeclone/baseline/_metrics_baseline_payload.py
@@ -0,0 +1,243 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+import orjson
+
+from ..cache.projection import wire_filepath_from_runtime
+from ..models import ApiSurfaceSnapshot, MetricsSnapshot, ProjectMetrics
+from ._metrics_baseline_contract import _API_SURFACE_PAYLOAD_SHA256_KEY
+
+
+def snapshot_from_project_metrics(project_metrics: ProjectMetrics) -> MetricsSnapshot:
+ return MetricsSnapshot(
+ max_complexity=int(project_metrics.complexity_max),
+ high_risk_functions=tuple(sorted(set(project_metrics.high_risk_functions))),
+ max_coupling=int(project_metrics.coupling_max),
+ high_coupling_classes=tuple(sorted(set(project_metrics.high_risk_classes))),
+ max_cohesion=int(project_metrics.cohesion_max),
+ low_cohesion_classes=tuple(sorted(set(project_metrics.low_cohesion_classes))),
+ dependency_cycles=tuple(
+ sorted({tuple(cycle) for cycle in project_metrics.dependency_cycles})
+ ),
+ dependency_max_depth=int(project_metrics.dependency_max_depth),
+ dead_code_items=tuple(
+ sorted({item.qualname for item in project_metrics.dead_code})
+ ),
+ health_score=int(project_metrics.health.total),
+ health_grade=project_metrics.health.grade,
+ typing_param_permille=_permille(
+ project_metrics.typing_param_annotated,
+ project_metrics.typing_param_total,
+ ),
+ typing_return_permille=_permille(
+ project_metrics.typing_return_annotated,
+ project_metrics.typing_return_total,
+ ),
+ docstring_permille=_permille(
+ project_metrics.docstring_public_documented,
+ project_metrics.docstring_public_total,
+ ),
+ typing_any_count=int(project_metrics.typing_any_count),
+ )
+
+
+def _permille(numerator: int, denominator: int) -> int:
+ if denominator <= 0:
+ return 0
+ return round((1000.0 * float(numerator)) / float(denominator))
+
+
+def _canonical_json(payload: object) -> str:
+ return orjson.dumps(payload, option=orjson.OPT_SORT_KEYS).decode("utf-8")
+
+
+def _snapshot_payload(
+ snapshot: MetricsSnapshot,
+ *,
+ include_adoption: bool = True,
+) -> dict[str, object]:
+ payload: dict[str, object] = {
+ "max_complexity": int(snapshot.max_complexity),
+ "high_risk_functions": list(snapshot.high_risk_functions),
+ "max_coupling": int(snapshot.max_coupling),
+ "high_coupling_classes": list(snapshot.high_coupling_classes),
+ "max_cohesion": int(snapshot.max_cohesion),
+ "low_cohesion_classes": list(snapshot.low_cohesion_classes),
+ "dependency_cycles": [list(cycle) for cycle in snapshot.dependency_cycles],
+ "dependency_max_depth": int(snapshot.dependency_max_depth),
+ "dead_code_items": list(snapshot.dead_code_items),
+ "health_score": int(snapshot.health_score),
+ "health_grade": snapshot.health_grade,
+ }
+ if include_adoption:
+ payload.update(
+ {
+ "typing_param_permille": int(snapshot.typing_param_permille),
+ "typing_return_permille": int(snapshot.typing_return_permille),
+ "docstring_permille": int(snapshot.docstring_permille),
+ "typing_any_count": int(snapshot.typing_any_count),
+ }
+ )
+ return payload
+
+
+def _compute_payload_sha256(
+ snapshot: MetricsSnapshot,
+ *,
+ include_adoption: bool = True,
+) -> str:
+ canonical = _canonical_json(
+ _snapshot_payload(snapshot, include_adoption=include_adoption)
+ )
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+
+
+def _has_coverage_adoption_snapshot(metrics_obj: dict[str, object]) -> bool:
+ return all(
+ key in metrics_obj
+ for key in (
+ "typing_param_permille",
+ "typing_return_permille",
+ "docstring_permille",
+ )
+ )
+
+
+def _api_surface_snapshot_payload(
+ snapshot: ApiSurfaceSnapshot,
+ *,
+ root: Path | None = None,
+ legacy_qualname: bool = False,
+) -> dict[str, object]:
+ return {
+ "modules": [
+ {
+ "module": module.module,
+ "filepath": wire_filepath_from_runtime(module.filepath, root=root),
+ "all_declared": list(module.all_declared or ()),
+ "symbols": [
+ {
+ ("qualname" if legacy_qualname else "local_name"): (
+ symbol.qualname
+ if legacy_qualname
+ else _local_name_from_qualname(
+ module=module.module,
+ qualname=symbol.qualname,
+ )
+ ),
+ "kind": symbol.kind,
+ "start_line": symbol.start_line,
+ "end_line": symbol.end_line,
+ "params": [
+ {
+ "name": param.name,
+ "kind": param.kind,
+ "has_default": param.has_default,
+ "annotation_hash": param.annotation_hash,
+ }
+ for param in symbol.params
+ ],
+ "returns_hash": symbol.returns_hash,
+ "exported_via": symbol.exported_via,
+ }
+ for symbol in sorted(
+ module.symbols,
+ key=lambda item: item.qualname,
+ )
+ ],
+ }
+ for module in sorted(
+ snapshot.modules,
+ key=lambda item: (item.filepath, item.module),
+ )
+ ]
+ }
+
+
+def _compute_api_surface_payload_sha256(
+ snapshot: ApiSurfaceSnapshot,
+ *,
+ root: Path | None = None,
+) -> str:
+ canonical = _canonical_json(_api_surface_snapshot_payload(snapshot, root=root))
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+
+
+def _compute_legacy_api_surface_payload_sha256(
+ snapshot: ApiSurfaceSnapshot,
+ *,
+ root: Path | None = None,
+) -> str:
+ canonical = _canonical_json(
+ _api_surface_snapshot_payload(snapshot, root=root, legacy_qualname=True)
+ )
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+
+
+def _compose_api_surface_qualname(*, module: str, local_name: str) -> str:
+ return f"{module}:{local_name}"
+
+
+def _local_name_from_qualname(*, module: str, qualname: str) -> str:
+ prefix = f"{module}:"
+ if qualname.startswith(prefix):
+ return qualname[len(prefix) :]
+ return qualname
+
+
+def _build_payload(
+ *,
+ snapshot: MetricsSnapshot,
+ schema_version: str,
+ python_tag: str,
+ generator_name: str,
+ generator_version: str,
+ created_at: str,
+ include_adoption: bool = True,
+ api_surface_snapshot: ApiSurfaceSnapshot | None = None,
+ api_surface_root: Path | None = None,
+) -> dict[str, object]:
+ payload_sha256 = _compute_payload_sha256(
+ snapshot,
+ include_adoption=include_adoption,
+ )
+ meta: dict[str, object] = {
+ "generator": {
+ "name": generator_name,
+ "version": generator_version,
+ },
+ "schema_version": schema_version,
+ "python_tag": python_tag,
+ "created_at": created_at,
+ "payload_sha256": payload_sha256,
+ }
+ payload: dict[str, object] = {
+ "meta": meta,
+ "metrics": _snapshot_payload(
+ snapshot,
+ include_adoption=include_adoption,
+ ),
+ }
+ if api_surface_snapshot is not None:
+ meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = _compute_api_surface_payload_sha256(
+ api_surface_snapshot,
+ root=api_surface_root,
+ )
+ payload["api_surface"] = _api_surface_snapshot_payload(
+ api_surface_snapshot,
+ root=api_surface_root,
+ )
+ return payload
+
+
+__all__ = [
+ "snapshot_from_project_metrics",
+]
diff --git a/codeclone/baseline/_metrics_baseline_validation.py b/codeclone/baseline/_metrics_baseline_validation.py
new file mode 100644
index 0000000..21f15a9
--- /dev/null
+++ b/codeclone/baseline/_metrics_baseline_validation.py
@@ -0,0 +1,648 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from json import JSONDecodeError
+from pathlib import Path
+from typing import Literal
+
+from ..cache.projection import runtime_filepath_from_wire
+from ..contracts import BASELINE_SCHEMA_VERSION
+from ..contracts.errors import BaselineValidationError
+from ..models import (
+ ApiParamSpec,
+ ApiSurfaceSnapshot,
+ MetricsSnapshot,
+ ModuleApiSurface,
+ PublicSymbol,
+)
+from ..utils.json_io import read_json_object as _read_json_object
+from ..utils.json_io import (
+ write_json_document_atomically as _write_json_document_atomically,
+)
+from ..utils.schema_validation import validate_top_level_structure
+from ._metrics_baseline_contract import (
+ _METRICS_PAYLOAD_SHA256_KEY,
+ _TOP_LEVEL_ALLOWED_KEYS,
+ _TOP_LEVEL_REQUIRED_KEYS,
+ MetricsBaselineStatus,
+)
+from ._metrics_baseline_payload import _compose_api_surface_qualname
+
+_HEALTH_GRADES = {"A", "B", "C", "D", "F"}
+_API_PARAM_KINDS = {"pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"}
+_PUBLIC_SYMBOL_KINDS = {"function", "class", "method", "constant"}
+_EXPORTED_VIA_KINDS = {"all", "name"}
+
+
+def _is_compatible_metrics_schema(
+ *,
+ baseline_version: str | None,
+ expected_version: str,
+) -> bool:
+ if baseline_version is None:
+ return False
+ baseline_major_minor = _parse_major_minor(baseline_version)
+ expected_major_minor = _parse_major_minor(expected_version)
+ if baseline_major_minor is None or expected_major_minor is None:
+ return baseline_version == expected_version
+ baseline_major, baseline_minor = baseline_major_minor
+ expected_major, expected_minor = expected_major_minor
+ return baseline_major == expected_major and baseline_minor <= expected_minor
+
+
+def _parse_major_minor(version: str) -> tuple[int, int] | None:
+ parts = version.split(".")
+ if len(parts) != 2 or not all(part.isdigit() for part in parts):
+ return None
+ return int(parts[0]), int(parts[1])
+
+
+def _atomic_write_json(path: Path, payload: dict[str, object]) -> None:
+ _write_json_document_atomically(
+ path,
+ payload,
+ indent=True,
+ trailing_newline=True,
+ )
+
+
+def _load_json_object(path: Path) -> dict[str, object]:
+ try:
+ return _read_json_object(path)
+ except OSError as e:
+ raise BaselineValidationError(
+ f"Cannot read metrics baseline file at {path}: {e}",
+ status=MetricsBaselineStatus.INVALID_JSON,
+ ) from e
+ except JSONDecodeError as e:
+ raise BaselineValidationError(
+ f"Corrupted metrics baseline file at {path}: {e}",
+ status=MetricsBaselineStatus.INVALID_JSON,
+ ) from e
+ except TypeError:
+ raise BaselineValidationError(
+ f"Metrics baseline payload must be an object at {path}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ ) from None
+
+
+def _validate_top_level_structure(payload: dict[str, object], *, path: Path) -> None:
+ validate_top_level_structure(
+ payload,
+ path=path,
+ required_keys=_TOP_LEVEL_REQUIRED_KEYS,
+ allowed_keys=_TOP_LEVEL_ALLOWED_KEYS,
+ schema_label="metrics baseline",
+ missing_status=MetricsBaselineStatus.MISSING_FIELDS,
+ extra_status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _validate_required_keys(
+ payload: dict[str, object],
+ required: frozenset[str],
+ *,
+ path: Path,
+) -> None:
+ missing = required - set(payload.keys())
+ if missing:
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: missing required fields: {', '.join(sorted(missing))}",
+ status=MetricsBaselineStatus.MISSING_FIELDS,
+ )
+
+
+def _validate_exact_keys(
+ payload: dict[str, object],
+ required: frozenset[str],
+ *,
+ path: Path,
+) -> None:
+ extra = set(payload.keys()) - set(required)
+ if extra:
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: unexpected fields: {', '.join(sorted(extra))}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_str(payload: dict[str, object], key: str, *, path: Path) -> str:
+ value = payload.get(key)
+ if isinstance(value, str):
+ return value
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _extract_metrics_payload_sha256(
+ payload: dict[str, object],
+ *,
+ path: Path,
+) -> str:
+ direct = payload.get(_METRICS_PAYLOAD_SHA256_KEY)
+ if isinstance(direct, str):
+ return direct
+ return _require_str(payload, "payload_sha256", path=path)
+
+
+def _extract_optional_payload_sha256(
+ payload: dict[str, object],
+ *,
+ key: str,
+) -> str | None:
+ value = payload.get(key)
+ return value if isinstance(value, str) else None
+
+
+def _require_int(payload: dict[str, object], key: str, *, path: Path) -> int:
+ value = payload.get(key)
+ if isinstance(value, bool):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be int",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if isinstance(value, int):
+ return value
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be int",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _optional_require_str(
+ payload: dict[str, object],
+ key: str,
+ *,
+ path: Path,
+) -> str | None:
+ value = payload.get(key)
+ if value is None:
+ return None
+ if isinstance(value, str):
+ return value
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_str_list(
+ payload: dict[str, object],
+ key: str,
+ *,
+ path: Path,
+) -> list[str]:
+ value = payload.get(key)
+ if not isinstance(value, list):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if not all(isinstance(item, str) for item in value):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ return value
+
+
+def _parse_cycles(
+ payload: dict[str, object],
+ *,
+ key: str,
+ path: Path,
+) -> tuple[tuple[str, ...], ...]:
+ value = payload.get(key)
+ if not isinstance(value, list):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: {key!r} must be list",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+ cycles: list[tuple[str, ...]] = []
+ for cycle in value:
+ if not isinstance(cycle, list):
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: {key!r} cycle item must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if not all(isinstance(item, str) for item in cycle):
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: {key!r} cycle item must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ cycles.append(tuple(cycle))
+ return tuple(sorted(set(cycles)))
+
+
+def _parse_generator(
+ meta: dict[str, object],
+ *,
+ path: Path,
+) -> tuple[str, str | None]:
+ generator = meta.get("generator")
+ if isinstance(generator, str):
+ version_value = meta.get("generator_version")
+ if version_value is None:
+ version_value = meta.get("codeclone_version")
+ if version_value is None:
+ return generator, None
+ if not isinstance(version_value, str):
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: generator_version must be str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ return generator, version_value
+
+ if isinstance(generator, dict):
+ allowed_keys = {"name", "version"}
+ extra = set(generator.keys()) - allowed_keys
+ if extra:
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ f"unexpected generator keys: {', '.join(sorted(extra))}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ name = generator.get("name")
+ version = generator.get("version")
+ if not isinstance(name, str):
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: generator.name must be str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if version is not None and not isinstance(version, str):
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: generator.version must be str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ return name, version if isinstance(version, str) else None
+
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: generator must be object or str",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_embedded_clone_baseline_payload(
+ payload: dict[str, object],
+ *,
+ path: Path,
+) -> tuple[dict[str, object], dict[str, object]]:
+ meta_obj = payload.get("meta")
+ clones_obj = payload.get("clones")
+ if not isinstance(meta_obj, dict):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: 'meta' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if not isinstance(clones_obj, dict):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: 'clones' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ _require_str(meta_obj, "payload_sha256", path=path)
+ _require_str(meta_obj, "python_tag", path=path)
+ _require_str(meta_obj, "created_at", path=path)
+ functions = clones_obj.get("functions")
+ blocks = clones_obj.get("blocks")
+ if not isinstance(functions, list) or not all(
+ isinstance(item, str) for item in functions
+ ):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: 'clones.functions' must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if not isinstance(blocks, list) or not all(
+ isinstance(item, str) for item in blocks
+ ):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: 'clones.blocks' must be list[str]",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ return meta_obj, clones_obj
+
+
+def _resolve_embedded_schema_version(meta: dict[str, object], *, path: Path) -> str:
+ raw_version = _require_str(meta, "schema_version", path=path)
+ parts = raw_version.split(".")
+ if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts):
+ raise BaselineValidationError(
+ "Invalid baseline schema at "
+ f"{path}: 'schema_version' must be semver string",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ major = int(parts[0])
+ if major >= 2:
+ return raw_version
+ return BASELINE_SCHEMA_VERSION
+
+
+def _parse_snapshot(
+ payload: dict[str, object],
+ *,
+ path: Path,
+) -> MetricsSnapshot:
+ grade = _require_str(payload, "health_grade", path=path)
+ if grade not in {"A", "B", "C", "D", "F"}:
+ raise BaselineValidationError(
+ "Invalid metrics baseline schema at "
+ f"{path}: 'health_grade' must be one of A/B/C/D/F",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+ return MetricsSnapshot(
+ max_complexity=_require_int(payload, "max_complexity", path=path),
+ high_risk_functions=tuple(
+ sorted(set(_require_str_list(payload, "high_risk_functions", path=path)))
+ ),
+ max_coupling=_require_int(payload, "max_coupling", path=path),
+ high_coupling_classes=tuple(
+ sorted(set(_require_str_list(payload, "high_coupling_classes", path=path)))
+ ),
+ max_cohesion=_require_int(payload, "max_cohesion", path=path),
+ low_cohesion_classes=tuple(
+ sorted(set(_require_str_list(payload, "low_cohesion_classes", path=path)))
+ ),
+ dependency_cycles=_parse_cycles(payload, key="dependency_cycles", path=path),
+ dependency_max_depth=_require_int(payload, "dependency_max_depth", path=path),
+ dead_code_items=tuple(
+ sorted(set(_require_str_list(payload, "dead_code_items", path=path)))
+ ),
+ health_score=_require_int(payload, "health_score", path=path),
+ health_grade=_require_health_grade(grade, path=path),
+ typing_param_permille=_optional_int(
+ payload,
+ "typing_param_permille",
+ path=path,
+ ),
+ typing_return_permille=_optional_int(
+ payload,
+ "typing_return_permille",
+ path=path,
+ ),
+ docstring_permille=_optional_int(payload, "docstring_permille", path=path),
+ typing_any_count=_optional_int(payload, "typing_any_count", path=path),
+ )
+
+
+def _optional_int(payload: dict[str, object], key: str, *, path: Path) -> int:
+ value = payload.get(key)
+ if value is None:
+ return 0
+ return _require_int(payload, key, path=path)
+
+
+def _require_health_grade(
+ value: str,
+ *,
+ path: Path,
+) -> Literal["A", "B", "C", "D", "F"]:
+ if value == "A":
+ return "A"
+ if value == "B":
+ return "B"
+ if value == "C":
+ return "C"
+ if value == "D":
+ return "D"
+ if value == "F":
+ return "F"
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "'health_grade' must be one of A/B/C/D/F",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_api_param_kind(
+ value: str,
+ *,
+ path: Path,
+) -> Literal["pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"]:
+ if value == "pos_only":
+ return "pos_only"
+ if value == "pos_or_kw":
+ return "pos_or_kw"
+ if value == "vararg":
+ return "vararg"
+ if value == "kw_only":
+ return "kw_only"
+ if value == "kwarg":
+ return "kwarg"
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: api param 'kind' is invalid",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_public_symbol_kind(
+ value: str,
+ *,
+ path: Path,
+) -> Literal["function", "class", "method", "constant"]:
+ if value == "function":
+ return "function"
+ if value == "class":
+ return "class"
+ if value == "method":
+ return "method"
+ if value == "constant":
+ return "constant"
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: public symbol 'kind' is invalid",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _require_exported_via(
+ value: str,
+ *,
+ path: Path,
+) -> Literal["all", "name"]:
+ if value == "all":
+ return "all"
+ if value == "name":
+ return "name"
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "public symbol 'exported_via' is invalid",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+
+def _parse_api_surface_snapshot(
+ payload: object,
+ *,
+ path: Path,
+ root: Path | None = None,
+) -> ApiSurfaceSnapshot | None:
+ if payload is None:
+ return None
+ if not isinstance(payload, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: 'api_surface' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ raw_modules = payload.get("modules", [])
+ if not isinstance(raw_modules, list):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "'api_surface.modules' must be list",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ modules: list[ModuleApiSurface] = []
+ for raw_module in raw_modules:
+ if not isinstance(raw_module, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api surface module must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ module = _require_str(raw_module, "module", path=path)
+ wire_filepath = _require_str(raw_module, "filepath", path=path)
+ filepath = runtime_filepath_from_wire(wire_filepath, root=root)
+ all_declared = _require_str_list_or_none(raw_module, "all_declared", path=path)
+ raw_symbols = raw_module.get("symbols", [])
+ if not isinstance(raw_symbols, list):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api surface symbols must be list",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ symbols: list[PublicSymbol] = []
+ for raw_symbol in raw_symbols:
+ if not isinstance(raw_symbol, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api surface symbol must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ local_name = _optional_require_str(raw_symbol, "local_name", path=path)
+ legacy_qualname = _optional_require_str(raw_symbol, "qualname", path=path)
+ if local_name is None and legacy_qualname is None:
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api surface symbol requires 'local_name' or 'qualname'",
+ status=MetricsBaselineStatus.MISSING_FIELDS,
+ )
+ qualname = (
+ legacy_qualname
+ if local_name is None
+ else _compose_api_surface_qualname(
+ module=module,
+ local_name=local_name,
+ )
+ )
+ kind = _require_str(raw_symbol, "kind", path=path)
+ exported_via = _require_str(raw_symbol, "exported_via", path=path)
+ params_raw = raw_symbol.get("params", [])
+ if not isinstance(params_raw, list):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api surface params must be list",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ params: list[ApiParamSpec] = []
+ for raw_param in params_raw:
+ if not isinstance(raw_param, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api param must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ name = _require_str(raw_param, "name", path=path)
+ param_kind = _require_str(raw_param, "kind", path=path)
+ has_default = raw_param.get("has_default")
+ annotation_hash = _optional_require_str(
+ raw_param,
+ "annotation_hash",
+ path=path,
+ )
+ if not isinstance(has_default, bool):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {path}: "
+ "api param 'has_default' must be bool",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ params.append(
+ ApiParamSpec(
+ name=name,
+ kind=_require_api_param_kind(param_kind, path=path),
+ has_default=has_default,
+ annotation_hash=annotation_hash or "",
+ )
+ )
+ symbols.append(
+ PublicSymbol(
+ qualname=qualname or "",
+ kind=_require_public_symbol_kind(kind, path=path),
+ start_line=_require_int(raw_symbol, "start_line", path=path),
+ end_line=_require_int(raw_symbol, "end_line", path=path),
+ params=tuple(params),
+ returns_hash=_optional_require_str(
+ raw_symbol,
+ "returns_hash",
+ path=path,
+ )
+ or "",
+ exported_via=_require_exported_via(exported_via, path=path),
+ )
+ )
+ modules.append(
+ ModuleApiSurface(
+ module=module,
+ filepath=filepath,
+ symbols=tuple(sorted(symbols, key=lambda item: item.qualname)),
+ all_declared=tuple(all_declared) if all_declared is not None else None,
+ )
+ )
+ return ApiSurfaceSnapshot(
+ modules=tuple(sorted(modules, key=lambda item: (item.filepath, item.module)))
+ )
+
+
+def _require_str_list_or_none(
+ payload: dict[str, object],
+ key: str,
+ *,
+ path: Path,
+) -> list[str] | None:
+ value = payload.get(key)
+ if value is None:
+ return None
+ return _require_str_list(payload, key, path=path)
+
+
+__all__ = [
+ "_atomic_write_json",
+ "_extract_metrics_payload_sha256",
+ "_is_compatible_metrics_schema",
+ "_load_json_object",
+ "_optional_require_str",
+ "_parse_api_surface_snapshot",
+ "_parse_cycles",
+ "_parse_generator",
+ "_parse_snapshot",
+ "_require_embedded_clone_baseline_payload",
+ "_require_int",
+ "_require_str",
+ "_require_str_list",
+ "_resolve_embedded_schema_version",
+ "_validate_exact_keys",
+ "_validate_required_keys",
+ "_validate_top_level_structure",
+]
diff --git a/codeclone/baseline.py b/codeclone/baseline/clone_baseline.py
similarity index 53%
rename from codeclone/baseline.py
rename to codeclone/baseline/clone_baseline.py
index c16c08c..7422232 100644
--- a/codeclone/baseline.py
+++ b/codeclone/baseline/clone_baseline.py
@@ -6,83 +6,26 @@
from __future__ import annotations
-import hashlib
import hmac
import re
-import sys
-from datetime import datetime, timezone
-from enum import Enum
-from json import JSONDecodeError
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Final
+from typing import TYPE_CHECKING
-import orjson
-
-from . import __version__
-from ._json_io import read_json_object as _read_json_object
-from ._json_io import write_json_document_atomically as _write_json_document_atomically
-from ._schema_validation import validate_top_level_structure
-from .contracts import (
+from .. import __version__
+from ..contracts import (
BASELINE_FINGERPRINT_VERSION,
BASELINE_SCHEMA_VERSION,
)
-from .errors import BaselineValidationError
-
-if TYPE_CHECKING:
- from collections.abc import Collection, Mapping
-
-# Any: baseline JSON parsing/serialization boundary. Values are validated
-# and narrowed before entering compatibility/integrity checks.
-
-BASELINE_GENERATOR = "codeclone"
-_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR = {1: 0, 2: 1}
-MAX_BASELINE_SIZE_BYTES = 5 * 1024 * 1024
-
-
-class BaselineStatus(str, Enum):
- OK = "ok"
- MISSING = "missing"
- TOO_LARGE = "too_large"
- INVALID_JSON = "invalid_json"
- INVALID_TYPE = "invalid_type"
- MISSING_FIELDS = "missing_fields"
- MISMATCH_SCHEMA_VERSION = "mismatch_schema_version"
- MISMATCH_FINGERPRINT_VERSION = "mismatch_fingerprint_version"
- MISMATCH_PYTHON_VERSION = "mismatch_python_version"
- GENERATOR_MISMATCH = "generator_mismatch"
- INTEGRITY_MISSING = "integrity_missing"
- INTEGRITY_FAILED = "integrity_failed"
-
-
-BASELINE_UNTRUSTED_STATUSES: Final[frozenset[BaselineStatus]] = frozenset(
- {
- BaselineStatus.MISSING,
- BaselineStatus.TOO_LARGE,
- BaselineStatus.INVALID_JSON,
- BaselineStatus.INVALID_TYPE,
- BaselineStatus.MISSING_FIELDS,
- BaselineStatus.MISMATCH_SCHEMA_VERSION,
- BaselineStatus.MISMATCH_FINGERPRINT_VERSION,
- BaselineStatus.MISMATCH_PYTHON_VERSION,
- BaselineStatus.GENERATOR_MISMATCH,
- BaselineStatus.INTEGRITY_MISSING,
- BaselineStatus.INTEGRITY_FAILED,
- }
+from ..contracts.errors import BaselineValidationError
+from ..utils.json_io import (
+ write_json_document_atomically as _write_json_document_atomically,
)
+from ..utils.schema_validation import validate_top_level_structure
+from . import trust as _trust
+from .diff import diff_clone_groups
-
-def coerce_baseline_status(
- raw_status: str | BaselineStatus | None,
-) -> BaselineStatus:
- if isinstance(raw_status, BaselineStatus):
- return raw_status
- if isinstance(raw_status, str):
- try:
- return BaselineStatus(raw_status)
- except ValueError:
- return BaselineStatus.INVALID_TYPE
- return BaselineStatus.INVALID_TYPE
-
+if TYPE_CHECKING:
+ from collections.abc import Mapping
_TOP_LEVEL_REQUIRED_KEYS = {"meta", "clones"}
_TOP_LEVEL_OPTIONAL_KEYS = {"metrics", "api_surface"}
@@ -98,7 +41,6 @@ def coerce_baseline_status(
_CLONES_REQUIRED_KEYS = {"functions", "blocks"}
_FUNCTION_ID_RE = re.compile(r"^[0-9a-f]{40}\|(?:\d+-\d+|\d+\+)$")
_BLOCK_ID_RE = re.compile(r"^[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}$")
-_UTC_ISO8601_Z_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
class Baseline:
@@ -131,44 +73,44 @@ def load(
self,
*,
max_size_bytes: int | None = None,
- preloaded_payload: dict[str, Any] | None = None,
+ preloaded_payload: dict[str, object] | None = None,
) -> None:
try:
exists = self.path.exists()
except OSError as e:
raise BaselineValidationError(
f"Cannot stat baseline file at {self.path}: {e}",
- status=BaselineStatus.INVALID_TYPE,
+ status=_trust.BaselineStatus.INVALID_TYPE,
) from e
if not exists:
return
size_limit = (
- MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes
+ _trust.MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes
)
- size = _safe_stat_size(self.path)
+ size = _trust._safe_stat_size(self.path)
if size > size_limit:
raise BaselineValidationError(
"Baseline file is too large "
f"({size} bytes, max {size_limit} bytes) at {self.path}. "
"Increase --max-baseline-size-mb or regenerate baseline.",
- status=BaselineStatus.TOO_LARGE,
+ status=_trust.BaselineStatus.TOO_LARGE,
)
if preloaded_payload is None:
- payload = _load_json_object(self.path)
+ payload = _trust._load_json_object(self.path)
else:
if not isinstance(preloaded_payload, dict):
raise BaselineValidationError(
f"Baseline payload must be an object at {self.path}",
- status=BaselineStatus.INVALID_TYPE,
+ status=_trust.BaselineStatus.INVALID_TYPE,
)
payload = preloaded_payload
if _is_legacy_baseline_payload(payload):
raise BaselineValidationError(
"Baseline format is legacy (<=1.3.x) and must be regenerated. "
"Please run --update-baseline.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
_validate_top_level_structure(payload, path=self.path)
@@ -178,21 +120,28 @@ def load(
if not isinstance(meta_obj, dict):
raise BaselineValidationError(
f"Invalid baseline schema at {self.path}: 'meta' must be object",
- status=BaselineStatus.INVALID_TYPE,
+ status=_trust.BaselineStatus.INVALID_TYPE,
)
if not isinstance(clones_obj, dict):
raise BaselineValidationError(
f"Invalid baseline schema at {self.path}: 'clones' must be object",
- status=BaselineStatus.INVALID_TYPE,
+ status=_trust.BaselineStatus.INVALID_TYPE,
)
_validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path)
_validate_required_keys(clones_obj, _CLONES_REQUIRED_KEYS, path=self.path)
_validate_exact_clone_keys(clones_obj, path=self.path)
- generator, generator_version = _parse_generator_meta(meta_obj, path=self.path)
- schema_version = _require_semver_str(meta_obj, "schema_version", path=self.path)
- schema_major, _, _ = _parse_semver(
+ generator, generator_version = _trust._parse_generator_meta(
+ meta_obj,
+ path=self.path,
+ )
+ schema_version = _trust._require_semver_str(
+ meta_obj,
+ "schema_version",
+ path=self.path,
+ )
+ schema_major, _, _ = _trust._parse_semver(
schema_version,
key="schema_version",
path=self.path,
@@ -201,22 +150,28 @@ def load(
raise BaselineValidationError(
f"Invalid baseline schema at {self.path}: "
"top-level 'metrics' requires baseline schema >= 2.0.",
- status=BaselineStatus.MISMATCH_SCHEMA_VERSION,
+ status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION,
)
- fingerprint_version = _require_str(
- meta_obj, "fingerprint_version", path=self.path
+ fingerprint_version = _trust._require_str(
+ meta_obj,
+ "fingerprint_version",
+ path=self.path,
)
- python_tag = _require_python_tag(meta_obj, "python_tag", path=self.path)
- created_at = _require_utc_iso8601_z(meta_obj, "created_at", path=self.path)
- payload_sha256 = _require_str(meta_obj, "payload_sha256", path=self.path)
+ python_tag = _trust._require_python_tag(meta_obj, "python_tag", path=self.path)
+ created_at = _trust._require_utc_iso8601_z(
+ meta_obj,
+ "created_at",
+ path=self.path,
+ )
+ payload_sha256 = _trust._require_str(meta_obj, "payload_sha256", path=self.path)
- function_ids = _require_sorted_unique_ids(
+ function_ids = _trust._require_sorted_unique_ids(
clones_obj,
"functions",
pattern=_FUNCTION_ID_RE,
path=self.path,
)
- block_ids = _require_sorted_unique_ids(
+ block_ids = _trust._require_sorted_unique_ids(
clones_obj,
"blocks",
pattern=_BLOCK_ID_RE,
@@ -298,60 +253,63 @@ def save(self) -> None:
self.payload_sha256 = payload_sha256
def verify_compatibility(self, *, current_python_tag: str) -> None:
- if self.generator != BASELINE_GENERATOR:
+ if self.generator != _trust.BASELINE_GENERATOR:
raise BaselineValidationError(
"Baseline generator mismatch: expected 'codeclone'.",
- status=BaselineStatus.GENERATOR_MISMATCH,
+ status=_trust.BaselineStatus.GENERATOR_MISMATCH,
)
if self.schema_version is None:
raise BaselineValidationError(
"Baseline schema version is missing.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
if self.fingerprint_version is None:
raise BaselineValidationError(
"Baseline fingerprint version is missing.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
if self.python_tag is None:
raise BaselineValidationError(
"Baseline python_tag is missing.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
- schema_major, schema_minor, _ = _parse_semver(
- self.schema_version, key="schema_version", path=self.path
+ schema_major, schema_minor, _ = _trust._parse_semver(
+ self.schema_version,
+ key="schema_version",
+ path=self.path,
)
- max_minor = _BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR.get(schema_major)
+ max_minor = _trust._BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR.get(schema_major)
if max_minor is None:
supported = ",".join(
- str(major) for major in sorted(_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR)
+ str(major)
+ for major in sorted(_trust._BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR)
)
raise BaselineValidationError(
"Baseline schema version mismatch: "
f"baseline={self.schema_version}, "
f"supported_majors={supported}.",
- status=BaselineStatus.MISMATCH_SCHEMA_VERSION,
+ status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION,
)
if schema_minor > max_minor:
raise BaselineValidationError(
"Baseline schema version is newer than supported: "
f"baseline={self.schema_version}, "
f"max={schema_major}.{max_minor}.",
- status=BaselineStatus.MISMATCH_SCHEMA_VERSION,
+ status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION,
)
if self.fingerprint_version != BASELINE_FINGERPRINT_VERSION:
raise BaselineValidationError(
"Baseline fingerprint version mismatch: "
f"baseline={self.fingerprint_version}, "
f"expected={BASELINE_FINGERPRINT_VERSION}.",
- status=BaselineStatus.MISMATCH_FINGERPRINT_VERSION,
+ status=_trust.BaselineStatus.MISMATCH_FINGERPRINT_VERSION,
)
if self.python_tag != current_python_tag:
raise BaselineValidationError(
"Baseline python tag mismatch: "
f"baseline={self.python_tag}, current={current_python_tag}.",
- status=BaselineStatus.MISMATCH_PYTHON_VERSION,
+ status=_trust.BaselineStatus.MISMATCH_PYTHON_VERSION,
)
self.verify_integrity()
@@ -359,36 +317,36 @@ def verify_integrity(self) -> None:
if not isinstance(self.payload_sha256, str):
raise BaselineValidationError(
"Baseline integrity payload hash is missing.",
- status=BaselineStatus.INTEGRITY_MISSING,
+ status=_trust.BaselineStatus.INTEGRITY_MISSING,
)
if len(self.payload_sha256) != 64:
raise BaselineValidationError(
"Baseline integrity payload hash is missing.",
- status=BaselineStatus.INTEGRITY_MISSING,
+ status=_trust.BaselineStatus.INTEGRITY_MISSING,
)
try:
int(self.payload_sha256, 16)
except ValueError as e:
raise BaselineValidationError(
"Baseline integrity payload hash is missing.",
- status=BaselineStatus.INTEGRITY_MISSING,
+ status=_trust.BaselineStatus.INTEGRITY_MISSING,
) from e
if self.schema_version is None:
raise BaselineValidationError(
"Baseline schema version is missing for integrity validation.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
if self.fingerprint_version is None:
raise BaselineValidationError(
"Baseline fingerprint version is missing for integrity validation.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
if self.python_tag is None:
raise BaselineValidationError(
"Baseline python_tag is missing for integrity validation.",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
- expected = _compute_payload_sha256(
+ expected = _trust._compute_payload_sha256(
functions=self.functions,
blocks=self.blocks,
fingerprint_version=self.fingerprint_version,
@@ -397,7 +355,7 @@ def verify_integrity(self) -> None:
if not hmac.compare_digest(self.payload_sha256, expected):
raise BaselineValidationError(
"Baseline integrity check failed: payload_sha256 mismatch.",
- status=BaselineStatus.INTEGRITY_FAILED,
+ status=_trust.BaselineStatus.INTEGRITY_FAILED,
)
@staticmethod
@@ -413,24 +371,27 @@ def from_groups(
baseline = Baseline(path)
baseline.functions = set(func_groups.keys())
baseline.blocks = set(block_groups.keys())
- baseline.generator = BASELINE_GENERATOR
+ baseline.generator = _trust.BASELINE_GENERATOR
baseline.schema_version = schema_version or BASELINE_SCHEMA_VERSION
baseline.fingerprint_version = (
fingerprint_version or BASELINE_FINGERPRINT_VERSION
)
- baseline.python_tag = python_tag or current_python_tag()
+ baseline.python_tag = python_tag or _trust.current_python_tag()
baseline.generator_version = generator_version or __version__
return baseline
def diff(
self, func_groups: Mapping[str, object], block_groups: Mapping[str, object]
) -> tuple[set[str], set[str]]:
- new_funcs = set(func_groups.keys()) - self.functions
- new_blocks = set(block_groups.keys()) - self.blocks
- return new_funcs, new_blocks
+ return diff_clone_groups(
+ known_functions=self.functions,
+ known_blocks=self.blocks,
+ func_groups=func_groups,
+ block_groups=block_groups,
+ )
-def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None:
+def _atomic_write_json(path: Path, payload: dict[str, object]) -> None:
_write_json_document_atomically(
path,
payload,
@@ -439,80 +400,55 @@ def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None:
)
-def _safe_stat_size(path: Path) -> int:
- try:
- return path.stat().st_size
- except OSError as e:
- raise BaselineValidationError(
- f"Cannot stat baseline file at {path}: {e}",
- status=BaselineStatus.INVALID_TYPE,
- ) from e
-
-
-def _load_json_object(path: Path) -> dict[str, Any]:
- try:
- return _read_json_object(path)
- except OSError as e:
- raise BaselineValidationError(
- f"Cannot read baseline file at {path}: {e}",
- status=BaselineStatus.INVALID_JSON,
- ) from e
- except JSONDecodeError as e:
- raise BaselineValidationError(
- f"Corrupted baseline file at {path}: {e}",
- status=BaselineStatus.INVALID_JSON,
- ) from e
- except TypeError:
- raise BaselineValidationError(
- f"Baseline payload must be an object at {path}",
- status=BaselineStatus.INVALID_TYPE,
- ) from None
-
-
-def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None:
+def _validate_top_level_structure(payload: dict[str, object], *, path: Path) -> None:
validate_top_level_structure(
payload,
path=path,
required_keys=_TOP_LEVEL_REQUIRED_KEYS,
allowed_keys=_TOP_LEVEL_ALLOWED_KEYS,
schema_label="baseline",
- missing_status=BaselineStatus.MISSING_FIELDS,
- extra_status=BaselineStatus.INVALID_TYPE,
+ missing_status=_trust.BaselineStatus.MISSING_FIELDS,
+ extra_status=_trust.BaselineStatus.INVALID_TYPE,
)
def _validate_required_keys(
- obj: dict[str, Any], required: set[str], *, path: Path
+ obj: dict[str, object], required: set[str], *, path: Path
) -> None:
missing = required - set(obj.keys())
if missing:
raise BaselineValidationError(
f"Invalid baseline schema at {path}: missing required fields: "
f"{', '.join(sorted(missing))}",
- status=BaselineStatus.MISSING_FIELDS,
+ status=_trust.BaselineStatus.MISSING_FIELDS,
)
-def _validate_exact_clone_keys(clones: dict[str, Any], *, path: Path) -> None:
+def _validate_exact_clone_keys(clones: dict[str, object], *, path: Path) -> None:
keys = set(clones.keys())
extra = keys - _CLONES_REQUIRED_KEYS
if extra:
raise BaselineValidationError(
f"Invalid baseline schema at {path}: unexpected clone keys: "
f"{', '.join(sorted(extra))}",
- status=BaselineStatus.INVALID_TYPE,
+ status=_trust.BaselineStatus.INVALID_TYPE,
)
-def _is_legacy_baseline_payload(payload: dict[str, Any]) -> bool:
+def _is_legacy_baseline_payload(payload: dict[str, object]) -> bool:
return "functions" in payload and "blocks" in payload
def _preserve_embedded_metrics(
path: Path,
-) -> tuple[dict[str, Any] | None, str | None, dict[str, Any] | None, str | None]:
+) -> tuple[
+ dict[str, object] | None,
+ str | None,
+ dict[str, object] | None,
+ str | None,
+]:
try:
- payload = _load_json_object(path)
+ payload = _trust._load_json_object(path)
except BaselineValidationError:
return None, None, None, None
metrics_obj = payload.get("metrics")
@@ -545,45 +481,6 @@ def _preserve_embedded_metrics(
)
-def _parse_generator_meta(
- meta_obj: dict[str, Any], *, path: Path
-) -> tuple[str, str | None]:
- raw_generator = meta_obj.get("generator")
-
- if isinstance(raw_generator, str):
- generator_version = _optional_str(meta_obj, "generator_version", path=path)
- if generator_version is None:
- # Legacy alias for baselines produced before generator_version rename.
- generator_version = _optional_str(meta_obj, "codeclone_version", path=path)
- return raw_generator, generator_version
-
- if isinstance(raw_generator, dict):
- allowed_keys = {"name", "version"}
- extra = set(raw_generator.keys()) - allowed_keys
- if extra:
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: unexpected generator keys: "
- f"{', '.join(sorted(extra))}",
- status=BaselineStatus.INVALID_TYPE,
- )
- generator_name = _require_str(raw_generator, "name", path=path)
- generator_version = _optional_str(raw_generator, "version", path=path)
-
- if generator_version is None:
- generator_version = _optional_str(meta_obj, "generator_version", path=path)
- if generator_version is None:
- generator_version = _optional_str(
- meta_obj, "codeclone_version", path=path
- )
-
- return generator_name, generator_version
-
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: 'generator' must be string or object",
- status=BaselineStatus.INVALID_TYPE,
- )
-
-
def _baseline_payload(
*,
functions: set[str],
@@ -594,17 +491,17 @@ def _baseline_payload(
python_tag: str | None,
generator_version: str | None,
created_at: str | None,
-) -> dict[str, Any]:
- resolved_generator = generator or BASELINE_GENERATOR
+) -> dict[str, object]:
+ resolved_generator = generator or _trust.BASELINE_GENERATOR
resolved_schema = schema_version or BASELINE_SCHEMA_VERSION
resolved_fingerprint = fingerprint_version or BASELINE_FINGERPRINT_VERSION
- resolved_python_tag = python_tag or current_python_tag()
+ resolved_python_tag = python_tag or _trust.current_python_tag()
resolved_generator_version = generator_version or __version__
- resolved_created_at = created_at or _utc_now_z()
+ resolved_created_at = created_at or _trust._utc_now_z()
sorted_functions = sorted(functions)
sorted_blocks = sorted(blocks)
- payload_sha256 = _compute_payload_sha256(
+ payload_sha256 = _trust._compute_payload_sha256(
functions=sorted_functions,
blocks=sorted_blocks,
fingerprint_version=resolved_fingerprint,
@@ -630,138 +527,11 @@ def _baseline_payload(
}
-def _compute_payload_sha256(
- *,
- functions: Collection[str],
- blocks: Collection[str],
- fingerprint_version: str,
- python_tag: str,
-) -> str:
- canonical = {
- "blocks": sorted(blocks),
- "fingerprint_version": fingerprint_version,
- "functions": sorted(functions),
- "python_tag": python_tag,
- }
- serialized = orjson.dumps(canonical, option=orjson.OPT_SORT_KEYS)
- return hashlib.sha256(serialized).hexdigest()
-
-
-def current_python_tag() -> str:
- """Return the interpreter compatibility tag as an immutable string."""
- impl = sys.implementation.name
- major, minor = sys.version_info[:2]
- prefix = "cp" if impl == "cpython" else impl[:2]
- return f"{prefix}{major}{minor}"
-
-
-def _utc_now_z() -> str:
- return (
- datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
- )
-
-
-def _require_str(obj: dict[str, Any], key: str, *, path: Path) -> str:
- value = obj.get(key)
- if not isinstance(value, str):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be string",
- status=BaselineStatus.INVALID_TYPE,
- )
- return value
-
-
-def _optional_str(obj: dict[str, Any], key: str, *, path: Path) -> str | None:
- value = obj.get(key)
- if value is None:
- return None
- if not isinstance(value, str):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be string",
- status=BaselineStatus.INVALID_TYPE,
- )
- return value
-
-
-def _require_semver_str(obj: dict[str, Any], key: str, *, path: Path) -> str:
- value = _require_str(obj, key, path=path)
- _parse_semver(value, key=key, path=path)
- return value
-
-
-def _parse_semver(value: str, *, key: str, path: Path) -> tuple[int, int, int]:
- parts = value.split(".")
- if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be semver string",
- status=BaselineStatus.INVALID_TYPE,
- )
- if len(parts) == 2:
- major, minor = int(parts[0]), int(parts[1])
- patch = 0
- else:
- major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2])
- return major, minor, patch
-
-
-def _require_python_tag(obj: dict[str, Any], key: str, *, path: Path) -> str:
- value = _require_str(obj, key, path=path)
- if not re.fullmatch(r"[a-z]{2}\d{2,3}", value):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must look like 'cp313'",
- status=BaselineStatus.INVALID_TYPE,
- )
- return value
-
-
-def _require_utc_iso8601_z(obj: dict[str, Any], key: str, *, path: Path) -> str:
- value = _require_str(obj, key, path=path)
- if not _UTC_ISO8601_Z_RE.fullmatch(value):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z",
- status=BaselineStatus.INVALID_TYPE,
- )
- try:
- datetime(
- int(value[0:4]),
- int(value[5:7]),
- int(value[8:10]),
- int(value[11:13]),
- int(value[14:16]),
- int(value[17:19]),
- tzinfo=timezone.utc,
- )
- except ValueError as e:
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z",
- status=BaselineStatus.INVALID_TYPE,
- ) from e
- return value
-
-
-def _require_sorted_unique_ids(
- obj: dict[str, Any], key: str, *, pattern: re.Pattern[str], path: Path
-) -> list[str]:
- value = obj.get(key)
- if not isinstance(value, list):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be list[str]",
- status=BaselineStatus.INVALID_TYPE,
- )
- if not all(isinstance(item, str) for item in value):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be list[str]",
- status=BaselineStatus.INVALID_TYPE,
- )
- values = list(value)
- if values != sorted(values) or len(values) != len(set(values)):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' must be sorted and unique",
- status=BaselineStatus.INVALID_TYPE,
- )
- if not all(pattern.fullmatch(item) for item in values):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: '{key}' has invalid id format",
- status=BaselineStatus.INVALID_TYPE,
- )
- return values
+__all__ = [
+ "_BLOCK_ID_RE",
+ "_FUNCTION_ID_RE",
+ "Baseline",
+ "_atomic_write_json",
+ "_baseline_payload",
+ "_preserve_embedded_metrics",
+]
diff --git a/codeclone/baseline/diff.py b/codeclone/baseline/diff.py
new file mode 100644
index 0000000..8c6ca2c
--- /dev/null
+++ b/codeclone/baseline/diff.py
@@ -0,0 +1,111 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from collections.abc import Mapping, Set
+
+from ..metrics.api_surface import compare_api_surfaces
+from ..models import (
+ ApiBreakingChange,
+ ApiSurfaceSnapshot,
+ MetricsDiff,
+ MetricsSnapshot,
+)
+
+
+def diff_clone_groups(
+ *,
+ known_functions: Set[str],
+ known_blocks: Set[str],
+ func_groups: Mapping[str, object],
+ block_groups: Mapping[str, object],
+) -> tuple[set[str], set[str]]:
+ new_funcs = set(func_groups.keys()) - known_functions
+ new_blocks = set(block_groups.keys()) - known_blocks
+ return new_funcs, new_blocks
+
+
+def diff_metrics(
+ *,
+ baseline_snapshot: MetricsSnapshot | None,
+ current_snapshot: MetricsSnapshot,
+ baseline_api_surface: ApiSurfaceSnapshot | None,
+ current_api_surface: ApiSurfaceSnapshot | None,
+) -> MetricsDiff:
+ snapshot = baseline_snapshot or MetricsSnapshot(
+ max_complexity=0,
+ high_risk_functions=(),
+ max_coupling=0,
+ high_coupling_classes=(),
+ max_cohesion=0,
+ low_cohesion_classes=(),
+ dependency_cycles=(),
+ dependency_max_depth=0,
+ dead_code_items=(),
+ health_score=0,
+ health_grade="F",
+ typing_param_permille=0,
+ typing_return_permille=0,
+ docstring_permille=0,
+ typing_any_count=0,
+ )
+
+ new_high_risk_functions = tuple(
+ sorted(
+ set(current_snapshot.high_risk_functions)
+ - set(snapshot.high_risk_functions)
+ )
+ )
+ new_high_coupling_classes = tuple(
+ sorted(
+ set(current_snapshot.high_coupling_classes)
+ - set(snapshot.high_coupling_classes)
+ )
+ )
+ new_cycles = tuple(
+ sorted(
+ set(current_snapshot.dependency_cycles) - set(snapshot.dependency_cycles)
+ )
+ )
+ new_dead_code = tuple(
+ sorted(set(current_snapshot.dead_code_items) - set(snapshot.dead_code_items))
+ )
+
+ if baseline_api_surface is None:
+ added_api_symbols: tuple[str, ...] = ()
+ api_breaking_changes: tuple[ApiBreakingChange, ...] = ()
+ else:
+ added_api_symbols, api_breaking_changes = compare_api_surfaces(
+ baseline=baseline_api_surface,
+ current=current_api_surface,
+ strict_types=False,
+ )
+
+ return MetricsDiff(
+ new_high_risk_functions=new_high_risk_functions,
+ new_high_coupling_classes=new_high_coupling_classes,
+ new_cycles=new_cycles,
+ new_dead_code=new_dead_code,
+ health_delta=current_snapshot.health_score - snapshot.health_score,
+ typing_param_permille_delta=(
+ current_snapshot.typing_param_permille - snapshot.typing_param_permille
+ ),
+ typing_return_permille_delta=(
+ current_snapshot.typing_return_permille - snapshot.typing_return_permille
+ ),
+ docstring_permille_delta=(
+ current_snapshot.docstring_permille - snapshot.docstring_permille
+ ),
+ new_api_symbols=added_api_symbols,
+ new_api_breaking_changes=api_breaking_changes,
+ )
+
+
+__all__ = ["diff_clone_groups", "diff_metrics"]
diff --git a/codeclone/baseline/metrics_baseline.py b/codeclone/baseline/metrics_baseline.py
new file mode 100644
index 0000000..2aecf3c
--- /dev/null
+++ b/codeclone/baseline/metrics_baseline.py
@@ -0,0 +1,497 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import hmac
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from json import JSONDecodeError
+from pathlib import Path
+
+import orjson
+
+from .. import __version__
+from ..contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION
+from ..contracts.errors import BaselineValidationError
+from ..models import ApiSurfaceSnapshot, MetricsDiff, MetricsSnapshot, ProjectMetrics
+from ._metrics_baseline_contract import (
+ _API_SURFACE_PAYLOAD_SHA256_KEY,
+ _META_REQUIRED_KEYS,
+ _METRICS_OPTIONAL_KEYS,
+ _METRICS_PAYLOAD_SHA256_KEY,
+ _METRICS_REQUIRED_KEYS,
+ MAX_METRICS_BASELINE_SIZE_BYTES,
+ METRICS_BASELINE_GENERATOR,
+ METRICS_BASELINE_UNTRUSTED_STATUSES,
+ MetricsBaselineStatus,
+ coerce_metrics_baseline_status,
+)
+from ._metrics_baseline_payload import (
+ _build_payload,
+ _compute_api_surface_payload_sha256,
+ _compute_legacy_api_surface_payload_sha256,
+ _compute_payload_sha256,
+ _has_coverage_adoption_snapshot,
+ snapshot_from_project_metrics,
+)
+from ._metrics_baseline_validation import (
+ _atomic_write_json,
+ _extract_metrics_payload_sha256,
+ _extract_optional_payload_sha256,
+ _is_compatible_metrics_schema,
+ _load_json_object,
+ _optional_require_str,
+ _parse_api_surface_snapshot,
+ _parse_generator,
+ _parse_snapshot,
+ _require_embedded_clone_baseline_payload,
+ _require_str,
+ _resolve_embedded_schema_version,
+ _validate_exact_keys,
+ _validate_required_keys,
+ _validate_top_level_structure,
+)
+from .diff import diff_metrics
+from .trust import current_python_tag
+
+
+@dataclass(frozen=True, slots=True)
+class MetricsBaselineSectionProbe:
+ has_metrics_section: bool
+ payload: dict[str, object] | None
+
+
+def _now_utc_z() -> str:
+ return (
+ datetime.now(timezone.utc)
+ .replace(microsecond=0)
+ .isoformat()
+ .replace("+00:00", "Z")
+ )
+
+
+def probe_metrics_baseline_section(path: Path) -> MetricsBaselineSectionProbe:
+ if not path.exists():
+ return MetricsBaselineSectionProbe(
+ has_metrics_section=False,
+ payload=None,
+ )
+ try:
+ raw_payload = orjson.loads(path.read_text("utf-8"))
+ except (OSError, JSONDecodeError):
+ return MetricsBaselineSectionProbe(
+ has_metrics_section=True,
+ payload=None,
+ )
+ if not isinstance(raw_payload, dict):
+ return MetricsBaselineSectionProbe(
+ has_metrics_section=True,
+ payload=None,
+ )
+ payload = dict(raw_payload)
+ return MetricsBaselineSectionProbe(
+ has_metrics_section=("metrics" in payload),
+ payload=payload,
+ )
+
+
+class MetricsBaseline:
+ __slots__ = (
+ "api_surface_payload_sha256",
+ "api_surface_snapshot",
+ "created_at",
+ "generator_name",
+ "generator_version",
+ "has_coverage_adoption_snapshot",
+ "is_embedded_in_clone_baseline",
+ "path",
+ "payload_sha256",
+ "python_tag",
+ "schema_version",
+ "snapshot",
+ )
+
+ def __init__(self, path: str | Path) -> None:
+ self.path = Path(path)
+ self.generator_name: str | None = None
+ self.generator_version: str | None = None
+ self.schema_version: str | None = None
+ self.python_tag: str | None = None
+ self.created_at: str | None = None
+ self.payload_sha256: str | None = None
+ self.snapshot: MetricsSnapshot | None = None
+ self.has_coverage_adoption_snapshot = False
+ self.api_surface_payload_sha256: str | None = None
+ self.api_surface_snapshot: ApiSurfaceSnapshot | None = None
+ self.is_embedded_in_clone_baseline = False
+
+ def load(
+ self,
+ *,
+ max_size_bytes: int | None = None,
+ preloaded_payload: dict[str, object] | None = None,
+ ) -> None:
+ try:
+ exists = self.path.exists()
+ except OSError as e:
+ raise BaselineValidationError(
+ f"Cannot stat metrics baseline file at {self.path}: {e}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ ) from e
+ if not exists:
+ return
+
+ size_limit = (
+ MAX_METRICS_BASELINE_SIZE_BYTES
+ if max_size_bytes is None
+ else max_size_bytes
+ )
+ try:
+ file_size = self.path.stat().st_size
+ except OSError as e:
+ raise BaselineValidationError(
+ f"Cannot stat metrics baseline file at {self.path}: {e}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ ) from e
+ if file_size > size_limit:
+ raise BaselineValidationError(
+ "Metrics baseline file is too large "
+ f"({file_size} bytes, max {size_limit} bytes) at {self.path}.",
+ status=MetricsBaselineStatus.TOO_LARGE,
+ )
+
+ if preloaded_payload is None:
+ payload = _load_json_object(self.path)
+ else:
+ if not isinstance(preloaded_payload, dict):
+ raise BaselineValidationError(
+ f"Metrics baseline payload must be an object at {self.path}",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ payload = preloaded_payload
+
+ _validate_top_level_structure(payload, path=self.path)
+ self.is_embedded_in_clone_baseline = "clones" in payload
+
+ meta_obj = payload.get("meta")
+ metrics_obj = payload.get("metrics")
+ if not isinstance(meta_obj, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {self.path}: "
+ "'meta' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ if not isinstance(metrics_obj, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {self.path}: "
+ "'metrics' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+
+ _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path)
+ _validate_required_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path)
+ _validate_exact_keys(
+ metrics_obj,
+ _METRICS_REQUIRED_KEYS | _METRICS_OPTIONAL_KEYS,
+ path=self.path,
+ )
+
+ generator_name, generator_version = _parse_generator(meta_obj, path=self.path)
+ self.generator_name = generator_name
+ self.generator_version = generator_version
+ self.schema_version = _require_str(meta_obj, "schema_version", path=self.path)
+ self.python_tag = _require_str(meta_obj, "python_tag", path=self.path)
+ self.created_at = _require_str(meta_obj, "created_at", path=self.path)
+ self.payload_sha256 = _extract_metrics_payload_sha256(
+ meta_obj,
+ path=self.path,
+ )
+ self.api_surface_payload_sha256 = _extract_optional_payload_sha256(
+ meta_obj,
+ key=_API_SURFACE_PAYLOAD_SHA256_KEY,
+ )
+ self.snapshot = _parse_snapshot(metrics_obj, path=self.path)
+ self.has_coverage_adoption_snapshot = _has_coverage_adoption_snapshot(
+ metrics_obj
+ )
+ self.api_surface_snapshot = _parse_api_surface_snapshot(
+ payload.get("api_surface"),
+ path=self.path,
+ root=self.path.parent,
+ )
+
+ def save(self) -> None:
+ if self.snapshot is None:
+ raise BaselineValidationError(
+ "Metrics baseline snapshot is missing.",
+ status=MetricsBaselineStatus.MISSING_FIELDS,
+ )
+
+ payload = _build_payload(
+ snapshot=self.snapshot,
+ schema_version=self.schema_version or METRICS_BASELINE_SCHEMA_VERSION,
+ python_tag=self.python_tag or current_python_tag(),
+ generator_name=self.generator_name or METRICS_BASELINE_GENERATOR,
+ generator_version=self.generator_version or __version__,
+ created_at=self.created_at or _now_utc_z(),
+ include_adoption=self.has_coverage_adoption_snapshot,
+ api_surface_snapshot=self.api_surface_snapshot,
+ api_surface_root=self.path.parent,
+ )
+ payload_meta = payload.get("meta")
+ if not isinstance(payload_meta, dict):
+ raise BaselineValidationError(
+ f"Invalid metrics baseline schema at {self.path}: "
+ "'meta' must be object",
+ status=MetricsBaselineStatus.INVALID_TYPE,
+ )
+ payload_metrics_hash = _require_str(
+ payload_meta,
+ "payload_sha256",
+ path=self.path,
+ )
+ payload_api_surface_hash = _optional_require_str(
+ payload_meta,
+ _API_SURFACE_PAYLOAD_SHA256_KEY,
+ path=self.path,
+ )
+
+ existing: dict[str, object] | None = None
+ try:
+ if self.path.exists():
+ loaded = _load_json_object(self.path)
+ if "clones" in loaded:
+ existing = loaded
+ except BaselineValidationError as e:
+ raise BaselineValidationError(
+ f"Cannot read existing baseline file at {self.path}: {e}",
+ status=MetricsBaselineStatus.INVALID_JSON,
+ ) from e
+
+ if existing is not None:
+ existing_meta, clones_obj = _require_embedded_clone_baseline_payload(
+ existing,
+ path=self.path,
+ )
+ merged_schema_version = _resolve_embedded_schema_version(
+ existing_meta,
+ path=self.path,
+ )
+ merged_meta = dict(existing_meta)
+ merged_meta["schema_version"] = merged_schema_version
+ merged_meta[_METRICS_PAYLOAD_SHA256_KEY] = payload_metrics_hash
+ if payload_api_surface_hash is None:
+ merged_meta.pop(_API_SURFACE_PAYLOAD_SHA256_KEY, None)
+ else:
+ merged_meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = payload_api_surface_hash
+ merged_payload: dict[str, object] = {
+ "meta": merged_meta,
+ "clones": clones_obj,
+ "metrics": payload["metrics"],
+ }
+ api_surface_payload = payload.get("api_surface")
+ if api_surface_payload is not None:
+ merged_payload["api_surface"] = api_surface_payload
+ self.path.parent.mkdir(parents=True, exist_ok=True)
+ _atomic_write_json(self.path, merged_payload)
+ self.is_embedded_in_clone_baseline = True
+ self.schema_version = merged_schema_version
+ self.python_tag = _require_str(merged_meta, "python_tag", path=self.path)
+ self.created_at = _require_str(merged_meta, "created_at", path=self.path)
+ self.payload_sha256 = _require_str(
+ merged_meta,
+ _METRICS_PAYLOAD_SHA256_KEY,
+ path=self.path,
+ )
+ self.api_surface_payload_sha256 = _optional_require_str(
+ merged_meta,
+ _API_SURFACE_PAYLOAD_SHA256_KEY,
+ path=self.path,
+ )
+ self.generator_name, self.generator_version = _parse_generator(
+ merged_meta,
+ path=self.path,
+ )
+ return
+
+ self.path.parent.mkdir(parents=True, exist_ok=True)
+ _atomic_write_json(self.path, payload)
+ self.is_embedded_in_clone_baseline = False
+ self.schema_version = _require_str(
+ payload_meta,
+ "schema_version",
+ path=self.path,
+ )
+ self.python_tag = _require_str(
+ payload_meta,
+ "python_tag",
+ path=self.path,
+ )
+ self.created_at = _require_str(
+ payload_meta,
+ "created_at",
+ path=self.path,
+ )
+ self.payload_sha256 = payload_metrics_hash
+ self.api_surface_payload_sha256 = payload_api_surface_hash
+
+ def verify_compatibility(self, *, runtime_python_tag: str) -> None:
+ if self.generator_name != METRICS_BASELINE_GENERATOR:
+ raise BaselineValidationError(
+ "Metrics baseline generator mismatch: expected 'codeclone'.",
+ status=MetricsBaselineStatus.GENERATOR_MISMATCH,
+ )
+ expected_schema = (
+ BASELINE_SCHEMA_VERSION
+ if self.is_embedded_in_clone_baseline
+ else METRICS_BASELINE_SCHEMA_VERSION
+ )
+ if not _is_compatible_metrics_schema(
+ baseline_version=self.schema_version,
+ expected_version=expected_schema,
+ ):
+ raise BaselineValidationError(
+ "Metrics baseline schema version mismatch: "
+ f"baseline={self.schema_version}, "
+ f"expected={expected_schema}.",
+ status=MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION,
+ )
+ if self.python_tag != runtime_python_tag:
+ raise BaselineValidationError(
+ "Metrics baseline python tag mismatch: "
+ f"baseline={self.python_tag}, current={runtime_python_tag}.",
+ status=MetricsBaselineStatus.MISMATCH_PYTHON_VERSION,
+ )
+ self.verify_integrity()
+
+ def verify_integrity(self) -> None:
+ if self.snapshot is None:
+ raise BaselineValidationError(
+ "Metrics baseline snapshot is missing.",
+ status=MetricsBaselineStatus.MISSING_FIELDS,
+ )
+ if not isinstance(self.payload_sha256, str) or len(self.payload_sha256) != 64:
+ raise BaselineValidationError(
+ "Metrics baseline integrity payload hash is missing.",
+ status=MetricsBaselineStatus.INTEGRITY_MISSING,
+ )
+
+ expected = _compute_payload_sha256(
+ self.snapshot,
+ include_adoption=self.has_coverage_adoption_snapshot,
+ )
+ if not hmac.compare_digest(self.payload_sha256, expected):
+ raise BaselineValidationError(
+ "Metrics baseline integrity check failed: payload_sha256 mismatch.",
+ status=MetricsBaselineStatus.INTEGRITY_FAILED,
+ )
+
+ if self.api_surface_snapshot is None:
+ return
+ if (
+ not isinstance(self.api_surface_payload_sha256, str)
+ or len(self.api_surface_payload_sha256) != 64
+ ):
+ raise BaselineValidationError(
+ "Metrics baseline API surface integrity payload hash is missing.",
+ status=MetricsBaselineStatus.INTEGRITY_MISSING,
+ )
+
+ expected_api = _compute_api_surface_payload_sha256(
+ self.api_surface_snapshot,
+ root=self.path.parent,
+ )
+ legacy_absolute_expected_api = _compute_api_surface_payload_sha256(
+ self.api_surface_snapshot
+ )
+ legacy_expected_api = _compute_legacy_api_surface_payload_sha256(
+ self.api_surface_snapshot,
+ root=self.path.parent,
+ )
+ legacy_absolute_qualname_expected_api = (
+ _compute_legacy_api_surface_payload_sha256(self.api_surface_snapshot)
+ )
+ if not (
+ hmac.compare_digest(self.api_surface_payload_sha256, expected_api)
+ or hmac.compare_digest(
+ self.api_surface_payload_sha256,
+ legacy_absolute_expected_api,
+ )
+ or hmac.compare_digest(
+ self.api_surface_payload_sha256,
+ legacy_expected_api,
+ )
+ or hmac.compare_digest(
+ self.api_surface_payload_sha256,
+ legacy_absolute_qualname_expected_api,
+ )
+ ):
+ raise BaselineValidationError(
+ "Metrics baseline integrity check failed: "
+ "api_surface payload_sha256 mismatch.",
+ status=MetricsBaselineStatus.INTEGRITY_FAILED,
+ )
+
+ @staticmethod
+ def from_project_metrics(
+ *,
+ project_metrics: ProjectMetrics,
+ path: str | Path,
+ schema_version: str | None = None,
+ python_tag: str | None = None,
+ generator_version: str | None = None,
+ include_adoption: bool = True,
+ include_api_surface: bool = True,
+ ) -> MetricsBaseline:
+ baseline = MetricsBaseline(path)
+ baseline.generator_name = METRICS_BASELINE_GENERATOR
+ baseline.generator_version = generator_version or __version__
+ baseline.schema_version = schema_version or METRICS_BASELINE_SCHEMA_VERSION
+ baseline.python_tag = python_tag or current_python_tag()
+ baseline.created_at = _now_utc_z()
+ baseline.snapshot = snapshot_from_project_metrics(project_metrics)
+ baseline.payload_sha256 = _compute_payload_sha256(
+ baseline.snapshot,
+ include_adoption=include_adoption,
+ )
+ baseline.has_coverage_adoption_snapshot = include_adoption
+ baseline.api_surface_snapshot = (
+ project_metrics.api_surface if include_api_surface else None
+ )
+ baseline.api_surface_payload_sha256 = (
+ _compute_api_surface_payload_sha256(
+ baseline.api_surface_snapshot,
+ root=baseline.path.parent,
+ )
+ if baseline.api_surface_snapshot is not None
+ else None
+ )
+ return baseline
+
+ def diff(self, current: ProjectMetrics) -> MetricsDiff:
+ return diff_metrics(
+ baseline_snapshot=self.snapshot,
+ current_snapshot=snapshot_from_project_metrics(current),
+ baseline_api_surface=self.api_surface_snapshot,
+ current_api_surface=current.api_surface,
+ )
+
+
+__all__ = [
+ "BASELINE_SCHEMA_VERSION",
+ "MAX_METRICS_BASELINE_SIZE_BYTES",
+ "METRICS_BASELINE_GENERATOR",
+ "METRICS_BASELINE_SCHEMA_VERSION",
+ "METRICS_BASELINE_UNTRUSTED_STATUSES",
+ "MetricsBaseline",
+ "MetricsBaselineSectionProbe",
+ "MetricsBaselineStatus",
+ "coerce_metrics_baseline_status",
+ "current_python_tag",
+ "probe_metrics_baseline_section",
+ "snapshot_from_project_metrics",
+]
diff --git a/codeclone/baseline/trust.py b/codeclone/baseline/trust.py
new file mode 100644
index 0000000..fa8179c
--- /dev/null
+++ b/codeclone/baseline/trust.py
@@ -0,0 +1,303 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import hashlib
+import re
+import sys
+from datetime import datetime, timezone
+from enum import Enum
+from json import JSONDecodeError
+from pathlib import Path
+from typing import TYPE_CHECKING, Final
+
+import orjson
+
+from ..contracts import DEFAULT_MAX_BASELINE_SIZE_MB
+from ..contracts.errors import BaselineValidationError
+from ..utils.json_io import read_json_object as _read_json_object
+
+if TYPE_CHECKING:
+ from collections.abc import Collection
+
+BASELINE_GENERATOR = "codeclone"
+_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR = {1: 0, 2: 1}
+MAX_BASELINE_SIZE_BYTES = DEFAULT_MAX_BASELINE_SIZE_MB * 1024 * 1024
+_UTC_ISO8601_Z_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
+
+
+class BaselineStatus(str, Enum):
+ OK = "ok"
+ MISSING = "missing"
+ TOO_LARGE = "too_large"
+ INVALID_JSON = "invalid_json"
+ INVALID_TYPE = "invalid_type"
+ MISSING_FIELDS = "missing_fields"
+ MISMATCH_SCHEMA_VERSION = "mismatch_schema_version"
+ MISMATCH_FINGERPRINT_VERSION = "mismatch_fingerprint_version"
+ MISMATCH_PYTHON_VERSION = "mismatch_python_version"
+ GENERATOR_MISMATCH = "generator_mismatch"
+ INTEGRITY_MISSING = "integrity_missing"
+ INTEGRITY_FAILED = "integrity_failed"
+
+
+BASELINE_UNTRUSTED_STATUSES: Final[frozenset[BaselineStatus]] = frozenset(
+ {
+ BaselineStatus.MISSING,
+ BaselineStatus.TOO_LARGE,
+ BaselineStatus.INVALID_JSON,
+ BaselineStatus.INVALID_TYPE,
+ BaselineStatus.MISSING_FIELDS,
+ BaselineStatus.MISMATCH_SCHEMA_VERSION,
+ BaselineStatus.MISMATCH_FINGERPRINT_VERSION,
+ BaselineStatus.MISMATCH_PYTHON_VERSION,
+ BaselineStatus.GENERATOR_MISMATCH,
+ BaselineStatus.INTEGRITY_MISSING,
+ BaselineStatus.INTEGRITY_FAILED,
+ }
+)
+
+
+def coerce_baseline_status(
+ raw_status: str | BaselineStatus | None,
+) -> BaselineStatus:
+ if isinstance(raw_status, BaselineStatus):
+ return raw_status
+ if isinstance(raw_status, str):
+ try:
+ return BaselineStatus(raw_status)
+ except ValueError:
+ return BaselineStatus.INVALID_TYPE
+ return BaselineStatus.INVALID_TYPE
+
+
+def _safe_stat_size(path: Path) -> int:
+ try:
+ return path.stat().st_size
+ except OSError as e:
+ raise BaselineValidationError(
+ f"Cannot stat baseline file at {path}: {e}",
+ status=BaselineStatus.INVALID_TYPE,
+ ) from e
+
+
+def _load_json_object(path: Path) -> dict[str, object]:
+ try:
+ return _read_json_object(path)
+ except OSError as e:
+ raise BaselineValidationError(
+ f"Cannot read baseline file at {path}: {e}",
+ status=BaselineStatus.INVALID_JSON,
+ ) from e
+ except JSONDecodeError as e:
+ raise BaselineValidationError(
+ f"Corrupted baseline file at {path}: {e}",
+ status=BaselineStatus.INVALID_JSON,
+ ) from e
+ except TypeError:
+ raise BaselineValidationError(
+ f"Baseline payload must be an object at {path}",
+ status=BaselineStatus.INVALID_TYPE,
+ ) from None
+
+
+def _parse_generator_meta(
+ meta_obj: dict[str, object], *, path: Path
+) -> tuple[str, str | None]:
+ raw_generator = meta_obj.get("generator")
+
+ if isinstance(raw_generator, str):
+ generator_version = _optional_str(meta_obj, "generator_version", path=path)
+ if generator_version is None:
+ generator_version = _optional_str(meta_obj, "codeclone_version", path=path)
+ return raw_generator, generator_version
+
+ if isinstance(raw_generator, dict):
+ allowed_keys = {"name", "version"}
+ extra = set(raw_generator.keys()) - allowed_keys
+ if extra:
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: unexpected generator keys: "
+ f"{', '.join(sorted(extra))}",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ generator_name = _require_str(raw_generator, "name", path=path)
+ generator_version = _optional_str(raw_generator, "version", path=path)
+
+ if generator_version is None:
+ generator_version = _optional_str(meta_obj, "generator_version", path=path)
+ if generator_version is None:
+ generator_version = _optional_str(
+ meta_obj, "codeclone_version", path=path
+ )
+
+ return generator_name, generator_version
+
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: 'generator' must be string or object",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+
+
+def _compute_payload_sha256(
+ *,
+ functions: Collection[str],
+ blocks: Collection[str],
+ fingerprint_version: str,
+ python_tag: str,
+) -> str:
+ canonical = {
+ "blocks": sorted(blocks),
+ "fingerprint_version": fingerprint_version,
+ "functions": sorted(functions),
+ "python_tag": python_tag,
+ }
+ serialized = orjson.dumps(canonical, option=orjson.OPT_SORT_KEYS)
+ return hashlib.sha256(serialized).hexdigest()
+
+
+def current_python_tag() -> str:
+ """Return the interpreter compatibility tag as an immutable string."""
+ impl = sys.implementation.name
+ major, minor = sys.version_info[:2]
+ prefix = "cp" if impl == "cpython" else impl[:2]
+ return f"{prefix}{major}{minor}"
+
+
+def _utc_now_z() -> str:
+ return (
+ datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
+ )
+
+
+def _require_str(obj: dict[str, object], key: str, *, path: Path) -> str:
+ value = obj.get(key)
+ if not isinstance(value, str):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be string",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ return value
+
+
+def _optional_str(obj: dict[str, object], key: str, *, path: Path) -> str | None:
+ value = obj.get(key)
+ if value is None:
+ return None
+ if not isinstance(value, str):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be string",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ return value
+
+
+def _require_semver_str(obj: dict[str, object], key: str, *, path: Path) -> str:
+ value = _require_str(obj, key, path=path)
+ _parse_semver(value, key=key, path=path)
+ return value
+
+
+def _parse_semver(value: str, *, key: str, path: Path) -> tuple[int, int, int]:
+ parts = value.split(".")
+ if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be semver string",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ if len(parts) == 2:
+ major, minor = int(parts[0]), int(parts[1])
+ patch = 0
+ else:
+ major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2])
+ return major, minor, patch
+
+
+def _require_python_tag(obj: dict[str, object], key: str, *, path: Path) -> str:
+ value = _require_str(obj, key, path=path)
+ if not re.fullmatch(r"[a-z]{2}\d{2,3}", value):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must look like 'cp313'",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ return value
+
+
+def _require_utc_iso8601_z(obj: dict[str, object], key: str, *, path: Path) -> str:
+ value = _require_str(obj, key, path=path)
+ if not _UTC_ISO8601_Z_RE.fullmatch(value):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ try:
+ datetime(
+ int(value[0:4]),
+ int(value[5:7]),
+ int(value[8:10]),
+ int(value[11:13]),
+ int(value[14:16]),
+ int(value[17:19]),
+ tzinfo=timezone.utc,
+ )
+ except ValueError as e:
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z",
+ status=BaselineStatus.INVALID_TYPE,
+ ) from e
+ return value
+
+
+def _require_sorted_unique_ids(
+ obj: dict[str, object], key: str, *, pattern: re.Pattern[str], path: Path
+) -> list[str]:
+ value = obj.get(key)
+ if not isinstance(value, list):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be list[str]",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ if not all(isinstance(item, str) for item in value):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be list[str]",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ values = list(value)
+ if values != sorted(values) or len(values) != len(set(values)):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' must be sorted and unique",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ if not all(pattern.fullmatch(item) for item in values):
+ raise BaselineValidationError(
+ f"Invalid baseline schema at {path}: '{key}' has invalid id format",
+ status=BaselineStatus.INVALID_TYPE,
+ )
+ return values
+
+
+__all__ = [
+ "BASELINE_GENERATOR",
+ "BASELINE_UNTRUSTED_STATUSES",
+ "MAX_BASELINE_SIZE_BYTES",
+ "_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR",
+ "BaselineStatus",
+ "_compute_payload_sha256",
+ "_load_json_object",
+ "_optional_str",
+ "_parse_generator_meta",
+ "_parse_semver",
+ "_require_python_tag",
+ "_require_semver_str",
+ "_require_sorted_unique_ids",
+ "_require_str",
+ "_require_utc_iso8601_z",
+ "_safe_stat_size",
+ "_utc_now_z",
+ "coerce_baseline_status",
+ "current_python_tag",
+]
diff --git a/codeclone/blocks.py b/codeclone/blocks/__init__.py
similarity index 95%
rename from codeclone/blocks.py
rename to codeclone/blocks/__init__.py
index 9089ff1..d998021 100644
--- a/codeclone/blocks.py
+++ b/codeclone/blocks/__init__.py
@@ -8,15 +8,15 @@
from typing import TYPE_CHECKING
-from .fingerprint import sha1
-from .models import BlockUnit, SegmentUnit
-from .normalize import stmt_hashes
+from ..analysis.fingerprint import sha1
+from ..analysis.normalizer import stmt_hashes
+from ..models import BlockUnit, SegmentUnit
if TYPE_CHECKING:
import ast
from collections.abc import Sequence
- from .normalize import NormalizationConfig
+ from ..analysis.normalizer import NormalizationConfig
__all__ = ["BlockUnit", "SegmentUnit", "extract_blocks", "extract_segments"]
diff --git a/codeclone/cache.py b/codeclone/cache.py
deleted file mode 100644
index 282cf66..0000000
--- a/codeclone/cache.py
+++ /dev/null
@@ -1,2803 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import os
-from collections.abc import Collection
-from enum import Enum
-from json import JSONDecodeError
-from pathlib import Path
-from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, TypeVar, cast
-
-from .baseline import current_python_tag
-from .cache_io import (
- as_int_or_none as _cache_as_int,
-)
-from .cache_io import (
- as_object_list as _cache_as_list,
-)
-from .cache_io import (
- as_str_dict as _cache_as_str_dict,
-)
-from .cache_io import (
- as_str_or_none as _cache_as_str,
-)
-from .cache_io import (
- read_json_document,
- sign_cache_payload,
- verify_cache_payload_signature,
- write_json_document_atomically,
-)
-from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime
-from .cache_segments import (
- SegmentReportProjection as _SegmentReportProjection,
-)
-from .cache_segments import (
- build_segment_report_projection as _build_segment_report_projection,
-)
-from .cache_segments import (
- decode_segment_report_projection,
- encode_segment_report_projection,
-)
-from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION
-from .errors import CacheError
-from .models import (
- BlockGroupItem,
- BlockUnit,
- ClassMetrics,
- DeadCandidate,
- FileMetrics,
- FunctionGroupItem,
- ModuleApiSurface,
- ModuleDep,
- ModuleDocstringCoverage,
- ModuleTypingCoverage,
- SegmentGroupItem,
- SegmentUnit,
- StructuralFindingGroup,
- StructuralFindingOccurrence,
- Unit,
-)
-from .structural_findings import normalize_structural_finding_group
-
-if TYPE_CHECKING:
- from collections.abc import Callable, Mapping, Sequence
-
-SegmentReportProjection = _SegmentReportProjection
-build_segment_report_projection = _build_segment_report_projection
-_as_str = _cache_as_str
-_as_int = _cache_as_int
-_as_list = _cache_as_list
-_as_str_dict = _cache_as_str_dict
-
-MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024
-LEGACY_CACHE_SECRET_FILENAME = ".cache_secret"
-_DEFAULT_WIRE_UNIT_FLOW_PROFILES = (
- 0,
- "none",
- False,
- "fallthrough",
- "none",
- "none",
-)
-
-
-class CacheStatus(str, Enum):
- OK = "ok"
- MISSING = "missing"
- TOO_LARGE = "too_large"
- UNREADABLE = "unreadable"
- INVALID_JSON = "invalid_json"
- INVALID_TYPE = "invalid_type"
- VERSION_MISMATCH = "version_mismatch"
- PYTHON_TAG_MISMATCH = "python_tag_mismatch"
- FINGERPRINT_MISMATCH = "mismatch_fingerprint_version"
- ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch"
- INTEGRITY_FAILED = "integrity_failed"
-
-
-class FileStat(TypedDict):
- mtime_ns: int
- size: int
-
-
-class SourceStatsDict(TypedDict):
- lines: int
- functions: int
- methods: int
- classes: int
-
-
-UnitDict = FunctionGroupItem
-BlockDict = BlockGroupItem
-SegmentDict = SegmentGroupItem
-
-
-class ClassMetricsDictBase(TypedDict):
- qualname: str
- filepath: str
- start_line: int
- end_line: int
- cbo: int
- lcom4: int
- method_count: int
- instance_var_count: int
- risk_coupling: str
- risk_cohesion: str
-
-
-class ClassMetricsDict(ClassMetricsDictBase, total=False):
- coupled_classes: list[str]
-
-
-class ModuleDepDict(TypedDict):
- source: str
- target: str
- import_type: str
- line: int
-
-
-class DeadCandidateDictBase(TypedDict):
- qualname: str
- local_name: str
- filepath: str
- start_line: int
- end_line: int
- kind: str
-
-
-class DeadCandidateDict(DeadCandidateDictBase, total=False):
- suppressed_rules: list[str]
-
-
-class ModuleTypingCoverageDict(TypedDict):
- module: str
- filepath: str
- callable_count: int
- params_total: int
- params_annotated: int
- returns_total: int
- returns_annotated: int
- any_annotation_count: int
-
-
-class ModuleDocstringCoverageDict(TypedDict):
- module: str
- filepath: str
- public_symbol_total: int
- public_symbol_documented: int
-
-
-class ApiParamSpecDict(TypedDict):
- name: str
- kind: str
- has_default: bool
- annotation_hash: str
-
-
-class PublicSymbolDict(TypedDict):
- qualname: str
- kind: str
- start_line: int
- end_line: int
- params: list[ApiParamSpecDict]
- returns_hash: str
- exported_via: str
-
-
-class ModuleApiSurfaceDict(TypedDict):
- module: str
- filepath: str
- all_declared: list[str]
- symbols: list[PublicSymbolDict]
-
-
-class StructuralFindingOccurrenceDict(TypedDict):
- qualname: str
- start: int
- end: int
-
-
-class StructuralFindingGroupDict(TypedDict):
- finding_kind: str
- finding_key: str
- signature: dict[str, str]
- items: list[StructuralFindingOccurrenceDict]
-
-
-class CacheEntryBase(TypedDict):
- stat: FileStat
- units: list[UnitDict]
- blocks: list[BlockDict]
- segments: list[SegmentDict]
-
-
-class CacheEntry(CacheEntryBase, total=False):
- source_stats: SourceStatsDict
- class_metrics: list[ClassMetricsDict]
- module_deps: list[ModuleDepDict]
- dead_candidates: list[DeadCandidateDict]
- referenced_names: list[str]
- referenced_qualnames: list[str]
- import_names: list[str]
- class_names: list[str]
- typing_coverage: ModuleTypingCoverageDict
- docstring_coverage: ModuleDocstringCoverageDict
- api_surface: ModuleApiSurfaceDict
- structural_findings: list[StructuralFindingGroupDict]
-
-
-class AnalysisProfile(TypedDict):
- min_loc: int
- min_stmt: int
- block_min_loc: int
- block_min_stmt: int
- segment_min_loc: int
- segment_min_stmt: int
- collect_api_surface: bool
-
-
-class CacheData(TypedDict):
- version: str
- python_tag: str
- fingerprint_version: str
- analysis_profile: AnalysisProfile
- files: dict[str, CacheEntry]
-
-
-def _normalize_cached_structural_group(
- group: StructuralFindingGroupDict,
- *,
- filepath: str,
-) -> StructuralFindingGroupDict | None:
- signature = dict(group["signature"])
- finding_kind = group["finding_kind"]
- finding_key = group["finding_key"]
- normalized = normalize_structural_finding_group(
- StructuralFindingGroup(
- finding_kind=finding_kind,
- finding_key=finding_key,
- signature=signature,
- items=tuple(
- StructuralFindingOccurrence(
- finding_kind=finding_kind,
- finding_key=finding_key,
- file_path=filepath,
- qualname=item["qualname"],
- start=item["start"],
- end=item["end"],
- signature=signature,
- )
- for item in group["items"]
- ),
- )
- )
- if normalized is None:
- return None
- return StructuralFindingGroupDict(
- finding_kind=normalized.finding_kind,
- finding_key=normalized.finding_key,
- signature=dict(normalized.signature),
- items=[
- StructuralFindingOccurrenceDict(
- qualname=item.qualname,
- start=item.start,
- end=item.end,
- )
- for item in normalized.items
- ],
- )
-
-
-def _normalize_cached_structural_groups(
- groups: Sequence[StructuralFindingGroupDict],
- *,
- filepath: str,
-) -> list[StructuralFindingGroupDict]:
- normalized = [
- candidate
- for candidate in (
- _normalize_cached_structural_group(group, filepath=filepath)
- for group in groups
- )
- if candidate is not None
- ]
- normalized.sort(key=lambda group: (-len(group["items"]), group["finding_key"]))
- return normalized
-
-
-_DecodedItemT = TypeVar("_DecodedItemT")
-_ValidatedItemT = TypeVar("_ValidatedItemT")
-
-
-class Cache:
- __slots__ = (
- "_canonical_runtime_paths",
- "_dirty",
- "analysis_profile",
- "cache_schema_version",
- "data",
- "fingerprint_version",
- "legacy_secret_warning",
- "load_status",
- "load_warning",
- "max_size_bytes",
- "path",
- "root",
- "segment_report_projection",
- )
-
- _CACHE_VERSION = CACHE_VERSION
-
- def __init__(
- self,
- path: str | Path,
- *,
- root: str | Path | None = None,
- max_size_bytes: int | None = None,
- min_loc: int = 10,
- min_stmt: int = 6,
- block_min_loc: int = 20,
- block_min_stmt: int = 8,
- segment_min_loc: int = 20,
- segment_min_stmt: int = 10,
- collect_api_surface: bool = False,
- ):
- self.path = Path(path)
- self.root = _resolve_root(root)
- self.fingerprint_version = BASELINE_FINGERPRINT_VERSION
- self.analysis_profile: AnalysisProfile = {
- "min_loc": min_loc,
- "min_stmt": min_stmt,
- "block_min_loc": block_min_loc,
- "block_min_stmt": block_min_stmt,
- "segment_min_loc": segment_min_loc,
- "segment_min_stmt": segment_min_stmt,
- "collect_api_surface": collect_api_surface,
- }
- self.data: CacheData = _empty_cache_data(
- version=self._CACHE_VERSION,
- python_tag=current_python_tag(),
- fingerprint_version=self.fingerprint_version,
- analysis_profile=self.analysis_profile,
- )
- self._canonical_runtime_paths: set[str] = set()
- self.legacy_secret_warning = self._detect_legacy_secret_warning()
- self.cache_schema_version: str | None = None
- self.load_status = CacheStatus.MISSING
- self.load_warning: str | None = self.legacy_secret_warning
- self.max_size_bytes = (
- MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes
- )
- self.segment_report_projection: SegmentReportProjection | None = None
- self._dirty: bool = True # new cache is dirty until loaded from disk
-
- def _detect_legacy_secret_warning(self) -> str | None:
- secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME
- try:
- if secret_path.exists():
- return (
- f"Legacy cache secret file detected at {secret_path}; "
- "delete this obsolete file."
- )
- except OSError as e:
- return f"Legacy cache secret check failed: {e}"
- return None
-
- def _set_load_warning(self, message: str | None) -> None:
- warning = message
- if warning is None:
- warning = self.legacy_secret_warning
- elif self.legacy_secret_warning:
- warning = f"{warning}\n{self.legacy_secret_warning}"
- self.load_warning = warning
-
- def _ignore_cache(
- self,
- message: str,
- *,
- status: CacheStatus,
- schema_version: str | None = None,
- ) -> None:
- self._set_load_warning(message)
- self.load_status = status
- self.cache_schema_version = schema_version
- self.data = _empty_cache_data(
- version=self._CACHE_VERSION,
- python_tag=current_python_tag(),
- fingerprint_version=self.fingerprint_version,
- analysis_profile=self.analysis_profile,
- )
- self._canonical_runtime_paths = set()
- self.segment_report_projection = None
-
- def _reject_cache_load(
- self,
- message: str,
- *,
- status: CacheStatus,
- schema_version: str | None = None,
- ) -> CacheData | None:
- self._ignore_cache(
- message,
- status=status,
- schema_version=schema_version,
- )
- return None
-
- def _reject_invalid_cache_format(
- self,
- *,
- schema_version: str | None = None,
- ) -> CacheData | None:
- return self._reject_cache_load(
- "Cache format invalid; ignoring cache.",
- status=CacheStatus.INVALID_TYPE,
- schema_version=schema_version,
- )
-
- def _reject_version_mismatch(self, version: str) -> CacheData | None:
- return self._reject_cache_load(
- f"Cache version mismatch (found {version}); ignoring cache.",
- status=CacheStatus.VERSION_MISMATCH,
- schema_version=version,
- )
-
- def load(self) -> None:
- try:
- exists = self.path.exists()
- except OSError as e:
- self._ignore_cache(
- f"Cache unreadable; ignoring cache: {e}",
- status=CacheStatus.UNREADABLE,
- )
- return
-
- if not exists:
- self._set_load_warning(None)
- self.load_status = CacheStatus.MISSING
- self.cache_schema_version = None
- self._canonical_runtime_paths = set()
- self.segment_report_projection = None
- return
-
- try:
- size = self.path.stat().st_size
- if size > self.max_size_bytes:
- self._ignore_cache(
- "Cache file too large "
- f"({size} bytes, max {self.max_size_bytes}); ignoring cache.",
- status=CacheStatus.TOO_LARGE,
- )
- return
-
- raw_obj = read_json_document(self.path)
- parsed = self._load_and_validate(raw_obj)
- if parsed is None:
- return
- self.data = parsed
- self._canonical_runtime_paths = set(parsed["files"].keys())
- self.load_status = CacheStatus.OK
- self._set_load_warning(None)
- self._dirty = False # freshly loaded — nothing to persist
-
- except OSError as e:
- self._ignore_cache(
- f"Cache unreadable; ignoring cache: {e}",
- status=CacheStatus.UNREADABLE,
- )
- except JSONDecodeError:
- self._ignore_cache(
- "Cache corrupted; ignoring cache.",
- status=CacheStatus.INVALID_JSON,
- )
-
- def _load_and_validate(self, raw_obj: object) -> CacheData | None:
- raw = _as_str_dict(raw_obj)
- if raw is None:
- return self._reject_invalid_cache_format()
-
- # Legacy cache format: top-level {version, files, _signature}.
- legacy_version = _as_str(raw.get("version"))
- if legacy_version is not None:
- return self._reject_version_mismatch(legacy_version)
-
- version = _as_str(raw.get("v"))
- if version is None:
- return self._reject_invalid_cache_format()
-
- if version != self._CACHE_VERSION:
- return self._reject_version_mismatch(version)
-
- sig = _as_str(raw.get("sig"))
- payload_obj = raw.get("payload")
- payload = _as_str_dict(payload_obj)
- if sig is None or payload is None:
- return self._reject_invalid_cache_format(schema_version=version)
-
- if not verify_cache_payload_signature(payload, sig):
- return self._reject_cache_load(
- "Cache signature mismatch; ignoring cache.",
- status=CacheStatus.INTEGRITY_FAILED,
- schema_version=version,
- )
-
- runtime_tag = current_python_tag()
- py_tag = _as_str(payload.get("py"))
- if py_tag is None:
- return self._reject_invalid_cache_format(schema_version=version)
-
- if py_tag != runtime_tag:
- return self._reject_cache_load(
- "Cache python tag mismatch "
- f"(found {py_tag}, expected {runtime_tag}); ignoring cache.",
- status=CacheStatus.PYTHON_TAG_MISMATCH,
- schema_version=version,
- )
-
- fp_version = _as_str(payload.get("fp"))
- if fp_version is None:
- return self._reject_invalid_cache_format(schema_version=version)
-
- if fp_version != self.fingerprint_version:
- return self._reject_cache_load(
- "Cache fingerprint version mismatch "
- f"(found {fp_version}, expected {self.fingerprint_version}); "
- "ignoring cache.",
- status=CacheStatus.FINGERPRINT_MISMATCH,
- schema_version=version,
- )
-
- analysis_profile = _as_analysis_profile(payload.get("ap"))
- if analysis_profile is None:
- return self._reject_invalid_cache_format(schema_version=version)
-
- if analysis_profile != self.analysis_profile:
- return self._reject_cache_load(
- "Cache analysis profile mismatch "
- f"(found min_loc={analysis_profile['min_loc']}, "
- f"min_stmt={analysis_profile['min_stmt']}, "
- "collect_api_surface="
- f"{str(analysis_profile['collect_api_surface']).lower()}; "
- f"expected min_loc={self.analysis_profile['min_loc']}, "
- f"min_stmt={self.analysis_profile['min_stmt']}, "
- "collect_api_surface="
- f"{str(self.analysis_profile['collect_api_surface']).lower()}); "
- "ignoring cache.",
- status=CacheStatus.ANALYSIS_PROFILE_MISMATCH,
- schema_version=version,
- )
-
- files_obj = payload.get("files")
- files_dict = _as_str_dict(files_obj)
- if files_dict is None:
- return self._reject_invalid_cache_format(schema_version=version)
-
- parsed_files: dict[str, CacheEntry] = {}
- for wire_path, file_entry_obj in files_dict.items():
- runtime_path = runtime_filepath_from_wire(wire_path, root=self.root)
- parsed_entry = self._decode_entry(file_entry_obj, runtime_path)
- if parsed_entry is None:
- return self._reject_invalid_cache_format(schema_version=version)
- parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry)
- self.segment_report_projection = decode_segment_report_projection(
- payload.get("sr"),
- root=self.root,
- )
-
- self.cache_schema_version = version
- return CacheData(
- version=self._CACHE_VERSION,
- python_tag=runtime_tag,
- fingerprint_version=self.fingerprint_version,
- analysis_profile=self.analysis_profile,
- files=parsed_files,
- )
-
- def save(self) -> None:
- if not self._dirty:
- return
- try:
- wire_files: dict[str, object] = {}
- wire_map = {
- rp: wire_filepath_from_runtime(rp, root=self.root)
- for rp in self.data["files"]
- }
- for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
- entry = self.get_file_entry(runtime_path)
- if entry is None:
- continue
- wire_files[wire_map[runtime_path]] = self._encode_entry(entry)
-
- payload: dict[str, object] = {
- "py": current_python_tag(),
- "fp": self.fingerprint_version,
- "ap": self.analysis_profile,
- "files": wire_files,
- }
- segment_projection = encode_segment_report_projection(
- self.segment_report_projection,
- root=self.root,
- )
- if segment_projection is not None:
- payload["sr"] = segment_projection
- signed_doc = {
- "v": self._CACHE_VERSION,
- "payload": payload,
- "sig": sign_cache_payload(payload),
- }
- write_json_document_atomically(self.path, signed_doc)
- self._dirty = False
-
- self.data["version"] = self._CACHE_VERSION
- self.data["python_tag"] = current_python_tag()
- self.data["fingerprint_version"] = self.fingerprint_version
- self.data["analysis_profile"] = self.analysis_profile
-
- except OSError as e:
- raise CacheError(f"Failed to save cache: {e}") from e
-
- @staticmethod
- def _decode_entry(value: object, filepath: str) -> CacheEntry | None:
- return _decode_wire_file_entry(value, filepath)
-
- @staticmethod
- def _encode_entry(entry: CacheEntry) -> dict[str, object]:
- return _encode_wire_file_entry(entry)
-
- def _store_canonical_file_entry(
- self,
- *,
- runtime_path: str,
- canonical_entry: CacheEntry,
- ) -> CacheEntry:
- previous_entry = self.data["files"].get(runtime_path)
- was_canonical = runtime_path in self._canonical_runtime_paths
- self.data["files"][runtime_path] = canonical_entry
- self._canonical_runtime_paths.add(runtime_path)
- if not was_canonical or previous_entry != canonical_entry:
- self._dirty = True
- return canonical_entry
-
- def get_file_entry(self, filepath: str) -> CacheEntry | None:
- runtime_lookup_key = filepath
- entry_obj = self.data["files"].get(runtime_lookup_key)
- if entry_obj is None:
- wire_key = wire_filepath_from_runtime(filepath, root=self.root)
- runtime_lookup_key = runtime_filepath_from_wire(wire_key, root=self.root)
- entry_obj = self.data["files"].get(runtime_lookup_key)
-
- if entry_obj is None:
- return None
-
- if runtime_lookup_key in self._canonical_runtime_paths:
- if _is_canonical_cache_entry(entry_obj):
- return entry_obj
- self._canonical_runtime_paths.discard(runtime_lookup_key)
-
- if not isinstance(entry_obj, dict):
- return None
- entry = entry_obj
-
- required = {"stat", "units", "blocks", "segments"}
- if not required.issubset(entry.keys()):
- return None
-
- stat = _as_file_stat_dict(entry.get("stat"))
- units = _as_typed_unit_list(entry.get("units"))
- blocks = _as_typed_block_list(entry.get("blocks"))
- segments = _as_typed_segment_list(entry.get("segments"))
- if stat is None or units is None or blocks is None or segments is None:
- return None
-
- optional_sections = _decode_optional_cache_sections(entry)
- if optional_sections is None:
- return None
- (
- class_metrics_raw,
- module_deps_raw,
- dead_candidates_raw,
- referenced_names_raw,
- referenced_qualnames_raw,
- import_names_raw,
- class_names_raw,
- typing_coverage_raw,
- docstring_coverage_raw,
- api_surface_raw,
- source_stats,
- structural_findings,
- ) = optional_sections
-
- entry_to_canonicalize: CacheEntry = _attach_optional_cache_sections(
- CacheEntry(
- stat=stat,
- units=units,
- blocks=blocks,
- segments=segments,
- class_metrics=class_metrics_raw,
- module_deps=module_deps_raw,
- dead_candidates=dead_candidates_raw,
- referenced_names=referenced_names_raw,
- referenced_qualnames=referenced_qualnames_raw,
- import_names=import_names_raw,
- class_names=class_names_raw,
- ),
- typing_coverage=typing_coverage_raw,
- docstring_coverage=docstring_coverage_raw,
- api_surface=api_surface_raw,
- source_stats=source_stats,
- structural_findings=structural_findings,
- )
- canonical_entry = _canonicalize_cache_entry(entry_to_canonicalize)
- return self._store_canonical_file_entry(
- runtime_path=runtime_lookup_key,
- canonical_entry=canonical_entry,
- )
-
- def put_file_entry(
- self,
- filepath: str,
- stat_sig: FileStat,
- units: list[Unit],
- blocks: list[BlockUnit],
- segments: list[SegmentUnit],
- *,
- source_stats: SourceStatsDict | None = None,
- file_metrics: FileMetrics | None = None,
- structural_findings: list[StructuralFindingGroup] | None = None,
- ) -> None:
- runtime_path = runtime_filepath_from_wire(
- wire_filepath_from_runtime(filepath, root=self.root),
- root=self.root,
- )
-
- unit_rows = [_unit_dict_from_model(unit, runtime_path) for unit in units]
- block_rows = [_block_dict_from_model(block, runtime_path) for block in blocks]
- segment_rows = [
- _segment_dict_from_model(segment, runtime_path) for segment in segments
- ]
-
- (
- class_metrics_rows,
- module_dep_rows,
- dead_candidate_rows,
- referenced_names,
- referenced_qualnames,
- import_names,
- class_names,
- typing_coverage,
- docstring_coverage,
- api_surface,
- ) = _new_optional_metrics_payload()
- if file_metrics is not None:
- class_metrics_rows = [
- _class_metrics_dict_from_model(metric, runtime_path)
- for metric in file_metrics.class_metrics
- ]
- module_dep_rows = [
- _module_dep_dict_from_model(dep) for dep in file_metrics.module_deps
- ]
- dead_candidate_rows = [
- _dead_candidate_dict_from_model(candidate, runtime_path)
- for candidate in file_metrics.dead_candidates
- ]
- referenced_names = sorted(set(file_metrics.referenced_names))
- referenced_qualnames = sorted(set(file_metrics.referenced_qualnames))
- import_names = sorted(set(file_metrics.import_names))
- class_names = sorted(set(file_metrics.class_names))
- typing_coverage = _typing_coverage_dict_from_model(
- file_metrics.typing_coverage,
- filepath=runtime_path,
- )
- docstring_coverage = _docstring_coverage_dict_from_model(
- file_metrics.docstring_coverage,
- filepath=runtime_path,
- )
- api_surface = _api_surface_dict_from_model(
- file_metrics.api_surface,
- filepath=runtime_path,
- )
-
- source_stats_payload = source_stats or SourceStatsDict(
- lines=0,
- functions=0,
- methods=0,
- classes=0,
- )
- entry_dict = CacheEntry(
- stat=stat_sig,
- source_stats=source_stats_payload,
- units=unit_rows,
- blocks=block_rows,
- segments=segment_rows,
- class_metrics=class_metrics_rows,
- module_deps=module_dep_rows,
- dead_candidates=dead_candidate_rows,
- referenced_names=referenced_names,
- referenced_qualnames=referenced_qualnames,
- import_names=import_names,
- class_names=class_names,
- )
- if typing_coverage is not None:
- entry_dict["typing_coverage"] = typing_coverage
- if docstring_coverage is not None:
- entry_dict["docstring_coverage"] = docstring_coverage
- if api_surface is not None:
- entry_dict["api_surface"] = api_surface
- if structural_findings is not None:
- entry_dict["structural_findings"] = _normalize_cached_structural_groups(
- [
- _structural_group_dict_from_model(group)
- for group in structural_findings
- ],
- filepath=runtime_path,
- )
- canonical_entry = _canonicalize_cache_entry(entry_dict)
- self._store_canonical_file_entry(
- runtime_path=runtime_path,
- canonical_entry=canonical_entry,
- )
-
-
-def file_stat_signature(path: str) -> FileStat:
- st = os.stat(path)
- return FileStat(
- mtime_ns=st.st_mtime_ns,
- size=st.st_size,
- )
-
-
-def _empty_cache_data(
- *,
- version: str,
- python_tag: str,
- fingerprint_version: str,
- analysis_profile: AnalysisProfile,
-) -> CacheData:
- return CacheData(
- version=version,
- python_tag=python_tag,
- fingerprint_version=fingerprint_version,
- analysis_profile=analysis_profile,
- files={},
- )
-
-
-def _as_risk_literal(value: object) -> Literal["low", "medium", "high"] | None:
- match value:
- case "low":
- return "low"
- case "medium":
- return "medium"
- case "high":
- return "high"
- case _:
- return None
-
-
-def _new_optional_metrics_payload() -> tuple[
- list[ClassMetricsDict],
- list[ModuleDepDict],
- list[DeadCandidateDict],
- list[str],
- list[str],
- list[str],
- list[str],
- ModuleTypingCoverageDict | None,
- ModuleDocstringCoverageDict | None,
- ModuleApiSurfaceDict | None,
-]:
- return [], [], [], [], [], [], [], None, None, None
-
-
-def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict:
- return FunctionGroupItem(
- qualname=unit.qualname,
- filepath=filepath,
- start_line=unit.start_line,
- end_line=unit.end_line,
- loc=unit.loc,
- stmt_count=unit.stmt_count,
- fingerprint=unit.fingerprint,
- loc_bucket=unit.loc_bucket,
- cyclomatic_complexity=unit.cyclomatic_complexity,
- nesting_depth=unit.nesting_depth,
- risk=unit.risk,
- raw_hash=unit.raw_hash,
- entry_guard_count=unit.entry_guard_count,
- entry_guard_terminal_profile=unit.entry_guard_terminal_profile,
- entry_guard_has_side_effect_before=unit.entry_guard_has_side_effect_before,
- terminal_kind=unit.terminal_kind,
- try_finally_profile=unit.try_finally_profile,
- side_effect_order_profile=unit.side_effect_order_profile,
- )
-
-
-def _block_dict_from_model(block: BlockUnit, filepath: str) -> BlockDict:
- return BlockGroupItem(
- block_hash=block.block_hash,
- filepath=filepath,
- qualname=block.qualname,
- start_line=block.start_line,
- end_line=block.end_line,
- size=block.size,
- )
-
-
-def _segment_dict_from_model(segment: SegmentUnit, filepath: str) -> SegmentDict:
- return SegmentGroupItem(
- segment_hash=segment.segment_hash,
- segment_sig=segment.segment_sig,
- filepath=filepath,
- qualname=segment.qualname,
- start_line=segment.start_line,
- end_line=segment.end_line,
- size=segment.size,
- )
-
-
-def _typing_coverage_dict_from_model(
- coverage: ModuleTypingCoverage | None,
- *,
- filepath: str,
-) -> ModuleTypingCoverageDict | None:
- if coverage is None:
- return None
- return ModuleTypingCoverageDict(
- module=coverage.module,
- filepath=filepath,
- callable_count=coverage.callable_count,
- params_total=coverage.params_total,
- params_annotated=coverage.params_annotated,
- returns_total=coverage.returns_total,
- returns_annotated=coverage.returns_annotated,
- any_annotation_count=coverage.any_annotation_count,
- )
-
-
-def _docstring_coverage_dict_from_model(
- coverage: ModuleDocstringCoverage | None,
- *,
- filepath: str,
-) -> ModuleDocstringCoverageDict | None:
- if coverage is None:
- return None
- return ModuleDocstringCoverageDict(
- module=coverage.module,
- filepath=filepath,
- public_symbol_total=coverage.public_symbol_total,
- public_symbol_documented=coverage.public_symbol_documented,
- )
-
-
-def _api_surface_dict_from_model(
- surface: ModuleApiSurface | None,
- *,
- filepath: str,
-) -> ModuleApiSurfaceDict | None:
- if surface is None:
- return None
- return ModuleApiSurfaceDict(
- module=surface.module,
- filepath=filepath,
- all_declared=list(surface.all_declared or ()),
- symbols=[
- PublicSymbolDict(
- qualname=symbol.qualname,
- kind=symbol.kind,
- start_line=symbol.start_line,
- end_line=symbol.end_line,
- params=[
- ApiParamSpecDict(
- name=param.name,
- kind=param.kind,
- has_default=param.has_default,
- annotation_hash=param.annotation_hash,
- )
- for param in symbol.params
- ],
- returns_hash=symbol.returns_hash,
- exported_via=symbol.exported_via,
- )
- for symbol in surface.symbols
- ],
- )
-
-
-def _class_metrics_dict_from_model(
- metric: ClassMetrics,
- filepath: str,
-) -> ClassMetricsDict:
- return ClassMetricsDict(
- qualname=metric.qualname,
- filepath=filepath,
- start_line=metric.start_line,
- end_line=metric.end_line,
- cbo=metric.cbo,
- lcom4=metric.lcom4,
- method_count=metric.method_count,
- instance_var_count=metric.instance_var_count,
- risk_coupling=metric.risk_coupling,
- risk_cohesion=metric.risk_cohesion,
- coupled_classes=sorted(set(metric.coupled_classes)),
- )
-
-
-def _module_dep_dict_from_model(dep: ModuleDep) -> ModuleDepDict:
- return ModuleDepDict(
- source=dep.source,
- target=dep.target,
- import_type=dep.import_type,
- line=dep.line,
- )
-
-
-def _dead_candidate_dict_from_model(
- candidate: DeadCandidate,
- filepath: str,
-) -> DeadCandidateDict:
- result = DeadCandidateDict(
- qualname=candidate.qualname,
- local_name=candidate.local_name,
- filepath=filepath,
- start_line=candidate.start_line,
- end_line=candidate.end_line,
- kind=candidate.kind,
- )
- if candidate.suppressed_rules:
- result["suppressed_rules"] = sorted(set(candidate.suppressed_rules))
- return result
-
-
-def _structural_occurrence_dict_from_model(
- occurrence: StructuralFindingOccurrence,
-) -> StructuralFindingOccurrenceDict:
- return StructuralFindingOccurrenceDict(
- qualname=occurrence.qualname,
- start=occurrence.start,
- end=occurrence.end,
- )
-
-
-def _structural_group_dict_from_model(
- group: StructuralFindingGroup,
-) -> StructuralFindingGroupDict:
- return StructuralFindingGroupDict(
- finding_kind=group.finding_kind,
- finding_key=group.finding_key,
- signature=dict(group.signature),
- items=[
- _structural_occurrence_dict_from_model(occurrence)
- for occurrence in group.items
- ],
- )
-
-
-def _as_file_stat_dict(value: object) -> FileStat | None:
- if not _is_file_stat_dict(value):
- return None
- obj = cast("Mapping[str, object]", value)
- mtime_ns = obj.get("mtime_ns")
- size = obj.get("size")
- if not isinstance(mtime_ns, int) or not isinstance(size, int):
- return None
- return FileStat(mtime_ns=mtime_ns, size=size)
-
-
-def _as_source_stats_dict(value: object) -> SourceStatsDict | None:
- if not _is_source_stats_dict(value):
- return None
- obj = cast("Mapping[str, object]", value)
- lines = obj.get("lines")
- functions = obj.get("functions")
- methods = obj.get("methods")
- classes = obj.get("classes")
- assert isinstance(lines, int)
- assert isinstance(functions, int)
- assert isinstance(methods, int)
- assert isinstance(classes, int)
- return SourceStatsDict(
- lines=lines,
- functions=functions,
- methods=methods,
- classes=classes,
- )
-
-
-def _as_typed_list(
- value: object,
- *,
- predicate: Callable[[object], bool],
-) -> list[_ValidatedItemT] | None:
- if not isinstance(value, list):
- return None
- if not all(predicate(item) for item in value):
- return None
- return cast("list[_ValidatedItemT]", value)
-
-
-def _as_typed_unit_list(value: object) -> list[UnitDict] | None:
- return _as_typed_list(value, predicate=_is_unit_dict)
-
-
-def _as_typed_block_list(value: object) -> list[BlockDict] | None:
- return _as_typed_list(value, predicate=_is_block_dict)
-
-
-def _as_typed_segment_list(value: object) -> list[SegmentDict] | None:
- return _as_typed_list(value, predicate=_is_segment_dict)
-
-
-def _as_typed_class_metrics_list(value: object) -> list[ClassMetricsDict] | None:
- return _as_typed_list(value, predicate=_is_class_metrics_dict)
-
-
-def _as_typed_dead_candidates_list(
- value: object,
-) -> list[DeadCandidateDict] | None:
- return _as_typed_list(value, predicate=_is_dead_candidate_dict)
-
-
-def _as_typed_module_deps_list(value: object) -> list[ModuleDepDict] | None:
- return _as_typed_list(value, predicate=_is_module_dep_dict)
-
-
-def _as_typed_string_list(value: object) -> list[str] | None:
- return _as_typed_list(value, predicate=lambda item: isinstance(item, str))
-
-
-def _as_module_typing_coverage_dict(
- value: object,
-) -> ModuleTypingCoverageDict | None:
- if not _is_module_typing_coverage_dict(value):
- return None
- return cast("ModuleTypingCoverageDict", value)
-
-
-def _as_module_docstring_coverage_dict(
- value: object,
-) -> ModuleDocstringCoverageDict | None:
- if not _is_module_docstring_coverage_dict(value):
- return None
- return cast("ModuleDocstringCoverageDict", value)
-
-
-def _as_module_api_surface_dict(value: object) -> ModuleApiSurfaceDict | None:
- if not _is_module_api_surface_dict(value):
- return None
- return cast("ModuleApiSurfaceDict", value)
-
-
-def _normalized_optional_string_list(value: object) -> list[str] | None:
- items = _as_typed_string_list(value)
- if not items:
- return None
- return sorted(set(items))
-
-
-def _is_canonical_cache_entry(value: object) -> TypeGuard[CacheEntry]:
- return isinstance(value, dict) and _has_cache_entry_container_shape(value)
-
-
-def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool:
- required = {"stat", "units", "blocks", "segments"}
- if not required.issubset(entry.keys()):
- return False
- if not isinstance(entry.get("stat"), dict):
- return False
- if not isinstance(entry.get("units"), list):
- return False
- if not isinstance(entry.get("blocks"), list):
- return False
- if not isinstance(entry.get("segments"), list):
- return False
- source_stats = entry.get("source_stats")
- if source_stats is not None and not _is_source_stats_dict(source_stats):
- return False
- optional_list_keys = (
- "class_metrics",
- "module_deps",
- "dead_candidates",
- "referenced_names",
- "referenced_qualnames",
- "import_names",
- "class_names",
- "structural_findings",
- )
- if not all(isinstance(entry.get(key, []), list) for key in optional_list_keys):
- return False
- typing_coverage = entry.get("typing_coverage")
- if typing_coverage is not None and not _is_module_typing_coverage_dict(
- typing_coverage
- ):
- return False
- docstring_coverage = entry.get("docstring_coverage")
- if docstring_coverage is not None and not _is_module_docstring_coverage_dict(
- docstring_coverage
- ):
- return False
- api_surface = entry.get("api_surface")
- return api_surface is None or _is_module_api_surface_dict(api_surface)
-
-
-def _decode_optional_cache_sections(
- entry: Mapping[str, object],
-) -> (
- tuple[
- list[ClassMetricsDict],
- list[ModuleDepDict],
- list[DeadCandidateDict],
- list[str],
- list[str],
- list[str],
- list[str],
- ModuleTypingCoverageDict | None,
- ModuleDocstringCoverageDict | None,
- ModuleApiSurfaceDict | None,
- SourceStatsDict | None,
- list[StructuralFindingGroupDict] | None,
- ]
- | None
-):
- class_metrics_raw = _as_typed_class_metrics_list(entry.get("class_metrics", []))
- module_deps_raw = _as_typed_module_deps_list(entry.get("module_deps", []))
- dead_candidates_raw = _as_typed_dead_candidates_list(
- entry.get("dead_candidates", [])
- )
- referenced_names_raw = _as_typed_string_list(entry.get("referenced_names", []))
- referenced_qualnames_raw = _as_typed_string_list(
- entry.get("referenced_qualnames", [])
- )
- import_names_raw = _as_typed_string_list(entry.get("import_names", []))
- class_names_raw = _as_typed_string_list(entry.get("class_names", []))
- if (
- class_metrics_raw is None
- or module_deps_raw is None
- or dead_candidates_raw is None
- or referenced_names_raw is None
- or referenced_qualnames_raw is None
- or import_names_raw is None
- or class_names_raw is None
- ):
- return None
- typing_coverage_raw = _as_module_typing_coverage_dict(entry.get("typing_coverage"))
- docstring_coverage_raw = _as_module_docstring_coverage_dict(
- entry.get("docstring_coverage")
- )
- api_surface_raw = _as_module_api_surface_dict(entry.get("api_surface"))
- source_stats = _as_source_stats_dict(entry.get("source_stats"))
- structural_findings = entry.get("structural_findings")
- typed_structural_findings = (
- structural_findings if isinstance(structural_findings, list) else None
- )
- return (
- class_metrics_raw,
- module_deps_raw,
- dead_candidates_raw,
- referenced_names_raw,
- referenced_qualnames_raw,
- import_names_raw,
- class_names_raw,
- typing_coverage_raw,
- docstring_coverage_raw,
- api_surface_raw,
- source_stats,
- typed_structural_findings,
- )
-
-
-def _attach_optional_cache_sections(
- entry: CacheEntry,
- *,
- typing_coverage: ModuleTypingCoverageDict | None = None,
- docstring_coverage: ModuleDocstringCoverageDict | None = None,
- api_surface: ModuleApiSurfaceDict | None = None,
- source_stats: SourceStatsDict | None = None,
- structural_findings: list[StructuralFindingGroupDict] | None = None,
-) -> CacheEntry:
- if typing_coverage is not None:
- entry["typing_coverage"] = typing_coverage
- if docstring_coverage is not None:
- entry["docstring_coverage"] = docstring_coverage
- if api_surface is not None:
- entry["api_surface"] = api_surface
- if source_stats is not None:
- entry["source_stats"] = source_stats
- if structural_findings is not None:
- entry["structural_findings"] = structural_findings
- return entry
-
-
-def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry:
- class_metrics_sorted = sorted(
- entry["class_metrics"],
- key=lambda item: (
- item["start_line"],
- item["end_line"],
- item["qualname"],
- ),
- )
- for metric in class_metrics_sorted:
- coupled_classes = metric.get("coupled_classes", [])
- if coupled_classes:
- metric["coupled_classes"] = sorted(set(coupled_classes))
-
- module_deps_sorted = sorted(
- entry["module_deps"],
- key=lambda item: (
- item["source"],
- item["target"],
- item["import_type"],
- item["line"],
- ),
- )
- dead_candidates_normalized: list[DeadCandidateDict] = []
- for candidate in entry["dead_candidates"]:
- suppressed_rules = candidate.get("suppressed_rules", [])
- normalized_candidate = DeadCandidateDict(
- qualname=candidate["qualname"],
- local_name=candidate["local_name"],
- filepath=candidate["filepath"],
- start_line=candidate["start_line"],
- end_line=candidate["end_line"],
- kind=candidate["kind"],
- )
- if _is_string_list(suppressed_rules):
- normalized_rules = sorted(set(suppressed_rules))
- if normalized_rules:
- normalized_candidate["suppressed_rules"] = normalized_rules
- dead_candidates_normalized.append(normalized_candidate)
-
- dead_candidates_sorted = sorted(
- dead_candidates_normalized,
- key=lambda item: (
- item["start_line"],
- item["end_line"],
- item["qualname"],
- item["local_name"],
- item["kind"],
- tuple(item.get("suppressed_rules", [])),
- ),
- )
-
- result: CacheEntry = {
- "stat": entry["stat"],
- "units": entry["units"],
- "blocks": entry["blocks"],
- "segments": entry["segments"],
- "class_metrics": class_metrics_sorted,
- "module_deps": module_deps_sorted,
- "dead_candidates": dead_candidates_sorted,
- "referenced_names": sorted(set(entry["referenced_names"])),
- "referenced_qualnames": sorted(set(entry.get("referenced_qualnames", []))),
- "import_names": sorted(set(entry["import_names"])),
- "class_names": sorted(set(entry["class_names"])),
- }
- typing_coverage = entry.get("typing_coverage")
- if typing_coverage is not None:
- result["typing_coverage"] = ModuleTypingCoverageDict(
- module=typing_coverage["module"],
- filepath=typing_coverage["filepath"],
- callable_count=typing_coverage["callable_count"],
- params_total=typing_coverage["params_total"],
- params_annotated=typing_coverage["params_annotated"],
- returns_total=typing_coverage["returns_total"],
- returns_annotated=typing_coverage["returns_annotated"],
- any_annotation_count=typing_coverage["any_annotation_count"],
- )
- docstring_coverage = entry.get("docstring_coverage")
- if docstring_coverage is not None:
- result["docstring_coverage"] = ModuleDocstringCoverageDict(
- module=docstring_coverage["module"],
- filepath=docstring_coverage["filepath"],
- public_symbol_total=docstring_coverage["public_symbol_total"],
- public_symbol_documented=docstring_coverage["public_symbol_documented"],
- )
- api_surface = entry.get("api_surface")
- if api_surface is not None:
- symbols = sorted(
- api_surface["symbols"],
- key=lambda item: (
- item["qualname"],
- item["kind"],
- item["start_line"],
- item["end_line"],
- ),
- )
- normalized_symbols = [
- PublicSymbolDict(
- qualname=symbol["qualname"],
- kind=symbol["kind"],
- start_line=symbol["start_line"],
- end_line=symbol["end_line"],
- params=[
- ApiParamSpecDict(
- name=param["name"],
- kind=param["kind"],
- has_default=param["has_default"],
- annotation_hash=param["annotation_hash"],
- )
- for param in symbol.get("params", [])
- ],
- returns_hash=symbol.get("returns_hash", ""),
- exported_via=symbol.get("exported_via", "name"),
- )
- for symbol in symbols
- ]
- result["api_surface"] = ModuleApiSurfaceDict(
- module=api_surface["module"],
- filepath=api_surface["filepath"],
- all_declared=sorted(set(api_surface.get("all_declared", []))),
- symbols=normalized_symbols,
- )
- sf = entry.get("structural_findings")
- if sf is not None:
- result["structural_findings"] = sf
- source_stats = entry.get("source_stats")
- if source_stats is not None:
- result["source_stats"] = source_stats
- return result
-
-
-def _decode_wire_qualname_span(
- row: list[object],
-) -> tuple[str, int, int] | None:
- qualname = _as_str(row[0])
- start_line = _as_int(row[1])
- end_line = _as_int(row[2])
- if qualname is None or start_line is None or end_line is None:
- return None
- return qualname, start_line, end_line
-
-
-def _decode_wire_qualname_span_size(
- row: list[object],
-) -> tuple[str, int, int, int] | None:
- qualname_span = _decode_wire_qualname_span(row)
- if qualname_span is None:
- return None
- size = _as_int(row[3])
- if size is None:
- return None
- qualname, start_line, end_line = qualname_span
- return qualname, start_line, end_line, size
-
-
-def _as_analysis_profile(value: object) -> AnalysisProfile | None:
- obj = _as_str_dict(value)
- if obj is None:
- return None
-
- _REQUIRED = {
- "min_loc",
- "min_stmt",
- "block_min_loc",
- "block_min_stmt",
- "segment_min_loc",
- "segment_min_stmt",
- }
- if set(obj.keys()) < _REQUIRED:
- return None
-
- min_loc = _as_int(obj.get("min_loc"))
- min_stmt = _as_int(obj.get("min_stmt"))
- block_min_loc = _as_int(obj.get("block_min_loc"))
- block_min_stmt = _as_int(obj.get("block_min_stmt"))
- segment_min_loc = _as_int(obj.get("segment_min_loc"))
- segment_min_stmt = _as_int(obj.get("segment_min_stmt"))
- collect_api_surface_raw = obj.get("collect_api_surface", False)
- collect_api_surface = (
- collect_api_surface_raw if isinstance(collect_api_surface_raw, bool) else None
- )
- if (
- min_loc is None
- or min_stmt is None
- or block_min_loc is None
- or block_min_stmt is None
- or segment_min_loc is None
- or segment_min_stmt is None
- or collect_api_surface is None
- ):
- return None
-
- return AnalysisProfile(
- min_loc=min_loc,
- min_stmt=min_stmt,
- block_min_loc=block_min_loc,
- block_min_stmt=block_min_stmt,
- segment_min_loc=segment_min_loc,
- segment_min_stmt=segment_min_stmt,
- collect_api_surface=collect_api_surface,
- )
-
-
-def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None:
- stat_list = _as_list(obj.get("st"))
- if stat_list is None or len(stat_list) != 2:
- return None
- mtime_ns = _as_int(stat_list[0])
- size = _as_int(stat_list[1])
- if mtime_ns is None or size is None:
- return None
- return FileStat(mtime_ns=mtime_ns, size=size)
-
-
-def _decode_optional_wire_source_stats(
- *,
- obj: dict[str, object],
-) -> SourceStatsDict | None:
- row = _decode_optional_wire_row(obj=obj, key="ss", expected_len=4)
- if row is None:
- return None
- counts = _decode_wire_int_fields(row, 0, 1, 2, 3)
- if counts is None:
- return None
- lines, functions, methods, classes = counts
- if any(value < 0 for value in counts):
- return None
- return SourceStatsDict(
- lines=lines,
- functions=functions,
- methods=methods,
- classes=classes,
- )
-
-
-def _decode_optional_wire_items(
- *,
- obj: dict[str, object],
- key: str,
- decode_item: Callable[[object], _DecodedItemT | None],
-) -> list[_DecodedItemT] | None:
- raw_items = obj.get(key)
- if raw_items is None:
- return []
- wire_items = _as_list(raw_items)
- if wire_items is None:
- return None
- decoded_items: list[_DecodedItemT] = []
- for wire_item in wire_items:
- decoded = decode_item(wire_item)
- if decoded is None:
- return None
- decoded_items.append(decoded)
- return decoded_items
-
-
-def _decode_optional_wire_items_for_filepath(
- *,
- obj: dict[str, object],
- key: str,
- filepath: str,
- decode_item: Callable[[object, str], _DecodedItemT | None],
-) -> list[_DecodedItemT] | None:
- raw_items = obj.get(key)
- if raw_items is None:
- return []
- wire_items = _as_list(raw_items)
- if wire_items is None:
- return None
- decoded_items: list[_DecodedItemT] = []
- for wire_item in wire_items:
- decoded = decode_item(wire_item, filepath)
- if decoded is None:
- return None
- decoded_items.append(decoded)
- return decoded_items
-
-
-def _decode_optional_wire_row(
- *,
- obj: dict[str, object],
- key: str,
- expected_len: int,
-) -> list[object] | None:
- raw = obj.get(key)
- if raw is None:
- return None
- row = _as_list(raw)
- if row is None or len(row) != expected_len:
- return None
- return row
-
-
-def _decode_optional_wire_names(
- *,
- obj: dict[str, object],
- key: str,
-) -> list[str] | None:
- raw_names = obj.get(key)
- if raw_names is None:
- return []
- names = _as_list(raw_names)
- if names is None or not all(isinstance(name, str) for name in names):
- return None
- return [str(name) for name in names]
-
-
-def _decode_optional_wire_coupled_classes(
- *,
- obj: dict[str, object],
- key: str,
-) -> dict[str, list[str]] | None:
- raw = obj.get(key)
- if raw is None:
- return {}
-
- rows = _as_list(raw)
- if rows is None:
- return None
-
- decoded: dict[str, list[str]] = {}
- for wire_row in rows:
- row = _as_list(wire_row)
- if row is None or len(row) != 2:
- return None
- qualname = _as_str(row[0])
- names = _as_list(row[1])
- if qualname is None or names is None:
- return None
- if not all(isinstance(name, str) for name in names):
- return None
- decoded[qualname] = sorted({str(name) for name in names if str(name)})
-
- return decoded
-
-
-def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None:
- obj = _as_str_dict(value)
- if obj is None:
- return None
-
- stat = _decode_wire_stat(obj)
- if stat is None:
- return None
- source_stats = _decode_optional_wire_source_stats(obj=obj)
- file_sections = _decode_wire_file_sections(obj=obj, filepath=filepath)
- if file_sections is None:
- return None
- (
- units,
- blocks,
- segments,
- class_metrics,
- module_deps,
- dead_candidates,
- ) = file_sections
- name_sections = _decode_wire_name_sections(obj=obj)
- if name_sections is None:
- return None
- (
- referenced_names,
- referenced_qualnames,
- import_names,
- class_names,
- ) = name_sections
- typing_coverage = _decode_optional_wire_typing_coverage(obj=obj, filepath=filepath)
- docstring_coverage = _decode_optional_wire_docstring_coverage(
- obj=obj,
- filepath=filepath,
- )
- api_surface = _decode_optional_wire_api_surface(obj=obj, filepath=filepath)
- coupled_classes_map = _decode_optional_wire_coupled_classes(obj=obj, key="cc")
- if coupled_classes_map is None:
- return None
-
- for metric in class_metrics:
- names = coupled_classes_map.get(metric["qualname"], [])
- if names:
- metric["coupled_classes"] = names
-
- has_structural_findings = "sf" in obj
- structural_findings = _decode_wire_structural_findings_optional(obj)
- if structural_findings is None:
- return None
-
- return _attach_optional_cache_sections(
- CacheEntry(
- stat=stat,
- units=units,
- blocks=blocks,
- segments=segments,
- class_metrics=class_metrics,
- module_deps=module_deps,
- dead_candidates=dead_candidates,
- referenced_names=referenced_names,
- referenced_qualnames=referenced_qualnames,
- import_names=import_names,
- class_names=class_names,
- ),
- typing_coverage=typing_coverage,
- docstring_coverage=docstring_coverage,
- api_surface=api_surface,
- source_stats=source_stats,
- structural_findings=(
- _normalize_cached_structural_groups(structural_findings, filepath=filepath)
- if has_structural_findings
- else None
- ),
- )
-
-
-def _decode_wire_file_sections(
- *,
- obj: dict[str, object],
- filepath: str,
-) -> (
- tuple[
- list[UnitDict],
- list[BlockDict],
- list[SegmentDict],
- list[ClassMetricsDict],
- list[ModuleDepDict],
- list[DeadCandidateDict],
- ]
- | None
-):
- units = _decode_optional_wire_items_for_filepath(
- obj=obj,
- key="u",
- filepath=filepath,
- decode_item=_decode_wire_unit,
- )
- blocks = _decode_optional_wire_items_for_filepath(
- obj=obj,
- key="b",
- filepath=filepath,
- decode_item=_decode_wire_block,
- )
- segments = _decode_optional_wire_items_for_filepath(
- obj=obj,
- key="s",
- filepath=filepath,
- decode_item=_decode_wire_segment,
- )
- class_metrics = _decode_optional_wire_items_for_filepath(
- obj=obj,
- key="cm",
- filepath=filepath,
- decode_item=_decode_wire_class_metric,
- )
- module_deps = _decode_optional_wire_items(
- obj=obj,
- key="md",
- decode_item=_decode_wire_module_dep,
- )
- dead_candidates = _decode_optional_wire_items_for_filepath(
- obj=obj,
- key="dc",
- filepath=filepath,
- decode_item=_decode_wire_dead_candidate,
- )
- if (
- units is None
- or blocks is None
- or segments is None
- or class_metrics is None
- or module_deps is None
- or dead_candidates is None
- ):
- return None
- return (
- units,
- blocks,
- segments,
- class_metrics,
- module_deps,
- dead_candidates,
- )
-
-
-def _decode_wire_name_sections(
- *,
- obj: dict[str, object],
-) -> tuple[list[str], list[str], list[str], list[str]] | None:
- referenced_names = _decode_optional_wire_names(obj=obj, key="rn")
- referenced_qualnames = _decode_optional_wire_names(obj=obj, key="rq")
- import_names = _decode_optional_wire_names(obj=obj, key="in")
- class_names = _decode_optional_wire_names(obj=obj, key="cn")
- if (
- referenced_names is None
- or referenced_qualnames is None
- or import_names is None
- or class_names is None
- ):
- return None
- return (
- referenced_names,
- referenced_qualnames,
- import_names,
- class_names,
- )
-
-
-def _decode_optional_wire_typing_coverage(
- *,
- obj: dict[str, object],
- filepath: str,
-) -> ModuleTypingCoverageDict | None:
- module_and_ints = _decode_optional_wire_module_ints(
- obj=obj,
- key="tc",
- expected_len=7,
- int_indexes=(1, 2, 3, 4, 5, 6),
- )
- if module_and_ints is None:
- return None
- module, ints = module_and_ints
- (
- callable_count,
- params_total,
- params_annotated,
- returns_total,
- returns_annotated,
- any_annotation_count,
- ) = ints
- return ModuleTypingCoverageDict(
- module=module,
- filepath=filepath,
- callable_count=callable_count,
- params_total=params_total,
- params_annotated=params_annotated,
- returns_total=returns_total,
- returns_annotated=returns_annotated,
- any_annotation_count=any_annotation_count,
- )
-
-
-def _decode_optional_wire_docstring_coverage(
- *,
- obj: dict[str, object],
- filepath: str,
-) -> ModuleDocstringCoverageDict | None:
- module_and_counts = _decode_optional_wire_module_ints(
- obj=obj,
- key="dg",
- expected_len=3,
- int_indexes=(1, 2),
- )
- if module_and_counts is None:
- return None
- module, counts = module_and_counts
- public_symbol_total, public_symbol_documented = counts
- return ModuleDocstringCoverageDict(
- module=module,
- filepath=filepath,
- public_symbol_total=public_symbol_total,
- public_symbol_documented=public_symbol_documented,
- )
-
-
-def _decode_optional_wire_api_surface(
- *,
- obj: dict[str, object],
- filepath: str,
-) -> ModuleApiSurfaceDict | None:
- row = _decode_optional_wire_row(obj=obj, key="as", expected_len=3)
- if row is None:
- return None
- module = _as_str(row[0])
- all_declared = _decode_optional_wire_names(obj={"ad": row[1]}, key="ad")
- symbols_raw = _as_list(row[2])
- if module is None or all_declared is None or symbols_raw is None:
- return None
- symbols: list[PublicSymbolDict] = []
- for symbol_raw in symbols_raw:
- decoded_symbol = _decode_wire_api_surface_symbol(symbol_raw)
- if decoded_symbol is None:
- return None
- symbols.append(decoded_symbol)
- return ModuleApiSurfaceDict(
- module=module,
- filepath=filepath,
- all_declared=sorted(set(all_declared)),
- symbols=symbols,
- )
-
-
-def _decode_optional_wire_module_ints(
- *,
- obj: dict[str, object],
- key: str,
- expected_len: int,
- int_indexes: tuple[int, ...],
-) -> tuple[str, tuple[int, ...]] | None:
- row = _decode_optional_wire_row(obj=obj, key=key, expected_len=expected_len)
- if row is None:
- return None
- module = _as_str(row[0])
- ints = _decode_wire_int_fields(row, *int_indexes)
- if module is None or ints is None:
- return None
- return module, ints
-
-
-def _decode_wire_api_surface_symbol(
- value: object,
-) -> PublicSymbolDict | None:
- symbol_row = _decode_wire_row(value, valid_lengths={7})
- if symbol_row is None:
- return None
- str_fields = _decode_wire_str_fields(symbol_row, 0, 1, 4, 5)
- int_fields = _decode_wire_int_fields(symbol_row, 2, 3)
- params_raw = _as_list(symbol_row[6])
- if str_fields is None or int_fields is None or params_raw is None:
- return None
- qualname, kind, exported_via, returns_hash = str_fields
- start_line, end_line = int_fields
- params: list[ApiParamSpecDict] = []
- for param_raw in params_raw:
- decoded_param = _decode_wire_api_param_spec(param_raw)
- if decoded_param is None:
- return None
- params.append(decoded_param)
- return PublicSymbolDict(
- qualname=qualname,
- kind=kind,
- start_line=start_line,
- end_line=end_line,
- params=params,
- returns_hash=returns_hash,
- exported_via=exported_via,
- )
-
-
-def _decode_wire_api_param_spec(
- value: object,
-) -> ApiParamSpecDict | None:
- param_row = _decode_wire_row(value, valid_lengths={4})
- if param_row is None:
- return None
- str_fields = _decode_wire_str_fields(param_row, 0, 1, 3)
- int_fields = _decode_wire_int_fields(param_row, 2)
- if str_fields is None or int_fields is None:
- return None
- name, param_kind, annotation_hash = str_fields
- (has_default_raw,) = int_fields
- return ApiParamSpecDict(
- name=name,
- kind=param_kind,
- has_default=bool(has_default_raw),
- annotation_hash=annotation_hash,
- )
-
-
-def _decode_wire_structural_findings_optional(
- obj: dict[str, object],
-) -> list[StructuralFindingGroupDict] | None:
- """Decode optional 'sf' wire key. Returns [] if absent, None on invalid format."""
- raw = obj.get("sf")
- if raw is None:
- return []
- groups_raw = _as_list(raw)
- if groups_raw is None:
- return None
- groups: list[StructuralFindingGroupDict] = []
- for group_raw in groups_raw:
- group = _decode_wire_structural_group(group_raw)
- if group is None:
- return None
- groups.append(group)
- return groups
-
-
-def _decode_wire_row(
- value: object,
- *,
- valid_lengths: Collection[int],
-) -> list[object] | None:
- row = _as_list(value)
- if row is None or len(row) not in valid_lengths:
- return None
- return row
-
-
-def _decode_wire_named_span(
- value: object,
- *,
- valid_lengths: Collection[int],
-) -> tuple[list[object], str, int, int] | None:
- row = _decode_wire_row(value, valid_lengths=valid_lengths)
- if row is None:
- return None
- span = _decode_wire_qualname_span(row)
- if span is None:
- return None
- qualname, start_line, end_line = span
- return row, qualname, start_line, end_line
-
-
-def _decode_wire_named_sized_span(
- value: object,
- *,
- valid_lengths: Collection[int],
-) -> tuple[list[object], str, int, int, int] | None:
- row = _decode_wire_row(value, valid_lengths=valid_lengths)
- if row is None:
- return None
- span = _decode_wire_qualname_span_size(row)
- if span is None:
- return None
- qualname, start_line, end_line, size = span
- return row, qualname, start_line, end_line, size
-
-
-def _decode_wire_int_fields(
- row: list[object],
- *indexes: int,
-) -> tuple[int, ...] | None:
- values: list[int] = []
- for index in indexes:
- value = _as_int(row[index])
- if value is None:
- return None
- values.append(value)
- return tuple(values)
-
-
-def _decode_wire_str_fields(
- row: list[object],
- *indexes: int,
-) -> tuple[str, ...] | None:
- values: list[str] = []
- for index in indexes:
- value = _as_str(row[index])
- if value is None:
- return None
- values.append(value)
- return tuple(values)
-
-
-def _decode_wire_unit_core_fields(
- row: list[object],
-) -> tuple[int, int, str, str, int, int, Literal["low", "medium", "high"], str] | None:
- int_fields = _decode_wire_int_fields(row, 3, 4, 7, 8)
- str_fields = _decode_wire_str_fields(row, 5, 6, 10)
- risk = _as_risk_literal(row[9])
- if int_fields is None or str_fields is None or risk is None:
- return None
- loc, stmt_count, cyclomatic_complexity, nesting_depth = int_fields
- fingerprint, loc_bucket, raw_hash = str_fields
- return (
- loc,
- stmt_count,
- fingerprint,
- loc_bucket,
- cyclomatic_complexity,
- nesting_depth,
- risk,
- raw_hash,
- )
-
-
-def _decode_wire_unit_flow_profiles(
- row: list[object],
-) -> tuple[int, str, bool, str, str, str] | None:
- if len(row) != 17:
- return _DEFAULT_WIRE_UNIT_FLOW_PROFILES
-
- parsed_entry_guard_count = _as_int(row[11])
- parsed_entry_guard_terminal_profile = _as_str(row[12])
- parsed_entry_guard_has_side_effect_before = _as_int(row[13])
- parsed_terminal_kind = _as_str(row[14])
- parsed_try_finally_profile = _as_str(row[15])
- parsed_side_effect_order_profile = _as_str(row[16])
- if (
- parsed_entry_guard_count is None
- or parsed_entry_guard_terminal_profile is None
- or parsed_entry_guard_has_side_effect_before is None
- or parsed_terminal_kind is None
- or parsed_try_finally_profile is None
- or parsed_side_effect_order_profile is None
- ):
- return None
- return (
- max(0, parsed_entry_guard_count),
- parsed_entry_guard_terminal_profile or "none",
- parsed_entry_guard_has_side_effect_before != 0,
- parsed_terminal_kind or "fallthrough",
- parsed_try_finally_profile or "none",
- parsed_side_effect_order_profile or "none",
- )
-
-
-def _decode_wire_class_metric_fields(
- row: list[object],
-) -> tuple[int, int, int, int, str, str] | None:
- int_fields = _decode_wire_int_fields(row, 3, 4, 5, 6)
- str_fields = _decode_wire_str_fields(row, 7, 8)
- if int_fields is None or str_fields is None:
- return None
- cbo, lcom4, method_count, instance_var_count = int_fields
- risk_coupling, risk_cohesion = str_fields
- return (
- cbo,
- lcom4,
- method_count,
- instance_var_count,
- risk_coupling,
- risk_cohesion,
- )
-
-
-def _decode_wire_structural_group(value: object) -> StructuralFindingGroupDict | None:
- group_row = _decode_wire_row(value, valid_lengths={4})
- if group_row is None:
- return None
- str_fields = _decode_wire_str_fields(group_row, 0, 1)
- items_raw = _as_list(group_row[3])
- signature = _decode_wire_structural_signature(group_row[2])
- if str_fields is None or items_raw is None or signature is None:
- return None
- finding_kind, finding_key = str_fields
- items: list[StructuralFindingOccurrenceDict] = []
- for item_raw in items_raw:
- item = _decode_wire_structural_occurrence(item_raw)
- if item is None:
- return None
- items.append(item)
- return StructuralFindingGroupDict(
- finding_kind=finding_kind,
- finding_key=finding_key,
- signature=signature,
- items=items,
- )
-
-
-def _decode_wire_structural_signature(value: object) -> dict[str, str] | None:
- sig_raw = _as_list(value)
- if sig_raw is None:
- return None
- signature: dict[str, str] = {}
- for pair in sig_raw:
- pair_list = _as_list(pair)
- if pair_list is None or len(pair_list) != 2:
- return None
- key = _as_str(pair_list[0])
- val = _as_str(pair_list[1])
- if key is None or val is None:
- return None
- signature[key] = val
- return signature
-
-
-def _decode_wire_structural_occurrence(
- value: object,
-) -> StructuralFindingOccurrenceDict | None:
- item_list = _as_list(value)
- if item_list is None or len(item_list) != 3:
- return None
- qualname = _as_str(item_list[0])
- start = _as_int(item_list[1])
- end = _as_int(item_list[2])
- if qualname is None or start is None or end is None:
- return None
- return StructuralFindingOccurrenceDict(
- qualname=qualname,
- start=start,
- end=end,
- )
-
-
-def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None:
- decoded = _decode_wire_named_span(value, valid_lengths={11, 17})
- if decoded is None:
- return None
- row, qualname, start_line, end_line = decoded
- core_fields = _decode_wire_unit_core_fields(row)
- flow_profiles = _decode_wire_unit_flow_profiles(row)
- if core_fields is None or flow_profiles is None:
- return None
- (
- loc,
- stmt_count,
- fingerprint,
- loc_bucket,
- cyclomatic_complexity,
- nesting_depth,
- risk,
- raw_hash,
- ) = core_fields
- (
- entry_guard_count,
- entry_guard_terminal_profile,
- entry_guard_has_side_effect_before,
- terminal_kind,
- try_finally_profile,
- side_effect_order_profile,
- ) = flow_profiles
- return FunctionGroupItem(
- qualname=qualname,
- filepath=filepath,
- start_line=start_line,
- end_line=end_line,
- loc=loc,
- stmt_count=stmt_count,
- fingerprint=fingerprint,
- loc_bucket=loc_bucket,
- cyclomatic_complexity=cyclomatic_complexity,
- nesting_depth=nesting_depth,
- risk=risk,
- raw_hash=raw_hash,
- entry_guard_count=entry_guard_count,
- entry_guard_terminal_profile=entry_guard_terminal_profile,
- entry_guard_has_side_effect_before=entry_guard_has_side_effect_before,
- terminal_kind=terminal_kind,
- try_finally_profile=try_finally_profile,
- side_effect_order_profile=side_effect_order_profile,
- )
-
-
-def _decode_wire_block(value: object, filepath: str) -> BlockDict | None:
- decoded = _decode_wire_named_sized_span(value, valid_lengths={5})
- if decoded is None:
- return None
- row, qualname, start_line, end_line, size = decoded
- block_hash = _as_str(row[4])
- if block_hash is None:
- return None
-
- return BlockGroupItem(
- block_hash=block_hash,
- filepath=filepath,
- qualname=qualname,
- start_line=start_line,
- end_line=end_line,
- size=size,
- )
-
-
-def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None:
- decoded = _decode_wire_named_sized_span(value, valid_lengths={6})
- if decoded is None:
- return None
- row, qualname, start_line, end_line, size = decoded
- segment_hash = _as_str(row[4])
- segment_sig = _as_str(row[5])
- if segment_hash is None or segment_sig is None:
- return None
-
- return SegmentGroupItem(
- segment_hash=segment_hash,
- segment_sig=segment_sig,
- filepath=filepath,
- qualname=qualname,
- start_line=start_line,
- end_line=end_line,
- size=size,
- )
-
-
-def _decode_wire_class_metric(
- value: object,
- filepath: str,
-) -> ClassMetricsDict | None:
- decoded = _decode_wire_named_span(value, valid_lengths={9})
- if decoded is None:
- return None
- row, qualname, start_line, end_line = decoded
- metric_fields = _decode_wire_class_metric_fields(row)
- if metric_fields is None:
- return None
- cbo, lcom4, method_count, instance_var_count, risk_coupling, risk_cohesion = (
- metric_fields
- )
- return ClassMetricsDict(
- qualname=qualname,
- filepath=filepath,
- start_line=start_line,
- end_line=end_line,
- cbo=cbo,
- lcom4=lcom4,
- method_count=method_count,
- instance_var_count=instance_var_count,
- risk_coupling=risk_coupling,
- risk_cohesion=risk_cohesion,
- )
-
-
-def _decode_wire_module_dep(value: object) -> ModuleDepDict | None:
- row = _as_list(value)
- if row is None or len(row) != 4:
- return None
- source = _as_str(row[0])
- target = _as_str(row[1])
- import_type = _as_str(row[2])
- line = _as_int(row[3])
- if source is None or target is None or import_type is None or line is None:
- return None
- return ModuleDepDict(
- source=source,
- target=target,
- import_type=import_type,
- line=line,
- )
-
-
-def _decode_wire_dead_candidate(
- value: object,
- filepath: str,
-) -> DeadCandidateDict | None:
- row = _decode_wire_row(value, valid_lengths={5, 6})
- if row is None:
- return None
- str_fields = _decode_wire_str_fields(row, 0, 1, 4)
- int_fields = _decode_wire_int_fields(row, 2, 3)
- suppressed_rules: list[str] | None = []
- if len(row) == 6:
- raw_rules = _as_list(row[5])
- if raw_rules is None or not all(isinstance(rule, str) for rule in raw_rules):
- return None
- suppressed_rules = sorted({str(rule) for rule in raw_rules if str(rule)})
- if str_fields is None or int_fields is None:
- return None
- qualname, local_name, kind = str_fields
- start_line, end_line = int_fields
- decoded = DeadCandidateDict(
- qualname=qualname,
- local_name=local_name,
- filepath=filepath,
- start_line=start_line,
- end_line=end_line,
- kind=kind,
- )
- if suppressed_rules:
- decoded["suppressed_rules"] = suppressed_rules
- return decoded
-
-
-def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]:
- wire: dict[str, object] = {
- "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]],
- }
- source_stats = entry.get("source_stats")
- if source_stats is not None:
- wire["ss"] = [
- source_stats["lines"],
- source_stats["functions"],
- source_stats["methods"],
- source_stats["classes"],
- ]
-
- units = sorted(
- entry["units"],
- key=lambda unit: (
- unit["qualname"],
- unit["start_line"],
- unit["end_line"],
- unit["fingerprint"],
- ),
- )
- if units:
- wire["u"] = [
- [
- unit["qualname"],
- unit["start_line"],
- unit["end_line"],
- unit["loc"],
- unit["stmt_count"],
- unit["fingerprint"],
- unit["loc_bucket"],
- unit.get("cyclomatic_complexity", 1),
- unit.get("nesting_depth", 0),
- unit.get("risk", "low"),
- unit.get("raw_hash", ""),
- unit.get("entry_guard_count", 0),
- unit.get("entry_guard_terminal_profile", "none"),
- 1 if unit.get("entry_guard_has_side_effect_before", False) else 0,
- unit.get("terminal_kind", "fallthrough"),
- unit.get("try_finally_profile", "none"),
- unit.get("side_effect_order_profile", "none"),
- ]
- for unit in units
- ]
-
- blocks = sorted(
- entry["blocks"],
- key=lambda block: (
- block["qualname"],
- block["start_line"],
- block["end_line"],
- block["block_hash"],
- ),
- )
- if blocks:
- wire["b"] = [
- [
- block["qualname"],
- block["start_line"],
- block["end_line"],
- block["size"],
- block["block_hash"],
- ]
- for block in blocks
- ]
-
- segments = sorted(
- entry["segments"],
- key=lambda segment: (
- segment["qualname"],
- segment["start_line"],
- segment["end_line"],
- segment["segment_hash"],
- ),
- )
- if segments:
- wire["s"] = [
- [
- segment["qualname"],
- segment["start_line"],
- segment["end_line"],
- segment["size"],
- segment["segment_hash"],
- segment["segment_sig"],
- ]
- for segment in segments
- ]
-
- class_metrics = sorted(
- entry["class_metrics"],
- key=lambda metric: (
- metric["start_line"],
- metric["end_line"],
- metric["qualname"],
- ),
- )
- if class_metrics:
- coupled_classes_rows: list[list[object]] = []
-
- def _append_coupled_classes_row(metric: ClassMetricsDict) -> None:
- coupled_classes = _normalized_optional_string_list(
- metric.get("coupled_classes", [])
- )
- if coupled_classes:
- coupled_classes_rows.append([metric["qualname"], coupled_classes])
-
- wire["cm"] = [
- [
- metric["qualname"],
- metric["start_line"],
- metric["end_line"],
- metric["cbo"],
- metric["lcom4"],
- metric["method_count"],
- metric["instance_var_count"],
- metric["risk_coupling"],
- metric["risk_cohesion"],
- ]
- for metric in class_metrics
- ]
- for metric in class_metrics:
- _append_coupled_classes_row(metric)
- if coupled_classes_rows:
- wire["cc"] = coupled_classes_rows
-
- module_deps = sorted(
- entry["module_deps"],
- key=lambda dep: (dep["source"], dep["target"], dep["import_type"], dep["line"]),
- )
- if module_deps:
- wire["md"] = [
- [
- dep["source"],
- dep["target"],
- dep["import_type"],
- dep["line"],
- ]
- for dep in module_deps
- ]
-
- dead_candidates = sorted(
- entry["dead_candidates"],
- key=lambda candidate: (
- candidate["start_line"],
- candidate["end_line"],
- candidate["qualname"],
- candidate["local_name"],
- candidate["kind"],
- ),
- )
- if dead_candidates:
- # Dead candidates are stored inside a per-file cache entry, so the
- # filepath is implicit and does not need to be repeated in every row.
- encoded_dead_candidates: list[list[object]] = []
- for candidate in dead_candidates:
- encoded = [
- candidate["qualname"],
- candidate["local_name"],
- candidate["start_line"],
- candidate["end_line"],
- candidate["kind"],
- ]
- suppressed_rules = candidate.get("suppressed_rules", [])
- normalized_rules = _normalized_optional_string_list(suppressed_rules)
- if normalized_rules:
- encoded.append(normalized_rules)
- encoded_dead_candidates.append(encoded)
- wire["dc"] = encoded_dead_candidates
-
- if entry["referenced_names"]:
- wire["rn"] = sorted(set(entry["referenced_names"]))
- if entry.get("referenced_qualnames"):
- wire["rq"] = sorted(set(entry["referenced_qualnames"]))
- if entry["import_names"]:
- wire["in"] = sorted(set(entry["import_names"]))
- if entry["class_names"]:
- wire["cn"] = sorted(set(entry["class_names"]))
- typing_coverage = entry.get("typing_coverage")
- if typing_coverage is not None:
- wire["tc"] = [
- typing_coverage["module"],
- typing_coverage["callable_count"],
- typing_coverage["params_total"],
- typing_coverage["params_annotated"],
- typing_coverage["returns_total"],
- typing_coverage["returns_annotated"],
- typing_coverage["any_annotation_count"],
- ]
- docstring_coverage = entry.get("docstring_coverage")
- if docstring_coverage is not None:
- wire["dg"] = [
- docstring_coverage["module"],
- docstring_coverage["public_symbol_total"],
- docstring_coverage["public_symbol_documented"],
- ]
- api_surface = entry.get("api_surface")
- if api_surface is not None:
- wire["as"] = [
- api_surface["module"],
- sorted(set(api_surface.get("all_declared", []))),
- [
- [
- symbol["qualname"],
- symbol["kind"],
- symbol["start_line"],
- symbol["end_line"],
- symbol.get("exported_via", "name"),
- symbol.get("returns_hash", ""),
- [
- [
- param["name"],
- param["kind"],
- 1 if param["has_default"] else 0,
- param.get("annotation_hash", ""),
- ]
- for param in symbol.get("params", [])
- ],
- ]
- for symbol in api_surface["symbols"]
- ],
- ]
-
- if "structural_findings" in entry:
- sf = entry.get("structural_findings", [])
- wire["sf"] = [
- [
- group["finding_kind"],
- group["finding_key"],
- sorted(group["signature"].items()),
- [
- [item["qualname"], item["start"], item["end"]]
- for item in group["items"]
- ],
- ]
- for group in sf
- ]
-
- return wire
-
-
-def _resolve_root(root: str | Path | None) -> Path | None:
- if root is None:
- return None
- try:
- return Path(root).resolve(strict=False)
- except OSError:
- return None
-
-
-def _is_file_stat_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- return isinstance(value.get("mtime_ns"), int) and isinstance(value.get("size"), int)
-
-
-def _is_source_stats_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- lines = value.get("lines")
- functions = value.get("functions")
- methods = value.get("methods")
- classes = value.get("classes")
- return (
- isinstance(lines, int)
- and lines >= 0
- and isinstance(functions, int)
- and functions >= 0
- and isinstance(methods, int)
- and methods >= 0
- and isinstance(classes, int)
- and classes >= 0
- )
-
-
-def _is_unit_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- string_keys = ("qualname", "filepath", "fingerprint", "loc_bucket")
- int_keys = ("start_line", "end_line", "loc", "stmt_count")
- if not _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys):
- return False
- cyclomatic_complexity = value.get("cyclomatic_complexity", 1)
- nesting_depth = value.get("nesting_depth", 0)
- risk = value.get("risk", "low")
- raw_hash = value.get("raw_hash", "")
- return (
- isinstance(cyclomatic_complexity, int)
- and isinstance(nesting_depth, int)
- and isinstance(risk, str)
- and risk in {"low", "medium", "high"}
- and isinstance(raw_hash, str)
- and isinstance(value.get("entry_guard_count", 0), int)
- and isinstance(value.get("entry_guard_terminal_profile", "none"), str)
- and isinstance(value.get("entry_guard_has_side_effect_before", False), bool)
- and isinstance(value.get("terminal_kind", "fallthrough"), str)
- and isinstance(value.get("try_finally_profile", "none"), str)
- and isinstance(value.get("side_effect_order_profile", "none"), str)
- )
-
-
-def _is_block_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- string_keys = ("block_hash", "filepath", "qualname")
- int_keys = ("start_line", "end_line", "size")
- return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
-
-
-def _is_segment_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- string_keys = ("segment_hash", "segment_sig", "filepath", "qualname")
- int_keys = ("start_line", "end_line", "size")
- return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
-
-
-def _is_module_typing_coverage_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- string_keys = ("module", "filepath")
- int_keys = (
- "callable_count",
- "params_total",
- "params_annotated",
- "returns_total",
- "returns_annotated",
- "any_annotation_count",
- )
- return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
-
-
-def _is_module_docstring_coverage_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- string_keys = ("module", "filepath")
- int_keys = ("public_symbol_total", "public_symbol_documented")
- return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
-
-
-def _is_api_param_spec_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- return (
- isinstance(value.get("name"), str)
- and isinstance(value.get("kind"), str)
- and isinstance(value.get("has_default"), bool)
- and isinstance(value.get("annotation_hash", ""), str)
- )
-
-
-def _is_public_symbol_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- if not _has_typed_fields(
- value,
- string_keys=("qualname", "kind", "exported_via"),
- int_keys=("start_line", "end_line"),
- ):
- return False
- params = value.get("params", [])
- return (
- isinstance(value.get("returns_hash", ""), str)
- and isinstance(
- params,
- list,
- )
- and all(_is_api_param_spec_dict(item) for item in params)
- )
-
-
-def _is_module_api_surface_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- all_declared = value.get("all_declared", [])
- symbols = value.get("symbols", [])
- return (
- isinstance(value.get("module"), str)
- and isinstance(value.get("filepath"), str)
- and _is_string_list(all_declared)
- and isinstance(symbols, list)
- and all(_is_public_symbol_dict(item) for item in symbols)
- )
-
-
-def _is_class_metrics_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- if not _has_typed_fields(
- value,
- string_keys=(
- "qualname",
- "filepath",
- "risk_coupling",
- "risk_cohesion",
- ),
- int_keys=(
- "start_line",
- "end_line",
- "cbo",
- "lcom4",
- "method_count",
- "instance_var_count",
- ),
- ):
- return False
-
- coupled_classes = value.get("coupled_classes")
- if coupled_classes is None:
- return True
- return _is_string_list(coupled_classes)
-
-
-def _is_module_dep_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- return _has_typed_fields(
- value,
- string_keys=("source", "target", "import_type"),
- int_keys=("line",),
- )
-
-
-def _is_dead_candidate_dict(value: object) -> bool:
- if not isinstance(value, dict):
- return False
- if not _has_typed_fields(
- value,
- string_keys=("qualname", "local_name", "filepath", "kind"),
- int_keys=("start_line", "end_line"),
- ):
- return False
- suppressed_rules = value.get("suppressed_rules")
- if suppressed_rules is None:
- return True
- return _is_string_list(suppressed_rules)
-
-
-def _is_string_list(value: object) -> bool:
- return isinstance(value, list) and all(isinstance(item, str) for item in value)
-
-
-def _has_typed_fields(
- value: Mapping[str, object],
- *,
- string_keys: Sequence[str],
- int_keys: Sequence[str],
-) -> bool:
- return all(isinstance(value.get(key), str) for key in string_keys) and all(
- isinstance(value.get(key), int) for key in int_keys
- )
diff --git a/codeclone/_html_report/_sections/__init__.py b/codeclone/cache/__init__.py
similarity index 100%
rename from codeclone/_html_report/_sections/__init__.py
rename to codeclone/cache/__init__.py
diff --git a/codeclone/cache/_canonicalize.py b/codeclone/cache/_canonicalize.py
new file mode 100644
index 0000000..b3d903e
--- /dev/null
+++ b/codeclone/cache/_canonicalize.py
@@ -0,0 +1,457 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable, Mapping
+from typing import TypeGuard, TypeVar
+
+from ._validators import (
+ _is_block_dict,
+ _is_class_metrics_dict,
+ _is_dead_candidate_dict,
+ _is_file_stat_dict,
+ _is_module_api_surface_dict,
+ _is_module_dep_dict,
+ _is_module_docstring_coverage_dict,
+ _is_module_typing_coverage_dict,
+ _is_security_surface_dict,
+ _is_segment_dict,
+ _is_source_stats_dict,
+ _is_string_list,
+ _is_unit_dict,
+)
+from .entries import (
+ ApiParamSpecDict,
+ BlockDict,
+ CacheEntry,
+ ClassMetricsDict,
+ DeadCandidateDict,
+ FileStat,
+ ModuleApiSurfaceDict,
+ ModuleDepDict,
+ ModuleDocstringCoverageDict,
+ ModuleTypingCoverageDict,
+ PublicSymbolDict,
+ SecuritySurfaceDict,
+ SegmentDict,
+ SourceStatsDict,
+ StructuralFindingGroupDict,
+ UnitDict,
+)
+
+_ValidatedItemT = TypeVar("_ValidatedItemT")
+
+
+def _is_str_item(value: object) -> TypeGuard[str]:
+ return isinstance(value, str)
+
+
+def _as_file_stat_dict(value: object) -> FileStat | None:
+ if not _is_file_stat_dict(value):
+ return None
+ mtime_ns = value.get("mtime_ns")
+ size = value.get("size")
+ if not isinstance(mtime_ns, int) or not isinstance(size, int):
+ return None
+ return FileStat(mtime_ns=mtime_ns, size=size)
+
+
+def _as_source_stats_dict(value: object) -> SourceStatsDict | None:
+ if not _is_source_stats_dict(value):
+ return None
+ return SourceStatsDict(
+ lines=value["lines"],
+ functions=value["functions"],
+ methods=value["methods"],
+ classes=value["classes"],
+ )
+
+
+def _as_typed_list(
+ value: object,
+ *,
+ predicate: Callable[[object], TypeGuard[_ValidatedItemT]],
+) -> list[_ValidatedItemT] | None:
+ if not isinstance(value, list):
+ return None
+ items: list[_ValidatedItemT] = []
+ for item in value:
+ if not predicate(item):
+ return None
+ items.append(item)
+ return items
+
+
+def _as_typed_unit_list(value: object) -> list[UnitDict] | None:
+ return _as_typed_list(value, predicate=_is_unit_dict)
+
+
+def _as_typed_block_list(value: object) -> list[BlockDict] | None:
+ return _as_typed_list(value, predicate=_is_block_dict)
+
+
+def _as_typed_segment_list(value: object) -> list[SegmentDict] | None:
+ return _as_typed_list(value, predicate=_is_segment_dict)
+
+
+def _as_typed_class_metrics_list(value: object) -> list[ClassMetricsDict] | None:
+ return _as_typed_list(value, predicate=_is_class_metrics_dict)
+
+
+def _as_typed_dead_candidates_list(
+ value: object,
+) -> list[DeadCandidateDict] | None:
+ return _as_typed_list(value, predicate=_is_dead_candidate_dict)
+
+
+def _as_typed_module_deps_list(value: object) -> list[ModuleDepDict] | None:
+ return _as_typed_list(value, predicate=_is_module_dep_dict)
+
+
+def _as_typed_security_surfaces_list(value: object) -> list[SecuritySurfaceDict] | None:
+ return _as_typed_list(value, predicate=_is_security_surface_dict)
+
+
+def _as_typed_string_list(value: object) -> list[str] | None:
+ return _as_typed_list(value, predicate=_is_str_item)
+
+
+def _as_module_typing_coverage_dict(
+ value: object,
+) -> ModuleTypingCoverageDict | None:
+ if not _is_module_typing_coverage_dict(value):
+ return None
+ return value
+
+
+def _as_module_docstring_coverage_dict(
+ value: object,
+) -> ModuleDocstringCoverageDict | None:
+ if not _is_module_docstring_coverage_dict(value):
+ return None
+ return value
+
+
+def _as_module_api_surface_dict(value: object) -> ModuleApiSurfaceDict | None:
+ if not _is_module_api_surface_dict(value):
+ return None
+ return value
+
+
+def _normalized_optional_string_list(value: object) -> list[str] | None:
+ items = _as_typed_string_list(value)
+ if not items:
+ return None
+ return sorted(set(items))
+
+
+def _is_canonical_cache_entry(value: object) -> TypeGuard[CacheEntry]:
+ return isinstance(value, dict) and _has_cache_entry_container_shape(value)
+
+
+def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool:
+ required = {"stat", "units", "blocks", "segments"}
+ if not required.issubset(entry.keys()):
+ return False
+ if not isinstance(entry.get("stat"), dict):
+ return False
+ if not isinstance(entry.get("units"), list):
+ return False
+ if not isinstance(entry.get("blocks"), list):
+ return False
+ if not isinstance(entry.get("segments"), list):
+ return False
+ source_stats = entry.get("source_stats")
+ if source_stats is not None and not _is_source_stats_dict(source_stats):
+ return False
+ optional_list_keys = (
+ "class_metrics",
+ "module_deps",
+ "dead_candidates",
+ "referenced_names",
+ "referenced_qualnames",
+ "import_names",
+ "class_names",
+ "security_surfaces",
+ "structural_findings",
+ )
+ if not all(isinstance(entry.get(key, []), list) for key in optional_list_keys):
+ return False
+ typing_coverage = entry.get("typing_coverage")
+ if typing_coverage is not None and not _is_module_typing_coverage_dict(
+ typing_coverage
+ ):
+ return False
+ docstring_coverage = entry.get("docstring_coverage")
+ if docstring_coverage is not None and not _is_module_docstring_coverage_dict(
+ docstring_coverage
+ ):
+ return False
+ api_surface = entry.get("api_surface")
+ return api_surface is None or _is_module_api_surface_dict(api_surface)
+
+
+def _decode_optional_cache_sections(
+ entry: Mapping[str, object],
+) -> (
+ tuple[
+ list[ClassMetricsDict],
+ list[ModuleDepDict],
+ list[DeadCandidateDict],
+ list[str],
+ list[str],
+ list[str],
+ list[str],
+ list[SecuritySurfaceDict],
+ ModuleTypingCoverageDict | None,
+ ModuleDocstringCoverageDict | None,
+ ModuleApiSurfaceDict | None,
+ SourceStatsDict | None,
+ list[StructuralFindingGroupDict] | None,
+ ]
+ | None
+):
+ class_metrics_raw = _as_typed_class_metrics_list(entry.get("class_metrics", []))
+ module_deps_raw = _as_typed_module_deps_list(entry.get("module_deps", []))
+ dead_candidates_raw = _as_typed_dead_candidates_list(
+ entry.get("dead_candidates", [])
+ )
+ referenced_names_raw = _as_typed_string_list(entry.get("referenced_names", []))
+ referenced_qualnames_raw = _as_typed_string_list(
+ entry.get("referenced_qualnames", [])
+ )
+ import_names_raw = _as_typed_string_list(entry.get("import_names", []))
+ class_names_raw = _as_typed_string_list(entry.get("class_names", []))
+ security_surfaces_raw = _as_typed_security_surfaces_list(
+ entry.get("security_surfaces", [])
+ )
+ if (
+ class_metrics_raw is None
+ or module_deps_raw is None
+ or dead_candidates_raw is None
+ or referenced_names_raw is None
+ or referenced_qualnames_raw is None
+ or import_names_raw is None
+ or class_names_raw is None
+ or security_surfaces_raw is None
+ ):
+ return None
+ typing_coverage_raw = _as_module_typing_coverage_dict(entry.get("typing_coverage"))
+ docstring_coverage_raw = _as_module_docstring_coverage_dict(
+ entry.get("docstring_coverage")
+ )
+ api_surface_raw = _as_module_api_surface_dict(entry.get("api_surface"))
+ source_stats = _as_source_stats_dict(entry.get("source_stats"))
+ structural_findings = entry.get("structural_findings")
+ typed_structural_findings = (
+ structural_findings if isinstance(structural_findings, list) else None
+ )
+ return (
+ class_metrics_raw,
+ module_deps_raw,
+ dead_candidates_raw,
+ referenced_names_raw,
+ referenced_qualnames_raw,
+ import_names_raw,
+ class_names_raw,
+ security_surfaces_raw,
+ typing_coverage_raw,
+ docstring_coverage_raw,
+ api_surface_raw,
+ source_stats,
+ typed_structural_findings,
+ )
+
+
+def _attach_optional_cache_sections(
+ entry: CacheEntry,
+ *,
+ typing_coverage: ModuleTypingCoverageDict | None = None,
+ docstring_coverage: ModuleDocstringCoverageDict | None = None,
+ api_surface: ModuleApiSurfaceDict | None = None,
+ security_surfaces: list[SecuritySurfaceDict] | None = None,
+ source_stats: SourceStatsDict | None = None,
+ structural_findings: list[StructuralFindingGroupDict] | None = None,
+) -> CacheEntry:
+ if typing_coverage is not None:
+ entry["typing_coverage"] = typing_coverage
+ if docstring_coverage is not None:
+ entry["docstring_coverage"] = docstring_coverage
+ if api_surface is not None:
+ entry["api_surface"] = api_surface
+ if security_surfaces is not None:
+ entry["security_surfaces"] = security_surfaces
+ if source_stats is not None:
+ entry["source_stats"] = source_stats
+ if structural_findings is not None:
+ entry["structural_findings"] = structural_findings
+ return entry
+
+
+def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry:
+ class_metrics_sorted = sorted(
+ entry["class_metrics"],
+ key=lambda item: (
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ ),
+ )
+ for metric in class_metrics_sorted:
+ coupled_classes = metric.get("coupled_classes", [])
+ if coupled_classes:
+ metric["coupled_classes"] = sorted(set(coupled_classes))
+
+ module_deps_sorted = sorted(
+ entry["module_deps"],
+ key=lambda item: (
+ item["source"],
+ item["target"],
+ item["import_type"],
+ item["line"],
+ ),
+ )
+ dead_candidates_normalized: list[DeadCandidateDict] = []
+ for candidate in entry["dead_candidates"]:
+ suppressed_rules = candidate.get("suppressed_rules", [])
+ normalized_candidate = DeadCandidateDict(
+ qualname=candidate["qualname"],
+ local_name=candidate["local_name"],
+ filepath=candidate["filepath"],
+ start_line=candidate["start_line"],
+ end_line=candidate["end_line"],
+ kind=candidate["kind"],
+ )
+ if _is_string_list(suppressed_rules):
+ normalized_rules = sorted(set(suppressed_rules))
+ if normalized_rules:
+ normalized_candidate["suppressed_rules"] = normalized_rules
+ dead_candidates_normalized.append(normalized_candidate)
+
+ dead_candidates_sorted = sorted(
+ dead_candidates_normalized,
+ key=lambda item: (
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["local_name"],
+ item["kind"],
+ tuple(item.get("suppressed_rules", [])),
+ ),
+ )
+
+ result: CacheEntry = {
+ "stat": entry["stat"],
+ "units": entry["units"],
+ "blocks": entry["blocks"],
+ "segments": entry["segments"],
+ "class_metrics": class_metrics_sorted,
+ "module_deps": module_deps_sorted,
+ "dead_candidates": dead_candidates_sorted,
+ "referenced_names": sorted(set(entry["referenced_names"])),
+ "referenced_qualnames": sorted(set(entry.get("referenced_qualnames", []))),
+ "import_names": sorted(set(entry["import_names"])),
+ "class_names": sorted(set(entry["class_names"])),
+ "security_surfaces": sorted(
+ entry.get("security_surfaces", []),
+ key=lambda item: (
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["category"],
+ item["capability"],
+ item["evidence_symbol"],
+ ),
+ ),
+ }
+ typing_coverage = entry.get("typing_coverage")
+ if typing_coverage is not None:
+ result["typing_coverage"] = ModuleTypingCoverageDict(
+ module=typing_coverage["module"],
+ filepath=typing_coverage["filepath"],
+ callable_count=typing_coverage["callable_count"],
+ params_total=typing_coverage["params_total"],
+ params_annotated=typing_coverage["params_annotated"],
+ returns_total=typing_coverage["returns_total"],
+ returns_annotated=typing_coverage["returns_annotated"],
+ any_annotation_count=typing_coverage["any_annotation_count"],
+ )
+ docstring_coverage = entry.get("docstring_coverage")
+ if docstring_coverage is not None:
+ result["docstring_coverage"] = ModuleDocstringCoverageDict(
+ module=docstring_coverage["module"],
+ filepath=docstring_coverage["filepath"],
+ public_symbol_total=docstring_coverage["public_symbol_total"],
+ public_symbol_documented=docstring_coverage["public_symbol_documented"],
+ )
+ api_surface = entry.get("api_surface")
+ if api_surface is not None:
+ symbols = sorted(
+ api_surface["symbols"],
+ key=lambda item: (
+ item["qualname"],
+ item["kind"],
+ item["start_line"],
+ item["end_line"],
+ ),
+ )
+ normalized_symbols = [
+ PublicSymbolDict(
+ qualname=symbol["qualname"],
+ kind=symbol["kind"],
+ start_line=symbol["start_line"],
+ end_line=symbol["end_line"],
+ params=[
+ ApiParamSpecDict(
+ name=param["name"],
+ kind=param["kind"],
+ has_default=param["has_default"],
+ annotation_hash=param["annotation_hash"],
+ )
+ for param in symbol.get("params", [])
+ ],
+ returns_hash=symbol.get("returns_hash", ""),
+ exported_via=symbol.get("exported_via", "name"),
+ )
+ for symbol in symbols
+ ]
+ result["api_surface"] = ModuleApiSurfaceDict(
+ module=api_surface["module"],
+ filepath=api_surface["filepath"],
+ all_declared=sorted(set(api_surface.get("all_declared", []))),
+ symbols=normalized_symbols,
+ )
+ structural_findings = entry.get("structural_findings")
+ if structural_findings is not None:
+ result["structural_findings"] = structural_findings
+ source_stats = entry.get("source_stats")
+ if source_stats is not None:
+ result["source_stats"] = source_stats
+ return result
+
+
+__all__ = [
+ "_as_file_stat_dict",
+ "_as_module_api_surface_dict",
+ "_as_module_docstring_coverage_dict",
+ "_as_module_typing_coverage_dict",
+ "_as_source_stats_dict",
+ "_as_typed_block_list",
+ "_as_typed_class_metrics_list",
+ "_as_typed_dead_candidates_list",
+ "_as_typed_module_deps_list",
+ "_as_typed_security_surfaces_list",
+ "_as_typed_segment_list",
+ "_as_typed_string_list",
+ "_as_typed_unit_list",
+ "_attach_optional_cache_sections",
+ "_canonicalize_cache_entry",
+ "_decode_optional_cache_sections",
+ "_has_cache_entry_container_shape",
+ "_is_canonical_cache_entry",
+ "_normalized_optional_string_list",
+]
diff --git a/codeclone/cache/_validators.py b/codeclone/cache/_validators.py
new file mode 100644
index 0000000..c289720
--- /dev/null
+++ b/codeclone/cache/_validators.py
@@ -0,0 +1,271 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+from typing import TypeGuard
+
+from .entries import (
+ ApiParamSpecDict,
+ BlockDict,
+ ClassMetricsDict,
+ DeadCandidateDict,
+ FileStat,
+ ModuleApiSurfaceDict,
+ ModuleDepDict,
+ ModuleDocstringCoverageDict,
+ ModuleTypingCoverageDict,
+ PublicSymbolDict,
+ SecuritySurfaceDict,
+ SegmentDict,
+ SourceStatsDict,
+ UnitDict,
+)
+
+
+def _is_file_stat_dict(value: object) -> TypeGuard[FileStat]:
+ if not isinstance(value, dict):
+ return False
+ return isinstance(value.get("mtime_ns"), int) and isinstance(value.get("size"), int)
+
+
+def _is_source_stats_dict(value: object) -> TypeGuard[SourceStatsDict]:
+ if not isinstance(value, dict):
+ return False
+ lines = value.get("lines")
+ functions = value.get("functions")
+ methods = value.get("methods")
+ classes = value.get("classes")
+ return (
+ isinstance(lines, int)
+ and lines >= 0
+ and isinstance(functions, int)
+ and functions >= 0
+ and isinstance(methods, int)
+ and methods >= 0
+ and isinstance(classes, int)
+ and classes >= 0
+ )
+
+
+def _is_unit_dict(value: object) -> TypeGuard[UnitDict]:
+ if not isinstance(value, dict):
+ return False
+ string_keys = ("qualname", "filepath", "fingerprint", "loc_bucket")
+ int_keys = ("start_line", "end_line", "loc", "stmt_count")
+ if not _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys):
+ return False
+ cyclomatic_complexity = value.get("cyclomatic_complexity", 1)
+ nesting_depth = value.get("nesting_depth", 0)
+ risk = value.get("risk", "low")
+ raw_hash = value.get("raw_hash", "")
+ return (
+ isinstance(cyclomatic_complexity, int)
+ and isinstance(nesting_depth, int)
+ and isinstance(risk, str)
+ and risk in {"low", "medium", "high"}
+ and isinstance(raw_hash, str)
+ and isinstance(value.get("entry_guard_count", 0), int)
+ and isinstance(value.get("entry_guard_terminal_profile", "none"), str)
+ and isinstance(value.get("entry_guard_has_side_effect_before", False), bool)
+ and isinstance(value.get("terminal_kind", "fallthrough"), str)
+ and isinstance(value.get("try_finally_profile", "none"), str)
+ and isinstance(value.get("side_effect_order_profile", "none"), str)
+ )
+
+
+def _is_block_dict(value: object) -> TypeGuard[BlockDict]:
+ if not isinstance(value, dict):
+ return False
+ string_keys = ("block_hash", "filepath", "qualname")
+ int_keys = ("start_line", "end_line", "size")
+ return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
+
+
+def _is_segment_dict(value: object) -> TypeGuard[SegmentDict]:
+ if not isinstance(value, dict):
+ return False
+ string_keys = ("segment_hash", "segment_sig", "filepath", "qualname")
+ int_keys = ("start_line", "end_line", "size")
+ return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
+
+
+def _is_module_typing_coverage_dict(
+ value: object,
+) -> TypeGuard[ModuleTypingCoverageDict]:
+ if not isinstance(value, dict):
+ return False
+ string_keys = ("module", "filepath")
+ int_keys = (
+ "callable_count",
+ "params_total",
+ "params_annotated",
+ "returns_total",
+ "returns_annotated",
+ "any_annotation_count",
+ )
+ return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
+
+
+def _is_module_docstring_coverage_dict(
+ value: object,
+) -> TypeGuard[ModuleDocstringCoverageDict]:
+ if not isinstance(value, dict):
+ return False
+ string_keys = ("module", "filepath")
+ int_keys = ("public_symbol_total", "public_symbol_documented")
+ return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys)
+
+
+def _is_api_param_spec_dict(value: object) -> TypeGuard[ApiParamSpecDict]:
+ if not isinstance(value, dict):
+ return False
+ return (
+ isinstance(value.get("name"), str)
+ and isinstance(value.get("kind"), str)
+ and isinstance(value.get("has_default"), bool)
+ and isinstance(value.get("annotation_hash", ""), str)
+ )
+
+
+def _is_public_symbol_dict(value: object) -> TypeGuard[PublicSymbolDict]:
+ if not isinstance(value, dict):
+ return False
+ if not _has_typed_fields(
+ value,
+ string_keys=("qualname", "kind", "exported_via"),
+ int_keys=("start_line", "end_line"),
+ ):
+ return False
+ params = value.get("params", [])
+ return (
+ isinstance(value.get("returns_hash", ""), str)
+ and isinstance(params, list)
+ and all(_is_api_param_spec_dict(item) for item in params)
+ )
+
+
+def _is_module_api_surface_dict(value: object) -> TypeGuard[ModuleApiSurfaceDict]:
+ if not isinstance(value, dict):
+ return False
+ all_declared = value.get("all_declared", [])
+ symbols = value.get("symbols", [])
+ return (
+ isinstance(value.get("module"), str)
+ and isinstance(value.get("filepath"), str)
+ and _is_string_list(all_declared)
+ and isinstance(symbols, list)
+ and all(_is_public_symbol_dict(item) for item in symbols)
+ )
+
+
+def _is_class_metrics_dict(value: object) -> TypeGuard[ClassMetricsDict]:
+ if not isinstance(value, dict):
+ return False
+ if not _has_typed_fields(
+ value,
+ string_keys=(
+ "qualname",
+ "filepath",
+ "risk_coupling",
+ "risk_cohesion",
+ ),
+ int_keys=(
+ "start_line",
+ "end_line",
+ "cbo",
+ "lcom4",
+ "method_count",
+ "instance_var_count",
+ ),
+ ):
+ return False
+
+ coupled_classes = value.get("coupled_classes")
+ if coupled_classes is None:
+ return True
+ return _is_string_list(coupled_classes)
+
+
+def _is_module_dep_dict(value: object) -> TypeGuard[ModuleDepDict]:
+ if not isinstance(value, dict):
+ return False
+ return _has_typed_fields(
+ value,
+ string_keys=("source", "target", "import_type"),
+ int_keys=("line",),
+ )
+
+
+def _is_dead_candidate_dict(value: object) -> TypeGuard[DeadCandidateDict]:
+ if not isinstance(value, dict):
+ return False
+ if not _has_typed_fields(
+ value,
+ string_keys=("qualname", "local_name", "filepath", "kind"),
+ int_keys=("start_line", "end_line"),
+ ):
+ return False
+ suppressed_rules = value.get("suppressed_rules")
+ if suppressed_rules is None:
+ return True
+ return _is_string_list(suppressed_rules)
+
+
+def _is_security_surface_dict(value: object) -> TypeGuard[SecuritySurfaceDict]:
+ if not isinstance(value, dict):
+ return False
+ return _has_typed_fields(
+ value,
+ string_keys=(
+ "category",
+ "capability",
+ "module",
+ "filepath",
+ "qualname",
+ "location_scope",
+ "classification_mode",
+ "evidence_kind",
+ "evidence_symbol",
+ ),
+ int_keys=("start_line", "end_line"),
+ )
+
+
+def _is_string_list(value: object) -> TypeGuard[list[str]]:
+ return isinstance(value, list) and all(isinstance(item, str) for item in value)
+
+
+def _has_typed_fields(
+ value: Mapping[str, object],
+ *,
+ string_keys: Sequence[str],
+ int_keys: Sequence[str],
+) -> bool:
+ return all(isinstance(value.get(key), str) for key in string_keys) and all(
+ isinstance(value.get(key), int) for key in int_keys
+ )
+
+
+__all__ = [
+ "_has_typed_fields",
+ "_is_api_param_spec_dict",
+ "_is_block_dict",
+ "_is_class_metrics_dict",
+ "_is_dead_candidate_dict",
+ "_is_file_stat_dict",
+ "_is_module_api_surface_dict",
+ "_is_module_dep_dict",
+ "_is_module_docstring_coverage_dict",
+ "_is_module_typing_coverage_dict",
+ "_is_public_symbol_dict",
+ "_is_security_surface_dict",
+ "_is_segment_dict",
+ "_is_source_stats_dict",
+ "_is_string_list",
+ "_is_unit_dict",
+]
diff --git a/codeclone/cache/_wire_decode.py b/codeclone/cache/_wire_decode.py
new file mode 100644
index 0000000..55d5aeb
--- /dev/null
+++ b/codeclone/cache/_wire_decode.py
@@ -0,0 +1,762 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from ..models import BlockGroupItem, FunctionGroupItem, SegmentGroupItem
+from ._canonicalize import _attach_optional_cache_sections
+from ._wire_helpers import (
+ _decode_optional_wire_coupled_classes,
+ _decode_optional_wire_items,
+ _decode_optional_wire_items_for_filepath,
+ _decode_optional_wire_names,
+ _decode_optional_wire_row,
+ _decode_wire_class_metric_fields,
+ _decode_wire_int_fields,
+ _decode_wire_named_sized_span,
+ _decode_wire_named_span,
+ _decode_wire_qualname_span,
+ _decode_wire_qualname_span_size,
+ _decode_wire_row,
+ _decode_wire_str_fields,
+ _decode_wire_unit_core_fields,
+ _decode_wire_unit_flow_profiles,
+)
+from .entries import (
+ ApiParamSpecDict,
+ BlockDict,
+ CacheEntry,
+ ClassMetricsDict,
+ DeadCandidateDict,
+ FileStat,
+ ModuleApiSurfaceDict,
+ ModuleDepDict,
+ ModuleDocstringCoverageDict,
+ ModuleTypingCoverageDict,
+ PublicSymbolDict,
+ SecuritySurfaceDict,
+ SegmentDict,
+ SourceStatsDict,
+ StructuralFindingGroupDict,
+ StructuralFindingOccurrenceDict,
+ UnitDict,
+ _as_security_surface_category,
+ _as_security_surface_classification_mode,
+ _as_security_surface_evidence_kind,
+ _as_security_surface_location_scope,
+ _normalize_cached_structural_groups,
+)
+from .integrity import (
+ as_int_or_none as _as_int,
+)
+from .integrity import (
+ as_object_list as _as_list,
+)
+from .integrity import (
+ as_str_dict as _as_str_dict,
+)
+from .integrity import (
+ as_str_or_none as _as_str,
+)
+
+
+def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None:
+ stat_list = _as_list(obj.get("st"))
+ if stat_list is None or len(stat_list) != 2:
+ return None
+ mtime_ns = _as_int(stat_list[0])
+ size = _as_int(stat_list[1])
+ if mtime_ns is None or size is None:
+ return None
+ return FileStat(mtime_ns=mtime_ns, size=size)
+
+
+def _decode_optional_wire_source_stats(
+ *,
+ obj: dict[str, object],
+) -> SourceStatsDict | None:
+ row = _decode_optional_wire_row(obj=obj, key="ss", expected_len=4)
+ if row is None:
+ return None
+ counts = _decode_wire_int_fields(row, 0, 1, 2, 3)
+ if counts is None:
+ return None
+ lines, functions, methods, classes = counts
+ if any(value < 0 for value in counts):
+ return None
+ return SourceStatsDict(
+ lines=lines,
+ functions=functions,
+ methods=methods,
+ classes=classes,
+ )
+
+
+def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None:
+ obj = _as_str_dict(value)
+ if obj is None:
+ return None
+
+ stat = _decode_wire_stat(obj)
+ if stat is None:
+ return None
+ source_stats = _decode_optional_wire_source_stats(obj=obj)
+ file_sections = _decode_wire_file_sections(obj=obj, filepath=filepath)
+ if file_sections is None:
+ return None
+ (
+ units,
+ blocks,
+ segments,
+ class_metrics,
+ module_deps,
+ dead_candidates,
+ ) = file_sections
+ name_sections = _decode_wire_name_sections(obj=obj)
+ if name_sections is None:
+ return None
+ (
+ referenced_names,
+ referenced_qualnames,
+ import_names,
+ class_names,
+ ) = name_sections
+ typing_coverage = _decode_optional_wire_typing_coverage(obj=obj, filepath=filepath)
+ docstring_coverage = _decode_optional_wire_docstring_coverage(
+ obj=obj,
+ filepath=filepath,
+ )
+ api_surface = _decode_optional_wire_api_surface(obj=obj, filepath=filepath)
+ security_surfaces = _decode_optional_wire_security_surfaces(
+ obj=obj,
+ filepath=filepath,
+ )
+ coupled_classes_map = _decode_optional_wire_coupled_classes(obj=obj, key="cc")
+ if coupled_classes_map is None:
+ return None
+ if security_surfaces is None:
+ return None
+
+ for metric in class_metrics:
+ names = coupled_classes_map.get(metric["qualname"], [])
+ if names:
+ metric["coupled_classes"] = names
+
+ has_structural_findings = "sf" in obj
+ structural_findings = _decode_wire_structural_findings_optional(obj)
+ if structural_findings is None:
+ return None
+
+ return _attach_optional_cache_sections(
+ CacheEntry(
+ stat=stat,
+ units=units,
+ blocks=blocks,
+ segments=segments,
+ class_metrics=class_metrics,
+ module_deps=module_deps,
+ dead_candidates=dead_candidates,
+ referenced_names=referenced_names,
+ referenced_qualnames=referenced_qualnames,
+ import_names=import_names,
+ class_names=class_names,
+ ),
+ typing_coverage=typing_coverage,
+ docstring_coverage=docstring_coverage,
+ api_surface=api_surface,
+ security_surfaces=security_surfaces,
+ source_stats=source_stats,
+ structural_findings=(
+ _normalize_cached_structural_groups(structural_findings, filepath=filepath)
+ if has_structural_findings
+ else None
+ ),
+ )
+
+
+def _decode_wire_file_sections(
+ *,
+ obj: dict[str, object],
+ filepath: str,
+) -> (
+ tuple[
+ list[UnitDict],
+ list[BlockDict],
+ list[SegmentDict],
+ list[ClassMetricsDict],
+ list[ModuleDepDict],
+ list[DeadCandidateDict],
+ ]
+ | None
+):
+ units = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="u",
+ filepath=filepath,
+ decode_item=_decode_wire_unit,
+ )
+ blocks = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="b",
+ filepath=filepath,
+ decode_item=_decode_wire_block,
+ )
+ segments = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="s",
+ filepath=filepath,
+ decode_item=_decode_wire_segment,
+ )
+ class_metrics = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="cm",
+ filepath=filepath,
+ decode_item=_decode_wire_class_metric,
+ )
+ module_deps = _decode_optional_wire_items(
+ obj=obj,
+ key="md",
+ decode_item=_decode_wire_module_dep,
+ )
+ dead_candidates = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="dc",
+ filepath=filepath,
+ decode_item=_decode_wire_dead_candidate,
+ )
+ if (
+ units is None
+ or blocks is None
+ or segments is None
+ or class_metrics is None
+ or module_deps is None
+ or dead_candidates is None
+ ):
+ return None
+ return (
+ units,
+ blocks,
+ segments,
+ class_metrics,
+ module_deps,
+ dead_candidates,
+ )
+
+
+def _decode_wire_name_sections(
+ *,
+ obj: dict[str, object],
+) -> tuple[list[str], list[str], list[str], list[str]] | None:
+ referenced_names = _decode_optional_wire_names(obj=obj, key="rn")
+ referenced_qualnames = _decode_optional_wire_names(obj=obj, key="rq")
+ import_names = _decode_optional_wire_names(obj=obj, key="in")
+ class_names = _decode_optional_wire_names(obj=obj, key="cn")
+ if (
+ referenced_names is None
+ or referenced_qualnames is None
+ or import_names is None
+ or class_names is None
+ ):
+ return None
+ return (
+ referenced_names,
+ referenced_qualnames,
+ import_names,
+ class_names,
+ )
+
+
+def _decode_optional_wire_typing_coverage(
+ *,
+ obj: dict[str, object],
+ filepath: str,
+) -> ModuleTypingCoverageDict | None:
+ module_and_ints = _decode_optional_wire_module_ints(
+ obj=obj,
+ key="tc",
+ expected_len=7,
+ int_indexes=(1, 2, 3, 4, 5, 6),
+ )
+ if module_and_ints is None:
+ return None
+ module, ints = module_and_ints
+ (
+ callable_count,
+ params_total,
+ params_annotated,
+ returns_total,
+ returns_annotated,
+ any_annotation_count,
+ ) = ints
+ return ModuleTypingCoverageDict(
+ module=module,
+ filepath=filepath,
+ callable_count=callable_count,
+ params_total=params_total,
+ params_annotated=params_annotated,
+ returns_total=returns_total,
+ returns_annotated=returns_annotated,
+ any_annotation_count=any_annotation_count,
+ )
+
+
+def _decode_optional_wire_docstring_coverage(
+ *,
+ obj: dict[str, object],
+ filepath: str,
+) -> ModuleDocstringCoverageDict | None:
+ module_and_counts = _decode_optional_wire_module_ints(
+ obj=obj,
+ key="dg",
+ expected_len=3,
+ int_indexes=(1, 2),
+ )
+ if module_and_counts is None:
+ return None
+ module, counts = module_and_counts
+ public_symbol_total, public_symbol_documented = counts
+ return ModuleDocstringCoverageDict(
+ module=module,
+ filepath=filepath,
+ public_symbol_total=public_symbol_total,
+ public_symbol_documented=public_symbol_documented,
+ )
+
+
+def _decode_optional_wire_api_surface(
+ *,
+ obj: dict[str, object],
+ filepath: str,
+) -> ModuleApiSurfaceDict | None:
+ row = _decode_optional_wire_row(obj=obj, key="as", expected_len=3)
+ if row is None:
+ return None
+ module = _as_str(row[0])
+ all_declared = _decode_optional_wire_names(obj={"ad": row[1]}, key="ad")
+ symbols_raw = _as_list(row[2])
+ if module is None or all_declared is None or symbols_raw is None:
+ return None
+ symbols: list[PublicSymbolDict] = []
+ for symbol_raw in symbols_raw:
+ decoded_symbol = _decode_wire_api_surface_symbol(symbol_raw)
+ if decoded_symbol is None:
+ return None
+ symbols.append(decoded_symbol)
+ return ModuleApiSurfaceDict(
+ module=module,
+ filepath=filepath,
+ all_declared=sorted(set(all_declared)),
+ symbols=symbols,
+ )
+
+
+def _decode_optional_wire_security_surfaces(
+ *,
+ obj: dict[str, object],
+ filepath: str,
+) -> list[SecuritySurfaceDict] | None:
+ rows = _decode_optional_wire_items_for_filepath(
+ obj=obj,
+ key="sc",
+ filepath=filepath,
+ decode_item=_decode_wire_security_surface,
+ )
+ return rows
+
+
+def _decode_wire_security_surface(
+ row_raw: object,
+ filepath: str,
+) -> SecuritySurfaceDict | None:
+ row = _decode_wire_row(row_raw, valid_lengths={10})
+ if row is None:
+ return None
+ category = _as_security_surface_category(_as_str(row[0]))
+ capability = _as_str(row[1])
+ module = _as_str(row[2])
+ qualname = _as_str(row[3])
+ lines = _decode_wire_int_fields(row, 4, 5)
+ location_scope = _as_security_surface_location_scope(_as_str(row[6]))
+ classification_mode = _as_security_surface_classification_mode(_as_str(row[7]))
+ evidence_kind = _as_security_surface_evidence_kind(_as_str(row[8]))
+ evidence_symbol = _as_str(row[9])
+ if (
+ category is None
+ or capability is None
+ or module is None
+ or qualname is None
+ or lines is None
+ or location_scope is None
+ or classification_mode is None
+ or evidence_kind is None
+ or evidence_symbol is None
+ ):
+ return None
+ start_line, end_line = lines
+ return SecuritySurfaceDict(
+ category=category,
+ capability=capability,
+ module=module,
+ filepath=filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=end_line,
+ location_scope=location_scope,
+ classification_mode=classification_mode,
+ evidence_kind=evidence_kind,
+ evidence_symbol=evidence_symbol,
+ )
+
+
+def _decode_optional_wire_module_ints(
+ *,
+ obj: dict[str, object],
+ key: str,
+ expected_len: int,
+ int_indexes: tuple[int, ...],
+) -> tuple[str, tuple[int, ...]] | None:
+ row = _decode_optional_wire_row(obj=obj, key=key, expected_len=expected_len)
+ if row is None:
+ return None
+ module = _as_str(row[0])
+ ints = _decode_wire_int_fields(row, *int_indexes)
+ if module is None or ints is None:
+ return None
+ return module, ints
+
+
+def _decode_wire_api_surface_symbol(
+ value: object,
+) -> PublicSymbolDict | None:
+ symbol_row = _decode_wire_row(value, valid_lengths={7})
+ if symbol_row is None:
+ return None
+ str_fields = _decode_wire_str_fields(symbol_row, 0, 1, 4, 5)
+ int_fields = _decode_wire_int_fields(symbol_row, 2, 3)
+ params_raw = _as_list(symbol_row[6])
+ if str_fields is None or int_fields is None or params_raw is None:
+ return None
+ qualname, kind, exported_via, returns_hash = str_fields
+ start_line, end_line = int_fields
+ params: list[ApiParamSpecDict] = []
+ for param_raw in params_raw:
+ decoded_param = _decode_wire_api_param_spec(param_raw)
+ if decoded_param is None:
+ return None
+ params.append(decoded_param)
+ return PublicSymbolDict(
+ qualname=qualname,
+ kind=kind,
+ start_line=start_line,
+ end_line=end_line,
+ params=params,
+ returns_hash=returns_hash,
+ exported_via=exported_via,
+ )
+
+
+def _decode_wire_api_param_spec(
+ value: object,
+) -> ApiParamSpecDict | None:
+ param_row = _decode_wire_row(value, valid_lengths={4})
+ if param_row is None:
+ return None
+ str_fields = _decode_wire_str_fields(param_row, 0, 1, 3)
+ int_fields = _decode_wire_int_fields(param_row, 2)
+ if str_fields is None or int_fields is None:
+ return None
+ name, param_kind, annotation_hash = str_fields
+ (has_default_raw,) = int_fields
+ return ApiParamSpecDict(
+ name=name,
+ kind=param_kind,
+ has_default=bool(has_default_raw),
+ annotation_hash=annotation_hash,
+ )
+
+
+def _decode_wire_structural_findings_optional(
+ obj: dict[str, object],
+) -> list[StructuralFindingGroupDict] | None:
+ raw = obj.get("sf")
+ if raw is None:
+ return []
+ groups_raw = _as_list(raw)
+ if groups_raw is None:
+ return None
+ groups: list[StructuralFindingGroupDict] = []
+ for group_raw in groups_raw:
+ group = _decode_wire_structural_group(group_raw)
+ if group is None:
+ return None
+ groups.append(group)
+ return groups
+
+
+def _decode_wire_structural_group(value: object) -> StructuralFindingGroupDict | None:
+ group_row = _decode_wire_row(value, valid_lengths={4})
+ if group_row is None:
+ return None
+ str_fields = _decode_wire_str_fields(group_row, 0, 1)
+ items_raw = _as_list(group_row[3])
+ signature = _decode_wire_structural_signature(group_row[2])
+ if str_fields is None or items_raw is None or signature is None:
+ return None
+ finding_kind, finding_key = str_fields
+ items: list[StructuralFindingOccurrenceDict] = []
+ for item_raw in items_raw:
+ item = _decode_wire_structural_occurrence(item_raw)
+ if item is None:
+ return None
+ items.append(item)
+ return StructuralFindingGroupDict(
+ finding_kind=finding_kind,
+ finding_key=finding_key,
+ signature=signature,
+ items=items,
+ )
+
+
+def _decode_wire_structural_signature(value: object) -> dict[str, str] | None:
+ sig_raw = _as_list(value)
+ if sig_raw is None:
+ return None
+ signature: dict[str, str] = {}
+ for pair in sig_raw:
+ pair_list = _as_list(pair)
+ if pair_list is None or len(pair_list) != 2:
+ return None
+ key = _as_str(pair_list[0])
+ val = _as_str(pair_list[1])
+ if key is None or val is None:
+ return None
+ signature[key] = val
+ return signature
+
+
+def _decode_wire_structural_occurrence(
+ value: object,
+) -> StructuralFindingOccurrenceDict | None:
+ item_list = _as_list(value)
+ if item_list is None or len(item_list) != 3:
+ return None
+ qualname = _as_str(item_list[0])
+ start = _as_int(item_list[1])
+ end = _as_int(item_list[2])
+ if qualname is None or start is None or end is None:
+ return None
+ return StructuralFindingOccurrenceDict(
+ qualname=qualname,
+ start=start,
+ end=end,
+ )
+
+
+def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None:
+ decoded = _decode_wire_named_span(value, valid_lengths={11, 17})
+ if decoded is None:
+ return None
+ row, qualname, start_line, end_line = decoded
+ core_fields = _decode_wire_unit_core_fields(row)
+ flow_profiles = _decode_wire_unit_flow_profiles(row)
+ if core_fields is None or flow_profiles is None:
+ return None
+ (
+ loc,
+ stmt_count,
+ fingerprint,
+ loc_bucket,
+ cyclomatic_complexity,
+ nesting_depth,
+ risk,
+ raw_hash,
+ ) = core_fields
+ (
+ entry_guard_count,
+ entry_guard_terminal_profile,
+ entry_guard_has_side_effect_before,
+ terminal_kind,
+ try_finally_profile,
+ side_effect_order_profile,
+ ) = flow_profiles
+ return FunctionGroupItem(
+ qualname=qualname,
+ filepath=filepath,
+ start_line=start_line,
+ end_line=end_line,
+ loc=loc,
+ stmt_count=stmt_count,
+ fingerprint=fingerprint,
+ loc_bucket=loc_bucket,
+ cyclomatic_complexity=cyclomatic_complexity,
+ nesting_depth=nesting_depth,
+ risk=risk,
+ raw_hash=raw_hash,
+ entry_guard_count=entry_guard_count,
+ entry_guard_terminal_profile=entry_guard_terminal_profile,
+ entry_guard_has_side_effect_before=entry_guard_has_side_effect_before,
+ terminal_kind=terminal_kind,
+ try_finally_profile=try_finally_profile,
+ side_effect_order_profile=side_effect_order_profile,
+ )
+
+
+def _decode_wire_block(value: object, filepath: str) -> BlockDict | None:
+ decoded = _decode_wire_named_sized_span(value, valid_lengths={5})
+ if decoded is None:
+ return None
+ row, qualname, start_line, end_line, size = decoded
+ block_hash = _as_str(row[4])
+ if block_hash is None:
+ return None
+
+ return BlockGroupItem(
+ block_hash=block_hash,
+ filepath=filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=end_line,
+ size=size,
+ )
+
+
+def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None:
+ decoded = _decode_wire_named_sized_span(value, valid_lengths={6})
+ if decoded is None:
+ return None
+ row, qualname, start_line, end_line, size = decoded
+ segment_hash = _as_str(row[4])
+ segment_sig = _as_str(row[5])
+ if segment_hash is None or segment_sig is None:
+ return None
+
+ return SegmentGroupItem(
+ segment_hash=segment_hash,
+ segment_sig=segment_sig,
+ filepath=filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=end_line,
+ size=size,
+ )
+
+
+def _decode_wire_class_metric(
+ value: object,
+ filepath: str,
+) -> ClassMetricsDict | None:
+ decoded = _decode_wire_named_span(value, valid_lengths={9})
+ if decoded is None:
+ return None
+ row, qualname, start_line, end_line = decoded
+ metric_fields = _decode_wire_class_metric_fields(row)
+ if metric_fields is None:
+ return None
+ cbo, lcom4, method_count, instance_var_count, risk_coupling, risk_cohesion = (
+ metric_fields
+ )
+ return ClassMetricsDict(
+ qualname=qualname,
+ filepath=filepath,
+ start_line=start_line,
+ end_line=end_line,
+ cbo=cbo,
+ lcom4=lcom4,
+ method_count=method_count,
+ instance_var_count=instance_var_count,
+ risk_coupling=risk_coupling,
+ risk_cohesion=risk_cohesion,
+ )
+
+
+def _decode_wire_module_dep(value: object) -> ModuleDepDict | None:
+ row = _as_list(value)
+ if row is None or len(row) != 4:
+ return None
+ source = _as_str(row[0])
+ target = _as_str(row[1])
+ import_type = _as_str(row[2])
+ line = _as_int(row[3])
+ if source is None or target is None or import_type is None or line is None:
+ return None
+ return ModuleDepDict(
+ source=source,
+ target=target,
+ import_type=import_type,
+ line=line,
+ )
+
+
+def _decode_wire_dead_candidate(
+ value: object,
+ filepath: str,
+) -> DeadCandidateDict | None:
+ row = _decode_wire_row(value, valid_lengths={5, 6})
+ if row is None:
+ return None
+ str_fields = _decode_wire_str_fields(row, 0, 1, 4)
+ int_fields = _decode_wire_int_fields(row, 2, 3)
+ suppressed_rules: list[str] | None = []
+ if len(row) == 6:
+ raw_rules = _as_list(row[5])
+ if raw_rules is None or not all(isinstance(rule, str) for rule in raw_rules):
+ return None
+ suppressed_rules = sorted({str(rule) for rule in raw_rules if str(rule)})
+ if str_fields is None or int_fields is None:
+ return None
+ qualname, local_name, kind = str_fields
+ start_line, end_line = int_fields
+ decoded = DeadCandidateDict(
+ qualname=qualname,
+ local_name=local_name,
+ filepath=filepath,
+ start_line=start_line,
+ end_line=end_line,
+ kind=kind,
+ )
+ if suppressed_rules:
+ decoded["suppressed_rules"] = suppressed_rules
+ return decoded
+
+
+__all__ = [
+ "_decode_optional_wire_api_surface",
+ "_decode_optional_wire_coupled_classes",
+ "_decode_optional_wire_docstring_coverage",
+ "_decode_optional_wire_items",
+ "_decode_optional_wire_items_for_filepath",
+ "_decode_optional_wire_module_ints",
+ "_decode_optional_wire_names",
+ "_decode_optional_wire_row",
+ "_decode_optional_wire_source_stats",
+ "_decode_optional_wire_typing_coverage",
+ "_decode_wire_api_param_spec",
+ "_decode_wire_api_surface_symbol",
+ "_decode_wire_block",
+ "_decode_wire_class_metric",
+ "_decode_wire_class_metric_fields",
+ "_decode_wire_dead_candidate",
+ "_decode_wire_file_entry",
+ "_decode_wire_file_sections",
+ "_decode_wire_int_fields",
+ "_decode_wire_module_dep",
+ "_decode_wire_name_sections",
+ "_decode_wire_named_sized_span",
+ "_decode_wire_named_span",
+ "_decode_wire_qualname_span",
+ "_decode_wire_qualname_span_size",
+ "_decode_wire_row",
+ "_decode_wire_segment",
+ "_decode_wire_stat",
+ "_decode_wire_str_fields",
+ "_decode_wire_structural_findings_optional",
+ "_decode_wire_structural_group",
+ "_decode_wire_structural_occurrence",
+ "_decode_wire_structural_signature",
+ "_decode_wire_unit",
+ "_decode_wire_unit_core_fields",
+ "_decode_wire_unit_flow_profiles",
+]
diff --git a/codeclone/cache/_wire_encode.py b/codeclone/cache/_wire_encode.py
new file mode 100644
index 0000000..f7ea38a
--- /dev/null
+++ b/codeclone/cache/_wire_encode.py
@@ -0,0 +1,320 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from ._canonicalize import _normalized_optional_string_list
+from .entries import CacheEntry, ClassMetricsDict
+
+
+def _encode_source_stats(entry: CacheEntry, wire: dict[str, object]) -> None:
+ source_stats = entry.get("source_stats")
+ if source_stats is not None:
+ wire["ss"] = [
+ source_stats["lines"],
+ source_stats["functions"],
+ source_stats["methods"],
+ source_stats["classes"],
+ ]
+
+
+def _encode_units(entry: CacheEntry, wire: dict[str, object]) -> None:
+ units = sorted(
+ entry["units"],
+ key=lambda unit: (
+ unit["qualname"],
+ unit["start_line"],
+ unit["end_line"],
+ unit["fingerprint"],
+ ),
+ )
+ if units:
+ wire["u"] = [
+ [
+ unit["qualname"],
+ unit["start_line"],
+ unit["end_line"],
+ unit["loc"],
+ unit["stmt_count"],
+ unit["fingerprint"],
+ unit["loc_bucket"],
+ unit.get("cyclomatic_complexity", 1),
+ unit.get("nesting_depth", 0),
+ unit.get("risk", "low"),
+ unit.get("raw_hash", ""),
+ unit.get("entry_guard_count", 0),
+ unit.get("entry_guard_terminal_profile", "none"),
+ 1 if unit.get("entry_guard_has_side_effect_before", False) else 0,
+ unit.get("terminal_kind", "fallthrough"),
+ unit.get("try_finally_profile", "none"),
+ unit.get("side_effect_order_profile", "none"),
+ ]
+ for unit in units
+ ]
+
+
+def _encode_blocks(entry: CacheEntry, wire: dict[str, object]) -> None:
+ blocks = sorted(
+ entry["blocks"],
+ key=lambda block: (
+ block["qualname"],
+ block["start_line"],
+ block["end_line"],
+ block["block_hash"],
+ ),
+ )
+ if blocks:
+ wire["b"] = [
+ [
+ block["qualname"],
+ block["start_line"],
+ block["end_line"],
+ block["size"],
+ block["block_hash"],
+ ]
+ for block in blocks
+ ]
+
+
+def _encode_segments(entry: CacheEntry, wire: dict[str, object]) -> None:
+ segments = sorted(
+ entry["segments"],
+ key=lambda segment: (
+ segment["qualname"],
+ segment["start_line"],
+ segment["end_line"],
+ segment["segment_hash"],
+ ),
+ )
+ if segments:
+ wire["s"] = [
+ [
+ segment["qualname"],
+ segment["start_line"],
+ segment["end_line"],
+ segment["size"],
+ segment["segment_hash"],
+ segment["segment_sig"],
+ ]
+ for segment in segments
+ ]
+
+
+def _append_coupled_classes_row(
+ metric: ClassMetricsDict,
+ *,
+ rows: list[list[object]],
+) -> None:
+ coupled_classes = _normalized_optional_string_list(
+ metric.get("coupled_classes", [])
+ )
+ if coupled_classes:
+ rows.append([metric["qualname"], coupled_classes])
+
+
+def _encode_class_metrics(entry: CacheEntry, wire: dict[str, object]) -> None:
+ class_metrics = sorted(
+ entry["class_metrics"],
+ key=lambda metric: (
+ metric["start_line"],
+ metric["end_line"],
+ metric["qualname"],
+ ),
+ )
+ if class_metrics:
+ coupled_classes_rows: list[list[object]] = []
+ wire["cm"] = [
+ [
+ metric["qualname"],
+ metric["start_line"],
+ metric["end_line"],
+ metric["cbo"],
+ metric["lcom4"],
+ metric["method_count"],
+ metric["instance_var_count"],
+ metric["risk_coupling"],
+ metric["risk_cohesion"],
+ ]
+ for metric in class_metrics
+ ]
+ for metric in class_metrics:
+ _append_coupled_classes_row(metric, rows=coupled_classes_rows)
+ if coupled_classes_rows:
+ wire["cc"] = coupled_classes_rows
+
+
+def _encode_module_deps(entry: CacheEntry, wire: dict[str, object]) -> None:
+ module_deps = sorted(
+ entry["module_deps"],
+ key=lambda dep: (dep["source"], dep["target"], dep["import_type"], dep["line"]),
+ )
+ if module_deps:
+ wire["md"] = [
+ [
+ dep["source"],
+ dep["target"],
+ dep["import_type"],
+ dep["line"],
+ ]
+ for dep in module_deps
+ ]
+
+
+def _encode_dead_candidates(entry: CacheEntry, wire: dict[str, object]) -> None:
+ dead_candidates = sorted(
+ entry["dead_candidates"],
+ key=lambda candidate: (
+ candidate["start_line"],
+ candidate["end_line"],
+ candidate["qualname"],
+ candidate["local_name"],
+ candidate["kind"],
+ ),
+ )
+ if dead_candidates:
+ encoded_dead_candidates: list[list[object]] = []
+ for candidate in dead_candidates:
+ encoded = [
+ candidate["qualname"],
+ candidate["local_name"],
+ candidate["start_line"],
+ candidate["end_line"],
+ candidate["kind"],
+ ]
+ suppressed_rules = candidate.get("suppressed_rules", [])
+ normalized_rules = _normalized_optional_string_list(suppressed_rules)
+ if normalized_rules:
+ encoded.append(normalized_rules)
+ encoded_dead_candidates.append(encoded)
+ wire["dc"] = encoded_dead_candidates
+
+
+def _encode_name_lists(entry: CacheEntry, wire: dict[str, object]) -> None:
+ if entry["referenced_names"]:
+ wire["rn"] = sorted(set(entry["referenced_names"]))
+ if entry.get("referenced_qualnames"):
+ wire["rq"] = sorted(set(entry["referenced_qualnames"]))
+ if entry["import_names"]:
+ wire["in"] = sorted(set(entry["import_names"]))
+ if entry["class_names"]:
+ wire["cn"] = sorted(set(entry["class_names"]))
+
+
+def _encode_security_surfaces(entry: CacheEntry, wire: dict[str, object]) -> None:
+ security_surfaces = sorted(
+ entry.get("security_surfaces", []),
+ key=lambda item: (
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["category"],
+ item["capability"],
+ item["evidence_symbol"],
+ ),
+ )
+ if security_surfaces:
+ wire["sc"] = [
+ [
+ item["category"],
+ item["capability"],
+ item["module"],
+ item["qualname"],
+ item["start_line"],
+ item["end_line"],
+ item["location_scope"],
+ item["classification_mode"],
+ item["evidence_kind"],
+ item["evidence_symbol"],
+ ]
+ for item in security_surfaces
+ ]
+
+
+def _encode_optional_metrics_sections(
+ entry: CacheEntry, wire: dict[str, object]
+) -> None:
+ typing_coverage = entry.get("typing_coverage")
+ if typing_coverage is not None:
+ wire["tc"] = [
+ typing_coverage["module"],
+ typing_coverage["callable_count"],
+ typing_coverage["params_total"],
+ typing_coverage["params_annotated"],
+ typing_coverage["returns_total"],
+ typing_coverage["returns_annotated"],
+ typing_coverage["any_annotation_count"],
+ ]
+ docstring_coverage = entry.get("docstring_coverage")
+ if docstring_coverage is not None:
+ wire["dg"] = [
+ docstring_coverage["module"],
+ docstring_coverage["public_symbol_total"],
+ docstring_coverage["public_symbol_documented"],
+ ]
+ api_surface = entry.get("api_surface")
+ if api_surface is not None:
+ wire["as"] = [
+ api_surface["module"],
+ sorted(set(api_surface.get("all_declared", []))),
+ [
+ [
+ symbol["qualname"],
+ symbol["kind"],
+ symbol["start_line"],
+ symbol["end_line"],
+ symbol.get("exported_via", "name"),
+ symbol.get("returns_hash", ""),
+ [
+ [
+ param["name"],
+ param["kind"],
+ 1 if param["has_default"] else 0,
+ param.get("annotation_hash", ""),
+ ]
+ for param in symbol.get("params", [])
+ ],
+ ]
+ for symbol in api_surface["symbols"]
+ ],
+ ]
+
+
+def _encode_structural_findings(entry: CacheEntry, wire: dict[str, object]) -> None:
+ if "structural_findings" in entry:
+ structural_findings = entry.get("structural_findings", [])
+ wire["sf"] = [
+ [
+ group["finding_kind"],
+ group["finding_key"],
+ sorted(group["signature"].items()),
+ [
+ [item["qualname"], item["start"], item["end"]]
+ for item in group["items"]
+ ],
+ ]
+ for group in structural_findings
+ ]
+
+
+def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]:
+ wire: dict[str, object] = {
+ "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]],
+ }
+ _encode_source_stats(entry, wire)
+ _encode_units(entry, wire)
+ _encode_blocks(entry, wire)
+ _encode_segments(entry, wire)
+ _encode_class_metrics(entry, wire)
+ _encode_module_deps(entry, wire)
+ _encode_dead_candidates(entry, wire)
+ _encode_name_lists(entry, wire)
+ _encode_security_surfaces(entry, wire)
+ _encode_optional_metrics_sections(entry, wire)
+ _encode_structural_findings(entry, wire)
+ return wire
+
+
+__all__ = ["_encode_wire_file_entry"]
diff --git a/codeclone/cache/_wire_helpers.py b/codeclone/cache/_wire_helpers.py
new file mode 100644
index 0000000..3e987f7
--- /dev/null
+++ b/codeclone/cache/_wire_helpers.py
@@ -0,0 +1,307 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable, Collection
+from typing import Literal, TypeVar
+
+from .entries import _as_risk_literal
+from .integrity import (
+ as_int_or_none as _as_int,
+)
+from .integrity import (
+ as_object_list as _as_list,
+)
+from .integrity import (
+ as_str_or_none as _as_str,
+)
+from .versioning import _DEFAULT_WIRE_UNIT_FLOW_PROFILES
+
+_DecodedItemT = TypeVar("_DecodedItemT")
+
+
+def _decode_wire_qualname_span(
+ row: list[object],
+) -> tuple[str, int, int] | None:
+ qualname = _as_str(row[0])
+ start_line = _as_int(row[1])
+ end_line = _as_int(row[2])
+ if qualname is None or start_line is None or end_line is None:
+ return None
+ return qualname, start_line, end_line
+
+
+def _decode_wire_qualname_span_size(
+ row: list[object],
+) -> tuple[str, int, int, int] | None:
+ qualname_span = _decode_wire_qualname_span(row)
+ if qualname_span is None:
+ return None
+ size = _as_int(row[3])
+ if size is None:
+ return None
+ qualname, start_line, end_line = qualname_span
+ return qualname, start_line, end_line, size
+
+
+def _decode_optional_wire_items(
+ *,
+ obj: dict[str, object],
+ key: str,
+ decode_item: Callable[[object], _DecodedItemT | None],
+) -> list[_DecodedItemT] | None:
+ raw_items = obj.get(key)
+ if raw_items is None:
+ return []
+ wire_items = _as_list(raw_items)
+ if wire_items is None:
+ return None
+ decoded_items: list[_DecodedItemT] = []
+ for wire_item in wire_items:
+ decoded = decode_item(wire_item)
+ if decoded is None:
+ return None
+ decoded_items.append(decoded)
+ return decoded_items
+
+
+def _decode_optional_wire_items_for_filepath(
+ *,
+ obj: dict[str, object],
+ key: str,
+ filepath: str,
+ decode_item: Callable[[object, str], _DecodedItemT | None],
+) -> list[_DecodedItemT] | None:
+ raw_items = obj.get(key)
+ if raw_items is None:
+ return []
+ wire_items = _as_list(raw_items)
+ if wire_items is None:
+ return None
+ decoded_items: list[_DecodedItemT] = []
+ for wire_item in wire_items:
+ decoded = decode_item(wire_item, filepath)
+ if decoded is None:
+ return None
+ decoded_items.append(decoded)
+ return decoded_items
+
+
+def _decode_optional_wire_row(
+ *,
+ obj: dict[str, object],
+ key: str,
+ expected_len: int,
+) -> list[object] | None:
+ raw = obj.get(key)
+ if raw is None:
+ return None
+ row = _as_list(raw)
+ if row is None or len(row) != expected_len:
+ return None
+ return row
+
+
+def _decode_optional_wire_names(
+ *,
+ obj: dict[str, object],
+ key: str,
+) -> list[str] | None:
+ raw_names = obj.get(key)
+ if raw_names is None:
+ return []
+ names = _as_list(raw_names)
+ if names is None or not all(isinstance(name, str) for name in names):
+ return None
+ return [str(name) for name in names]
+
+
+def _decode_optional_wire_coupled_classes(
+ *,
+ obj: dict[str, object],
+ key: str,
+) -> dict[str, list[str]] | None:
+ raw = obj.get(key)
+ if raw is None:
+ return {}
+
+ rows = _as_list(raw)
+ if rows is None:
+ return None
+
+ decoded: dict[str, list[str]] = {}
+ for wire_row in rows:
+ row = _as_list(wire_row)
+ if row is None or len(row) != 2:
+ return None
+ qualname = _as_str(row[0])
+ names = _as_list(row[1])
+ if qualname is None or names is None:
+ return None
+ if not all(isinstance(name, str) for name in names):
+ return None
+ decoded[qualname] = sorted({str(name) for name in names if str(name)})
+
+ return decoded
+
+
+def _decode_wire_row(
+ value: object,
+ *,
+ valid_lengths: Collection[int],
+) -> list[object] | None:
+ row = _as_list(value)
+ if row is None or len(row) not in valid_lengths:
+ return None
+ return row
+
+
+def _decode_wire_named_span(
+ value: object,
+ *,
+ valid_lengths: Collection[int],
+) -> tuple[list[object], str, int, int] | None:
+ row = _decode_wire_row(value, valid_lengths=valid_lengths)
+ if row is None:
+ return None
+ span = _decode_wire_qualname_span(row)
+ if span is None:
+ return None
+ qualname, start_line, end_line = span
+ return row, qualname, start_line, end_line
+
+
+def _decode_wire_named_sized_span(
+ value: object,
+ *,
+ valid_lengths: Collection[int],
+) -> tuple[list[object], str, int, int, int] | None:
+ row = _decode_wire_row(value, valid_lengths=valid_lengths)
+ if row is None:
+ return None
+ span = _decode_wire_qualname_span_size(row)
+ if span is None:
+ return None
+ qualname, start_line, end_line, size = span
+ return row, qualname, start_line, end_line, size
+
+
+def _decode_wire_int_fields(
+ row: list[object],
+ *indexes: int,
+) -> tuple[int, ...] | None:
+ values: list[int] = []
+ for index in indexes:
+ value = _as_int(row[index])
+ if value is None:
+ return None
+ values.append(value)
+ return tuple(values)
+
+
+def _decode_wire_str_fields(
+ row: list[object],
+ *indexes: int,
+) -> tuple[str, ...] | None:
+ values: list[str] = []
+ for index in indexes:
+ value = _as_str(row[index])
+ if value is None:
+ return None
+ values.append(value)
+ return tuple(values)
+
+
+def _decode_wire_unit_core_fields(
+ row: list[object],
+) -> tuple[int, int, str, str, int, int, Literal["low", "medium", "high"], str] | None:
+ int_fields = _decode_wire_int_fields(row, 3, 4, 7, 8)
+ str_fields = _decode_wire_str_fields(row, 5, 6, 10)
+ risk = _as_risk_literal(row[9])
+ if int_fields is None or str_fields is None or risk is None:
+ return None
+ loc, stmt_count, cyclomatic_complexity, nesting_depth = int_fields
+ fingerprint, loc_bucket, raw_hash = str_fields
+ return (
+ loc,
+ stmt_count,
+ fingerprint,
+ loc_bucket,
+ cyclomatic_complexity,
+ nesting_depth,
+ risk,
+ raw_hash,
+ )
+
+
+def _decode_wire_unit_flow_profiles(
+ row: list[object],
+) -> tuple[int, str, bool, str, str, str] | None:
+ if len(row) != 17:
+ return _DEFAULT_WIRE_UNIT_FLOW_PROFILES
+
+ parsed_entry_guard_count = _as_int(row[11])
+ parsed_entry_guard_terminal_profile = _as_str(row[12])
+ parsed_entry_guard_has_side_effect_before = _as_int(row[13])
+ parsed_terminal_kind = _as_str(row[14])
+ parsed_try_finally_profile = _as_str(row[15])
+ parsed_side_effect_order_profile = _as_str(row[16])
+ if (
+ parsed_entry_guard_count is None
+ or parsed_entry_guard_terminal_profile is None
+ or parsed_entry_guard_has_side_effect_before is None
+ or parsed_terminal_kind is None
+ or parsed_try_finally_profile is None
+ or parsed_side_effect_order_profile is None
+ ):
+ return None
+ return (
+ max(0, parsed_entry_guard_count),
+ parsed_entry_guard_terminal_profile or "none",
+ parsed_entry_guard_has_side_effect_before != 0,
+ parsed_terminal_kind or "fallthrough",
+ parsed_try_finally_profile or "none",
+ parsed_side_effect_order_profile or "none",
+ )
+
+
+def _decode_wire_class_metric_fields(
+ row: list[object],
+) -> tuple[int, int, int, int, str, str] | None:
+ int_fields = _decode_wire_int_fields(row, 3, 4, 5, 6)
+ str_fields = _decode_wire_str_fields(row, 7, 8)
+ if int_fields is None or str_fields is None:
+ return None
+ cbo, lcom4, method_count, instance_var_count = int_fields
+ risk_coupling, risk_cohesion = str_fields
+ return (
+ cbo,
+ lcom4,
+ method_count,
+ instance_var_count,
+ risk_coupling,
+ risk_cohesion,
+ )
+
+
+__all__ = [
+ "_decode_optional_wire_coupled_classes",
+ "_decode_optional_wire_items",
+ "_decode_optional_wire_items_for_filepath",
+ "_decode_optional_wire_names",
+ "_decode_optional_wire_row",
+ "_decode_wire_class_metric_fields",
+ "_decode_wire_int_fields",
+ "_decode_wire_named_sized_span",
+ "_decode_wire_named_span",
+ "_decode_wire_qualname_span",
+ "_decode_wire_qualname_span_size",
+ "_decode_wire_row",
+ "_decode_wire_str_fields",
+ "_decode_wire_unit_core_fields",
+ "_decode_wire_unit_flow_profiles",
+]
diff --git a/codeclone/cache/entries.py b/codeclone/cache/entries.py
new file mode 100644
index 0000000..9dcd1ee
--- /dev/null
+++ b/codeclone/cache/entries.py
@@ -0,0 +1,559 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import Literal, TypedDict
+
+from ..findings.structural.detectors import normalize_structural_finding_group
+from ..models import (
+ BlockGroupItem,
+ BlockUnit,
+ ClassMetrics,
+ DeadCandidate,
+ FunctionGroupItem,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ SecuritySurface,
+ SegmentGroupItem,
+ SegmentUnit,
+ StructuralFindingGroup,
+ StructuralFindingOccurrence,
+ Unit,
+)
+
+
+class FileStat(TypedDict):
+ mtime_ns: int
+ size: int
+
+
+class SourceStatsDict(TypedDict):
+ lines: int
+ functions: int
+ methods: int
+ classes: int
+
+
+UnitDict = FunctionGroupItem
+BlockDict = BlockGroupItem
+SegmentDict = SegmentGroupItem
+
+
+class ClassMetricsDictBase(TypedDict):
+ qualname: str
+ filepath: str
+ start_line: int
+ end_line: int
+ cbo: int
+ lcom4: int
+ method_count: int
+ instance_var_count: int
+ risk_coupling: str
+ risk_cohesion: str
+
+
+class ClassMetricsDict(ClassMetricsDictBase, total=False):
+ coupled_classes: list[str]
+
+
+class ModuleDepDict(TypedDict):
+ source: str
+ target: str
+ import_type: str
+ line: int
+
+
+class DeadCandidateDictBase(TypedDict):
+ qualname: str
+ local_name: str
+ filepath: str
+ start_line: int
+ end_line: int
+ kind: str
+
+
+class DeadCandidateDict(DeadCandidateDictBase, total=False):
+ suppressed_rules: list[str]
+
+
+class SecuritySurfaceDictBase(TypedDict):
+ category: str
+ capability: str
+ module: str
+ filepath: str
+ qualname: str
+ start_line: int
+ end_line: int
+ location_scope: str
+ classification_mode: str
+ evidence_kind: str
+ evidence_symbol: str
+
+
+class SecuritySurfaceDict(SecuritySurfaceDictBase):
+ pass
+
+
+class ModuleTypingCoverageDict(TypedDict):
+ module: str
+ filepath: str
+ callable_count: int
+ params_total: int
+ params_annotated: int
+ returns_total: int
+ returns_annotated: int
+ any_annotation_count: int
+
+
+class ModuleDocstringCoverageDict(TypedDict):
+ module: str
+ filepath: str
+ public_symbol_total: int
+ public_symbol_documented: int
+
+
+class ApiParamSpecDict(TypedDict):
+ name: str
+ kind: str
+ has_default: bool
+ annotation_hash: str
+
+
+class PublicSymbolDict(TypedDict):
+ qualname: str
+ kind: str
+ start_line: int
+ end_line: int
+ params: list[ApiParamSpecDict]
+ returns_hash: str
+ exported_via: str
+
+
+class ModuleApiSurfaceDict(TypedDict):
+ module: str
+ filepath: str
+ all_declared: list[str]
+ symbols: list[PublicSymbolDict]
+
+
+class StructuralFindingOccurrenceDict(TypedDict):
+ qualname: str
+ start: int
+ end: int
+
+
+class StructuralFindingGroupDict(TypedDict):
+ finding_kind: str
+ finding_key: str
+ signature: dict[str, str]
+ items: list[StructuralFindingOccurrenceDict]
+
+
+class _FileEntryBase(TypedDict):
+ stat: FileStat
+ units: list[UnitDict]
+ blocks: list[BlockDict]
+ segments: list[SegmentDict]
+
+
+class _FileEntryV26(_FileEntryBase, total=False):
+ source_stats: SourceStatsDict
+ class_metrics: list[ClassMetricsDict]
+ module_deps: list[ModuleDepDict]
+ dead_candidates: list[DeadCandidateDict]
+ referenced_names: list[str]
+ referenced_qualnames: list[str]
+ import_names: list[str]
+ class_names: list[str]
+ security_surfaces: list[SecuritySurfaceDict]
+ typing_coverage: ModuleTypingCoverageDict
+ docstring_coverage: ModuleDocstringCoverageDict
+ api_surface: ModuleApiSurfaceDict
+ structural_findings: list[StructuralFindingGroupDict]
+
+
+CacheEntryBase = _FileEntryBase
+CacheEntry = _FileEntryV26
+
+
+def _normalize_cached_structural_group(
+ group: StructuralFindingGroupDict,
+ *,
+ filepath: str,
+) -> StructuralFindingGroupDict | None:
+ signature = dict(group["signature"])
+ finding_kind = group["finding_kind"]
+ finding_key = group["finding_key"]
+ normalized = normalize_structural_finding_group(
+ StructuralFindingGroup(
+ finding_kind=finding_kind,
+ finding_key=finding_key,
+ signature=signature,
+ items=tuple(
+ StructuralFindingOccurrence(
+ finding_kind=finding_kind,
+ finding_key=finding_key,
+ file_path=filepath,
+ qualname=item["qualname"],
+ start=item["start"],
+ end=item["end"],
+ signature=signature,
+ )
+ for item in group["items"]
+ ),
+ )
+ )
+ if normalized is None:
+ return None
+ return StructuralFindingGroupDict(
+ finding_kind=normalized.finding_kind,
+ finding_key=normalized.finding_key,
+ signature=dict(normalized.signature),
+ items=[
+ StructuralFindingOccurrenceDict(
+ qualname=item.qualname,
+ start=item.start,
+ end=item.end,
+ )
+ for item in normalized.items
+ ],
+ )
+
+
+def _normalize_cached_structural_groups(
+ groups: Sequence[StructuralFindingGroupDict],
+ *,
+ filepath: str,
+) -> list[StructuralFindingGroupDict]:
+ normalized = [
+ candidate
+ for candidate in (
+ _normalize_cached_structural_group(group, filepath=filepath)
+ for group in groups
+ )
+ if candidate is not None
+ ]
+ normalized.sort(key=lambda group: (-len(group["items"]), group["finding_key"]))
+ return normalized
+
+
+def _as_risk_literal(value: object) -> Literal["low", "medium", "high"] | None:
+ match value:
+ case "low":
+ return "low"
+ case "medium":
+ return "medium"
+ case "high":
+ return "high"
+ case _:
+ return None
+
+
+def _as_security_surface_category(value: object) -> str | None:
+ match value:
+ case (
+ "archive_extraction"
+ | "crypto_transport"
+ | "database_boundary"
+ | "deserialization"
+ | "dynamic_execution"
+ | "dynamic_loading"
+ | "filesystem_mutation"
+ | "identity_token"
+ | "network_boundary"
+ | "process_boundary"
+ ):
+ return value
+ case _:
+ return None
+
+
+def _as_security_surface_location_scope(value: object) -> str | None:
+ match value:
+ case "module" | "class" | "callable":
+ return value
+ case _:
+ return None
+
+
+def _as_security_surface_classification_mode(value: object) -> str | None:
+ match value:
+ case "exact_builtin" | "exact_call" | "exact_import":
+ return value
+ case _:
+ return None
+
+
+def _as_security_surface_evidence_kind(value: object) -> str | None:
+ match value:
+ case "builtin" | "call" | "import":
+ return value
+ case _:
+ return None
+
+
+def _new_optional_metrics_payload() -> tuple[
+ list[ClassMetricsDict],
+ list[ModuleDepDict],
+ list[DeadCandidateDict],
+ list[str],
+ list[str],
+ list[str],
+ list[str],
+ list[SecuritySurfaceDict],
+ ModuleTypingCoverageDict | None,
+ ModuleDocstringCoverageDict | None,
+ ModuleApiSurfaceDict | None,
+]:
+ return [], [], [], [], [], [], [], [], None, None, None
+
+
+def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict:
+ return FunctionGroupItem(
+ qualname=unit.qualname,
+ filepath=filepath,
+ start_line=unit.start_line,
+ end_line=unit.end_line,
+ loc=unit.loc,
+ stmt_count=unit.stmt_count,
+ fingerprint=unit.fingerprint,
+ loc_bucket=unit.loc_bucket,
+ cyclomatic_complexity=unit.cyclomatic_complexity,
+ nesting_depth=unit.nesting_depth,
+ risk=unit.risk,
+ raw_hash=unit.raw_hash,
+ entry_guard_count=unit.entry_guard_count,
+ entry_guard_terminal_profile=unit.entry_guard_terminal_profile,
+ entry_guard_has_side_effect_before=unit.entry_guard_has_side_effect_before,
+ terminal_kind=unit.terminal_kind,
+ try_finally_profile=unit.try_finally_profile,
+ side_effect_order_profile=unit.side_effect_order_profile,
+ )
+
+
+def _block_dict_from_model(block: BlockUnit, filepath: str) -> BlockDict:
+ return BlockGroupItem(
+ block_hash=block.block_hash,
+ filepath=filepath,
+ qualname=block.qualname,
+ start_line=block.start_line,
+ end_line=block.end_line,
+ size=block.size,
+ )
+
+
+def _segment_dict_from_model(segment: SegmentUnit, filepath: str) -> SegmentDict:
+ return SegmentGroupItem(
+ segment_hash=segment.segment_hash,
+ segment_sig=segment.segment_sig,
+ filepath=filepath,
+ qualname=segment.qualname,
+ start_line=segment.start_line,
+ end_line=segment.end_line,
+ size=segment.size,
+ )
+
+
+def _typing_coverage_dict_from_model(
+ coverage: ModuleTypingCoverage | None,
+ *,
+ filepath: str,
+) -> ModuleTypingCoverageDict | None:
+ if coverage is None:
+ return None
+ return ModuleTypingCoverageDict(
+ module=coverage.module,
+ filepath=filepath,
+ callable_count=coverage.callable_count,
+ params_total=coverage.params_total,
+ params_annotated=coverage.params_annotated,
+ returns_total=coverage.returns_total,
+ returns_annotated=coverage.returns_annotated,
+ any_annotation_count=coverage.any_annotation_count,
+ )
+
+
+def _docstring_coverage_dict_from_model(
+ coverage: ModuleDocstringCoverage | None,
+ *,
+ filepath: str,
+) -> ModuleDocstringCoverageDict | None:
+ if coverage is None:
+ return None
+ return ModuleDocstringCoverageDict(
+ module=coverage.module,
+ filepath=filepath,
+ public_symbol_total=coverage.public_symbol_total,
+ public_symbol_documented=coverage.public_symbol_documented,
+ )
+
+
+def _api_surface_dict_from_model(
+ surface: ModuleApiSurface | None,
+ *,
+ filepath: str,
+) -> ModuleApiSurfaceDict | None:
+ if surface is None:
+ return None
+ return ModuleApiSurfaceDict(
+ module=surface.module,
+ filepath=filepath,
+ all_declared=list(surface.all_declared or ()),
+ symbols=[
+ PublicSymbolDict(
+ qualname=symbol.qualname,
+ kind=symbol.kind,
+ start_line=symbol.start_line,
+ end_line=symbol.end_line,
+ params=[
+ ApiParamSpecDict(
+ name=param.name,
+ kind=param.kind,
+ has_default=param.has_default,
+ annotation_hash=param.annotation_hash,
+ )
+ for param in symbol.params
+ ],
+ returns_hash=symbol.returns_hash,
+ exported_via=symbol.exported_via,
+ )
+ for symbol in surface.symbols
+ ],
+ )
+
+
+def _class_metrics_dict_from_model(
+ metric: ClassMetrics,
+ filepath: str,
+) -> ClassMetricsDict:
+ return ClassMetricsDict(
+ qualname=metric.qualname,
+ filepath=filepath,
+ start_line=metric.start_line,
+ end_line=metric.end_line,
+ cbo=metric.cbo,
+ lcom4=metric.lcom4,
+ method_count=metric.method_count,
+ instance_var_count=metric.instance_var_count,
+ risk_coupling=metric.risk_coupling,
+ risk_cohesion=metric.risk_cohesion,
+ coupled_classes=sorted(set(metric.coupled_classes)),
+ )
+
+
+def _module_dep_dict_from_model(dep: ModuleDep) -> ModuleDepDict:
+ return ModuleDepDict(
+ source=dep.source,
+ target=dep.target,
+ import_type=dep.import_type,
+ line=dep.line,
+ )
+
+
+def _dead_candidate_dict_from_model(
+ candidate: DeadCandidate,
+ filepath: str,
+) -> DeadCandidateDict:
+ result = DeadCandidateDict(
+ qualname=candidate.qualname,
+ local_name=candidate.local_name,
+ filepath=filepath,
+ start_line=candidate.start_line,
+ end_line=candidate.end_line,
+ kind=candidate.kind,
+ )
+ if candidate.suppressed_rules:
+ result["suppressed_rules"] = sorted(set(candidate.suppressed_rules))
+ return result
+
+
+def _security_surface_dict_from_model(
+ surface: SecuritySurface,
+ filepath: str,
+) -> SecuritySurfaceDict:
+ return SecuritySurfaceDict(
+ category=surface.category,
+ capability=surface.capability,
+ module=surface.module,
+ filepath=filepath,
+ qualname=surface.qualname,
+ start_line=surface.start_line,
+ end_line=surface.end_line,
+ location_scope=surface.location_scope,
+ classification_mode=surface.classification_mode,
+ evidence_kind=surface.evidence_kind,
+ evidence_symbol=surface.evidence_symbol,
+ )
+
+
+def _structural_occurrence_dict_from_model(
+ occurrence: StructuralFindingOccurrence,
+) -> StructuralFindingOccurrenceDict:
+ return StructuralFindingOccurrenceDict(
+ qualname=occurrence.qualname,
+ start=occurrence.start,
+ end=occurrence.end,
+ )
+
+
+def _structural_group_dict_from_model(
+ group: StructuralFindingGroup,
+) -> StructuralFindingGroupDict:
+ return StructuralFindingGroupDict(
+ finding_kind=group.finding_kind,
+ finding_key=group.finding_key,
+ signature=dict(group.signature),
+ items=[
+ _structural_occurrence_dict_from_model(occurrence)
+ for occurrence in group.items
+ ],
+ )
+
+
+__all__ = [
+ "ApiParamSpecDict",
+ "BlockDict",
+ "CacheEntry",
+ "CacheEntryBase",
+ "ClassMetricsDict",
+ "DeadCandidateDict",
+ "FileStat",
+ "ModuleApiSurfaceDict",
+ "ModuleDepDict",
+ "ModuleDocstringCoverageDict",
+ "ModuleTypingCoverageDict",
+ "PublicSymbolDict",
+ "SecuritySurfaceDict",
+ "SegmentDict",
+ "SourceStatsDict",
+ "StructuralFindingGroupDict",
+ "StructuralFindingOccurrenceDict",
+ "UnitDict",
+ "_api_surface_dict_from_model",
+ "_as_risk_literal",
+ "_as_security_surface_category",
+ "_as_security_surface_classification_mode",
+ "_as_security_surface_evidence_kind",
+ "_as_security_surface_location_scope",
+ "_block_dict_from_model",
+ "_class_metrics_dict_from_model",
+ "_dead_candidate_dict_from_model",
+ "_docstring_coverage_dict_from_model",
+ "_module_dep_dict_from_model",
+ "_new_optional_metrics_payload",
+ "_normalize_cached_structural_group",
+ "_normalize_cached_structural_groups",
+ "_security_surface_dict_from_model",
+ "_segment_dict_from_model",
+ "_structural_group_dict_from_model",
+ "_structural_occurrence_dict_from_model",
+ "_typing_coverage_dict_from_model",
+ "_unit_dict_from_model",
+]
diff --git a/codeclone/cache_io.py b/codeclone/cache/integrity.py
similarity index 80%
rename from codeclone/cache_io.py
rename to codeclone/cache/integrity.py
index c077cc8..12086b1 100644
--- a/codeclone/cache_io.py
+++ b/codeclone/cache/integrity.py
@@ -11,13 +11,9 @@
from collections.abc import Mapping
from pathlib import Path
-from ._json_io import (
- json_text as _json_text,
-)
-from ._json_io import (
- read_json_document as _read_json_document,
-)
-from ._json_io import (
+from ..utils.json_io import json_text as _json_text
+from ..utils.json_io import read_json_document as _read_json_document
+from ..utils.json_io import (
write_json_document_atomically as _write_json_document_atomically,
)
@@ -64,3 +60,16 @@ def read_json_document(path: Path) -> object:
def write_json_document_atomically(path: Path, document: object) -> None:
_write_json_document_atomically(path, document, sort_keys=True)
+
+
+__all__ = [
+ "as_int_or_none",
+ "as_object_list",
+ "as_str_dict",
+ "as_str_or_none",
+ "canonical_json",
+ "read_json_document",
+ "sign_cache_payload",
+ "verify_cache_payload_signature",
+ "write_json_document_atomically",
+]
diff --git a/codeclone/cache_segments.py b/codeclone/cache/projection.py
similarity index 82%
rename from codeclone/cache_segments.py
rename to codeclone/cache/projection.py
index a771e51..49b2db3 100644
--- a/codeclone/cache_segments.py
+++ b/codeclone/cache/projection.py
@@ -10,18 +10,57 @@
from pathlib import Path
from typing import TypedDict
-from .cache_io import (
+from ..models import SegmentGroupItem
+from .integrity import (
as_int_or_none,
as_object_list,
as_str_dict,
as_str_or_none,
)
-from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime
-from .models import SegmentGroupItem
SegmentDict = SegmentGroupItem
+def wire_filepath_from_runtime(
+ runtime_filepath: str,
+ *,
+ root: Path | None,
+) -> str:
+ runtime_path = Path(runtime_filepath)
+ if root is None:
+ return runtime_path.as_posix()
+
+ try:
+ relative = runtime_path.relative_to(root)
+ return relative.as_posix()
+ except ValueError:
+ pass
+
+ try:
+ relative = runtime_path.resolve().relative_to(root.resolve())
+ return relative.as_posix()
+ except OSError:
+ return runtime_path.as_posix()
+ except ValueError:
+ return runtime_path.as_posix()
+
+
+def runtime_filepath_from_wire(
+ wire_filepath: str,
+ *,
+ root: Path | None,
+) -> str:
+ wire_path = Path(wire_filepath)
+ if root is None or wire_path.is_absolute():
+ return str(wire_path)
+
+ combined = root / wire_path
+ try:
+ return str(combined.resolve(strict=False))
+ except OSError:
+ return str(combined)
+
+
class SegmentReportProjection(TypedDict):
digest: str
suppressed: int
@@ -182,3 +221,14 @@ def encode_segment_report_projection(
"s": max(0, int(projection["suppressed"])),
"g": groups_rows,
}
+
+
+__all__ = [
+ "SegmentDict",
+ "SegmentReportProjection",
+ "build_segment_report_projection",
+ "decode_segment_report_projection",
+ "encode_segment_report_projection",
+ "runtime_filepath_from_wire",
+ "wire_filepath_from_runtime",
+]
diff --git a/codeclone/cache/store.py b/codeclone/cache/store.py
new file mode 100644
index 0000000..6ac9884
--- /dev/null
+++ b/codeclone/cache/store.py
@@ -0,0 +1,681 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import os
+from collections.abc import Collection
+from json import JSONDecodeError
+from pathlib import Path
+from typing import Protocol
+
+from ..baseline.trust import current_python_tag
+from ..contracts import (
+ BASELINE_FINGERPRINT_VERSION,
+ CACHE_VERSION,
+ DEFAULT_BLOCK_MIN_LOC,
+ DEFAULT_BLOCK_MIN_STMT,
+ DEFAULT_MIN_LOC,
+ DEFAULT_MIN_STMT,
+ DEFAULT_SEGMENT_MIN_LOC,
+ DEFAULT_SEGMENT_MIN_STMT,
+)
+from ..contracts.errors import CacheError
+from ..models import BlockUnit, FileMetrics, SegmentUnit, StructuralFindingGroup, Unit
+from ._canonicalize import (
+ _as_file_stat_dict,
+ _as_typed_block_list,
+ _as_typed_segment_list,
+ _as_typed_unit_list,
+ _attach_optional_cache_sections,
+ _canonicalize_cache_entry,
+ _decode_optional_cache_sections,
+ _is_canonical_cache_entry,
+)
+from ._wire_decode import _decode_wire_file_entry
+from ._wire_encode import _encode_wire_file_entry
+from .entries import (
+ CacheEntry,
+ FileStat,
+ SourceStatsDict,
+ _api_surface_dict_from_model,
+ _block_dict_from_model,
+ _class_metrics_dict_from_model,
+ _dead_candidate_dict_from_model,
+ _docstring_coverage_dict_from_model,
+ _module_dep_dict_from_model,
+ _new_optional_metrics_payload,
+ _normalize_cached_structural_groups,
+ _security_surface_dict_from_model,
+ _segment_dict_from_model,
+ _structural_group_dict_from_model,
+ _typing_coverage_dict_from_model,
+ _unit_dict_from_model,
+)
+from .integrity import (
+ as_str_dict as _as_str_dict,
+)
+from .integrity import (
+ as_str_or_none as _as_str,
+)
+from .integrity import (
+ read_json_document,
+ sign_cache_payload,
+ verify_cache_payload_signature,
+ write_json_document_atomically,
+)
+from .projection import (
+ SegmentReportProjection,
+ decode_segment_report_projection,
+ encode_segment_report_projection,
+ runtime_filepath_from_wire,
+ wire_filepath_from_runtime,
+)
+from .versioning import (
+ LEGACY_CACHE_SECRET_FILENAME,
+ MAX_CACHE_SIZE_BYTES,
+ AnalysisProfile,
+ CacheData,
+ CacheStatus,
+ _as_analysis_profile,
+ _empty_cache_data,
+ _resolve_root,
+)
+
+
+class _CacheStatusLike(Protocol):
+ @property
+ def load_status(self) -> CacheStatus | str | None: ...
+
+ @property
+ def load_warning(self) -> str | None: ...
+
+ @property
+ def cache_schema_version(self) -> str | None: ...
+
+
+def resolve_cache_status(cache: _CacheStatusLike) -> tuple[CacheStatus, str | None]:
+ raw_cache_status = getattr(cache, "load_status", None)
+ load_warning = getattr(cache, "load_warning", None)
+ if isinstance(raw_cache_status, CacheStatus):
+ cache_status = raw_cache_status
+ elif isinstance(raw_cache_status, str):
+ try:
+ cache_status = CacheStatus(raw_cache_status)
+ except ValueError:
+ cache_status = (
+ CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE
+ )
+ else:
+ cache_status = (
+ CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE
+ )
+
+ raw_cache_schema_version = getattr(cache, "cache_schema_version", None)
+ cache_schema_version = (
+ raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None
+ )
+ return cache_status, cache_schema_version
+
+
+class Cache:
+ __slots__ = (
+ "_canonical_runtime_paths",
+ "_dirty",
+ "analysis_profile",
+ "cache_schema_version",
+ "data",
+ "fingerprint_version",
+ "legacy_secret_warning",
+ "load_status",
+ "load_warning",
+ "max_size_bytes",
+ "path",
+ "root",
+ "segment_report_projection",
+ )
+
+ _CACHE_VERSION = CACHE_VERSION
+
+ def __init__(
+ self,
+ path: str | Path,
+ *,
+ root: str | Path | None = None,
+ max_size_bytes: int | None = None,
+ min_loc: int = DEFAULT_MIN_LOC,
+ min_stmt: int = DEFAULT_MIN_STMT,
+ block_min_loc: int = DEFAULT_BLOCK_MIN_LOC,
+ block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT,
+ segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC,
+ segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT,
+ collect_api_surface: bool = False,
+ ):
+ self.path = Path(path)
+ self.root = _resolve_root(root)
+ self.fingerprint_version = BASELINE_FINGERPRINT_VERSION
+ self.analysis_profile: AnalysisProfile = {
+ "min_loc": min_loc,
+ "min_stmt": min_stmt,
+ "block_min_loc": block_min_loc,
+ "block_min_stmt": block_min_stmt,
+ "segment_min_loc": segment_min_loc,
+ "segment_min_stmt": segment_min_stmt,
+ "collect_api_surface": collect_api_surface,
+ }
+ self.data: CacheData = _empty_cache_data(
+ version=self._CACHE_VERSION,
+ python_tag=current_python_tag(),
+ fingerprint_version=self.fingerprint_version,
+ analysis_profile=self.analysis_profile,
+ )
+ self._canonical_runtime_paths: set[str] = set()
+ self.legacy_secret_warning = self._detect_legacy_secret_warning()
+ self.cache_schema_version: str | None = None
+ self.load_status = CacheStatus.MISSING
+ self.load_warning: str | None = self.legacy_secret_warning
+ self.max_size_bytes = (
+ MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes
+ )
+ self.segment_report_projection: SegmentReportProjection | None = None
+ self._dirty: bool = True
+
+ def _detect_legacy_secret_warning(self) -> str | None:
+ secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME
+ try:
+ if secret_path.exists():
+ return (
+ f"Legacy cache secret file detected at {secret_path}; "
+ "delete this obsolete file."
+ )
+ except OSError as exc:
+ return f"Legacy cache secret check failed: {exc}"
+ return None
+
+ def _set_load_warning(self, message: str | None) -> None:
+ warning = message
+ if warning is None:
+ warning = self.legacy_secret_warning
+ elif self.legacy_secret_warning:
+ warning = f"{warning}\n{self.legacy_secret_warning}"
+ self.load_warning = warning
+
+ def _ignore_cache(
+ self,
+ message: str,
+ *,
+ status: CacheStatus,
+ schema_version: str | None = None,
+ ) -> None:
+ self._set_load_warning(message)
+ self.load_status = status
+ self.cache_schema_version = schema_version
+ self.data = _empty_cache_data(
+ version=self._CACHE_VERSION,
+ python_tag=current_python_tag(),
+ fingerprint_version=self.fingerprint_version,
+ analysis_profile=self.analysis_profile,
+ )
+ self._canonical_runtime_paths = set()
+ self.segment_report_projection = None
+
+ def _reject_cache_load(
+ self,
+ message: str,
+ *,
+ status: CacheStatus,
+ schema_version: str | None = None,
+ ) -> CacheData | None:
+ self._ignore_cache(
+ message,
+ status=status,
+ schema_version=schema_version,
+ )
+ return None
+
+ def _reject_invalid_cache_format(
+ self,
+ *,
+ schema_version: str | None = None,
+ ) -> CacheData | None:
+ return self._reject_cache_load(
+ "Cache format invalid; ignoring cache.",
+ status=CacheStatus.INVALID_TYPE,
+ schema_version=schema_version,
+ )
+
+ def _reject_version_mismatch(self, version: str) -> CacheData | None:
+ return self._reject_cache_load(
+ f"Cache version mismatch (found {version}); ignoring cache.",
+ status=CacheStatus.VERSION_MISMATCH,
+ schema_version=version,
+ )
+
+ def load(self) -> None:
+ try:
+ exists = self.path.exists()
+ except OSError as exc:
+ self._ignore_cache(
+ f"Cache unreadable; ignoring cache: {exc}",
+ status=CacheStatus.UNREADABLE,
+ )
+ return
+
+ if not exists:
+ self._set_load_warning(None)
+ self.load_status = CacheStatus.MISSING
+ self.cache_schema_version = None
+ self._canonical_runtime_paths = set()
+ self.segment_report_projection = None
+ return
+
+ try:
+ size = self.path.stat().st_size
+ if size > self.max_size_bytes:
+ self._ignore_cache(
+ "Cache file too large "
+ f"({size} bytes, max {self.max_size_bytes}); ignoring cache.",
+ status=CacheStatus.TOO_LARGE,
+ )
+ return
+
+ raw_obj = read_json_document(self.path)
+ parsed = self._load_and_validate(raw_obj)
+ if parsed is None:
+ return
+ self.data = parsed
+ self._canonical_runtime_paths = set(parsed["files"].keys())
+ self.load_status = CacheStatus.OK
+ self._set_load_warning(None)
+ self._dirty = False
+ except OSError as exc:
+ self._ignore_cache(
+ f"Cache unreadable; ignoring cache: {exc}",
+ status=CacheStatus.UNREADABLE,
+ )
+ except JSONDecodeError:
+ self._ignore_cache(
+ "Cache corrupted; ignoring cache.",
+ status=CacheStatus.INVALID_JSON,
+ )
+
+ def _load_and_validate(self, raw_obj: object) -> CacheData | None:
+ raw = _as_str_dict(raw_obj)
+ if raw is None:
+ return self._reject_invalid_cache_format()
+
+ legacy_version = _as_str(raw.get("version"))
+ if legacy_version is not None:
+ return self._reject_version_mismatch(legacy_version)
+
+ version = _as_str(raw.get("v"))
+ if version is None:
+ return self._reject_invalid_cache_format()
+
+ if version != self._CACHE_VERSION:
+ return self._reject_version_mismatch(version)
+
+ sig = _as_str(raw.get("sig"))
+ payload = _as_str_dict(raw.get("payload"))
+ if sig is None or payload is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+
+ if not verify_cache_payload_signature(payload, sig):
+ return self._reject_cache_load(
+ "Cache signature mismatch; ignoring cache.",
+ status=CacheStatus.INTEGRITY_FAILED,
+ schema_version=version,
+ )
+
+ runtime_tag = current_python_tag()
+ py_tag = _as_str(payload.get("py"))
+ if py_tag is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+
+ if py_tag != runtime_tag:
+ return self._reject_cache_load(
+ "Cache python tag mismatch "
+ f"(found {py_tag}, expected {runtime_tag}); ignoring cache.",
+ status=CacheStatus.PYTHON_TAG_MISMATCH,
+ schema_version=version,
+ )
+
+ fp_version = _as_str(payload.get("fp"))
+ if fp_version is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+
+ if fp_version != self.fingerprint_version:
+ return self._reject_cache_load(
+ "Cache fingerprint version mismatch "
+ f"(found {fp_version}, expected {self.fingerprint_version}); "
+ "ignoring cache.",
+ status=CacheStatus.FINGERPRINT_MISMATCH,
+ schema_version=version,
+ )
+
+ analysis_profile = _as_analysis_profile(payload.get("ap"))
+ if analysis_profile is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+
+ if analysis_profile != self.analysis_profile:
+ return self._reject_cache_load(
+ "Cache analysis profile mismatch "
+ f"(found min_loc={analysis_profile['min_loc']}, "
+ f"min_stmt={analysis_profile['min_stmt']}, "
+ "collect_api_surface="
+ f"{str(analysis_profile['collect_api_surface']).lower()}; "
+ f"expected min_loc={self.analysis_profile['min_loc']}, "
+ f"min_stmt={self.analysis_profile['min_stmt']}, "
+ "collect_api_surface="
+ f"{str(self.analysis_profile['collect_api_surface']).lower()}); "
+ "ignoring cache.",
+ status=CacheStatus.ANALYSIS_PROFILE_MISMATCH,
+ schema_version=version,
+ )
+
+ files_dict = _as_str_dict(payload.get("files"))
+ if files_dict is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+
+ parsed_files: dict[str, CacheEntry] = {}
+ for wire_path, file_entry_obj in files_dict.items():
+ runtime_path = runtime_filepath_from_wire(wire_path, root=self.root)
+ parsed_entry = self._decode_entry(file_entry_obj, runtime_path)
+ if parsed_entry is None:
+ return self._reject_invalid_cache_format(schema_version=version)
+ parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry)
+ self.segment_report_projection = decode_segment_report_projection(
+ payload.get("sr"),
+ root=self.root,
+ )
+
+ self.cache_schema_version = version
+ return CacheData(
+ version=self._CACHE_VERSION,
+ python_tag=runtime_tag,
+ fingerprint_version=self.fingerprint_version,
+ analysis_profile=self.analysis_profile,
+ files=parsed_files,
+ )
+
+ def save(self) -> None:
+ if not self._dirty:
+ return
+ try:
+ wire_files: dict[str, object] = {}
+ wire_map = {
+ runtime_path: wire_filepath_from_runtime(runtime_path, root=self.root)
+ for runtime_path in self.data["files"]
+ }
+ for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
+ entry = self.get_file_entry(runtime_path)
+ if entry is None:
+ continue
+ wire_files[wire_map[runtime_path]] = self._encode_entry(entry)
+
+ payload: dict[str, object] = {
+ "py": current_python_tag(),
+ "fp": self.fingerprint_version,
+ "ap": self.analysis_profile,
+ "files": wire_files,
+ }
+ segment_projection = encode_segment_report_projection(
+ self.segment_report_projection,
+ root=self.root,
+ )
+ if segment_projection is not None:
+ payload["sr"] = segment_projection
+ signed_doc = {
+ "v": self._CACHE_VERSION,
+ "payload": payload,
+ "sig": sign_cache_payload(payload),
+ }
+ write_json_document_atomically(self.path, signed_doc)
+ self._dirty = False
+
+ self.data["version"] = self._CACHE_VERSION
+ self.data["python_tag"] = current_python_tag()
+ self.data["fingerprint_version"] = self.fingerprint_version
+ self.data["analysis_profile"] = self.analysis_profile
+ except OSError as exc:
+ raise CacheError(f"Failed to save cache: {exc}") from exc
+
+ @staticmethod
+ def _decode_entry(value: object, filepath: str) -> CacheEntry | None:
+ return _decode_wire_file_entry(value, filepath)
+
+ @staticmethod
+ def _encode_entry(entry: CacheEntry) -> dict[str, object]:
+ return _encode_wire_file_entry(entry)
+
+ def _store_canonical_file_entry(
+ self,
+ *,
+ runtime_path: str,
+ canonical_entry: CacheEntry,
+ ) -> CacheEntry:
+ previous_entry = self.data["files"].get(runtime_path)
+ was_canonical = runtime_path in self._canonical_runtime_paths
+ self.data["files"][runtime_path] = canonical_entry
+ self._canonical_runtime_paths.add(runtime_path)
+ if not was_canonical or previous_entry != canonical_entry:
+ self._dirty = True
+ return canonical_entry
+
+ def get_file_entry(self, filepath: str) -> CacheEntry | None:
+ runtime_lookup_key = filepath
+ entry_obj = self.data["files"].get(runtime_lookup_key)
+ if entry_obj is None:
+ wire_key = wire_filepath_from_runtime(filepath, root=self.root)
+ runtime_lookup_key = runtime_filepath_from_wire(wire_key, root=self.root)
+ entry_obj = self.data["files"].get(runtime_lookup_key)
+
+ if entry_obj is None:
+ return None
+
+ if runtime_lookup_key in self._canonical_runtime_paths:
+ if _is_canonical_cache_entry(entry_obj):
+ return entry_obj
+ self._canonical_runtime_paths.discard(runtime_lookup_key)
+
+ if not isinstance(entry_obj, dict):
+ return None
+
+ stat = _as_file_stat_dict(entry_obj.get("stat"))
+ units = _as_typed_unit_list(entry_obj.get("units"))
+ blocks = _as_typed_block_list(entry_obj.get("blocks"))
+ segments = _as_typed_segment_list(entry_obj.get("segments"))
+ if stat is None or units is None or blocks is None or segments is None:
+ return None
+
+ optional_sections = _decode_optional_cache_sections(entry_obj)
+ if optional_sections is None:
+ return None
+ (
+ class_metrics_raw,
+ module_deps_raw,
+ dead_candidates_raw,
+ referenced_names_raw,
+ referenced_qualnames_raw,
+ import_names_raw,
+ class_names_raw,
+ security_surfaces_raw,
+ typing_coverage_raw,
+ docstring_coverage_raw,
+ api_surface_raw,
+ source_stats,
+ structural_findings,
+ ) = optional_sections
+
+ entry_to_canonicalize: CacheEntry = _attach_optional_cache_sections(
+ CacheEntry(
+ stat=stat,
+ units=units,
+ blocks=blocks,
+ segments=segments,
+ class_metrics=class_metrics_raw,
+ module_deps=module_deps_raw,
+ dead_candidates=dead_candidates_raw,
+ referenced_names=referenced_names_raw,
+ referenced_qualnames=referenced_qualnames_raw,
+ import_names=import_names_raw,
+ class_names=class_names_raw,
+ security_surfaces=security_surfaces_raw,
+ ),
+ typing_coverage=typing_coverage_raw,
+ docstring_coverage=docstring_coverage_raw,
+ api_surface=api_surface_raw,
+ security_surfaces=security_surfaces_raw,
+ source_stats=source_stats,
+ structural_findings=structural_findings,
+ )
+ canonical_entry = _canonicalize_cache_entry(entry_to_canonicalize)
+ return self._store_canonical_file_entry(
+ runtime_path=runtime_lookup_key,
+ canonical_entry=canonical_entry,
+ )
+
+ def put_file_entry(
+ self,
+ filepath: str,
+ stat_sig: FileStat,
+ units: list[Unit],
+ blocks: list[BlockUnit],
+ segments: list[SegmentUnit],
+ *,
+ source_stats: SourceStatsDict | None = None,
+ file_metrics: FileMetrics | None = None,
+ structural_findings: list[StructuralFindingGroup] | None = None,
+ ) -> None:
+ runtime_path = runtime_filepath_from_wire(
+ wire_filepath_from_runtime(filepath, root=self.root),
+ root=self.root,
+ )
+
+ unit_rows = [_unit_dict_from_model(unit, runtime_path) for unit in units]
+ block_rows = [_block_dict_from_model(block, runtime_path) for block in blocks]
+ segment_rows = [
+ _segment_dict_from_model(segment, runtime_path) for segment in segments
+ ]
+
+ (
+ class_metrics_rows,
+ module_dep_rows,
+ dead_candidate_rows,
+ referenced_names,
+ referenced_qualnames,
+ import_names,
+ class_names,
+ security_surfaces,
+ typing_coverage,
+ docstring_coverage,
+ api_surface,
+ ) = _new_optional_metrics_payload()
+ if file_metrics is not None:
+ class_metrics_rows = [
+ _class_metrics_dict_from_model(metric, runtime_path)
+ for metric in file_metrics.class_metrics
+ ]
+ module_dep_rows = [
+ _module_dep_dict_from_model(dep) for dep in file_metrics.module_deps
+ ]
+ dead_candidate_rows = [
+ _dead_candidate_dict_from_model(candidate, runtime_path)
+ for candidate in file_metrics.dead_candidates
+ ]
+ referenced_names = sorted(set(file_metrics.referenced_names))
+ referenced_qualnames = sorted(set(file_metrics.referenced_qualnames))
+ import_names = sorted(set(file_metrics.import_names))
+ class_names = sorted(set(file_metrics.class_names))
+ security_surfaces = [
+ _security_surface_dict_from_model(surface, runtime_path)
+ for surface in file_metrics.security_surfaces
+ ]
+ typing_coverage = _typing_coverage_dict_from_model(
+ file_metrics.typing_coverage,
+ filepath=runtime_path,
+ )
+ docstring_coverage = _docstring_coverage_dict_from_model(
+ file_metrics.docstring_coverage,
+ filepath=runtime_path,
+ )
+ api_surface = _api_surface_dict_from_model(
+ file_metrics.api_surface,
+ filepath=runtime_path,
+ )
+
+ source_stats_payload = source_stats or SourceStatsDict(
+ lines=0,
+ functions=0,
+ methods=0,
+ classes=0,
+ )
+ entry_dict = CacheEntry(
+ stat=stat_sig,
+ source_stats=source_stats_payload,
+ units=unit_rows,
+ blocks=block_rows,
+ segments=segment_rows,
+ class_metrics=class_metrics_rows,
+ module_deps=module_dep_rows,
+ dead_candidates=dead_candidate_rows,
+ referenced_names=referenced_names,
+ referenced_qualnames=referenced_qualnames,
+ import_names=import_names,
+ class_names=class_names,
+ security_surfaces=security_surfaces,
+ )
+ if typing_coverage is not None:
+ entry_dict["typing_coverage"] = typing_coverage
+ if docstring_coverage is not None:
+ entry_dict["docstring_coverage"] = docstring_coverage
+ if api_surface is not None:
+ entry_dict["api_surface"] = api_surface
+ if structural_findings is not None:
+ entry_dict["structural_findings"] = _normalize_cached_structural_groups(
+ [
+ _structural_group_dict_from_model(group)
+ for group in structural_findings
+ ],
+ filepath=runtime_path,
+ )
+ canonical_entry = _canonicalize_cache_entry(entry_dict)
+ self._store_canonical_file_entry(
+ runtime_path=runtime_path,
+ canonical_entry=canonical_entry,
+ )
+
+ def prune_file_entries(self, existing_filepaths: Collection[str]) -> int:
+ keep_runtime_paths = {
+ runtime_filepath_from_wire(
+ wire_filepath_from_runtime(filepath, root=self.root),
+ root=self.root,
+ )
+ for filepath in existing_filepaths
+ }
+ stale_runtime_paths = sorted(
+ runtime_path
+ for runtime_path in self.data["files"]
+ if runtime_path not in keep_runtime_paths
+ )
+ if not stale_runtime_paths:
+ return 0
+ for runtime_path in stale_runtime_paths:
+ self.data["files"].pop(runtime_path, None)
+ self._canonical_runtime_paths.discard(runtime_path)
+ self._dirty = True
+ return len(stale_runtime_paths)
+
+
+def file_stat_signature(path: str) -> FileStat:
+ stat_result = os.stat(path)
+ return FileStat(
+ mtime_ns=stat_result.st_mtime_ns,
+ size=stat_result.st_size,
+ )
+
+
+__all__ = ["Cache", "file_stat_signature"]
diff --git a/codeclone/cache/versioning.py b/codeclone/cache/versioning.py
new file mode 100644
index 0000000..2081242
--- /dev/null
+++ b/codeclone/cache/versioning.py
@@ -0,0 +1,136 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from enum import Enum
+from pathlib import Path
+from typing import TypedDict
+
+from ..contracts import CACHE_VERSION, DEFAULT_MAX_CACHE_SIZE_MB
+from ..contracts.schemas import AnalysisProfile
+from .entries import CacheEntry
+from .integrity import as_int_or_none, as_str_dict
+
+MAX_CACHE_SIZE_BYTES = DEFAULT_MAX_CACHE_SIZE_MB * 1024 * 1024
+LEGACY_CACHE_SECRET_FILENAME = ".cache_secret"
+_DEFAULT_WIRE_UNIT_FLOW_PROFILES = (
+ 0,
+ "none",
+ False,
+ "fallthrough",
+ "none",
+ "none",
+)
+
+
+class CacheStatus(str, Enum):
+ OK = "ok"
+ MISSING = "missing"
+ TOO_LARGE = "too_large"
+ UNREADABLE = "unreadable"
+ INVALID_JSON = "invalid_json"
+ INVALID_TYPE = "invalid_type"
+ VERSION_MISMATCH = "version_mismatch"
+ PYTHON_TAG_MISMATCH = "python_tag_mismatch"
+ FINGERPRINT_MISMATCH = "mismatch_fingerprint_version"
+ ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch"
+ INTEGRITY_FAILED = "integrity_failed"
+
+
+class CacheData(TypedDict):
+ version: str
+ python_tag: str
+ fingerprint_version: str
+ analysis_profile: AnalysisProfile
+ files: dict[str, CacheEntry]
+
+
+def _empty_cache_data(
+ *,
+ version: str = CACHE_VERSION,
+ python_tag: str,
+ fingerprint_version: str,
+ analysis_profile: AnalysisProfile,
+) -> CacheData:
+ return CacheData(
+ version=version,
+ python_tag=python_tag,
+ fingerprint_version=fingerprint_version,
+ analysis_profile=analysis_profile,
+ files={},
+ )
+
+
+def _as_analysis_profile(value: object) -> AnalysisProfile | None:
+ obj = as_str_dict(value)
+ if obj is None:
+ return None
+
+ required = {
+ "min_loc",
+ "min_stmt",
+ "block_min_loc",
+ "block_min_stmt",
+ "segment_min_loc",
+ "segment_min_stmt",
+ }
+ if set(obj.keys()) < required:
+ return None
+
+ min_loc = as_int_or_none(obj.get("min_loc"))
+ min_stmt = as_int_or_none(obj.get("min_stmt"))
+ block_min_loc = as_int_or_none(obj.get("block_min_loc"))
+ block_min_stmt = as_int_or_none(obj.get("block_min_stmt"))
+ segment_min_loc = as_int_or_none(obj.get("segment_min_loc"))
+ segment_min_stmt = as_int_or_none(obj.get("segment_min_stmt"))
+ collect_api_surface_raw = obj.get("collect_api_surface", False)
+ collect_api_surface = (
+ collect_api_surface_raw if isinstance(collect_api_surface_raw, bool) else None
+ )
+ if (
+ min_loc is None
+ or min_stmt is None
+ or block_min_loc is None
+ or block_min_stmt is None
+ or segment_min_loc is None
+ or segment_min_stmt is None
+ or collect_api_surface is None
+ ):
+ return None
+
+ return AnalysisProfile(
+ min_loc=min_loc,
+ min_stmt=min_stmt,
+ block_min_loc=block_min_loc,
+ block_min_stmt=block_min_stmt,
+ segment_min_loc=segment_min_loc,
+ segment_min_stmt=segment_min_stmt,
+ collect_api_surface=collect_api_surface,
+ )
+
+
+def _resolve_root(root: str | Path | None) -> Path | None:
+ if root is None:
+ return None
+ try:
+ return Path(root).resolve(strict=False)
+ except OSError:
+ return None
+
+
+__all__ = [
+ "CACHE_VERSION",
+ "LEGACY_CACHE_SECRET_FILENAME",
+ "MAX_CACHE_SIZE_BYTES",
+ "_DEFAULT_WIRE_UNIT_FLOW_PROFILES",
+ "AnalysisProfile",
+ "CacheData",
+ "CacheStatus",
+ "_as_analysis_profile",
+ "_empty_cache_data",
+ "_resolve_root",
+]
diff --git a/codeclone/cache_paths.py b/codeclone/cache_paths.py
deleted file mode 100644
index 8de7c63..0000000
--- a/codeclone/cache_paths.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-from pathlib import Path
-
-
-def wire_filepath_from_runtime(
- runtime_filepath: str,
- *,
- root: Path | None,
-) -> str:
- runtime_path = Path(runtime_filepath)
- if root is None:
- return runtime_path.as_posix()
-
- try:
- relative = runtime_path.relative_to(root)
- return relative.as_posix()
- except ValueError:
- pass
-
- try:
- relative = runtime_path.resolve().relative_to(root.resolve())
- return relative.as_posix()
- except OSError:
- return runtime_path.as_posix()
- except ValueError:
- return runtime_path.as_posix()
-
-
-def runtime_filepath_from_wire(
- wire_filepath: str,
- *,
- root: Path | None,
-) -> str:
- wire_path = Path(wire_filepath)
- if root is None or wire_path.is_absolute():
- return str(wire_path)
-
- combined = root / wire_path
- try:
- return str(combined.resolve(strict=False))
- except OSError:
- return str(combined)
diff --git a/codeclone/cli.py b/codeclone/cli.py
deleted file mode 100644
index 09ac8c5..0000000
--- a/codeclone/cli.py
+++ /dev/null
@@ -1,1741 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import os
-import subprocess
-import sys
-import time
-from collections.abc import Mapping, Sequence
-from dataclasses import dataclass
-from pathlib import Path
-from typing import TYPE_CHECKING, Literal, Protocol, cast
-
-from . import __version__, _coerce
-from . import ui_messages as ui
-from ._cli_args import build_parser
-from ._cli_baselines import (
- CloneBaselineState as _CloneBaselineStateImpl,
-)
-from ._cli_baselines import (
- MetricsBaselineSectionProbe as _MetricsBaselineSectionProbeImpl,
-)
-from ._cli_baselines import (
- MetricsBaselineState as _MetricsBaselineStateImpl,
-)
-from ._cli_baselines import (
- probe_metrics_baseline_section as _probe_metrics_baseline_section_impl,
-)
-from ._cli_baselines import (
- resolve_clone_baseline_state as _resolve_clone_baseline_state_impl,
-)
-from ._cli_baselines import (
- resolve_metrics_baseline_state as _resolve_metrics_baseline_state_impl,
-)
-from ._cli_config import (
- ConfigValidationError,
- apply_pyproject_config_overrides,
- collect_explicit_cli_dests,
- load_pyproject_config,
-)
-from ._cli_gating import (
- parse_metric_reason_entry as _parse_metric_reason_entry_impl,
-)
-from ._cli_gating import (
- print_gating_failure_block as _print_gating_failure_block_impl,
-)
-from ._cli_paths import _validate_output_path
-from ._cli_reports import (
- write_report_outputs as _write_report_outputs_impl,
-)
-from ._cli_rich import (
- PlainConsole as _PlainConsole,
-)
-from ._cli_rich import (
- make_console as _make_rich_console,
-)
-from ._cli_rich import (
- make_plain_console as _make_plain_console_impl,
-)
-from ._cli_rich import (
- print_banner as _print_banner_impl,
-)
-from ._cli_rich import (
- rich_progress_symbols as _rich_progress_symbols_impl,
-)
-from ._cli_runtime import (
- configure_metrics_mode as _configure_metrics_mode_impl,
-)
-from ._cli_runtime import (
- metrics_computed as _metrics_computed_impl,
-)
-from ._cli_runtime import (
- print_failed_files as _print_failed_files_impl,
-)
-from ._cli_runtime import (
- resolve_cache_path as _resolve_cache_path_impl,
-)
-from ._cli_runtime import (
- resolve_cache_status as _resolve_cache_status_impl,
-)
-from ._cli_runtime import (
- validate_numeric_args as _validate_numeric_args_impl,
-)
-from ._cli_summary import (
- ChangedScopeSnapshot,
- MetricsSnapshot,
- _print_changed_scope,
- _print_metrics,
- _print_summary,
-)
-from ._git_diff import validate_git_diff_ref
-from .baseline import Baseline
-from .cache import Cache, CacheStatus, build_segment_report_projection
-from .contracts import (
- DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- ISSUES_URL,
- ExitCode,
-)
-from .errors import CacheError
-
-if TYPE_CHECKING:
- from argparse import Namespace
- from collections.abc import Callable, Mapping, Sequence
- from types import ModuleType
-
- from rich.console import Console as RichConsole
- from rich.progress import BarColumn as RichBarColumn
- from rich.progress import Progress as RichProgress
- from rich.progress import SpinnerColumn as RichSpinnerColumn
- from rich.progress import TextColumn as RichTextColumn
- from rich.progress import TimeElapsedColumn as RichTimeElapsedColumn
-
- from ._cli_baselines import _BaselineArgs as _BaselineArgsLike
- from ._cli_gating import _GatingArgs as _GatingArgsLike
- from ._cli_reports import _QuietArgs as _QuietArgsLike
- from ._cli_runtime import _RuntimeArgs as _RuntimeArgsLike
- from .models import MetricsDiff
- from .normalize import NormalizationConfig
- from .pipeline import (
- AnalysisResult,
- BootstrapResult,
- DiscoveryResult,
- GatingResult,
- ReportArtifacts,
- )
- from .pipeline import (
- OutputPaths as PipelineOutputPaths,
- )
- from .pipeline import (
- ProcessingResult as PipelineProcessingResult,
- )
-
-MAX_FILE_SIZE = 10 * 1024 * 1024
-__all__ = [
- "MAX_FILE_SIZE",
- "ExitCode",
- "ProcessingResult",
- "analyze",
- "bootstrap",
- "discover",
- "gate",
- "main",
- "process",
- "process_file",
- "report",
-]
-
-# Lazy singleton for pipeline module — deferred import to keep CLI startup fast.
-# Tests monkeypatch this via _pipeline_module() to inject mocks.
-_PIPELINE_MODULE: ModuleType | None = None
-
-
-def _pipeline_module() -> ModuleType:
- global _PIPELINE_MODULE
- if _PIPELINE_MODULE is None:
- from . import pipeline as _pipeline
-
- _PIPELINE_MODULE = _pipeline
- return _PIPELINE_MODULE
-
-
-@dataclass(frozen=True, slots=True)
-class OutputPaths:
- html: Path | None = None
- json: Path | None = None
- text: Path | None = None
- md: Path | None = None
- sarif: Path | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class ProcessingResult:
- filepath: str
- success: bool
- error: str | None = None
- units: list[object] | None = None
- blocks: list[object] | None = None
- segments: list[object] | None = None
- lines: int = 0
- functions: int = 0
- methods: int = 0
- classes: int = 0
- stat: Mapping[str, int] | None = None
- error_kind: str | None = None
- file_metrics: object | None = None
- structural_findings: list[object] | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class ChangedCloneGate:
- changed_paths: tuple[str, ...]
- new_func: frozenset[str]
- new_block: frozenset[str]
- total_clone_groups: int
- findings_total: int
- findings_new: int
- findings_known: int
-
-
-_as_mapping = _coerce.as_mapping
-_as_int = _coerce.as_int
-_as_sequence = _coerce.as_sequence
-
-
-def _validate_changed_scope_args(*, args: Namespace) -> str | None:
- if args.diff_against and args.paths_from_git_diff:
- console.print(
- ui.fmt_contract_error(
- "Use --diff-against or --paths-from-git-diff, not both."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- if args.paths_from_git_diff:
- args.changed_only = True
- return str(args.paths_from_git_diff)
- if args.diff_against and not args.changed_only:
- console.print(ui.fmt_contract_error("--diff-against requires --changed-only."))
- sys.exit(ExitCode.CONTRACT_ERROR)
- if args.changed_only and not args.diff_against:
- console.print(
- ui.fmt_contract_error(
- "--changed-only requires --diff-against or --paths-from-git-diff."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- return str(args.diff_against) if args.diff_against else None
-
-
-def _normalize_changed_paths(
- *,
- root_path: Path,
- paths: Sequence[str],
-) -> tuple[str, ...]:
- normalized: set[str] = set()
- for raw_path in paths:
- candidate = raw_path.strip()
- if not candidate:
- continue
- candidate_path = Path(candidate)
- try:
- absolute_path = (
- candidate_path.resolve()
- if candidate_path.is_absolute()
- else (root_path / candidate_path).resolve()
- )
- except OSError as exc:
- console.print(
- ui.fmt_contract_error(
- f"Unable to resolve changed path '{candidate}': {exc}"
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- try:
- relative_path = absolute_path.relative_to(root_path)
- except ValueError:
- console.print(
- ui.fmt_contract_error(
- f"Changed path '{candidate}' is outside the scan root."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- cleaned = str(relative_path).replace("\\", "/").strip("/")
- if cleaned:
- normalized.add(cleaned)
- return tuple(sorted(normalized))
-
-
-def _git_diff_changed_paths(*, root_path: Path, git_diff_ref: str) -> tuple[str, ...]:
- try:
- validated_ref = validate_git_diff_ref(git_diff_ref)
- except ValueError as exc:
- console.print(ui.fmt_contract_error(str(exc)))
- sys.exit(ExitCode.CONTRACT_ERROR)
- try:
- completed = subprocess.run(
- ["git", "diff", "--name-only", validated_ref, "--"],
- cwd=str(root_path),
- check=True,
- capture_output=True,
- text=True,
- timeout=30,
- )
- except (
- FileNotFoundError,
- subprocess.CalledProcessError,
- subprocess.TimeoutExpired,
- ) as exc:
- console.print(
- ui.fmt_contract_error(
- "Unable to resolve changed files from git diff ref "
- f"'{validated_ref}': {exc}"
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()]
- return _normalize_changed_paths(root_path=root_path, paths=lines)
-
-
-def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool:
- return any(
- relative_path == candidate or relative_path.startswith(candidate + "/")
- for candidate in changed_paths
- )
-
-
-def _flatten_report_findings(
- report_document: Mapping[str, object],
-) -> list[dict[str, object]]:
- findings = _as_mapping(report_document.get("findings"))
- groups = _as_mapping(findings.get("groups"))
- clone_groups = _as_mapping(groups.get("clones"))
- return [
- *[
- dict(_as_mapping(item))
- for item in _as_sequence(clone_groups.get("functions"))
- ],
- *[dict(_as_mapping(item)) for item in _as_sequence(clone_groups.get("blocks"))],
- *[
- dict(_as_mapping(item))
- for item in _as_sequence(clone_groups.get("segments"))
- ],
- *[
- dict(_as_mapping(item))
- for item in _as_sequence(
- _as_mapping(groups.get("structural")).get("groups")
- )
- ],
- *[
- dict(_as_mapping(item))
- for item in _as_sequence(_as_mapping(groups.get("dead_code")).get("groups"))
- ],
- *[
- dict(_as_mapping(item))
- for item in _as_sequence(_as_mapping(groups.get("design")).get("groups"))
- ],
- ]
-
-
-def _finding_touches_changed_paths(
- finding: Mapping[str, object],
- *,
- changed_paths: Sequence[str],
-) -> bool:
- for item in _as_sequence(finding.get("items")):
- relative_path = str(_as_mapping(item).get("relative_path", "")).strip()
- if relative_path and _path_matches(relative_path, changed_paths):
- return True
- return False
-
-
-def _changed_clone_gate_from_report(
- report_document: Mapping[str, object],
- *,
- changed_paths: Sequence[str],
-) -> ChangedCloneGate:
- findings = [
- finding
- for finding in _flatten_report_findings(report_document)
- if _finding_touches_changed_paths(finding, changed_paths=changed_paths)
- ]
- clone_findings = [
- finding
- for finding in findings
- if str(finding.get("family", "")).strip() == "clone"
- and str(finding.get("category", "")).strip() in {"function", "block"}
- ]
- new_func = frozenset(
- str(finding.get("id", ""))
- for finding in clone_findings
- if str(finding.get("category", "")).strip() == "function"
- and str(finding.get("novelty", "")).strip() == "new"
- )
- new_block = frozenset(
- str(finding.get("id", ""))
- for finding in clone_findings
- if str(finding.get("category", "")).strip() == "block"
- and str(finding.get("novelty", "")).strip() == "new"
- )
- findings_new = sum(
- 1 for finding in findings if str(finding.get("novelty", "")).strip() == "new"
- )
- findings_known = sum(
- 1 for finding in findings if str(finding.get("novelty", "")).strip() == "known"
- )
- return ChangedCloneGate(
- changed_paths=tuple(changed_paths),
- new_func=new_func,
- new_block=new_block,
- total_clone_groups=len(clone_findings),
- findings_total=len(findings),
- findings_new=findings_new,
- findings_known=findings_known,
- )
-
-
-def process_file(
- filepath: str,
- root: str,
- cfg: NormalizationConfig,
- min_loc: int,
- min_stmt: int,
- collect_structural_findings: bool = True,
-) -> ProcessingResult:
- pipeline_mod = _pipeline_module()
- result = pipeline_mod.process_file(
- filepath,
- root,
- cfg,
- min_loc,
- min_stmt,
- collect_structural_findings,
- )
- return cast("ProcessingResult", result)
-
-
-def bootstrap(
- *,
- args: Namespace,
- root: Path,
- output_paths: PipelineOutputPaths | OutputPaths,
- cache_path: Path,
-) -> BootstrapResult:
- return cast(
- "BootstrapResult",
- _pipeline_module().bootstrap(
- args=args,
- root=root,
- output_paths=output_paths,
- cache_path=cache_path,
- ),
- )
-
-
-def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult:
- return cast("DiscoveryResult", _pipeline_module().discover(boot=boot, cache=cache))
-
-
-def process(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- cache: Cache,
- on_advance: Callable[[], None] | None = None,
- on_worker_error: Callable[[str], None] | None = None,
- on_parallel_fallback: Callable[[Exception], None] | None = None,
-) -> PipelineProcessingResult:
- return cast(
- "PipelineProcessingResult",
- _pipeline_module().process(
- boot=boot,
- discovery=discovery,
- cache=cache,
- on_advance=on_advance,
- on_worker_error=on_worker_error,
- on_parallel_fallback=on_parallel_fallback,
- ),
- )
-
-
-def analyze(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- processing: PipelineProcessingResult,
-) -> AnalysisResult:
- return cast(
- "AnalysisResult",
- _pipeline_module().analyze(
- boot=boot,
- discovery=discovery,
- processing=processing,
- ),
- )
-
-
-def report(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- processing: PipelineProcessingResult,
- analysis: AnalysisResult,
- report_meta: Mapping[str, object],
- new_func: set[str],
- new_block: set[str],
- html_builder: Callable[..., str] | None = None,
- metrics_diff: MetricsDiff | None = None,
- coverage_adoption_diff_available: bool = False,
- api_surface_diff_available: bool = False,
- include_report_document: bool = False,
-) -> ReportArtifacts:
- return cast(
- "ReportArtifacts",
- _pipeline_module().report(
- boot=boot,
- discovery=discovery,
- processing=processing,
- analysis=analysis,
- report_meta=report_meta,
- new_func=new_func,
- new_block=new_block,
- html_builder=html_builder,
- metrics_diff=metrics_diff,
- coverage_adoption_diff_available=coverage_adoption_diff_available,
- api_surface_diff_available=api_surface_diff_available,
- include_report_document=include_report_document,
- ),
- )
-
-
-def gate(
- *,
- boot: BootstrapResult,
- analysis: AnalysisResult,
- new_func: set[str],
- new_block: set[str],
- metrics_diff: MetricsDiff | None,
-) -> GatingResult:
- return cast(
- "GatingResult",
- _pipeline_module().gate(
- boot=boot,
- analysis=analysis,
- new_func=new_func,
- new_block=new_block,
- metrics_diff=metrics_diff,
- ),
- )
-
-
-class _PrinterLike(Protocol):
- def print(self, *objects: object, **kwargs: object) -> None: ...
-
-
-LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser()
-ReportPathOrigin = Literal["default", "explicit"]
-
-
-def _rich_progress_symbols() -> tuple[
- type[RichProgress],
- type[RichSpinnerColumn],
- type[RichTextColumn],
- type[RichBarColumn],
- type[RichTimeElapsedColumn],
-]:
- return _rich_progress_symbols_impl()
-
-
-def _make_console(*, no_color: bool) -> RichConsole:
- return _make_rich_console(
- no_color=no_color,
- width=ui.CLI_LAYOUT_MAX_WIDTH,
- )
-
-
-def _print_verbose_clone_hashes(
- console: _PrinterLike,
- *,
- label: str,
- clone_hashes: set[str],
-) -> None:
- if not clone_hashes:
- return
- console.print(f"\n {label}:")
- for clone_hash in sorted(clone_hashes):
- console.print(f" - {clone_hash}")
-
-
-def _make_plain_console() -> _PlainConsole:
- return _make_plain_console_impl()
-
-
-console: RichConsole | _PlainConsole = _make_plain_console()
-
-
-def _parse_metric_reason_entry(reason: str) -> tuple[str, str]:
- return _parse_metric_reason_entry_impl(reason)
-
-
-def _print_gating_failure_block(
- *,
- code: str,
- entries: Sequence[tuple[str, object]],
- args: Namespace,
-) -> None:
- _print_gating_failure_block_impl(
- console=cast("_PrinterLike", console),
- code=code,
- entries=list(entries),
- args=cast("_GatingArgsLike", cast(object, args)),
- )
-
-
-def build_html_report(*args: object, **kwargs: object) -> str:
- # Lazy import avoids pulling HTML renderer in non-HTML CLI runs.
- from .html_report import build_html_report as _build_html_report
-
- html_builder: Callable[..., str] = _build_html_report
- return html_builder(*args, **kwargs)
-
-
-_CloneBaselineState = _CloneBaselineStateImpl
-_MetricsBaselineState = _MetricsBaselineStateImpl
-_MetricsBaselineSectionProbe = _MetricsBaselineSectionProbeImpl
-
-
-def print_banner(*, root: Path | None = None) -> None:
- _print_banner_impl(
- console=cast("_PrinterLike", console),
- banner_title=ui.banner_title(__version__),
- project_name=(root.name if root is not None else None),
- root_display=(str(root) if root is not None else None),
- )
-
-
-def _is_debug_enabled(
- *,
- argv: Sequence[str] | None = None,
- environ: Mapping[str, str] | None = None,
-) -> bool:
- args = list(sys.argv[1:] if argv is None else argv)
- debug_from_flag = any(arg == "--debug" for arg in args)
- env = os.environ if environ is None else environ
- debug_from_env = env.get("CODECLONE_DEBUG") == "1"
- return debug_from_flag or debug_from_env
-
-
-def _report_path_origins(argv: Sequence[str]) -> dict[str, ReportPathOrigin | None]:
- origins: dict[str, ReportPathOrigin | None] = {
- "html": None,
- "json": None,
- "md": None,
- "sarif": None,
- "text": None,
- }
- flag_to_field = {
- "--html": "html",
- "--json": "json",
- "--md": "md",
- "--sarif": "sarif",
- "--text": "text",
- }
- index = 0
- while index < len(argv):
- token = argv[index]
- if token == "--":
- break
- if "=" in token:
- flag, _value = token.split("=", maxsplit=1)
- field_name = flag_to_field.get(flag)
- if field_name is not None:
- origins[field_name] = "explicit"
- index += 1
- continue
- field_name = flag_to_field.get(token)
- if field_name is None:
- index += 1
- continue
- next_token = argv[index + 1] if index + 1 < len(argv) else None
- if next_token is None or next_token.startswith("-"):
- origins[field_name] = "default"
- index += 1
- continue
- origins[field_name] = "explicit"
- index += 2
- return origins
-
-
-def _report_path_timestamp_slug(report_generated_at_utc: str) -> str:
- return report_generated_at_utc.replace("-", "").replace(":", "")
-
-
-def _timestamped_report_path(path: Path, *, report_generated_at_utc: str) -> Path:
- suffix = path.suffix
- stem = path.name[: -len(suffix)] if suffix else path.name
- return path.with_name(
- f"{stem}-{_report_path_timestamp_slug(report_generated_at_utc)}{suffix}"
- )
-
-
-def _resolve_output_paths(
- args: Namespace,
- *,
- report_path_origins: Mapping[str, ReportPathOrigin | None],
- report_generated_at_utc: str,
-) -> OutputPaths:
- printer = cast("_PrinterLike", console)
- resolved: dict[str, Path | None] = {
- "html": None,
- "json": None,
- "md": None,
- "sarif": None,
- "text": None,
- }
- output_specs = (
- ("html", "html_out", ".html", "HTML"),
- ("json", "json_out", ".json", "JSON"),
- ("md", "md_out", ".md", "Markdown"),
- ("sarif", "sarif_out", ".sarif", "SARIF"),
- ("text", "text_out", ".txt", "text"),
- )
-
- for field_name, arg_name, expected_suffix, label in output_specs:
- raw_value = getattr(args, arg_name, None)
- if not raw_value:
- continue
- path = _validate_output_path(
- raw_value,
- expected_suffix=expected_suffix,
- label=label,
- console=printer,
- invalid_message=ui.fmt_invalid_output_extension,
- invalid_path_message=ui.fmt_invalid_output_path,
- )
- if (
- args.timestamped_report_paths
- and report_path_origins.get(field_name) == "default"
- ):
- path = _timestamped_report_path(
- path,
- report_generated_at_utc=report_generated_at_utc,
- )
- resolved[field_name] = path
-
- return OutputPaths(
- html=resolved["html"],
- json=resolved["json"],
- text=resolved["text"],
- md=resolved["md"],
- sarif=resolved["sarif"],
- )
-
-
-def _validate_report_ui_flags(*, args: Namespace, output_paths: OutputPaths) -> None:
- if args.open_html_report and output_paths.html is None:
- console.print(ui.fmt_contract_error(ui.ERR_OPEN_HTML_REPORT_REQUIRES_HTML))
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- if args.timestamped_report_paths and not any(
- (
- output_paths.html,
- output_paths.json,
- output_paths.md,
- output_paths.sarif,
- output_paths.text,
- )
- ):
- console.print(
- ui.fmt_contract_error(ui.ERR_TIMESTAMPED_REPORT_PATHS_REQUIRES_REPORT)
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
-
-def _resolve_cache_path(*, root_path: Path, args: Namespace, from_args: bool) -> Path:
- return _resolve_cache_path_impl(
- root_path=root_path,
- args=cast("_RuntimeArgsLike", cast(object, args)),
- from_args=from_args,
- legacy_cache_path=LEGACY_CACHE_PATH,
- console=cast("_PrinterLike", console),
- )
-
-
-def _validate_numeric_args(args: Namespace) -> bool:
- return _validate_numeric_args_impl(cast("_RuntimeArgsLike", cast(object, args)))
-
-
-def _configure_metrics_mode(*, args: Namespace, metrics_baseline_exists: bool) -> None:
- _configure_metrics_mode_impl(
- args=cast("_RuntimeArgsLike", cast(object, args)),
- metrics_baseline_exists=metrics_baseline_exists,
- console=cast("_PrinterLike", console),
- )
-
-
-def _print_failed_files(failed_files: Sequence[str]) -> None:
- _print_failed_files_impl(
- failed_files=tuple(failed_files),
- console=cast("_PrinterLike", console),
- )
-
-
-def _metrics_computed(args: Namespace) -> tuple[str, ...]:
- return _metrics_computed_impl(cast("_RuntimeArgsLike", cast(object, args)))
-
-
-def _probe_metrics_baseline_section(path: Path) -> _MetricsBaselineSectionProbe:
- return _probe_metrics_baseline_section_impl(path)
-
-
-def _resolve_clone_baseline_state(
- *,
- args: Namespace,
- baseline_path: Path,
- baseline_exists: bool,
- analysis: AnalysisResult,
- shared_baseline_payload: dict[str, object] | None = None,
-) -> _CloneBaselineState:
- return _resolve_clone_baseline_state_impl(
- args=cast("_BaselineArgsLike", cast(object, args)),
- baseline_path=baseline_path,
- baseline_exists=baseline_exists,
- func_groups=analysis.func_groups,
- block_groups=analysis.block_groups,
- codeclone_version=__version__,
- console=cast("_PrinterLike", console),
- shared_baseline_payload=shared_baseline_payload,
- )
-
-
-def _resolve_metrics_baseline_state(
- *,
- args: Namespace,
- metrics_baseline_path: Path,
- metrics_baseline_exists: bool,
- baseline_updated_path: Path | None,
- analysis: AnalysisResult,
- shared_baseline_payload: dict[str, object] | None = None,
-) -> _MetricsBaselineState:
- return _resolve_metrics_baseline_state_impl(
- args=cast("_BaselineArgsLike", cast(object, args)),
- metrics_baseline_path=metrics_baseline_path,
- metrics_baseline_exists=metrics_baseline_exists,
- baseline_updated_path=baseline_updated_path,
- project_metrics=analysis.project_metrics,
- console=cast("_PrinterLike", console),
- shared_baseline_payload=shared_baseline_payload,
- )
-
-
-def _resolve_cache_status(cache: Cache) -> tuple[CacheStatus, str | None]:
- return _resolve_cache_status_impl(cache)
-
-
-def _cache_update_segment_projection(cache: Cache, analysis: AnalysisResult) -> None:
- if not hasattr(cache, "segment_report_projection"):
- return
- new_projection = build_segment_report_projection(
- digest=analysis.segment_groups_raw_digest,
- suppressed=analysis.suppressed_segment_groups,
- groups=analysis.segment_groups,
- )
- if new_projection != cache.segment_report_projection:
- cache.segment_report_projection = new_projection
- cache._dirty = True
-
-
-def _run_analysis_stages(
- *,
- args: Namespace,
- boot: BootstrapResult,
- cache: Cache,
-) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]:
- def _require_rich_console(
- value: RichConsole | _PlainConsole,
- ) -> RichConsole:
- if isinstance(value, _PlainConsole):
- raise RuntimeError("Rich console is required when progress UI is enabled.")
- return value
-
- use_status = not args.quiet and not args.no_progress
- try:
- if use_status:
- with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
- discovery_result = discover(boot=boot, cache=cache)
- else:
- discovery_result = discover(boot=boot, cache=cache)
- except OSError as exc:
- console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=exc)))
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- for warning in discovery_result.skipped_warnings:
- console.print(f"[warning]{warning}[/warning]")
-
- total_files = len(discovery_result.files_to_process)
- if total_files > 0 and not args.quiet and args.no_progress:
- console.print(ui.fmt_processing_changed(total_files))
-
- if total_files > 0 and not args.no_progress:
- (
- progress_cls,
- spinner_column_cls,
- text_column_cls,
- bar_column_cls,
- time_elapsed_column_cls,
- ) = _rich_progress_symbols()
-
- with progress_cls(
- spinner_column_cls(),
- text_column_cls("[progress.description]{task.description}"),
- bar_column_cls(),
- text_column_cls("[progress.percentage]{task.percentage:>3.0f}%"),
- time_elapsed_column_cls(),
- console=_require_rich_console(console),
- ) as progress_ui:
- task_id = progress_ui.add_task(
- f"Analyzing {total_files} files...",
- total=total_files,
- )
- processing_result = process(
- boot=boot,
- discovery=discovery_result,
- cache=cache,
- on_advance=lambda: progress_ui.advance(task_id),
- on_worker_error=lambda reason: console.print(
- ui.fmt_worker_failed(reason)
- ),
- on_parallel_fallback=lambda exc: console.print(
- ui.fmt_parallel_fallback(exc)
- ),
- )
- else:
- processing_result = process(
- boot=boot,
- discovery=discovery_result,
- cache=cache,
- on_worker_error=(
- (lambda reason: console.print(ui.fmt_batch_item_failed(reason)))
- if args.no_progress
- else (lambda reason: console.print(ui.fmt_worker_failed(reason)))
- ),
- on_parallel_fallback=lambda exc: console.print(
- ui.fmt_parallel_fallback(exc)
- ),
- )
-
- _print_failed_files(processing_result.failed_files)
- # Keep unreadable-source diagnostics visible in normal mode even if
- # failed_files was filtered/empty due upstream transport differences.
- if not processing_result.failed_files and processing_result.source_read_failures:
- _print_failed_files(processing_result.source_read_failures)
-
- if use_status:
- with console.status(ui.STATUS_GROUPING, spinner="dots"):
- analysis_result = analyze(
- boot=boot,
- discovery=discovery_result,
- processing=processing_result,
- )
- _cache_update_segment_projection(cache, analysis_result)
- try:
- cache.save()
- except CacheError as exc:
- console.print(ui.fmt_cache_save_failed(exc))
- else:
- analysis_result = analyze(
- boot=boot,
- discovery=discovery_result,
- processing=processing_result,
- )
- _cache_update_segment_projection(cache, analysis_result)
- try:
- cache.save()
- except CacheError as exc:
- console.print(ui.fmt_cache_save_failed(exc))
-
- coverage_join = getattr(analysis_result, "coverage_join", None)
- if (
- coverage_join is not None
- and coverage_join.status != "ok"
- and coverage_join.invalid_reason
- ):
- console.print(ui.fmt_coverage_join_ignored(coverage_join.invalid_reason))
-
- return discovery_result, processing_result, analysis_result
-
-
-def _write_report_outputs(
- *,
- args: Namespace,
- output_paths: OutputPaths,
- report_artifacts: ReportArtifacts,
- open_html_report: bool = False,
-) -> str | None:
- return _write_report_outputs_impl(
- args=cast("_QuietArgsLike", cast(object, args)),
- output_paths=output_paths,
- report_artifacts=report_artifacts,
- console=cast("_PrinterLike", console),
- open_html_report=open_html_report,
- )
-
-
-def _enforce_gating(
- *,
- args: Namespace,
- boot: BootstrapResult,
- analysis: AnalysisResult,
- processing: PipelineProcessingResult,
- source_read_contract_failure: bool,
- baseline_failure_code: ExitCode | None,
- metrics_baseline_failure_code: ExitCode | None,
- new_func: set[str],
- new_block: set[str],
- metrics_diff: MetricsDiff | None,
- html_report_path: str | None,
- clone_threshold_total: int | None = None,
-) -> None:
- if source_read_contract_failure:
- console.print(
- ui.fmt_contract_error(
- ui.fmt_unreadable_source_in_gating(
- count=len(processing.source_read_failures)
- )
- )
- )
- for failure in processing.source_read_failures[:10]:
- console.print(f" • {failure}")
- if len(processing.source_read_failures) > 10:
- console.print(f" ... and {len(processing.source_read_failures) - 10} more")
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- if baseline_failure_code is not None:
- console.print(ui.fmt_contract_error(ui.ERR_BASELINE_GATING_REQUIRES_TRUSTED))
- sys.exit(baseline_failure_code)
-
- if metrics_baseline_failure_code is not None:
- console.print(
- ui.fmt_contract_error(
- "Metrics baseline is untrusted or missing for requested metrics gating."
- )
- )
- sys.exit(metrics_baseline_failure_code)
-
- if bool(getattr(args, "fail_on_untested_hotspots", False)):
- if analysis.coverage_join is None:
- console.print(
- ui.fmt_contract_error(
- "--fail-on-untested-hotspots requires --coverage."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- if analysis.coverage_join.status != "ok":
- detail = analysis.coverage_join.invalid_reason or "invalid coverage input"
- console.print(
- ui.fmt_contract_error(
- "Coverage gating requires a valid Cobertura XML input.\n"
- f"Reason: {detail}"
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- gate_result = gate(
- boot=boot,
- analysis=analysis,
- new_func=new_func,
- new_block=new_block,
- metrics_diff=metrics_diff,
- )
- if clone_threshold_total is not None:
- reasons = [
- reason
- for reason in gate_result.reasons
- if not reason.startswith("clone:threshold:")
- ]
- if 0 <= args.fail_threshold < clone_threshold_total:
- reasons.append(
- f"clone:threshold:{clone_threshold_total}:{args.fail_threshold}"
- )
- gate_result = cast(
- "GatingResult",
- _pipeline_module().GatingResult(
- exit_code=(
- int(ExitCode.GATING_FAILURE) if reasons else int(ExitCode.SUCCESS)
- ),
- reasons=tuple(reasons),
- ),
- )
-
- metric_reasons = [
- reason[len("metric:") :]
- for reason in gate_result.reasons
- if reason.startswith("metric:")
- ]
- if metric_reasons:
- _print_gating_failure_block(
- code="metrics",
- entries=[_parse_metric_reason_entry(reason) for reason in metric_reasons],
- args=args,
- )
- sys.exit(ExitCode.GATING_FAILURE)
-
- if "clone:new" in gate_result.reasons:
- default_report = Path(".cache/codeclone/report.html")
- resolved_html_report_path = html_report_path
- if resolved_html_report_path is None and default_report.exists():
- resolved_html_report_path = str(default_report)
-
- clone_entries: list[tuple[str, object]] = [
- ("new_function_clone_groups", len(new_func)),
- ("new_block_clone_groups", len(new_block)),
- ]
- if resolved_html_report_path:
- clone_entries.append(("report", resolved_html_report_path))
- clone_entries.append(("accept", "codeclone . --update-baseline"))
- _print_gating_failure_block(
- code="new-clones",
- entries=clone_entries,
- args=args,
- )
-
- if args.verbose:
- _print_verbose_clone_hashes(
- cast("_PrinterLike", console),
- label="Function clone hashes",
- clone_hashes=new_func,
- )
- _print_verbose_clone_hashes(
- cast("_PrinterLike", console),
- label="Block clone hashes",
- clone_hashes=new_block,
- )
-
- sys.exit(ExitCode.GATING_FAILURE)
-
- threshold_reason = next(
- (
- reason
- for reason in gate_result.reasons
- if reason.startswith("clone:threshold:")
- ),
- None,
- )
- if threshold_reason is not None:
- _, _, total_raw, threshold_raw = threshold_reason.split(":", maxsplit=3)
- total = int(total_raw)
- threshold = int(threshold_raw)
- _print_gating_failure_block(
- code="threshold",
- entries=(
- ("clone_groups_total", total),
- ("clone_groups_limit", threshold),
- ),
- args=args,
- )
- sys.exit(ExitCode.GATING_FAILURE)
-
-
-def _main_impl() -> None:
- global console
-
- run_started_at = time.monotonic()
- from ._cli_meta import _build_report_meta, _current_report_timestamp_utc
-
- analysis_started_at_utc = _current_report_timestamp_utc()
- ap = build_parser(__version__)
-
- def _resolve_runtime_path_arg(
- *,
- root_path: Path,
- raw_path: str,
- from_cli: bool,
- ) -> Path:
- candidate_path = Path(raw_path).expanduser()
- if from_cli or candidate_path.is_absolute():
- return candidate_path.resolve()
- return (root_path / candidate_path).resolve()
-
- def _prepare_run_inputs() -> tuple[
- Namespace,
- Path,
- Path,
- bool,
- Path,
- bool,
- OutputPaths,
- Path,
- dict[str, object] | None,
- tuple[str, ...],
- str,
- str,
- ]:
- global console
- raw_argv = tuple(sys.argv[1:])
- explicit_cli_dests = collect_explicit_cli_dests(ap, argv=raw_argv)
- report_path_origins = _report_path_origins(raw_argv)
- report_generated_at_utc = _current_report_timestamp_utc()
- cache_path_from_args = any(
- arg in {"--cache-dir", "--cache-path"}
- or arg.startswith(("--cache-dir=", "--cache-path="))
- for arg in sys.argv
- )
- baseline_path_from_args = any(
- arg == "--baseline" or arg.startswith("--baseline=") for arg in sys.argv
- )
- metrics_path_from_args = any(
- arg == "--metrics-baseline" or arg.startswith("--metrics-baseline=")
- for arg in sys.argv
- )
- args = ap.parse_args()
-
- try:
- root_path = Path(args.root).resolve()
- if not root_path.exists():
- console.print(
- ui.fmt_contract_error(ui.ERR_ROOT_NOT_FOUND.format(path=root_path))
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
- except OSError as exc:
- console.print(
- ui.fmt_contract_error(ui.ERR_INVALID_ROOT_PATH.format(error=exc))
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- try:
- pyproject_config = load_pyproject_config(root_path)
- except ConfigValidationError as exc:
- console.print(ui.fmt_contract_error(str(exc)))
- sys.exit(ExitCode.CONTRACT_ERROR)
- apply_pyproject_config_overrides(
- args=args,
- config_values=pyproject_config,
- explicit_cli_dests=explicit_cli_dests,
- )
- git_diff_ref = _validate_changed_scope_args(args=args)
- changed_paths = (
- _git_diff_changed_paths(root_path=root_path, git_diff_ref=git_diff_ref)
- if git_diff_ref is not None
- else ()
- )
- if args.debug:
- os.environ["CODECLONE_DEBUG"] = "1"
-
- if args.ci:
- args.fail_on_new = True
- args.no_color = True
- args.quiet = True
-
- console = (
- _make_plain_console()
- if args.quiet
- else _make_console(no_color=args.no_color)
- )
-
- if not _validate_numeric_args(args):
- console.print(
- ui.fmt_contract_error(
- "Size limits must be non-negative integers (MB), "
- "threshold flags must be >= 0 or -1, and coverage thresholds "
- "must be between 0 and 100."
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- baseline_arg_path = Path(args.baseline).expanduser()
- try:
- baseline_path = _resolve_runtime_path_arg(
- root_path=root_path,
- raw_path=args.baseline,
- from_cli=baseline_path_from_args,
- )
- baseline_exists = baseline_path.exists()
- except OSError as exc:
- console.print(
- ui.fmt_contract_error(
- ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=exc)
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- shared_baseline_payload: dict[str, object] | None = None
- default_metrics_baseline = ap.get_default("metrics_baseline")
- metrics_path_overridden = metrics_path_from_args or (
- args.metrics_baseline != default_metrics_baseline
- )
- metrics_baseline_arg_path = Path(
- args.metrics_baseline if metrics_path_overridden else args.baseline
- ).expanduser()
- try:
- metrics_baseline_path = _resolve_runtime_path_arg(
- root_path=root_path,
- raw_path=(
- args.metrics_baseline if metrics_path_overridden else args.baseline
- ),
- from_cli=metrics_path_from_args,
- )
- if metrics_baseline_path == baseline_path:
- probe = _probe_metrics_baseline_section(metrics_baseline_path)
- metrics_baseline_exists = probe.has_metrics_section
- shared_baseline_payload = probe.payload
- else:
- metrics_baseline_exists = metrics_baseline_path.exists()
- except OSError as exc:
- console.print(
- ui.fmt_contract_error(
- ui.fmt_invalid_baseline_path(
- path=metrics_baseline_arg_path,
- error=exc,
- )
- )
- )
- sys.exit(ExitCode.CONTRACT_ERROR)
-
- if (
- args.update_baseline
- and not args.skip_metrics
- and not args.update_metrics_baseline
- ):
- args.update_metrics_baseline = True
- _configure_metrics_mode(
- args=args,
- metrics_baseline_exists=metrics_baseline_exists,
- )
- if (
- args.update_metrics_baseline
- and metrics_baseline_path == baseline_path
- and not baseline_exists
- and not args.update_baseline
- ):
- # Unified baseline needs clone payload before metrics can be embedded.
- args.update_baseline = True
-
- if args.quiet:
- args.no_progress = True
-
- if not args.quiet:
- print_banner(root=root_path)
-
- output_paths = _resolve_output_paths(
- args,
- report_path_origins=report_path_origins,
- report_generated_at_utc=report_generated_at_utc,
- )
- _validate_report_ui_flags(args=args, output_paths=output_paths)
- cache_path = _resolve_cache_path(
- root_path=root_path,
- args=args,
- from_args=cache_path_from_args,
- )
- return (
- args,
- root_path,
- baseline_path,
- baseline_exists,
- metrics_baseline_path,
- metrics_baseline_exists,
- output_paths,
- cache_path,
- shared_baseline_payload,
- changed_paths,
- analysis_started_at_utc,
- report_generated_at_utc,
- )
-
- (
- args,
- root_path,
- baseline_path,
- baseline_exists,
- metrics_baseline_path,
- metrics_baseline_exists,
- output_paths,
- cache_path,
- shared_baseline_payload,
- changed_paths,
- analysis_started_at_utc,
- report_generated_at_utc,
- ) = _prepare_run_inputs()
-
- cache = Cache(
- cache_path,
- root=root_path,
- max_size_bytes=args.max_cache_size_mb * 1024 * 1024,
- min_loc=args.min_loc,
- min_stmt=args.min_stmt,
- block_min_loc=args.block_min_loc,
- block_min_stmt=args.block_min_stmt,
- segment_min_loc=args.segment_min_loc,
- segment_min_stmt=args.segment_min_stmt,
- collect_api_surface=bool(args.api_surface),
- )
- cache.load()
- if cache.load_warning:
- console.print(f"[warning]{cache.load_warning}[/warning]")
-
- boot = bootstrap(
- args=args,
- root=root_path,
- output_paths=output_paths,
- cache_path=cache_path,
- )
- discovery_result, processing_result, analysis_result = _run_analysis_stages(
- args=args,
- boot=boot,
- cache=cache,
- )
-
- gating_mode = (
- args.fail_on_new
- or args.fail_threshold >= 0
- or args.fail_complexity >= 0
- or args.fail_coupling >= 0
- or args.fail_cohesion >= 0
- or args.fail_cycles
- or args.fail_dead_code
- or args.fail_health >= 0
- or args.fail_on_new_metrics
- or args.fail_on_typing_regression
- or args.fail_on_docstring_regression
- or args.fail_on_api_break
- or args.min_typing_coverage >= 0
- or args.min_docstring_coverage >= 0
- )
- source_read_contract_failure = (
- bool(processing_result.source_read_failures)
- and gating_mode
- and not args.update_baseline
- )
- baseline_state = _resolve_clone_baseline_state(
- args=args,
- baseline_path=baseline_path,
- baseline_exists=baseline_exists,
- analysis=analysis_result,
- shared_baseline_payload=(
- shared_baseline_payload if metrics_baseline_path == baseline_path else None
- ),
- )
- metrics_baseline_state = _resolve_metrics_baseline_state(
- args=args,
- metrics_baseline_path=metrics_baseline_path,
- metrics_baseline_exists=metrics_baseline_exists,
- baseline_updated_path=baseline_state.updated_path,
- analysis=analysis_result,
- shared_baseline_payload=(
- shared_baseline_payload if metrics_baseline_path == baseline_path else None
- ),
- )
-
- try:
- report_cache_path = cache_path.resolve()
- except OSError:
- report_cache_path = cache_path
-
- cache_status, cache_schema_version = _resolve_cache_status(cache)
-
- report_meta = _build_report_meta(
- codeclone_version=__version__,
- scan_root=root_path,
- baseline_path=baseline_path,
- baseline=baseline_state.baseline,
- baseline_loaded=baseline_state.loaded,
- baseline_status=baseline_state.status.value,
- cache_path=report_cache_path,
- cache_used=cache_status == CacheStatus.OK,
- cache_status=cache_status.value,
- cache_schema_version=cache_schema_version,
- files_skipped_source_io=len(processing_result.source_read_failures),
- metrics_baseline_path=metrics_baseline_path,
- metrics_baseline=metrics_baseline_state.baseline,
- metrics_baseline_loaded=metrics_baseline_state.loaded,
- metrics_baseline_status=metrics_baseline_state.status.value,
- health_score=(
- analysis_result.project_metrics.health.total
- if analysis_result.project_metrics
- else None
- ),
- health_grade=(
- analysis_result.project_metrics.health.grade
- if analysis_result.project_metrics
- else None
- ),
- analysis_mode=("clones_only" if args.skip_metrics else "full"),
- metrics_computed=_metrics_computed(args),
- min_loc=args.min_loc,
- min_stmt=args.min_stmt,
- block_min_loc=args.block_min_loc,
- block_min_stmt=args.block_min_stmt,
- segment_min_loc=args.segment_min_loc,
- segment_min_stmt=args.segment_min_stmt,
- design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- analysis_started_at_utc=analysis_started_at_utc,
- report_generated_at_utc=report_generated_at_utc,
- )
-
- baseline_for_diff = (
- baseline_state.baseline
- if baseline_state.trusted_for_diff
- else Baseline(baseline_path)
- )
- new_func, new_block = baseline_for_diff.diff(
- analysis_result.func_groups,
- analysis_result.block_groups,
- )
- new_clones_count = len(new_func) + len(new_block)
-
- metrics_diff: MetricsDiff | None = None
- if (
- analysis_result.project_metrics is not None
- and metrics_baseline_state.trusted_for_diff
- ):
- metrics_diff = metrics_baseline_state.baseline.diff(
- analysis_result.project_metrics
- )
- coverage_adoption_diff_available = bool(
- metrics_baseline_state.trusted_for_diff
- and getattr(
- metrics_baseline_state.baseline,
- "has_coverage_adoption_snapshot",
- False,
- )
- )
- api_surface_diff_available = bool(
- metrics_baseline_state.trusted_for_diff
- and getattr(metrics_baseline_state.baseline, "api_surface_snapshot", None)
- is not None
- )
-
- _print_summary(
- console=cast("_PrinterLike", console),
- quiet=args.quiet,
- files_found=discovery_result.files_found,
- files_analyzed=processing_result.files_analyzed,
- cache_hits=discovery_result.cache_hits,
- files_skipped=processing_result.files_skipped,
- analyzed_lines=(
- processing_result.analyzed_lines
- + int(getattr(discovery_result, "cached_lines", 0))
- ),
- analyzed_functions=(
- processing_result.analyzed_functions
- + int(getattr(discovery_result, "cached_functions", 0))
- ),
- analyzed_methods=(
- processing_result.analyzed_methods
- + int(getattr(discovery_result, "cached_methods", 0))
- ),
- analyzed_classes=(
- processing_result.analyzed_classes
- + int(getattr(discovery_result, "cached_classes", 0))
- ),
- func_clones_count=analysis_result.func_clones_count,
- block_clones_count=analysis_result.block_clones_count,
- segment_clones_count=analysis_result.segment_clones_count,
- suppressed_golden_fixture_groups=len(
- getattr(analysis_result, "suppressed_clone_groups", ())
- ),
- suppressed_segment_groups=analysis_result.suppressed_segment_groups,
- new_clones_count=new_clones_count,
- )
-
- if analysis_result.project_metrics is not None:
- pm = analysis_result.project_metrics
- metrics_payload_map = _as_mapping(analysis_result.metrics_payload)
- overloaded_modules_summary = _as_mapping(
- _as_mapping(metrics_payload_map.get("overloaded_modules")).get("summary")
- )
- adoption_summary = _as_mapping(
- _as_mapping(metrics_payload_map.get("coverage_adoption")).get("summary")
- )
- api_surface_summary = _as_mapping(
- _as_mapping(metrics_payload_map.get("api_surface")).get("summary")
- )
- coverage_join_summary = _as_mapping(
- _as_mapping(metrics_payload_map.get("coverage_join")).get("summary")
- )
- overloaded_modules_summary_map = _as_mapping(overloaded_modules_summary)
- coverage_join_source = str(coverage_join_summary.get("source", "")).strip()
- _print_metrics(
- console=cast("_PrinterLike", console),
- quiet=args.quiet,
- metrics=MetricsSnapshot(
- complexity_avg=pm.complexity_avg,
- complexity_max=pm.complexity_max,
- high_risk_count=len(pm.high_risk_functions),
- coupling_avg=pm.coupling_avg,
- coupling_max=pm.coupling_max,
- cohesion_avg=pm.cohesion_avg,
- cohesion_max=pm.cohesion_max,
- cycles_count=len(pm.dependency_cycles),
- dead_code_count=len(pm.dead_code),
- health_total=pm.health.total,
- health_grade=pm.health.grade,
- suppressed_dead_code_count=analysis_result.suppressed_dead_code_items,
- overloaded_modules_candidates=_as_int(
- overloaded_modules_summary_map.get("candidates")
- ),
- overloaded_modules_total=_as_int(
- overloaded_modules_summary_map.get("total")
- ),
- overloaded_modules_population_status=str(
- overloaded_modules_summary_map.get("population_status", "")
- ),
- overloaded_modules_top_score=_coerce.as_float(
- overloaded_modules_summary_map.get("top_score")
- ),
- adoption_param_permille=(
- _as_int(adoption_summary.get("param_permille"))
- if adoption_summary
- else None
- ),
- adoption_return_permille=(
- _as_int(adoption_summary.get("return_permille"))
- if adoption_summary
- else None
- ),
- adoption_docstring_permille=(
- _as_int(adoption_summary.get("docstring_permille"))
- if adoption_summary
- else None
- ),
- adoption_any_annotation_count=_as_int(
- adoption_summary.get("typing_any_count")
- ),
- api_surface_enabled=bool(api_surface_summary.get("enabled")),
- api_surface_modules=_as_int(api_surface_summary.get("modules")),
- api_surface_public_symbols=_as_int(
- api_surface_summary.get("public_symbols")
- ),
- api_surface_added=(
- len(metrics_diff.new_api_symbols)
- if metrics_diff is not None and api_surface_diff_available
- else 0
- ),
- api_surface_breaking=(
- len(metrics_diff.new_api_breaking_changes)
- if metrics_diff is not None and api_surface_diff_available
- else 0
- ),
- coverage_join_status=str(
- coverage_join_summary.get("status", "")
- ).strip(),
- coverage_join_overall_permille=_as_int(
- coverage_join_summary.get("overall_permille")
- ),
- coverage_join_coverage_hotspots=_as_int(
- coverage_join_summary.get("coverage_hotspots")
- ),
- coverage_join_scope_gap_hotspots=_as_int(
- coverage_join_summary.get("scope_gap_hotspots")
- ),
- coverage_join_threshold_percent=_as_int(
- coverage_join_summary.get("hotspot_threshold_percent")
- ),
- coverage_join_source_label=(
- Path(coverage_join_source).name if coverage_join_source else ""
- ),
- ),
- )
-
- report_artifacts = report(
- boot=boot,
- discovery=discovery_result,
- processing=processing_result,
- analysis=analysis_result,
- report_meta=report_meta,
- new_func=new_func,
- new_block=new_block,
- html_builder=build_html_report,
- metrics_diff=metrics_diff,
- coverage_adoption_diff_available=coverage_adoption_diff_available,
- api_surface_diff_available=api_surface_diff_available,
- include_report_document=bool(changed_paths),
- )
- changed_clone_gate = (
- _changed_clone_gate_from_report(
- report_artifacts.report_document or {},
- changed_paths=changed_paths,
- )
- if args.changed_only and report_artifacts.report_document is not None
- else None
- )
- if changed_clone_gate is not None:
- _print_changed_scope(
- console=cast("_PrinterLike", console),
- quiet=args.quiet,
- changed_scope=ChangedScopeSnapshot(
- paths_count=len(changed_clone_gate.changed_paths),
- findings_total=changed_clone_gate.findings_total,
- findings_new=changed_clone_gate.findings_new,
- findings_known=changed_clone_gate.findings_known,
- ),
- )
- html_report_path = _write_report_outputs(
- args=args,
- output_paths=output_paths,
- report_artifacts=report_artifacts,
- open_html_report=args.open_html_report,
- )
-
- _enforce_gating(
- args=args,
- boot=boot,
- analysis=analysis_result,
- processing=processing_result,
- source_read_contract_failure=source_read_contract_failure,
- baseline_failure_code=baseline_state.failure_code,
- metrics_baseline_failure_code=metrics_baseline_state.failure_code,
- new_func=set(changed_clone_gate.new_func) if changed_clone_gate else new_func,
- new_block=(
- set(changed_clone_gate.new_block) if changed_clone_gate else new_block
- ),
- metrics_diff=metrics_diff,
- html_report_path=html_report_path,
- clone_threshold_total=(
- changed_clone_gate.total_clone_groups if changed_clone_gate else None
- ),
- )
-
- notice_new_clones_count = (
- len(changed_clone_gate.new_func) + len(changed_clone_gate.new_block)
- if changed_clone_gate is not None
- else new_clones_count
- )
- if (
- not args.update_baseline
- and not args.fail_on_new
- and notice_new_clones_count > 0
- ):
- console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL)
-
- if not args.quiet:
- elapsed = time.monotonic() - run_started_at
- console.print()
- console.print(ui.fmt_pipeline_done(elapsed))
-
-
-def main() -> None:
- try:
- _main_impl()
- except SystemExit:
- raise
- except Exception as exc:
- console.print(
- ui.fmt_internal_error(
- exc,
- issues_url=ISSUES_URL,
- debug=_is_debug_enabled(),
- )
- )
- sys.exit(ExitCode.INTERNAL_ERROR)
-
-
-if __name__ == "__main__":
- main()
diff --git a/codeclone/config/__init__.py b/codeclone/config/__init__.py
new file mode 100644
index 0000000..557317f
--- /dev/null
+++ b/codeclone/config/__init__.py
@@ -0,0 +1,4 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
diff --git a/codeclone/config/argparse_builder.py b/codeclone/config/argparse_builder.py
new file mode 100644
index 0000000..aec5a63
--- /dev/null
+++ b/codeclone/config/argparse_builder.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import NoReturn
+
+from .. import ui_messages as ui
+from ..contracts import ExitCode, cli_help_epilog
+from .spec import ARGUMENT_GROUP_TITLES, DEFAULTS_BY_DEST, OPTIONS, OptionSpec
+
+
+class _ArgumentParser(argparse.ArgumentParser):
+ def error(self, message: str) -> NoReturn:
+ self.print_usage(sys.stderr)
+ self.exit(
+ int(ExitCode.CONTRACT_ERROR),
+ f"CONTRACT ERROR: {message}\n",
+ )
+
+
+class _HelpFormatter(argparse.RawTextHelpFormatter):
+ """Product-oriented help formatter extension point."""
+
+
+def _add_option(
+ group: argparse._ArgumentGroup,
+ *,
+ option: OptionSpec,
+ version: str,
+) -> None:
+ if option.cli_kind == "positional":
+ group.add_argument(
+ option.dest,
+ nargs=option.nargs,
+ metavar=option.metavar,
+ help=option.help_text,
+ )
+ return
+
+ argument_kwargs: dict[str, object] = {"help": option.help_text}
+
+ if option.cli_kind == "value":
+ argument_kwargs.update(
+ dest=option.dest,
+ nargs=option.nargs,
+ const=option.const,
+ metavar=option.metavar,
+ )
+ if option.value_type is not None:
+ argument_kwargs["type"] = option.value_type
+ elif option.cli_kind == "optional_path":
+ argument_kwargs.update(
+ dest=option.dest,
+ nargs="?",
+ const=option.const,
+ metavar=option.metavar or "FILE",
+ )
+ elif option.cli_kind == "bool_optional":
+ argument_kwargs.update(
+ action=argparse.BooleanOptionalAction,
+ default=argparse.SUPPRESS,
+ )
+ elif option.cli_kind in {"store_true", "store_false"}:
+ argument_kwargs.update(
+ dest=option.dest,
+ action=option.cli_kind,
+ default=argparse.SUPPRESS,
+ )
+ elif option.cli_kind == "help":
+ argument_kwargs["action"] = "help"
+ elif option.cli_kind == "version":
+ argument_kwargs.update(
+ action="version",
+ version=ui.version_output(version),
+ )
+ else:
+ raise RuntimeError(f"Unsupported CLI option kind: {option.cli_kind}")
+
+ group.add_argument(*option.flags, **argument_kwargs) # type: ignore[arg-type]
+
+
+def build_parser(version: str) -> _ArgumentParser:
+ parser = _ArgumentParser(
+ prog="codeclone",
+ description="Structural code quality analysis for Python.",
+ add_help=False,
+ formatter_class=_HelpFormatter,
+ epilog=cli_help_epilog(),
+ )
+
+ for group_title in ARGUMENT_GROUP_TITLES:
+ argument_group = parser.add_argument_group(group_title)
+ for option in OPTIONS:
+ if option.group != group_title or option.cli_kind is None:
+ continue
+ _add_option(
+ argument_group,
+ option=option,
+ version=version,
+ )
+
+ parser.set_defaults(**DEFAULTS_BY_DEST)
+ return parser
+
+
+__all__ = ["_ArgumentParser", "_HelpFormatter", "build_parser"]
diff --git a/codeclone/config/pyproject_loader.py b/codeclone/config/pyproject_loader.py
new file mode 100644
index 0000000..3c2cd20
--- /dev/null
+++ b/codeclone/config/pyproject_loader.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import importlib
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from ..findings.clones.golden_fixtures import (
+ GoldenFixturePatternError,
+ normalize_golden_fixture_patterns,
+)
+from .spec import CONFIG_KEY_SPECS, PATH_CONFIG_KEYS, ConfigKeySpec
+
+if TYPE_CHECKING:
+ from collections.abc import Callable, Mapping, Set
+
+
+class ConfigValidationError(ValueError):
+ """Raised when pyproject.toml contains invalid CodeClone configuration."""
+
+
+def validate_config_value(
+ *,
+ key: str,
+ value: object,
+ config_key_specs: Mapping[str, ConfigKeySpec] = CONFIG_KEY_SPECS,
+) -> object:
+ spec = config_key_specs[key]
+ if value is None:
+ if spec.allow_none:
+ return None
+ raise ConfigValidationError(
+ "Invalid value type for tool.codeclone."
+ f"{key}: expected {spec.expected_name or spec.expected_type.__name__}"
+ )
+
+ expected_type = spec.expected_type
+ if expected_type is bool:
+ return _validated_config_instance(
+ key=key,
+ value=value,
+ expected_type=bool,
+ expected_name="bool",
+ )
+
+ if expected_type is int:
+ return _validated_config_instance(
+ key=key,
+ value=value,
+ expected_type=int,
+ expected_name="int",
+ reject_bool=True,
+ )
+
+ if expected_type is str:
+ return _validated_config_instance(
+ key=key,
+ value=value,
+ expected_type=str,
+ expected_name="str",
+ )
+
+ if expected_type is list:
+ return _validated_string_list(key=key, value=value)
+
+ raise ConfigValidationError(f"Unsupported config key spec for tool.codeclone.{key}")
+
+
+def load_pyproject_config(
+ root_path: Path,
+ *,
+ load_toml: Callable[[Path], object] | None = None,
+ config_key_specs: Mapping[str, ConfigKeySpec] = CONFIG_KEY_SPECS,
+ path_config_keys: Set[str] | frozenset[str] = PATH_CONFIG_KEYS,
+) -> dict[str, object]:
+ config_path = root_path / "pyproject.toml"
+ if not config_path.exists():
+ return {}
+
+ load_toml_fn = _load_toml if load_toml is None else load_toml
+
+ payload: object
+ try:
+ payload = load_toml_fn(config_path)
+ except OSError as exc:
+ raise ConfigValidationError(
+ f"Cannot read pyproject.toml at {config_path}: {exc}"
+ ) from exc
+ except ValueError as exc:
+ raise ConfigValidationError(f"Invalid TOML in {config_path}: {exc}") from exc
+
+ if not isinstance(payload, dict):
+ raise ConfigValidationError(
+ f"Invalid pyproject payload at {config_path}: root must be object"
+ )
+
+ tool_obj = payload.get("tool")
+ if tool_obj is None:
+ return {}
+ if not isinstance(tool_obj, dict):
+ raise ConfigValidationError(
+ f"Invalid pyproject payload at {config_path}: 'tool' must be object"
+ )
+
+ codeclone_obj = tool_obj.get("codeclone")
+ if codeclone_obj is None:
+ return {}
+ if not isinstance(codeclone_obj, dict):
+ raise ConfigValidationError(
+ "Invalid pyproject payload at "
+ f"{config_path}: 'tool.codeclone' must be object"
+ )
+
+ unknown = sorted(set(codeclone_obj.keys()) - set(config_key_specs))
+ if unknown:
+ raise ConfigValidationError(
+ "Unknown key(s) in tool.codeclone: " + ", ".join(unknown)
+ )
+
+ validated: dict[str, object] = {}
+ for key in sorted(codeclone_obj.keys()):
+ value = validate_config_value(
+ key=key,
+ value=codeclone_obj[key],
+ config_key_specs=config_key_specs,
+ )
+ validated[key] = normalize_path_config_value(
+ key=key,
+ value=value,
+ root_path=root_path,
+ path_config_keys=path_config_keys,
+ )
+ return validated
+
+
+def normalize_path_config_value(
+ *,
+ key: str,
+ value: object,
+ root_path: Path,
+ path_config_keys: Set[str] | frozenset[str] = PATH_CONFIG_KEYS,
+) -> object:
+ if key not in path_config_keys:
+ return value
+ if not isinstance(value, str):
+ return value
+
+ path = Path(value).expanduser()
+ if path.is_absolute():
+ return str(path)
+ return str(root_path / path)
+
+
+def _validated_config_instance(
+ *,
+ key: str,
+ value: object,
+ expected_type: type[object],
+ expected_name: str,
+ reject_bool: bool = False,
+) -> object:
+ if isinstance(value, expected_type) and (
+ not reject_bool or not isinstance(value, bool)
+ ):
+ return value
+ raise ConfigValidationError(
+ f"Invalid value type for tool.codeclone.{key}: expected {expected_name}"
+ )
+
+
+def _validated_string_list(*, key: str, value: object) -> tuple[str, ...]:
+ if not isinstance(value, list):
+ raise ConfigValidationError(
+ f"Invalid value type for tool.codeclone.{key}: expected list[str]"
+ )
+ if not all(isinstance(item, str) for item in value):
+ raise ConfigValidationError(
+ f"Invalid value type for tool.codeclone.{key}: expected list[str]"
+ )
+ try:
+ return normalize_golden_fixture_patterns(value)
+ except GoldenFixturePatternError as exc:
+ raise ConfigValidationError(str(exc)) from exc
+
+
+def _load_toml(path: Path) -> object:
+ if sys.version_info >= (3, 11):
+ import tomllib
+
+ with path.open("rb") as config_file:
+ return tomllib.load(config_file)
+
+ try:
+ tomli_module = importlib.import_module("tomli")
+ except ModuleNotFoundError as exc:
+ raise ConfigValidationError(
+ "Python 3.10 requires dependency 'tomli' to read pyproject.toml."
+ ) from exc
+
+ load_fn = getattr(tomli_module, "load", None)
+ if not callable(load_fn):
+ raise ConfigValidationError("Invalid 'tomli' module: missing callable 'load'.")
+
+ with path.open("rb") as config_file:
+ return load_fn(config_file)
+
+
+__all__ = [
+ "CONFIG_KEY_SPECS",
+ "PATH_CONFIG_KEYS",
+ "ConfigValidationError",
+ "_load_toml",
+ "load_pyproject_config",
+ "normalize_path_config_value",
+ "validate_config_value",
+]
diff --git a/codeclone/config/resolver.py b/codeclone/config/resolver.py
new file mode 100644
index 0000000..03ef896
--- /dev/null
+++ b/codeclone/config/resolver.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import argparse
+ from collections.abc import Mapping, Sequence
+
+
+@dataclass(frozen=True, slots=True)
+class ResolvedConfig:
+ values: dict[str, object]
+ explicit_cli_dests: frozenset[str]
+ pyproject_values: dict[str, object]
+
+
+def collect_explicit_cli_dests(
+ parser: argparse.ArgumentParser,
+ *,
+ argv: Sequence[str],
+) -> set[str]:
+ option_to_dest: dict[str, str] = {}
+ for action in parser._actions:
+ for option in action.option_strings:
+ option_to_dest[option] = action.dest
+
+ explicit: set[str] = set()
+ for token in argv:
+ if token == "--":
+ break
+ if not token.startswith("-"):
+ continue
+ option = token.split("=", maxsplit=1)[0]
+ dest = option_to_dest.get(option)
+ if dest is not None:
+ explicit.add(dest)
+ return explicit
+
+
+def resolve_config(
+ *,
+ args: argparse.Namespace,
+ config_values: Mapping[str, object],
+ explicit_cli_dests: set[str],
+) -> ResolvedConfig:
+ resolved_values = vars(args).copy()
+ for key, value in config_values.items():
+ if key in explicit_cli_dests:
+ continue
+ resolved_values[key] = value
+
+ return ResolvedConfig(
+ values=resolved_values,
+ explicit_cli_dests=frozenset(explicit_cli_dests),
+ pyproject_values=dict(config_values),
+ )
+
+
+def apply_resolved_config(
+ *,
+ args: argparse.Namespace,
+ resolved: ResolvedConfig,
+) -> None:
+ for key, value in resolved.values.items():
+ setattr(args, key, value)
+
+
+def apply_pyproject_config_overrides(
+ *,
+ args: argparse.Namespace,
+ config_values: Mapping[str, object],
+ explicit_cli_dests: set[str],
+) -> None:
+ apply_resolved_config(
+ args=args,
+ resolved=resolve_config(
+ args=args,
+ config_values=config_values,
+ explicit_cli_dests=explicit_cli_dests,
+ ),
+ )
+
+
+__all__ = [
+ "ResolvedConfig",
+ "apply_pyproject_config_overrides",
+ "apply_resolved_config",
+ "collect_explicit_cli_dests",
+ "resolve_config",
+]
diff --git a/codeclone/config/spec.py b/codeclone/config/spec.py
new file mode 100644
index 0000000..798e2bf
--- /dev/null
+++ b/codeclone/config/spec.py
@@ -0,0 +1,770 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Final, Literal
+
+from .. import ui_messages as ui
+from ..contracts import (
+ DEFAULT_BASELINE_PATH,
+ DEFAULT_BLOCK_MIN_LOC,
+ DEFAULT_BLOCK_MIN_STMT,
+ DEFAULT_COHESION_THRESHOLD,
+ DEFAULT_COMPLEXITY_THRESHOLD,
+ DEFAULT_COUPLING_THRESHOLD,
+ DEFAULT_COVERAGE_MIN,
+ DEFAULT_HEALTH_THRESHOLD,
+ DEFAULT_HTML_REPORT_PATH,
+ DEFAULT_JSON_REPORT_PATH,
+ DEFAULT_MARKDOWN_REPORT_PATH,
+ DEFAULT_MAX_BASELINE_SIZE_MB,
+ DEFAULT_MAX_CACHE_SIZE_MB,
+ DEFAULT_MIN_LOC,
+ DEFAULT_MIN_STMT,
+ DEFAULT_PROCESSES,
+ DEFAULT_ROOT,
+ DEFAULT_SARIF_REPORT_PATH,
+ DEFAULT_SEGMENT_MIN_LOC,
+ DEFAULT_SEGMENT_MIN_STMT,
+ DEFAULT_TEXT_REPORT_PATH,
+)
+
+CliKind = Literal[
+ "positional",
+ "value",
+ "optional_path",
+ "bool_optional",
+ "store_true",
+ "store_false",
+ "help",
+ "version",
+]
+
+_UNSET: Final[object] = object()
+_INFER_PYPROJECT_KEY: Final[object] = object()
+
+
+@dataclass(frozen=True, slots=True)
+class ConfigKeySpec:
+ expected_type: type[object]
+ allow_none: bool = False
+ expected_name: str | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class OptionSpec:
+ dest: str
+ group: str | None
+ cli_kind: CliKind | None = None
+ flags: tuple[str, ...] = ()
+ default: object = _UNSET
+ value_type: type[object] | None = None
+ const: object | None = None
+ nargs: str | int | None = None
+ metavar: str | None = None
+ help_text: str | None = None
+ pyproject_key: str | None = None
+ config_spec: ConfigKeySpec | None = None
+ path_value: bool = False
+
+ @property
+ def has_default(self) -> bool:
+ return self.default is not _UNSET
+
+
+def _option(
+ *,
+ dest: str,
+ group: str | None,
+ cli_kind: CliKind | None = None,
+ flags: tuple[str, ...] = (),
+ default: object = _UNSET,
+ value_type: type[object] | None = None,
+ const: object | None = None,
+ nargs: str | int | None = None,
+ metavar: str | None = None,
+ help_text: str | None = None,
+ pyproject_type: type[object] | None = None,
+ allow_none: bool = False,
+ expected_name: str | None = None,
+ pyproject_key: object = _INFER_PYPROJECT_KEY,
+ path_value: bool = False,
+) -> OptionSpec:
+ config_spec = (
+ ConfigKeySpec(
+ expected_type=pyproject_type,
+ allow_none=allow_none,
+ expected_name=expected_name,
+ )
+ if pyproject_type is not None
+ else None
+ )
+ resolved_pyproject_key: str | None
+ if pyproject_type is None:
+ resolved_pyproject_key = None
+ elif pyproject_key is _INFER_PYPROJECT_KEY:
+ resolved_pyproject_key = dest
+ elif pyproject_key is None or isinstance(pyproject_key, str):
+ resolved_pyproject_key = pyproject_key
+ else:
+ raise TypeError("pyproject_key must be str | None when pyproject_type is set")
+ return OptionSpec(
+ dest=dest,
+ group=group,
+ cli_kind=cli_kind,
+ flags=flags,
+ default=default,
+ value_type=value_type,
+ const=const,
+ nargs=nargs,
+ metavar=metavar,
+ help_text=help_text,
+ pyproject_key=resolved_pyproject_key,
+ config_spec=config_spec,
+ path_value=path_value,
+ )
+
+
+ARGUMENT_GROUP_TITLES: Final[tuple[str, ...]] = (
+ "Target",
+ "Analysis",
+ "Baselines and CI",
+ "Quality gates",
+ "Analysis stages",
+ "Reporting",
+ "Output and UI",
+ "General",
+)
+
+OPTIONS: Final[tuple[OptionSpec, ...]] = (
+ _option(
+ dest="root",
+ group="Target",
+ cli_kind="positional",
+ default=DEFAULT_ROOT,
+ nargs="?",
+ help_text=ui.HELP_ROOT,
+ ),
+ _option(
+ dest="min_loc",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--min-loc",),
+ default=DEFAULT_MIN_LOC,
+ value_type=int,
+ help_text=ui.HELP_MIN_LOC,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="min_stmt",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--min-stmt",),
+ default=DEFAULT_MIN_STMT,
+ value_type=int,
+ help_text=ui.HELP_MIN_STMT,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="block_min_loc",
+ group="Analysis",
+ default=DEFAULT_BLOCK_MIN_LOC,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="block_min_stmt",
+ group="Analysis",
+ default=DEFAULT_BLOCK_MIN_STMT,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="segment_min_loc",
+ group="Analysis",
+ default=DEFAULT_SEGMENT_MIN_LOC,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="segment_min_stmt",
+ group="Analysis",
+ default=DEFAULT_SEGMENT_MIN_STMT,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="golden_fixture_paths",
+ group="Analysis",
+ default=(),
+ pyproject_type=list,
+ expected_name="list[str]",
+ ),
+ _option(
+ dest="processes",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--processes",),
+ default=DEFAULT_PROCESSES,
+ value_type=int,
+ help_text=ui.HELP_PROCESSES,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="changed_only",
+ group="Analysis",
+ cli_kind="bool_optional",
+ flags=("--changed-only",),
+ default=False,
+ help_text=ui.HELP_CHANGED_ONLY,
+ ),
+ _option(
+ dest="diff_against",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--diff-against",),
+ default=None,
+ metavar="GIT_REF",
+ help_text=ui.HELP_DIFF_AGAINST,
+ ),
+ _option(
+ dest="paths_from_git_diff",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--paths-from-git-diff",),
+ default=None,
+ metavar="GIT_REF",
+ help_text=ui.HELP_PATHS_FROM_GIT_DIFF,
+ ),
+ _option(
+ dest="cache_path",
+ group="Analysis",
+ cli_kind="optional_path",
+ flags=("--cache-path",),
+ default=None,
+ metavar="FILE",
+ help_text=ui.HELP_CACHE_PATH,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="cache_path",
+ group="Analysis",
+ cli_kind="optional_path",
+ flags=("--cache-dir",),
+ metavar="FILE",
+ help_text=ui.HELP_CACHE_DIR_LEGACY,
+ pyproject_key=None,
+ ),
+ _option(
+ dest="max_cache_size_mb",
+ group="Analysis",
+ cli_kind="value",
+ flags=("--max-cache-size-mb",),
+ default=DEFAULT_MAX_CACHE_SIZE_MB,
+ value_type=int,
+ metavar="MB",
+ help_text=ui.HELP_MAX_CACHE_SIZE_MB,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="baseline",
+ group="Baselines and CI",
+ cli_kind="optional_path",
+ flags=("--baseline",),
+ default=DEFAULT_BASELINE_PATH,
+ const=DEFAULT_BASELINE_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_BASELINE,
+ pyproject_type=str,
+ path_value=True,
+ ),
+ _option(
+ dest="max_baseline_size_mb",
+ group="Baselines and CI",
+ cli_kind="value",
+ flags=("--max-baseline-size-mb",),
+ default=DEFAULT_MAX_BASELINE_SIZE_MB,
+ value_type=int,
+ metavar="MB",
+ help_text=ui.HELP_MAX_BASELINE_SIZE_MB,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="update_baseline",
+ group="Baselines and CI",
+ cli_kind="bool_optional",
+ flags=("--update-baseline",),
+ default=False,
+ help_text=ui.HELP_UPDATE_BASELINE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="metrics_baseline",
+ group="Baselines and CI",
+ cli_kind="optional_path",
+ flags=("--metrics-baseline",),
+ default=DEFAULT_BASELINE_PATH,
+ const=DEFAULT_BASELINE_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_METRICS_BASELINE,
+ pyproject_type=str,
+ path_value=True,
+ ),
+ _option(
+ dest="update_metrics_baseline",
+ group="Baselines and CI",
+ cli_kind="bool_optional",
+ flags=("--update-metrics-baseline",),
+ default=False,
+ help_text=ui.HELP_UPDATE_METRICS_BASELINE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="ci",
+ group="Baselines and CI",
+ cli_kind="bool_optional",
+ flags=("--ci",),
+ default=False,
+ help_text=ui.HELP_CI,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="api_surface",
+ group="Baselines and CI",
+ cli_kind="bool_optional",
+ flags=("--api-surface",),
+ default=False,
+ help_text=ui.HELP_API_SURFACE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="coverage_xml",
+ group="Baselines and CI",
+ cli_kind="value",
+ flags=("--coverage",),
+ default=None,
+ metavar="FILE",
+ help_text=ui.HELP_COVERAGE,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="fail_on_new",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-new",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_NEW,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_on_new_metrics",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-new-metrics",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_NEW_METRICS,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_threshold",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--fail-threshold",),
+ default=-1,
+ value_type=int,
+ metavar="MAX_CLONES",
+ help_text=ui.HELP_FAIL_THRESHOLD,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="fail_complexity",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--fail-complexity",),
+ default=-1,
+ value_type=int,
+ nargs="?",
+ const=DEFAULT_COMPLEXITY_THRESHOLD,
+ metavar="CC_MAX",
+ help_text=ui.HELP_FAIL_COMPLEXITY,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="fail_coupling",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--fail-coupling",),
+ default=-1,
+ value_type=int,
+ nargs="?",
+ const=DEFAULT_COUPLING_THRESHOLD,
+ metavar="CBO_MAX",
+ help_text=ui.HELP_FAIL_COUPLING,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="fail_cohesion",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--fail-cohesion",),
+ default=-1,
+ value_type=int,
+ nargs="?",
+ const=DEFAULT_COHESION_THRESHOLD,
+ metavar="LCOM4_MAX",
+ help_text=ui.HELP_FAIL_COHESION,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="fail_cycles",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-cycles",),
+ default=False,
+ help_text=ui.HELP_FAIL_CYCLES,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_dead_code",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-dead-code",),
+ default=False,
+ help_text=ui.HELP_FAIL_DEAD_CODE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_health",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--fail-health",),
+ default=-1,
+ value_type=int,
+ nargs="?",
+ const=DEFAULT_HEALTH_THRESHOLD,
+ metavar="SCORE_MIN",
+ help_text=ui.HELP_FAIL_HEALTH,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="fail_on_typing_regression",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-typing-regression",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_TYPING_REGRESSION,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_on_docstring_regression",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-docstring-regression",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_DOCSTRING_REGRESSION,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_on_api_break",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-api-break",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_API_BREAK,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="fail_on_untested_hotspots",
+ group="Quality gates",
+ cli_kind="bool_optional",
+ flags=("--fail-on-untested-hotspots",),
+ default=False,
+ help_text=ui.HELP_FAIL_ON_UNTESTED_HOTSPOTS,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="min_typing_coverage",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--min-typing-coverage",),
+ default=-1,
+ value_type=int,
+ metavar="PERCENT",
+ help_text=ui.HELP_MIN_TYPING_COVERAGE,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="min_docstring_coverage",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--min-docstring-coverage",),
+ default=-1,
+ value_type=int,
+ metavar="PERCENT",
+ help_text=ui.HELP_MIN_DOCSTRING_COVERAGE,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="coverage_min",
+ group="Quality gates",
+ cli_kind="value",
+ flags=("--coverage-min",),
+ default=DEFAULT_COVERAGE_MIN,
+ value_type=int,
+ metavar="PERCENT",
+ help_text=ui.HELP_COVERAGE_MIN,
+ pyproject_type=int,
+ ),
+ _option(
+ dest="skip_metrics",
+ group="Analysis stages",
+ cli_kind="bool_optional",
+ flags=("--skip-metrics",),
+ default=False,
+ help_text=ui.HELP_SKIP_METRICS,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="skip_dead_code",
+ group="Analysis stages",
+ cli_kind="bool_optional",
+ flags=("--skip-dead-code",),
+ default=False,
+ help_text=ui.HELP_SKIP_DEAD_CODE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="skip_dependencies",
+ group="Analysis stages",
+ cli_kind="bool_optional",
+ flags=("--skip-dependencies",),
+ default=False,
+ help_text=ui.HELP_SKIP_DEPENDENCIES,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="html_out",
+ group="Reporting",
+ cli_kind="optional_path",
+ flags=("--html",),
+ default=None,
+ const=DEFAULT_HTML_REPORT_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_HTML,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="json_out",
+ group="Reporting",
+ cli_kind="optional_path",
+ flags=("--json",),
+ default=None,
+ const=DEFAULT_JSON_REPORT_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_JSON,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="md_out",
+ group="Reporting",
+ cli_kind="optional_path",
+ flags=("--md",),
+ default=None,
+ const=DEFAULT_MARKDOWN_REPORT_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_MD,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="sarif_out",
+ group="Reporting",
+ cli_kind="optional_path",
+ flags=("--sarif",),
+ default=None,
+ const=DEFAULT_SARIF_REPORT_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_SARIF,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="text_out",
+ group="Reporting",
+ cli_kind="optional_path",
+ flags=("--text",),
+ default=None,
+ const=DEFAULT_TEXT_REPORT_PATH,
+ metavar="FILE",
+ help_text=ui.HELP_TEXT,
+ pyproject_type=str,
+ allow_none=True,
+ path_value=True,
+ ),
+ _option(
+ dest="timestamped_report_paths",
+ group="Reporting",
+ cli_kind="bool_optional",
+ flags=("--timestamped-report-paths",),
+ default=False,
+ help_text=ui.HELP_TIMESTAMPED_REPORT_PATHS,
+ ),
+ _option(
+ dest="open_html_report",
+ group="Output and UI",
+ cli_kind="bool_optional",
+ flags=("--open-html-report",),
+ default=False,
+ help_text=ui.HELP_OPEN_HTML_REPORT,
+ ),
+ _option(
+ dest="no_progress",
+ group="Output and UI",
+ cli_kind="store_true",
+ flags=("--no-progress",),
+ default=False,
+ help_text=ui.HELP_NO_PROGRESS,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="no_progress",
+ group="Output and UI",
+ cli_kind="store_false",
+ flags=("--progress",),
+ help_text=ui.HELP_PROGRESS,
+ pyproject_key=None,
+ ),
+ _option(
+ dest="no_color",
+ group="Output and UI",
+ cli_kind="store_true",
+ flags=("--no-color",),
+ default=False,
+ help_text=ui.HELP_NO_COLOR,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="no_color",
+ group="Output and UI",
+ cli_kind="store_false",
+ flags=("--color",),
+ help_text=ui.HELP_COLOR,
+ pyproject_key=None,
+ ),
+ _option(
+ dest="quiet",
+ group="Output and UI",
+ cli_kind="bool_optional",
+ flags=("--quiet",),
+ default=False,
+ help_text=ui.HELP_QUIET,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="verbose",
+ group="Output and UI",
+ cli_kind="bool_optional",
+ flags=("--verbose",),
+ default=False,
+ help_text=ui.HELP_VERBOSE,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="debug",
+ group="Output and UI",
+ cli_kind="bool_optional",
+ flags=("--debug",),
+ default=False,
+ help_text=ui.HELP_DEBUG,
+ pyproject_type=bool,
+ ),
+ _option(
+ dest="help",
+ group="General",
+ cli_kind="help",
+ flags=("-h", "--help"),
+ help_text="Show this help message and exit.",
+ ),
+ _option(
+ dest="version",
+ group="General",
+ cli_kind="version",
+ flags=("--version",),
+ help_text=ui.HELP_VERSION,
+ ),
+)
+
+
+def _build_defaults_by_dest() -> dict[str, object]:
+ defaults: dict[str, object] = {}
+ for spec in OPTIONS:
+ if not spec.has_default or spec.dest in defaults:
+ continue
+ defaults[spec.dest] = spec.default
+ return defaults
+
+
+def _build_pyproject_specs() -> dict[str, ConfigKeySpec]:
+ config_specs: dict[str, ConfigKeySpec] = {}
+ for spec in OPTIONS:
+ if spec.pyproject_key is None or spec.config_spec is None:
+ continue
+ if spec.pyproject_key in config_specs:
+ existing = config_specs[spec.pyproject_key]
+ if existing != spec.config_spec:
+ raise RuntimeError(
+ f"Conflicting pyproject spec for {spec.pyproject_key}"
+ )
+ continue
+ config_specs[spec.pyproject_key] = spec.config_spec
+ return config_specs
+
+
+DEFAULTS_BY_DEST: Final[dict[str, object]] = _build_defaults_by_dest()
+CONFIG_KEY_SPECS: Final[dict[str, ConfigKeySpec]] = _build_pyproject_specs()
+PATH_CONFIG_KEYS: Final[frozenset[str]] = frozenset(
+ spec.pyproject_key
+ for spec in OPTIONS
+ if spec.pyproject_key is not None and spec.path_value
+)
+TESTABLE_CLI_OPTIONS: Final[tuple[OptionSpec, ...]] = tuple(
+ spec
+ for spec in OPTIONS
+ if spec.cli_kind is not None and spec.cli_kind not in {"help", "version"}
+)
+PYPROJECT_OPTIONS: Final[tuple[OptionSpec, ...]] = tuple(
+ spec for spec in OPTIONS if spec.pyproject_key is not None and spec.config_spec
+)
+
+__all__ = [
+ "ARGUMENT_GROUP_TITLES",
+ "CONFIG_KEY_SPECS",
+ "DEFAULTS_BY_DEST",
+ "DEFAULT_BASELINE_PATH",
+ "DEFAULT_BLOCK_MIN_LOC",
+ "DEFAULT_BLOCK_MIN_STMT",
+ "DEFAULT_HTML_REPORT_PATH",
+ "DEFAULT_JSON_REPORT_PATH",
+ "DEFAULT_MARKDOWN_REPORT_PATH",
+ "DEFAULT_MAX_BASELINE_SIZE_MB",
+ "DEFAULT_MAX_CACHE_SIZE_MB",
+ "DEFAULT_MIN_LOC",
+ "DEFAULT_MIN_STMT",
+ "DEFAULT_PROCESSES",
+ "DEFAULT_ROOT",
+ "DEFAULT_SARIF_REPORT_PATH",
+ "DEFAULT_SEGMENT_MIN_LOC",
+ "DEFAULT_SEGMENT_MIN_STMT",
+ "DEFAULT_TEXT_REPORT_PATH",
+ "OPTIONS",
+ "PATH_CONFIG_KEYS",
+ "PYPROJECT_OPTIONS",
+ "TESTABLE_CLI_OPTIONS",
+ "ConfigKeySpec",
+ "OptionSpec",
+]
diff --git a/codeclone/contracts.py b/codeclone/contracts.py
deleted file mode 100644
index 70a76ee..0000000
--- a/codeclone/contracts.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-from enum import IntEnum
-from typing import Final
-
-BASELINE_SCHEMA_VERSION: Final = "2.1"
-BASELINE_FINGERPRINT_VERSION: Final = "1"
-
-CACHE_VERSION: Final = "2.5"
-REPORT_SCHEMA_VERSION: Final = "2.8"
-METRICS_BASELINE_SCHEMA_VERSION: Final = "1.2"
-
-DEFAULT_COMPLEXITY_THRESHOLD: Final = 20
-DEFAULT_COUPLING_THRESHOLD: Final = 10
-DEFAULT_COHESION_THRESHOLD: Final = 4
-DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD: Final = 20
-DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD: Final = 10
-DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD: Final = 4
-DEFAULT_HEALTH_THRESHOLD: Final = 60
-
-COMPLEXITY_RISK_LOW_MAX: Final = 10
-COMPLEXITY_RISK_MEDIUM_MAX: Final = 20
-COUPLING_RISK_LOW_MAX: Final = 5
-COUPLING_RISK_MEDIUM_MAX: Final = 10
-COHESION_RISK_MEDIUM_MAX: Final = 3
-
-HEALTH_WEIGHTS: Final[dict[str, float]] = {
- "clones": 0.25,
- "complexity": 0.20,
- "coupling": 0.10,
- "cohesion": 0.15,
- "dead_code": 0.10,
- "dependencies": 0.10,
- "coverage": 0.10,
-}
-
-
-class ExitCode(IntEnum):
- SUCCESS = 0
- CONTRACT_ERROR = 2
- GATING_FAILURE = 3
- INTERNAL_ERROR = 5
-
-
-REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone"
-ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues"
-DOCS_URL: Final = "https://orenlab.github.io/codeclone/"
-
-
-def cli_help_epilog() -> str:
- return "\n".join(
- [
- "Exit codes:",
- " 0 Success.",
- " 2 Contract error: untrusted or invalid baseline, invalid output",
- " configuration, incompatible versions, or unreadable sources in",
- " CI/gating mode.",
- " 3 Gating failure: new clones, threshold violations, or metrics",
- " quality gate failures.",
- " 5 Internal error: unexpected exception.",
- "",
- f"Repository: {REPOSITORY_URL}",
- f"Issues: {ISSUES_URL}",
- f"Docs: {DOCS_URL}",
- ]
- )
diff --git a/codeclone/contracts/__init__.py b/codeclone/contracts/__init__.py
new file mode 100644
index 0000000..f8ad78f
--- /dev/null
+++ b/codeclone/contracts/__init__.py
@@ -0,0 +1,141 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from enum import IntEnum
+from typing import Final
+
+BASELINE_SCHEMA_VERSION: Final = "2.1"
+BASELINE_FINGERPRINT_VERSION: Final = "1"
+
+CACHE_VERSION: Final = "2.6"
+REPORT_SCHEMA_VERSION: Final = "2.10"
+METRICS_BASELINE_SCHEMA_VERSION: Final = "1.2"
+
+DEFAULT_COMPLEXITY_THRESHOLD: Final = 20
+DEFAULT_COUPLING_THRESHOLD: Final = 10
+DEFAULT_COHESION_THRESHOLD: Final = 4
+DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD: Final = 20
+DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD: Final = 10
+DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD: Final = 4
+DEFAULT_HEALTH_THRESHOLD: Final = 60
+DEFAULT_ROOT: Final = "."
+DEFAULT_MIN_LOC: Final = 10
+DEFAULT_MIN_STMT: Final = 6
+DEFAULT_BLOCK_MIN_LOC: Final = 20
+DEFAULT_BLOCK_MIN_STMT: Final = 8
+DEFAULT_SEGMENT_MIN_LOC: Final = 20
+DEFAULT_SEGMENT_MIN_STMT: Final = 10
+DEFAULT_PROCESSES: Final = 4
+DEFAULT_MAX_CACHE_SIZE_MB: Final = 50
+DEFAULT_MAX_BASELINE_SIZE_MB: Final = 5
+DEFAULT_COVERAGE_MIN: Final = 50
+DEFAULT_BASELINE_PATH: Final = "codeclone.baseline.json"
+DEFAULT_HTML_REPORT_PATH: Final = ".cache/codeclone/report.html"
+DEFAULT_JSON_REPORT_PATH: Final = ".cache/codeclone/report.json"
+DEFAULT_MARKDOWN_REPORT_PATH: Final = ".cache/codeclone/report.md"
+DEFAULT_SARIF_REPORT_PATH: Final = ".cache/codeclone/report.sarif"
+DEFAULT_TEXT_REPORT_PATH: Final = ".cache/codeclone/report.txt"
+
+COMPLEXITY_RISK_LOW_MAX: Final = 10
+COMPLEXITY_RISK_MEDIUM_MAX: Final = 20
+COUPLING_RISK_LOW_MAX: Final = 5
+COUPLING_RISK_MEDIUM_MAX: Final = 10
+COHESION_RISK_MEDIUM_MAX: Final = 3
+HEALTH_DEPENDENCY_CYCLE_PENALTY: Final = 25
+HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY: Final = 4
+HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER: Final = 2.0
+HEALTH_DEPENDENCY_DEPTH_P95_MARGIN: Final = 1
+
+HEALTH_WEIGHTS: Final[dict[str, float]] = {
+ "clones": 0.25,
+ "complexity": 0.20,
+ "coupling": 0.10,
+ "cohesion": 0.15,
+ "dead_code": 0.10,
+ "dependencies": 0.10,
+ "coverage": 0.10,
+}
+
+
+class ExitCode(IntEnum):
+ SUCCESS = 0
+ CONTRACT_ERROR = 2
+ GATING_FAILURE = 3
+ INTERNAL_ERROR = 5
+
+
+REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone"
+ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues"
+DOCS_URL: Final = "https://orenlab.github.io/codeclone/"
+
+
+def cli_help_epilog() -> str:
+ return "\n".join(
+ [
+ "Exit codes:",
+ " 0 Success.",
+ " 2 Contract error: untrusted or invalid baseline, invalid output",
+ " configuration, incompatible versions, or unreadable sources in",
+ " CI/gating mode.",
+ " 3 Gating failure: new clones, threshold violations, or metrics",
+ " quality gate failures.",
+ " 5 Internal error: unexpected exception.",
+ "",
+ f"Repository: {REPOSITORY_URL}",
+ f"Issues: {ISSUES_URL}",
+ f"Docs: {DOCS_URL}",
+ ]
+ )
+
+
+__all__ = [
+ "BASELINE_FINGERPRINT_VERSION",
+ "BASELINE_SCHEMA_VERSION",
+ "CACHE_VERSION",
+ "COHESION_RISK_MEDIUM_MAX",
+ "COMPLEXITY_RISK_LOW_MAX",
+ "COMPLEXITY_RISK_MEDIUM_MAX",
+ "COUPLING_RISK_LOW_MAX",
+ "COUPLING_RISK_MEDIUM_MAX",
+ "DEFAULT_BASELINE_PATH",
+ "DEFAULT_BLOCK_MIN_LOC",
+ "DEFAULT_BLOCK_MIN_STMT",
+ "DEFAULT_COHESION_THRESHOLD",
+ "DEFAULT_COMPLEXITY_THRESHOLD",
+ "DEFAULT_COUPLING_THRESHOLD",
+ "DEFAULT_COVERAGE_MIN",
+ "DEFAULT_HEALTH_THRESHOLD",
+ "DEFAULT_HTML_REPORT_PATH",
+ "DEFAULT_JSON_REPORT_PATH",
+ "DEFAULT_MARKDOWN_REPORT_PATH",
+ "DEFAULT_MAX_BASELINE_SIZE_MB",
+ "DEFAULT_MAX_CACHE_SIZE_MB",
+ "DEFAULT_MIN_LOC",
+ "DEFAULT_MIN_STMT",
+ "DEFAULT_PROCESSES",
+ "DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD",
+ "DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD",
+ "DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD",
+ "DEFAULT_ROOT",
+ "DEFAULT_SARIF_REPORT_PATH",
+ "DEFAULT_SEGMENT_MIN_LOC",
+ "DEFAULT_SEGMENT_MIN_STMT",
+ "DEFAULT_TEXT_REPORT_PATH",
+ "DOCS_URL",
+ "HEALTH_DEPENDENCY_CYCLE_PENALTY",
+ "HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER",
+ "HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY",
+ "HEALTH_DEPENDENCY_DEPTH_P95_MARGIN",
+ "HEALTH_WEIGHTS",
+ "ISSUES_URL",
+ "METRICS_BASELINE_SCHEMA_VERSION",
+ "REPORT_SCHEMA_VERSION",
+ "REPOSITORY_URL",
+ "ExitCode",
+ "cli_help_epilog",
+]
diff --git a/codeclone/errors.py b/codeclone/contracts/errors.py
similarity index 85%
rename from codeclone/errors.py
rename to codeclone/contracts/errors.py
index 7b9331f..f19c34b 100644
--- a/codeclone/errors.py
+++ b/codeclone/contracts/errors.py
@@ -37,3 +37,14 @@ class BaselineValidationError(BaselineSchemaError):
def __init__(self, message: str, *, status: str = "invalid_type") -> None:
super().__init__(message)
self.status = status
+
+
+__all__ = [
+ "BaselineSchemaError",
+ "BaselineValidationError",
+ "CacheError",
+ "CodeCloneError",
+ "FileProcessingError",
+ "ParseError",
+ "ValidationError",
+]
diff --git a/codeclone/contracts/schemas.py b/codeclone/contracts/schemas.py
new file mode 100644
index 0000000..ec1eb49
--- /dev/null
+++ b/codeclone/contracts/schemas.py
@@ -0,0 +1,85 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from typing import TypedDict
+
+
+class AnalysisProfile(TypedDict):
+ min_loc: int
+ min_stmt: int
+ block_min_loc: int
+ block_min_stmt: int
+ segment_min_loc: int
+ segment_min_stmt: int
+ collect_api_surface: bool
+
+
+class AnalysisProfileMeta(TypedDict):
+ min_loc: int
+ min_stmt: int
+ block_min_loc: int
+ block_min_stmt: int
+ segment_min_loc: int
+ segment_min_stmt: int
+
+
+class ReportMeta(TypedDict):
+ """
+ Canonical report metadata contract shared by HTML, JSON, and TXT reports.
+
+ Key semantics:
+ - python_version: runtime major.minor string for human readability (e.g. "3.14")
+ - python_tag: runtime compatibility tag used by baseline/cache contracts
+ (e.g. "cp314")
+ - baseline_*: values loaded from baseline metadata for audit/provenance
+ - cache_*: cache status/provenance for run transparency
+ """
+
+ codeclone_version: str
+ project_name: str
+ scan_root: str
+ python_version: str
+ python_tag: str
+ baseline_path: str
+ baseline_fingerprint_version: str | None
+ baseline_schema_version: str | None
+ baseline_python_tag: str | None
+ baseline_generator_name: str | None
+ baseline_generator_version: str | None
+ baseline_payload_sha256: str | None
+ baseline_payload_sha256_verified: bool
+ baseline_loaded: bool
+ baseline_status: str
+ cache_path: str
+ cache_used: bool
+ cache_status: str
+ cache_schema_version: str | None
+ files_skipped_source_io: int
+ metrics_baseline_path: str
+ metrics_baseline_loaded: bool
+ metrics_baseline_status: str
+ metrics_baseline_schema_version: str | None
+ metrics_baseline_payload_sha256: str | None
+ metrics_baseline_payload_sha256_verified: bool
+ health_score: int | None
+ health_grade: str | None
+ analysis_mode: str
+ metrics_computed: list[str]
+ analysis_profile: AnalysisProfileMeta
+ design_complexity_threshold: int
+ design_coupling_threshold: int
+ design_cohesion_threshold: int
+ analysis_started_at_utc: str | None
+ report_generated_at_utc: str
+
+
+__all__ = [
+ "AnalysisProfile",
+ "AnalysisProfileMeta",
+ "ReportMeta",
+]
diff --git a/codeclone/core/__init__.py b/codeclone/core/__init__.py
new file mode 100644
index 0000000..557317f
--- /dev/null
+++ b/codeclone/core/__init__.py
@@ -0,0 +1,4 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
diff --git a/codeclone/core/_types.py b/codeclone/core/_types.py
new file mode 100644
index 0000000..3336374
--- /dev/null
+++ b/codeclone/core/_types.py
@@ -0,0 +1,358 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from argparse import Namespace
+from collections.abc import Mapping
+from dataclasses import dataclass
+from hashlib import sha256
+from pathlib import Path
+
+import orjson
+
+from ..analysis.normalizer import NormalizationConfig
+from ..cache.entries import FileStat
+from ..cache.projection import SegmentReportProjection
+from ..contracts import DEFAULT_PROCESSES
+from ..models import (
+ BlockUnit,
+ ClassMetrics,
+ CoverageJoinResult,
+ DeadCandidate,
+ FileMetrics,
+ GroupItem,
+ GroupItemLike,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ ProjectMetrics,
+ SecuritySurface,
+ SegmentGroupItem,
+ SegmentUnit,
+ StructuralFindingGroup,
+ Suggestion,
+ SuppressedCloneGroup,
+ Unit,
+)
+from ..utils.coerce import as_int, as_mapping, as_str
+
+MAX_FILE_SIZE = 10 * 1024 * 1024
+DEFAULT_BATCH_SIZE = 100
+PARALLEL_MIN_FILES_PER_WORKER = 8
+PARALLEL_MIN_FILES_FLOOR = 16
+DEFAULT_RUNTIME_PROCESSES = DEFAULT_PROCESSES
+
+
+@dataclass(frozen=True, slots=True)
+class OutputPaths:
+ html: Path | None = None
+ json: Path | None = None
+ text: Path | None = None
+ md: Path | None = None
+ sarif: Path | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class BootstrapResult:
+ root: Path
+ config: NormalizationConfig
+ args: Namespace
+ output_paths: OutputPaths
+ cache_path: Path
+
+
+@dataclass(frozen=True, slots=True)
+class DiscoveryResult:
+ files_found: int
+ cache_hits: int
+ files_skipped: int
+ all_file_paths: tuple[str, ...]
+ cached_units: tuple[GroupItem, ...]
+ cached_blocks: tuple[GroupItem, ...]
+ cached_segments: tuple[GroupItem, ...]
+ cached_class_metrics: tuple[ClassMetrics, ...]
+ cached_module_deps: tuple[ModuleDep, ...]
+ cached_dead_candidates: tuple[DeadCandidate, ...]
+ cached_referenced_names: frozenset[str]
+ files_to_process: tuple[str, ...]
+ skipped_warnings: tuple[str, ...]
+ cached_security_surfaces: tuple[SecuritySurface, ...] = ()
+ cached_referenced_qualnames: frozenset[str] = frozenset()
+ cached_typing_modules: tuple[ModuleTypingCoverage, ...] = ()
+ cached_docstring_modules: tuple[ModuleDocstringCoverage, ...] = ()
+ cached_api_modules: tuple[ModuleApiSurface, ...] = ()
+ cached_structural_findings: tuple[StructuralFindingGroup, ...] = ()
+ cached_segment_report_projection: SegmentReportProjection | None = None
+ cached_lines: int = 0
+ cached_functions: int = 0
+ cached_methods: int = 0
+ cached_classes: int = 0
+ cached_source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = ()
+
+
+@dataclass(frozen=True, slots=True)
+class FileProcessResult:
+ filepath: str
+ success: bool
+ error: str | None = None
+ units: list[Unit] | None = None
+ blocks: list[BlockUnit] | None = None
+ segments: list[SegmentUnit] | None = None
+ lines: int = 0
+ functions: int = 0
+ methods: int = 0
+ classes: int = 0
+ stat: FileStat | None = None
+ error_kind: str | None = None
+ file_metrics: FileMetrics | None = None
+ structural_findings: list[StructuralFindingGroup] | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class ProcessingResult:
+ units: tuple[GroupItem, ...]
+ blocks: tuple[GroupItem, ...]
+ segments: tuple[GroupItem, ...]
+ class_metrics: tuple[ClassMetrics, ...]
+ module_deps: tuple[ModuleDep, ...]
+ dead_candidates: tuple[DeadCandidate, ...]
+ referenced_names: frozenset[str]
+ files_analyzed: int
+ files_skipped: int
+ analyzed_lines: int
+ analyzed_functions: int
+ analyzed_methods: int
+ analyzed_classes: int
+ failed_files: tuple[str, ...]
+ source_read_failures: tuple[str, ...]
+ security_surfaces: tuple[SecuritySurface, ...] = ()
+ referenced_qualnames: frozenset[str] = frozenset()
+ typing_modules: tuple[ModuleTypingCoverage, ...] = ()
+ docstring_modules: tuple[ModuleDocstringCoverage, ...] = ()
+ api_modules: tuple[ModuleApiSurface, ...] = ()
+ structural_findings: tuple[StructuralFindingGroup, ...] = ()
+ source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = ()
+
+
+@dataclass(frozen=True, slots=True)
+class AnalysisResult:
+ func_groups: Mapping[str, list[GroupItem]]
+ block_groups: Mapping[str, list[GroupItem]]
+ block_groups_report: Mapping[str, list[GroupItem]]
+ segment_groups: Mapping[str, list[GroupItem]]
+ suppressed_segment_groups: int
+ block_group_facts: dict[str, dict[str, str]]
+ func_clones_count: int
+ block_clones_count: int
+ segment_clones_count: int
+ files_analyzed_or_cached: int
+ project_metrics: ProjectMetrics | None
+ metrics_payload: dict[str, object] | None
+ suggestions: tuple[Suggestion, ...]
+ segment_groups_raw_digest: str
+ suppressed_clone_groups: tuple[SuppressedCloneGroup, ...] = ()
+ coverage_join: CoverageJoinResult | None = None
+ suppressed_dead_code_items: int = 0
+ structural_findings: tuple[StructuralFindingGroup, ...] = ()
+
+
+@dataclass(frozen=True, slots=True)
+class ReportArtifacts:
+ html: str | None = None
+ json: str | None = None
+ text: str | None = None
+ md: str | None = None
+ sarif: str | None = None
+ report_document: dict[str, object] | None = None
+
+
+def _as_sorted_str_tuple(value: object) -> tuple[str, ...]:
+ if not isinstance(value, list):
+ return ()
+ return tuple(sorted({item for item in value if isinstance(item, str) and item}))
+
+
+def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]:
+ return (
+ as_str(item.get("filepath")),
+ as_int(item.get("start_line")),
+ as_int(item.get("end_line")),
+ as_str(item.get("qualname")),
+ )
+
+
+def _segment_projection_item_sort_key(
+ item: GroupItemLike,
+) -> tuple[str, str, int, int]:
+ return (
+ as_str(item.get("filepath")),
+ as_str(item.get("qualname")),
+ as_int(item.get("start_line")),
+ as_int(item.get("end_line")),
+ )
+
+
+def _segment_groups_digest(segment_groups: Mapping[str, list[GroupItem]]) -> str:
+ normalized_rows: list[
+ tuple[str, tuple[tuple[str, str, int, int, int, str, str], ...]]
+ ] = []
+ for group_key in sorted(segment_groups):
+ items = sorted(segment_groups[group_key], key=_segment_projection_item_sort_key)
+ normalized_items = [
+ (
+ as_str(item.get("filepath")),
+ as_str(item.get("qualname")),
+ as_int(item.get("start_line")),
+ as_int(item.get("end_line")),
+ as_int(item.get("size")),
+ as_str(item.get("segment_hash")),
+ as_str(item.get("segment_sig")),
+ )
+ for item in items
+ ]
+ normalized_rows.append((group_key, tuple(normalized_items)))
+ payload = orjson.dumps(tuple(normalized_rows), option=orjson.OPT_SORT_KEYS)
+ return sha256(payload).hexdigest()
+
+
+def _coerce_segment_report_projection(
+ value: object,
+) -> SegmentReportProjection | None:
+ row = as_mapping(value)
+ if not row:
+ return None
+ match row.get("digest"), row.get("suppressed"), row.get("groups"):
+ case str() as digest, int() as suppressed, dict() as groups:
+ pass
+ case _:
+ return None
+ if not all(
+ isinstance(group_key, str) and isinstance(items, list)
+ for group_key, items in groups.items()
+ ):
+ return None
+ normalized_groups: dict[str, list[SegmentGroupItem]] = {}
+ for group_key, items in groups.items():
+ if not isinstance(group_key, str) or not isinstance(items, list):
+ return None
+ normalized_items: list[SegmentGroupItem] = []
+ for item in items:
+ if not isinstance(item, dict):
+ return None
+ segment_hash = item.get("segment_hash")
+ segment_sig = item.get("segment_sig")
+ filepath = item.get("filepath")
+ qualname = item.get("qualname")
+ start_line = item.get("start_line")
+ end_line = item.get("end_line")
+ size = item.get("size")
+ if not (
+ isinstance(segment_hash, str)
+ and isinstance(segment_sig, str)
+ and isinstance(filepath, str)
+ and isinstance(qualname, str)
+ and isinstance(start_line, int)
+ and isinstance(end_line, int)
+ and isinstance(size, int)
+ ):
+ return None
+ normalized_items.append(
+ SegmentGroupItem(
+ segment_hash=segment_hash,
+ segment_sig=segment_sig,
+ filepath=filepath,
+ qualname=qualname,
+ start_line=start_line,
+ end_line=end_line,
+ size=size,
+ )
+ )
+ normalized_groups[group_key] = normalized_items
+ return {
+ "digest": digest,
+ "suppressed": suppressed,
+ "groups": normalized_groups,
+ }
+
+
+def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]:
+ return dep.source, dep.target, dep.import_type, dep.line
+
+
+def _class_metric_sort_key(metric: ClassMetrics) -> tuple[str, int, int, str]:
+ return metric.filepath, metric.start_line, metric.end_line, metric.qualname
+
+
+def _dead_candidate_sort_key(item: DeadCandidate) -> tuple[str, int, int, str]:
+ return item.filepath, item.start_line, item.end_line, item.qualname
+
+
+def _module_names_from_units(units: tuple[GroupItemLike, ...]) -> frozenset[str]:
+ modules: set[str] = set()
+ for item in units:
+ qualname = as_str(item.get("qualname")) if isinstance(item, Mapping) else ""
+ module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname
+ if module_name:
+ modules.add(module_name)
+ return frozenset(sorted(modules))
+
+
+def _unit_to_group_item(unit: Unit) -> GroupItem:
+ return {
+ "qualname": unit.qualname,
+ "filepath": unit.filepath,
+ "start_line": unit.start_line,
+ "end_line": unit.end_line,
+ "loc": unit.loc,
+ "stmt_count": unit.stmt_count,
+ "fingerprint": unit.fingerprint,
+ "loc_bucket": unit.loc_bucket,
+ "cyclomatic_complexity": unit.cyclomatic_complexity,
+ "nesting_depth": unit.nesting_depth,
+ "risk": unit.risk,
+ "raw_hash": unit.raw_hash,
+ "entry_guard_count": unit.entry_guard_count,
+ "entry_guard_terminal_profile": unit.entry_guard_terminal_profile,
+ "entry_guard_has_side_effect_before": unit.entry_guard_has_side_effect_before,
+ "terminal_kind": unit.terminal_kind,
+ "try_finally_profile": unit.try_finally_profile,
+ "side_effect_order_profile": unit.side_effect_order_profile,
+ }
+
+
+def _block_to_group_item(block: BlockUnit) -> GroupItem:
+ return {
+ "block_hash": block.block_hash,
+ "filepath": block.filepath,
+ "qualname": block.qualname,
+ "start_line": block.start_line,
+ "end_line": block.end_line,
+ "size": block.size,
+ }
+
+
+def _segment_to_group_item(segment: SegmentUnit) -> GroupItem:
+ return {
+ "filepath": segment.filepath,
+ "qualname": segment.qualname,
+ "start_line": segment.start_line,
+ "end_line": segment.end_line,
+ "size": segment.size,
+ "segment_hash": segment.segment_hash,
+ "segment_sig": segment.segment_sig,
+ }
+
+
+def _should_collect_structural_findings(output_paths: OutputPaths) -> bool:
+ return bool(
+ output_paths.html
+ or output_paths.json
+ or output_paths.md
+ or output_paths.text
+ or output_paths.sarif
+ )
diff --git a/codeclone/core/api_surface_payload.py b/codeclone/core/api_surface_payload.py
new file mode 100644
index 0000000..7ec0ff8
--- /dev/null
+++ b/codeclone/core/api_surface_payload.py
@@ -0,0 +1,98 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+from ..models import ApiBreakingChange, ApiSurfaceSnapshot
+from ..utils.coerce import as_int, as_str
+
+
+def _api_surface_summary(api_surface: ApiSurfaceSnapshot | None) -> dict[str, object]:
+ modules = api_surface.modules if api_surface is not None else ()
+ return {
+ "enabled": api_surface is not None,
+ "modules": len(modules),
+ "public_symbols": sum(len(module.symbols) for module in modules),
+ "added": 0,
+ "breaking": 0,
+ "strict_types": False,
+ }
+
+
+def _api_surface_rows(
+ api_surface: ApiSurfaceSnapshot | None,
+) -> list[dict[str, object]]:
+ if api_surface is None:
+ return []
+ rows: list[dict[str, object]] = []
+ for module in api_surface.modules:
+ rows.extend(
+ {
+ "record_kind": "symbol",
+ "module": module.module,
+ "filepath": module.filepath,
+ "qualname": symbol.qualname,
+ "start_line": symbol.start_line,
+ "end_line": symbol.end_line,
+ "symbol_kind": symbol.kind,
+ "exported_via": symbol.exported_via,
+ "params_total": len(symbol.params),
+ "params": [
+ {
+ "name": param.name,
+ "kind": param.kind,
+ "has_default": param.has_default,
+ "annotated": bool(param.annotation_hash),
+ }
+ for param in symbol.params
+ ],
+ "returns_annotated": bool(symbol.returns_hash),
+ }
+ for symbol in module.symbols
+ )
+ return sorted(
+ rows,
+ key=lambda item: (
+ as_str(item.get("filepath")),
+ as_int(item.get("start_line")),
+ as_int(item.get("end_line")),
+ as_str(item.get("qualname")),
+ as_str(item.get("record_kind")),
+ ),
+ )
+
+
+def _breaking_api_surface_rows(changes: Sequence[object]) -> list[dict[str, object]]:
+ rows: list[dict[str, object]] = []
+ for change in changes:
+ if not isinstance(change, ApiBreakingChange):
+ continue
+ module_name, _, _local_name = change.qualname.partition(":")
+ rows.append(
+ {
+ "record_kind": "breaking_change",
+ "module": module_name,
+ "filepath": change.filepath,
+ "qualname": change.qualname,
+ "start_line": change.start_line,
+ "end_line": change.end_line,
+ "symbol_kind": change.symbol_kind,
+ "change_kind": change.change_kind,
+ "detail": change.detail,
+ }
+ )
+ return sorted(
+ rows,
+ key=lambda item: (
+ as_str(item.get("filepath")),
+ as_int(item.get("start_line")),
+ as_int(item.get("end_line")),
+ as_str(item.get("qualname")),
+ as_str(item.get("change_kind")),
+ ),
+ )
diff --git a/codeclone/core/bootstrap.py b/codeclone/core/bootstrap.py
new file mode 100644
index 0000000..1043a27
--- /dev/null
+++ b/codeclone/core/bootstrap.py
@@ -0,0 +1,41 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from argparse import Namespace
+from pathlib import Path
+
+from ..analysis.normalizer import NormalizationConfig
+from ._types import BootstrapResult, OutputPaths
+
+
+def bootstrap(
+ *,
+ args: Namespace,
+ root: Path,
+ output_paths: OutputPaths,
+ cache_path: Path,
+) -> BootstrapResult:
+ return BootstrapResult(
+ root=root,
+ config=NormalizationConfig(),
+ args=args,
+ output_paths=output_paths,
+ cache_path=cache_path,
+ )
+
+
+def _resolve_optional_runtime_path(value: object, *, root: Path) -> Path | None:
+ text = str(value).strip() if value is not None else ""
+ if not text:
+ return None
+ candidate = Path(text).expanduser()
+ resolved = candidate if candidate.is_absolute() else root / candidate
+ try:
+ return resolved.resolve()
+ except OSError:
+ return resolved.absolute()
diff --git a/codeclone/core/coverage_payload.py b/codeclone/core/coverage_payload.py
new file mode 100644
index 0000000..1380b71
--- /dev/null
+++ b/codeclone/core/coverage_payload.py
@@ -0,0 +1,173 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from ..models import CoverageJoinResult, ProjectMetrics
+from ..utils.coerce import as_int, as_str
+
+
+def _permille(numerator: int, denominator: int) -> int:
+ if denominator <= 0:
+ return 0
+ return round((1000.0 * float(numerator)) / float(denominator))
+
+
+def _coverage_join_summary(
+ coverage_join: CoverageJoinResult | None,
+) -> dict[str, object]:
+ if coverage_join is None:
+ return {}
+ return {
+ "status": coverage_join.status,
+ "source": coverage_join.coverage_xml,
+ "files": coverage_join.files,
+ "units": len(coverage_join.units),
+ "measured_units": coverage_join.measured_units,
+ "overall_executable_lines": coverage_join.overall_executable_lines,
+ "overall_covered_lines": coverage_join.overall_covered_lines,
+ "overall_permille": _permille(
+ coverage_join.overall_covered_lines,
+ coverage_join.overall_executable_lines,
+ ),
+ "missing_from_report_units": sum(
+ 1
+ for fact in coverage_join.units
+ if fact.coverage_status == "missing_from_report"
+ ),
+ "coverage_hotspots": coverage_join.coverage_hotspots,
+ "scope_gap_hotspots": coverage_join.scope_gap_hotspots,
+ "hotspot_threshold_percent": coverage_join.hotspot_threshold_percent,
+ "invalid_reason": coverage_join.invalid_reason,
+ }
+
+
+def _coverage_join_rows(
+ coverage_join: CoverageJoinResult | None,
+) -> list[dict[str, object]]:
+ if coverage_join is None or coverage_join.status != "ok":
+ return []
+ return sorted(
+ (
+ {
+ "qualname": fact.qualname,
+ "filepath": fact.filepath,
+ "start_line": fact.start_line,
+ "end_line": fact.end_line,
+ "cyclomatic_complexity": fact.cyclomatic_complexity,
+ "risk": fact.risk,
+ "executable_lines": fact.executable_lines,
+ "covered_lines": fact.covered_lines,
+ "coverage_permille": fact.coverage_permille,
+ "coverage_status": fact.coverage_status,
+ "coverage_hotspot": (
+ fact.risk in {"medium", "high"}
+ and fact.coverage_status == "measured"
+ and (fact.coverage_permille / 10.0)
+ < float(coverage_join.hotspot_threshold_percent)
+ ),
+ "scope_gap_hotspot": (
+ fact.risk in {"medium", "high"}
+ and fact.coverage_status == "missing_from_report"
+ ),
+ "coverage_review_item": (
+ (
+ fact.risk in {"medium", "high"}
+ and fact.coverage_status == "measured"
+ and (fact.coverage_permille / 10.0)
+ < float(coverage_join.hotspot_threshold_percent)
+ )
+ or (
+ fact.risk in {"medium", "high"}
+ and fact.coverage_status == "missing_from_report"
+ )
+ ),
+ }
+ for fact in coverage_join.units
+ ),
+ key=lambda item: (
+ 0 if bool(item.get("coverage_hotspot")) else 1,
+ 0 if bool(item.get("scope_gap_hotspot")) else 1,
+ {"high": 0, "medium": 1, "low": 2}.get(as_str(item.get("risk")), 3),
+ as_int(item.get("coverage_permille"), 0),
+ -as_int(item.get("cyclomatic_complexity"), 0),
+ as_str(item.get("filepath")),
+ as_int(item.get("start_line")),
+ as_str(item.get("qualname")),
+ ),
+ )
+
+
+def _coverage_adoption_rows(project_metrics: ProjectMetrics) -> list[dict[str, object]]:
+ docstring_by_module = {
+ (item.filepath, item.module): item for item in project_metrics.docstring_modules
+ }
+ rows: list[dict[str, object]] = []
+ seen_keys: set[tuple[str, str]] = set()
+ for typing_item in project_metrics.typing_modules:
+ key = (typing_item.filepath, typing_item.module)
+ seen_keys.add(key)
+ docstring_item = docstring_by_module.get(key)
+ doc_total = docstring_item.public_symbol_total if docstring_item else 0
+ doc_documented = (
+ docstring_item.public_symbol_documented if docstring_item else 0
+ )
+ rows.append(
+ {
+ "module": typing_item.module,
+ "filepath": typing_item.filepath,
+ "callable_count": typing_item.callable_count,
+ "params_total": typing_item.params_total,
+ "params_annotated": typing_item.params_annotated,
+ "param_permille": _permille(
+ typing_item.params_annotated,
+ typing_item.params_total,
+ ),
+ "returns_total": typing_item.returns_total,
+ "returns_annotated": typing_item.returns_annotated,
+ "return_permille": _permille(
+ typing_item.returns_annotated,
+ typing_item.returns_total,
+ ),
+ "any_annotation_count": typing_item.any_annotation_count,
+ "public_symbol_total": doc_total,
+ "public_symbol_documented": doc_documented,
+ "docstring_permille": _permille(doc_documented, doc_total),
+ }
+ )
+ for docstring_item in project_metrics.docstring_modules:
+ key = (docstring_item.filepath, docstring_item.module)
+ if key in seen_keys:
+ continue
+ rows.append(
+ {
+ "module": docstring_item.module,
+ "filepath": docstring_item.filepath,
+ "callable_count": 0,
+ "params_total": 0,
+ "params_annotated": 0,
+ "param_permille": 0,
+ "returns_total": 0,
+ "returns_annotated": 0,
+ "return_permille": 0,
+ "any_annotation_count": 0,
+ "public_symbol_total": docstring_item.public_symbol_total,
+ "public_symbol_documented": docstring_item.public_symbol_documented,
+ "docstring_permille": _permille(
+ docstring_item.public_symbol_documented,
+ docstring_item.public_symbol_total,
+ ),
+ }
+ )
+ return sorted(
+ rows,
+ key=lambda item: (
+ as_int(item.get("param_permille")),
+ as_int(item.get("docstring_permille")),
+ as_int(item.get("return_permille")),
+ as_str(item.get("module")),
+ ),
+ )
diff --git a/codeclone/core/discovery.py b/codeclone/core/discovery.py
new file mode 100644
index 0000000..99ddcf1
--- /dev/null
+++ b/codeclone/core/discovery.py
@@ -0,0 +1,220 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from ..cache.store import Cache, file_stat_signature
+from ..models import (
+ ClassMetrics,
+ DeadCandidate,
+ GroupItem,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ SecuritySurface,
+ StructuralFindingGroup,
+)
+from ..scanner import iter_py_files
+from ._types import (
+ BootstrapResult,
+ DiscoveryResult,
+ _class_metric_sort_key,
+ _coerce_segment_report_projection,
+ _dead_candidate_sort_key,
+ _group_item_sort_key,
+ _module_dep_sort_key,
+ _should_collect_structural_findings,
+)
+from .discovery_cache import (
+ decode_cached_structural_finding_group as _decode_cached_structural_finding_group,
+)
+from .discovery_cache import (
+ load_cached_metrics_extended as _load_cached_metrics_extended,
+)
+from .discovery_cache import usable_cached_source_stats as _usable_cached_source_stats
+
+DiscoveryBuffers = tuple[
+ list[GroupItem],
+ list[GroupItem],
+ list[GroupItem],
+ list[ClassMetrics],
+ list[ModuleDep],
+ list[DeadCandidate],
+ set[str],
+ set[str],
+ list[ModuleTypingCoverage],
+ list[ModuleDocstringCoverage],
+ list[ModuleApiSurface],
+ list[SecuritySurface],
+ list[str],
+ list[str],
+]
+
+
+def _group_items_from_cache(rows: Sequence[Mapping[str, object]]) -> list[GroupItem]:
+ return [dict(row) for row in rows]
+
+
+def _new_discovery_buffers() -> DiscoveryBuffers:
+ # Keep buffer order aligned with DiscoveryBuffers above.
+ return [], [], [], [], [], [], set(), set(), [], [], [], [], [], []
+
+
+def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult:
+ files_found = 0
+ cache_hits = 0
+ files_skipped = 0
+ collect_structural_findings = _should_collect_structural_findings(boot.output_paths)
+ cached_segment_projection = _coerce_segment_report_projection(
+ getattr(cache, "segment_report_projection", None)
+ )
+ (
+ cached_units,
+ cached_blocks,
+ cached_segments,
+ cached_class_metrics,
+ cached_module_deps,
+ cached_dead_candidates,
+ cached_referenced_names,
+ cached_referenced_qualnames,
+ cached_typing_modules,
+ cached_docstring_modules,
+ cached_api_modules,
+ cached_security_surfaces,
+ files_to_process,
+ skipped_warnings,
+ ) = _new_discovery_buffers()
+ cached_sf: list[StructuralFindingGroup] = []
+ cached_source_stats_by_file: list[tuple[str, int, int, int, int]] = []
+ cached_lines = 0
+ cached_functions = 0
+ cached_methods = 0
+ cached_classes = 0
+ all_file_paths: list[str] = []
+
+ for filepath in iter_py_files(str(boot.root)):
+ files_found += 1
+ all_file_paths.append(filepath)
+ try:
+ stat = file_stat_signature(filepath)
+ except OSError as exc:
+ files_skipped += 1
+ skipped_warnings.append(f"{filepath}: {exc}")
+ continue
+ cached = cache.get_file_entry(filepath)
+ if cached and cached.get("stat") == stat:
+ cached_source_stats = _usable_cached_source_stats(
+ cached,
+ skip_metrics=boot.args.skip_metrics,
+ collect_structural_findings=collect_structural_findings,
+ )
+ if cached_source_stats is None:
+ files_to_process.append(filepath)
+ continue
+ cache_hits += 1
+ lines, functions, methods, classes = cached_source_stats
+ cached_lines += lines
+ cached_functions += functions
+ cached_methods += methods
+ cached_classes += classes
+ cached_source_stats_by_file.append(
+ (filepath, lines, functions, methods, classes)
+ )
+ cached_units.extend(_group_items_from_cache(cached["units"]))
+ cached_blocks.extend(_group_items_from_cache(cached["blocks"]))
+ cached_segments.extend(_group_items_from_cache(cached["segments"]))
+ if not boot.args.skip_metrics:
+ (
+ class_metrics,
+ module_deps,
+ dead_candidates,
+ referenced_names,
+ referenced_qualnames,
+ typing_coverage,
+ docstring_coverage,
+ api_surface,
+ security_surfaces,
+ ) = _load_cached_metrics_extended(cached, filepath=filepath)
+ cached_class_metrics.extend(class_metrics)
+ cached_module_deps.extend(module_deps)
+ cached_dead_candidates.extend(dead_candidates)
+ cached_referenced_names.update(referenced_names)
+ cached_referenced_qualnames.update(referenced_qualnames)
+ if typing_coverage is not None:
+ cached_typing_modules.append(typing_coverage)
+ if docstring_coverage is not None:
+ cached_docstring_modules.append(docstring_coverage)
+ if api_surface is not None:
+ cached_api_modules.append(api_surface)
+ cached_security_surfaces.extend(security_surfaces)
+ if collect_structural_findings:
+ cached_sf.extend(
+ _decode_cached_structural_finding_group(group_dict, filepath)
+ for group_dict in cached.get("structural_findings") or []
+ )
+ continue
+ files_to_process.append(filepath)
+
+ cache.prune_file_entries(all_file_paths)
+
+ return DiscoveryResult(
+ files_found=files_found,
+ cache_hits=cache_hits,
+ files_skipped=files_skipped,
+ all_file_paths=tuple(all_file_paths),
+ cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)),
+ cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)),
+ cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)),
+ cached_class_metrics=tuple(
+ sorted(cached_class_metrics, key=_class_metric_sort_key)
+ ),
+ cached_module_deps=tuple(sorted(cached_module_deps, key=_module_dep_sort_key)),
+ cached_dead_candidates=tuple(
+ sorted(cached_dead_candidates, key=_dead_candidate_sort_key)
+ ),
+ cached_referenced_names=frozenset(cached_referenced_names),
+ cached_security_surfaces=tuple(
+ sorted(
+ cached_security_surfaces,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ item.category,
+ item.capability,
+ item.evidence_symbol,
+ ),
+ )
+ ),
+ cached_referenced_qualnames=frozenset(cached_referenced_qualnames),
+ cached_typing_modules=tuple(
+ sorted(cached_typing_modules, key=lambda item: (item.filepath, item.module))
+ ),
+ cached_docstring_modules=tuple(
+ sorted(
+ cached_docstring_modules,
+ key=lambda item: (item.filepath, item.module),
+ )
+ ),
+ cached_api_modules=tuple(
+ sorted(cached_api_modules, key=lambda item: (item.filepath, item.module))
+ ),
+ files_to_process=tuple(files_to_process),
+ skipped_warnings=tuple(sorted(skipped_warnings)),
+ cached_structural_findings=tuple(cached_sf),
+ cached_segment_report_projection=cached_segment_projection,
+ cached_lines=cached_lines,
+ cached_functions=cached_functions,
+ cached_methods=cached_methods,
+ cached_classes=cached_classes,
+ cached_source_stats_by_file=tuple(
+ sorted(cached_source_stats_by_file, key=lambda row: row[0])
+ ),
+ )
diff --git a/codeclone/core/discovery_cache.py b/codeclone/core/discovery_cache.py
new file mode 100644
index 0000000..995fdd4
--- /dev/null
+++ b/codeclone/core/discovery_cache.py
@@ -0,0 +1,581 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Literal
+
+from ..cache.entries import (
+ CacheEntry,
+ ClassMetricsDict,
+ DeadCandidateDict,
+ ModuleDepDict,
+ SecuritySurfaceDict,
+ StructuralFindingGroupDict,
+)
+from ..models import (
+ ApiParamSpec,
+ ClassMetrics,
+ DeadCandidate,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ PublicSymbol,
+ SecuritySurface,
+ SecuritySurfaceCategory,
+ SecuritySurfaceClassificationMode,
+ SecuritySurfaceEvidenceKind,
+ SecuritySurfaceLocationScope,
+ StructuralFindingGroup,
+ StructuralFindingOccurrence,
+)
+from ..paths import is_test_filepath
+from ..utils.coerce import as_mapping
+from ._types import _as_sorted_str_tuple
+
+_ApiParamKind = Literal["pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"]
+_PublicSymbolKind = Literal["function", "class", "method", "constant"]
+_ExportedViaKind = Literal["all", "name"]
+_RiskLevel = Literal["low", "medium", "high"]
+_ImportType = Literal["import", "from_import"]
+_DeadCandidateKind = Literal["function", "class", "method", "import"]
+
+
+def _api_param_kind(value: object) -> _ApiParamKind | None:
+ match value:
+ case "pos_only":
+ return "pos_only"
+ case "pos_or_kw":
+ return "pos_or_kw"
+ case "vararg":
+ return "vararg"
+ case "kw_only":
+ return "kw_only"
+ case "kwarg":
+ return "kwarg"
+ case _:
+ return None
+
+
+def _public_symbol_kind(value: object) -> _PublicSymbolKind | None:
+ match value:
+ case "function":
+ return "function"
+ case "class":
+ return "class"
+ case "method":
+ return "method"
+ case "constant":
+ return "constant"
+ case _:
+ return None
+
+
+def _exported_via_kind(value: object) -> _ExportedViaKind | None:
+ match value:
+ case "all":
+ return "all"
+ case "name":
+ return "name"
+ case _:
+ return None
+
+
+def _risk_level(value: object) -> _RiskLevel | None:
+ match value:
+ case "low":
+ return "low"
+ case "medium":
+ return "medium"
+ case "high":
+ return "high"
+ case _:
+ return None
+
+
+def _import_type(value: object) -> _ImportType | None:
+ match value:
+ case "import":
+ return "import"
+ case "from_import":
+ return "from_import"
+ case _:
+ return None
+
+
+def _dead_candidate_kind(value: object) -> _DeadCandidateKind | None:
+ match value:
+ case "function":
+ return "function"
+ case "class":
+ return "class"
+ case "method":
+ return "method"
+ case "import":
+ return "import"
+ case _:
+ return None
+
+
+def _security_surface_category(value: object) -> SecuritySurfaceCategory | None:
+ match value:
+ case (
+ "archive_extraction"
+ | "crypto_transport"
+ | "database_boundary"
+ | "deserialization"
+ | "dynamic_execution"
+ | "dynamic_loading"
+ | "filesystem_mutation"
+ | "identity_token"
+ | "network_boundary"
+ | "process_boundary"
+ ):
+ return value
+ case _:
+ return None
+
+
+def _security_surface_location_scope(
+ value: object,
+) -> SecuritySurfaceLocationScope | None:
+ match value:
+ case "module" | "class" | "callable":
+ return value
+ case _:
+ return None
+
+
+def _security_surface_classification_mode(
+ value: object,
+) -> SecuritySurfaceClassificationMode | None:
+ match value:
+ case "exact_builtin" | "exact_call" | "exact_import":
+ return value
+ case _:
+ return None
+
+
+def _security_surface_evidence_kind(
+ value: object,
+) -> SecuritySurfaceEvidenceKind | None:
+ match value:
+ case "builtin" | "call" | "import":
+ return value
+ case _:
+ return None
+
+
+def decode_cached_structural_finding_group(
+ group_dict: StructuralFindingGroupDict,
+ filepath: str,
+) -> StructuralFindingGroup:
+ finding_kind = group_dict["finding_kind"]
+ finding_key = group_dict["finding_key"]
+ signature = group_dict["signature"]
+ items = tuple(
+ StructuralFindingOccurrence(
+ finding_kind=finding_kind,
+ finding_key=finding_key,
+ file_path=filepath,
+ qualname=item["qualname"],
+ start=item["start"],
+ end=item["end"],
+ signature=signature,
+ )
+ for item in group_dict["items"]
+ )
+ return StructuralFindingGroup(
+ finding_kind=finding_kind,
+ finding_key=finding_key,
+ signature=signature,
+ items=items,
+ )
+
+
+def _cache_entry_has_metrics(entry: CacheEntry) -> bool:
+ metric_keys = (
+ "class_metrics",
+ "module_deps",
+ "dead_candidates",
+ "referenced_names",
+ "referenced_qualnames",
+ "import_names",
+ "class_names",
+ )
+ return all(key in entry and isinstance(entry.get(key), list) for key in metric_keys)
+
+
+def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool:
+ return "structural_findings" in entry
+
+
+def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | None:
+ stats_obj = entry.get("source_stats")
+ if not isinstance(stats_obj, dict):
+ return None
+ lines = stats_obj.get("lines")
+ functions = stats_obj.get("functions")
+ methods = stats_obj.get("methods")
+ classes = stats_obj.get("classes")
+ if not (
+ isinstance(lines, int)
+ and isinstance(functions, int)
+ and isinstance(methods, int)
+ and isinstance(classes, int)
+ and lines >= 0
+ and functions >= 0
+ and methods >= 0
+ and classes >= 0
+ ):
+ return None
+ return lines, functions, methods, classes
+
+
+def usable_cached_source_stats(
+ entry: CacheEntry,
+ *,
+ skip_metrics: bool,
+ collect_structural_findings: bool,
+) -> tuple[int, int, int, int] | None:
+ if not skip_metrics and not _cache_entry_has_metrics(entry):
+ return None
+ if collect_structural_findings and not _cache_entry_has_structural_findings(entry):
+ return None
+ return _cache_entry_source_stats(entry)
+
+
+def _cache_dict_module_fields(
+ value: object,
+) -> tuple[Mapping[str, object], str, str] | None:
+ if not isinstance(value, Mapping):
+ return None
+ row = as_mapping(value)
+ module = row.get("module")
+ filepath = row.get("filepath")
+ if not isinstance(module, str) or not isinstance(filepath, str):
+ return None
+ return row, module, filepath
+
+
+def _cache_dict_int_fields(
+ row: Mapping[str, object],
+ *keys: str,
+) -> tuple[int, ...] | None:
+ values: list[int] = []
+ for key in keys:
+ value = row.get(key)
+ if not isinstance(value, int):
+ return None
+ values.append(value)
+ return tuple(values)
+
+
+def _api_param_fields(
+ row: Mapping[str, object],
+) -> tuple[str, _ApiParamKind, bool, str] | None:
+ name = row.get("name")
+ validated_kind = _api_param_kind(row.get("kind"))
+ has_default = row.get("has_default")
+ annotation_hash = row.get("annotation_hash", "")
+ if (
+ not isinstance(name, str)
+ or validated_kind is None
+ or not isinstance(has_default, bool)
+ or not isinstance(annotation_hash, str)
+ ):
+ return None
+ return name, validated_kind, has_default, annotation_hash
+
+
+def _typing_coverage_from_cache_dict(value: object) -> ModuleTypingCoverage | None:
+ row_info = _cache_dict_module_fields(value)
+ if row_info is None:
+ return None
+ row, module, filepath = row_info
+ int_fields = _cache_dict_int_fields(
+ row,
+ "callable_count",
+ "params_total",
+ "params_annotated",
+ "returns_total",
+ "returns_annotated",
+ "any_annotation_count",
+ )
+ if int_fields is None:
+ return None
+ return ModuleTypingCoverage(
+ module=module,
+ filepath=filepath,
+ callable_count=int_fields[0],
+ params_total=int_fields[1],
+ params_annotated=int_fields[2],
+ returns_total=int_fields[3],
+ returns_annotated=int_fields[4],
+ any_annotation_count=int_fields[5],
+ )
+
+
+def _docstring_coverage_from_cache_dict(
+ value: object,
+) -> ModuleDocstringCoverage | None:
+ row_info = _cache_dict_module_fields(value)
+ if row_info is None:
+ return None
+ row, module, filepath = row_info
+ totals = _cache_dict_int_fields(
+ row,
+ "public_symbol_total",
+ "public_symbol_documented",
+ )
+ if totals is None:
+ return None
+ return ModuleDocstringCoverage(
+ module=module,
+ filepath=filepath,
+ public_symbol_total=totals[0],
+ public_symbol_documented=totals[1],
+ )
+
+
+def _api_param_spec_from_cache_dict(value: object) -> ApiParamSpec | None:
+ row = as_mapping(value)
+ if not row:
+ return None
+ fields = _api_param_fields(row)
+ if fields is None:
+ return None
+ name, validated_kind, has_default, annotation_hash = fields
+ return ApiParamSpec(
+ name=name,
+ kind=validated_kind,
+ has_default=has_default,
+ annotation_hash=annotation_hash,
+ )
+
+
+def _public_symbol_from_cache_dict(value: object) -> PublicSymbol | None:
+ row = as_mapping(value)
+ if not row:
+ return None
+ qualname = row.get("qualname")
+ start_line = row.get("start_line")
+ end_line = row.get("end_line")
+ returns_hash = row.get("returns_hash", "")
+ params_raw = row.get("params", [])
+ validated_kind = _public_symbol_kind(row.get("kind"))
+ validated_exported_via = _exported_via_kind(row.get("exported_via", "name"))
+ if (
+ not isinstance(qualname, str)
+ or validated_kind is None
+ or not isinstance(start_line, int)
+ or not isinstance(end_line, int)
+ or validated_exported_via is None
+ or not isinstance(returns_hash, str)
+ or not isinstance(params_raw, list)
+ ):
+ return None
+ params: list[ApiParamSpec] = []
+ for param in params_raw:
+ if not isinstance(param, dict):
+ return None
+ parsed = _api_param_spec_from_cache_dict(param)
+ if parsed is None:
+ return None
+ params.append(parsed)
+ return PublicSymbol(
+ qualname=qualname,
+ kind=validated_kind,
+ start_line=start_line,
+ end_line=end_line,
+ params=tuple(params),
+ returns_hash=returns_hash,
+ exported_via=validated_exported_via,
+ )
+
+
+def _api_surface_from_cache_dict(value: object) -> ModuleApiSurface | None:
+ row_info = _cache_dict_module_fields(value)
+ if row_info is None:
+ return None
+ row, module, filepath = row_info
+ all_declared_raw = row.get("all_declared", [])
+ symbols_raw = row.get("symbols", [])
+ if (
+ not isinstance(all_declared_raw, list)
+ or not isinstance(symbols_raw, list)
+ or not all(isinstance(item, str) for item in all_declared_raw)
+ ):
+ return None
+ symbols: list[PublicSymbol] = []
+ for item in symbols_raw:
+ parsed = _public_symbol_from_cache_dict(item)
+ if parsed is None:
+ return None
+ symbols.append(parsed)
+ return ModuleApiSurface(
+ module=module,
+ filepath=filepath,
+ all_declared=tuple(sorted(set(all_declared_raw))) or None,
+ symbols=tuple(sorted(symbols, key=lambda item: item.qualname)),
+ )
+
+
+def _class_metric_from_cache_row(metric_row: ClassMetricsDict) -> ClassMetrics | None:
+ risk_coupling = _risk_level(metric_row["risk_coupling"])
+ risk_cohesion = _risk_level(metric_row["risk_cohesion"])
+ if (
+ not metric_row.get("qualname")
+ or not metric_row.get("filepath")
+ or risk_coupling is None
+ or risk_cohesion is None
+ ):
+ return None
+ return ClassMetrics(
+ qualname=metric_row["qualname"],
+ filepath=metric_row["filepath"],
+ start_line=metric_row["start_line"],
+ end_line=metric_row["end_line"],
+ cbo=metric_row["cbo"],
+ lcom4=metric_row["lcom4"],
+ method_count=metric_row["method_count"],
+ instance_var_count=metric_row["instance_var_count"],
+ risk_coupling=risk_coupling,
+ risk_cohesion=risk_cohesion,
+ coupled_classes=_as_sorted_str_tuple(metric_row.get("coupled_classes", [])),
+ )
+
+
+def _module_dep_from_cache_row(dep_row: ModuleDepDict) -> ModuleDep | None:
+ import_type = _import_type(dep_row["import_type"])
+ if not dep_row.get("source") or not dep_row.get("target") or import_type is None:
+ return None
+ return ModuleDep(
+ source=dep_row["source"],
+ target=dep_row["target"],
+ import_type=import_type,
+ line=dep_row["line"],
+ )
+
+
+def _dead_candidate_from_cache_row(dead_row: DeadCandidateDict) -> DeadCandidate | None:
+ kind = _dead_candidate_kind(dead_row["kind"])
+ if (
+ not dead_row.get("qualname")
+ or not dead_row.get("local_name")
+ or not dead_row.get("filepath")
+ or kind is None
+ ):
+ return None
+ return DeadCandidate(
+ qualname=dead_row["qualname"],
+ local_name=dead_row["local_name"],
+ filepath=dead_row["filepath"],
+ start_line=dead_row["start_line"],
+ end_line=dead_row["end_line"],
+ kind=kind,
+ suppressed_rules=_as_sorted_str_tuple(dead_row.get("suppressed_rules", [])),
+ )
+
+
+def _security_surface_from_cache_row(
+ surface_row: SecuritySurfaceDict,
+) -> SecuritySurface | None:
+ category = _security_surface_category(surface_row.get("category"))
+ location_scope = _security_surface_location_scope(surface_row.get("location_scope"))
+ classification_mode = _security_surface_classification_mode(
+ surface_row.get("classification_mode")
+ )
+ evidence_kind = _security_surface_evidence_kind(surface_row.get("evidence_kind"))
+ if (
+ category is None
+ or location_scope is None
+ or classification_mode is None
+ or evidence_kind is None
+ ):
+ return None
+ return SecuritySurface(
+ category=category,
+ capability=surface_row["capability"],
+ module=surface_row["module"],
+ filepath=surface_row["filepath"],
+ qualname=surface_row["qualname"],
+ start_line=surface_row["start_line"],
+ end_line=surface_row["end_line"],
+ location_scope=location_scope,
+ classification_mode=classification_mode,
+ evidence_kind=evidence_kind,
+ evidence_symbol=surface_row["evidence_symbol"],
+ )
+
+
+def load_cached_metrics_extended(
+ entry: CacheEntry,
+ *,
+ filepath: str,
+) -> tuple[
+ tuple[ClassMetrics, ...],
+ tuple[ModuleDep, ...],
+ tuple[DeadCandidate, ...],
+ frozenset[str],
+ frozenset[str],
+ ModuleTypingCoverage | None,
+ ModuleDocstringCoverage | None,
+ ModuleApiSurface | None,
+ tuple[SecuritySurface, ...],
+]:
+ class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", [])
+ class_metrics_items: list[ClassMetrics] = []
+ for metric_row in class_metrics_rows:
+ parsed_metric = _class_metric_from_cache_row(metric_row)
+ if parsed_metric is not None:
+ class_metrics_items.append(parsed_metric)
+ class_metrics = tuple(class_metrics_items)
+ module_dep_rows: list[ModuleDepDict] = entry.get("module_deps", [])
+ module_dep_items: list[ModuleDep] = []
+ for dep_row in module_dep_rows:
+ parsed_dep = _module_dep_from_cache_row(dep_row)
+ if parsed_dep is not None:
+ module_dep_items.append(parsed_dep)
+ module_deps = tuple(module_dep_items)
+ dead_rows: list[DeadCandidateDict] = entry.get("dead_candidates", [])
+ dead_candidate_items: list[DeadCandidate] = []
+ for dead_row in dead_rows:
+ parsed_dead = _dead_candidate_from_cache_row(dead_row)
+ if parsed_dead is not None:
+ dead_candidate_items.append(parsed_dead)
+ dead_candidates = tuple(dead_candidate_items)
+ referenced_names = (
+ frozenset()
+ if is_test_filepath(filepath)
+ else frozenset(entry.get("referenced_names", []))
+ )
+ referenced_qualnames = (
+ frozenset()
+ if is_test_filepath(filepath)
+ else frozenset(entry.get("referenced_qualnames", []))
+ )
+ security_surface_rows: list[SecuritySurfaceDict] = entry.get(
+ "security_surfaces", []
+ )
+ security_surface_items: list[SecuritySurface] = []
+ for surface_row in security_surface_rows:
+ parsed_surface = _security_surface_from_cache_row(surface_row)
+ if parsed_surface is not None:
+ security_surface_items.append(parsed_surface)
+ return (
+ class_metrics,
+ module_deps,
+ dead_candidates,
+ referenced_names,
+ referenced_qualnames,
+ _typing_coverage_from_cache_dict(entry.get("typing_coverage")),
+ _docstring_coverage_from_cache_dict(entry.get("docstring_coverage")),
+ _api_surface_from_cache_dict(entry.get("api_surface")),
+ tuple(security_surface_items),
+ )
diff --git a/codeclone/core/metrics_payload.py b/codeclone/core/metrics_payload.py
new file mode 100644
index 0000000..cc69d18
--- /dev/null
+++ b/codeclone/core/metrics_payload.py
@@ -0,0 +1,323 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from ..analysis.suppressions import (
+ DEAD_CODE_RULE_ID,
+ INLINE_CODECLONE_SUPPRESSION_SOURCE,
+)
+from ..domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING
+from ..domain.quality import CONFIDENCE_HIGH, RISK_LOW
+from ..metrics.overloaded_modules import build_overloaded_modules_payload
+from ..models import (
+ ClassMetrics,
+ CoverageJoinResult,
+ DeadItem,
+ DepGraph,
+ GroupItemLike,
+ MetricsDiff,
+ ModuleDep,
+ ProjectMetrics,
+ SecuritySurface,
+)
+from ..utils.coerce import as_int, as_mapping, as_sequence, as_str
+from .api_surface_payload import (
+ _api_surface_rows,
+ _api_surface_summary,
+ _breaking_api_surface_rows,
+)
+from .coverage_payload import (
+ _coverage_adoption_rows,
+ _coverage_join_rows,
+ _coverage_join_summary,
+ _permille,
+)
+from .security_surfaces_payload import build_security_surfaces_payload
+
+
+def _enrich_metrics_report_payload(
+ *,
+ metrics_payload: Mapping[str, object],
+ metrics_diff: MetricsDiff | None,
+ coverage_adoption_diff_available: bool,
+ api_surface_diff_available: bool,
+) -> dict[str, object]:
+ enriched = {
+ key: (dict(value) if isinstance(value, Mapping) else value)
+ for key, value in metrics_payload.items()
+ }
+ coverage_adoption = dict(as_mapping(enriched.get("coverage_adoption")))
+ coverage_summary = dict(as_mapping(coverage_adoption.get("summary")))
+ if coverage_summary:
+ coverage_summary["baseline_diff_available"] = coverage_adoption_diff_available
+ coverage_summary["param_delta"] = (
+ int(metrics_diff.typing_param_permille_delta)
+ if metrics_diff is not None and coverage_adoption_diff_available
+ else 0
+ )
+ coverage_summary["return_delta"] = (
+ int(metrics_diff.typing_return_permille_delta)
+ if metrics_diff is not None and coverage_adoption_diff_available
+ else 0
+ )
+ coverage_summary["docstring_delta"] = (
+ int(metrics_diff.docstring_permille_delta)
+ if metrics_diff is not None and coverage_adoption_diff_available
+ else 0
+ )
+ coverage_adoption["summary"] = coverage_summary
+ enriched["coverage_adoption"] = coverage_adoption
+
+ api_surface = dict(as_mapping(enriched.get("api_surface")))
+ api_summary = dict(as_mapping(api_surface.get("summary")))
+ api_items = list(as_sequence(api_surface.get("items")))
+ if api_summary:
+ api_summary["baseline_diff_available"] = api_surface_diff_available
+ api_summary["added"] = (
+ len(metrics_diff.new_api_symbols)
+ if metrics_diff is not None and api_surface_diff_available
+ else 0
+ )
+ api_summary["breaking"] = (
+ len(metrics_diff.new_api_breaking_changes)
+ if metrics_diff is not None and api_surface_diff_available
+ else 0
+ )
+ api_surface["summary"] = api_summary
+ if (
+ metrics_diff is not None
+ and api_surface_diff_available
+ and metrics_diff.new_api_breaking_changes
+ ):
+ api_items.extend(
+ _breaking_api_surface_rows(metrics_diff.new_api_breaking_changes)
+ )
+ api_surface["items"] = api_items
+ if api_surface:
+ enriched["api_surface"] = api_surface
+ return enriched
+
+
+def build_metrics_report_payload(
+ *,
+ scan_root: str = "",
+ project_metrics: ProjectMetrics,
+ dep_graph: DepGraph | None = None,
+ coverage_join: CoverageJoinResult | None = None,
+ units: Sequence[GroupItemLike],
+ class_metrics: Sequence[ClassMetrics],
+ module_deps: Sequence[ModuleDep] = (),
+ security_surfaces: Sequence[SecuritySurface] = (),
+ source_stats_by_file: Sequence[tuple[str, int, int, int, int]] = (),
+ suppressed_dead_code: Sequence[DeadItem] = (),
+) -> dict[str, object]:
+ sorted_units = sorted(
+ units,
+ key=lambda item: (
+ as_int(item.get("cyclomatic_complexity"), 0),
+ as_int(item.get("nesting_depth"), 0),
+ as_str(item.get("qualname")),
+ ),
+ reverse=True,
+ )
+ complexity_rows = [
+ {
+ "qualname": as_str(item.get("qualname")),
+ "filepath": as_str(item.get("filepath")),
+ "start_line": as_int(item.get("start_line"), 0),
+ "end_line": as_int(item.get("end_line"), 0),
+ "cyclomatic_complexity": as_int(item.get("cyclomatic_complexity"), 1),
+ "nesting_depth": as_int(item.get("nesting_depth"), 0),
+ "risk": as_str(item.get("risk"), RISK_LOW),
+ }
+ for item in sorted_units
+ ]
+ classes_sorted = sorted(
+ class_metrics,
+ key=lambda item: (item.cbo, item.lcom4, item.qualname),
+ reverse=True,
+ )
+ coupling_rows = [
+ {
+ "qualname": metric.qualname,
+ "filepath": metric.filepath,
+ "start_line": metric.start_line,
+ "end_line": metric.end_line,
+ "cbo": metric.cbo,
+ "risk": metric.risk_coupling,
+ "coupled_classes": list(metric.coupled_classes),
+ }
+ for metric in classes_sorted
+ ]
+ cohesion_rows = [
+ {
+ "qualname": metric.qualname,
+ "filepath": metric.filepath,
+ "start_line": metric.start_line,
+ "end_line": metric.end_line,
+ "lcom4": metric.lcom4,
+ "risk": metric.risk_cohesion,
+ "method_count": metric.method_count,
+ "instance_var_count": metric.instance_var_count,
+ }
+ for metric in classes_sorted
+ ]
+ active_dead_items = tuple(project_metrics.dead_code)
+ suppressed_dead_items = tuple(suppressed_dead_code)
+ coverage_adoption_rows = _coverage_adoption_rows(project_metrics)
+ api_surface_summary = _api_surface_summary(project_metrics.api_surface)
+ api_surface_items = _api_surface_rows(project_metrics.api_surface)
+ coverage_join_summary = _coverage_join_summary(coverage_join)
+ coverage_join_items = _coverage_join_rows(coverage_join)
+
+ def _serialize_dead_item(
+ item: DeadItem,
+ *,
+ suppressed: bool = False,
+ ) -> dict[str, object]:
+ payload: dict[str, object] = {
+ "qualname": item.qualname,
+ "filepath": item.filepath,
+ "start_line": item.start_line,
+ "end_line": item.end_line,
+ "kind": item.kind,
+ "confidence": item.confidence,
+ }
+ if suppressed:
+ payload["suppressed_by"] = [
+ {
+ "rule": DEAD_CODE_RULE_ID,
+ "source": INLINE_CODECLONE_SUPPRESSION_SOURCE,
+ }
+ ]
+ return payload
+
+ payload = {
+ CATEGORY_COMPLEXITY: {
+ "functions": complexity_rows,
+ "summary": {
+ "total": len(complexity_rows),
+ "average": round(project_metrics.complexity_avg, 2),
+ "max": project_metrics.complexity_max,
+ "high_risk": len(project_metrics.high_risk_functions),
+ },
+ },
+ CATEGORY_COUPLING: {
+ "classes": coupling_rows,
+ "summary": {
+ "total": len(coupling_rows),
+ "average": round(project_metrics.coupling_avg, 2),
+ "max": project_metrics.coupling_max,
+ "high_risk": len(project_metrics.high_risk_classes),
+ },
+ },
+ CATEGORY_COHESION: {
+ "classes": cohesion_rows,
+ "summary": {
+ "total": len(cohesion_rows),
+ "average": round(project_metrics.cohesion_avg, 2),
+ "max": project_metrics.cohesion_max,
+ "low_cohesion": len(project_metrics.low_cohesion_classes),
+ },
+ },
+ "dependencies": {
+ "modules": project_metrics.dependency_modules,
+ "edges": project_metrics.dependency_edges,
+ "max_depth": project_metrics.dependency_max_depth,
+ "avg_depth": (
+ round(dep_graph.avg_depth, 2) if dep_graph is not None else 0.0
+ ),
+ "p95_depth": dep_graph.p95_depth if dep_graph is not None else 0,
+ "cycles": [list(cycle) for cycle in project_metrics.dependency_cycles],
+ "longest_chains": [
+ list(chain) for chain in project_metrics.dependency_longest_chains
+ ],
+ "edge_list": [
+ {
+ "source": edge.source,
+ "target": edge.target,
+ "import_type": edge.import_type,
+ "line": edge.line,
+ }
+ for edge in project_metrics.dependency_edge_list
+ ],
+ },
+ "dead_code": {
+ "items": [_serialize_dead_item(item) for item in active_dead_items],
+ "suppressed_items": [
+ _serialize_dead_item(item, suppressed=True)
+ for item in suppressed_dead_items
+ ],
+ "summary": {
+ "total": len(active_dead_items),
+ "critical": sum(
+ 1
+ for item in active_dead_items
+ if item.confidence == CONFIDENCE_HIGH
+ ),
+ "high_confidence": sum(
+ 1
+ for item in active_dead_items
+ if item.confidence == CONFIDENCE_HIGH
+ ),
+ "suppressed": len(suppressed_dead_items),
+ },
+ },
+ "health": {
+ "score": project_metrics.health.total,
+ "grade": project_metrics.health.grade,
+ "dimensions": dict(project_metrics.health.dimensions),
+ },
+ "coverage_adoption": {
+ "summary": {
+ "modules": len(coverage_adoption_rows),
+ "params_total": project_metrics.typing_param_total,
+ "params_annotated": project_metrics.typing_param_annotated,
+ "param_permille": _permille(
+ project_metrics.typing_param_annotated,
+ project_metrics.typing_param_total,
+ ),
+ "returns_total": project_metrics.typing_return_total,
+ "returns_annotated": project_metrics.typing_return_annotated,
+ "return_permille": _permille(
+ project_metrics.typing_return_annotated,
+ project_metrics.typing_return_total,
+ ),
+ "public_symbol_total": project_metrics.docstring_public_total,
+ "public_symbol_documented": project_metrics.docstring_public_documented,
+ "docstring_permille": _permille(
+ project_metrics.docstring_public_documented,
+ project_metrics.docstring_public_total,
+ ),
+ "typing_any_count": project_metrics.typing_any_count,
+ },
+ "items": coverage_adoption_rows,
+ },
+ "api_surface": {
+ "summary": dict(api_surface_summary),
+ "items": api_surface_items,
+ },
+ "overloaded_modules": build_overloaded_modules_payload(
+ scan_root=scan_root,
+ source_stats_by_file=source_stats_by_file,
+ units=units,
+ class_metrics=class_metrics,
+ module_deps=module_deps,
+ ),
+ "security_surfaces": build_security_surfaces_payload(
+ scan_root=scan_root,
+ surfaces=security_surfaces,
+ ),
+ }
+ if coverage_join is not None:
+ payload["coverage_join"] = {
+ "summary": dict(coverage_join_summary),
+ "items": coverage_join_items,
+ }
+ return payload
diff --git a/codeclone/core/parallelism.py b/codeclone/core/parallelism.py
new file mode 100644
index 0000000..3750670
--- /dev/null
+++ b/codeclone/core/parallelism.py
@@ -0,0 +1,355 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+from ..cache.entries import SourceStatsDict
+from ..cache.store import Cache
+from ..models import (
+ ClassMetrics,
+ DeadCandidate,
+ GroupItem,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ SecuritySurface,
+ StructuralFindingGroup,
+)
+from ._types import (
+ DEFAULT_BATCH_SIZE,
+ DEFAULT_RUNTIME_PROCESSES,
+ PARALLEL_MIN_FILES_FLOOR,
+ PARALLEL_MIN_FILES_PER_WORKER,
+ BootstrapResult,
+ DiscoveryResult,
+ FileProcessResult,
+ ProcessingResult,
+ _block_to_group_item,
+ _class_metric_sort_key,
+ _dead_candidate_sort_key,
+ _group_item_sort_key,
+ _module_dep_sort_key,
+ _segment_to_group_item,
+ _should_collect_structural_findings,
+ _unit_to_group_item,
+)
+from .worker import _invoke_process_file
+
+
+def _parallel_min_files(processes: int) -> int:
+ return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER)
+
+
+def _resolve_process_count(processes: object) -> int:
+ if not isinstance(processes, int):
+ return DEFAULT_RUNTIME_PROCESSES
+ return max(1, processes)
+
+
+def _should_use_parallel(files_count: int, processes: int) -> bool:
+ if processes <= 1:
+ return False
+ return files_count >= _parallel_min_files(processes)
+
+
+def process(
+ *,
+ boot: BootstrapResult,
+ discovery: DiscoveryResult,
+ cache: Cache,
+ on_advance: Callable[[], None] | None = None,
+ on_worker_error: Callable[[str], None] | None = None,
+ on_parallel_fallback: Callable[[Exception], None] | None = None,
+ batch_size: int = DEFAULT_BATCH_SIZE,
+) -> ProcessingResult:
+ files_to_process = discovery.files_to_process
+ if not files_to_process:
+ return ProcessingResult(
+ units=discovery.cached_units,
+ blocks=discovery.cached_blocks,
+ segments=discovery.cached_segments,
+ class_metrics=discovery.cached_class_metrics,
+ module_deps=discovery.cached_module_deps,
+ dead_candidates=discovery.cached_dead_candidates,
+ referenced_names=discovery.cached_referenced_names,
+ security_surfaces=discovery.cached_security_surfaces,
+ referenced_qualnames=discovery.cached_referenced_qualnames,
+ typing_modules=discovery.cached_typing_modules,
+ docstring_modules=discovery.cached_docstring_modules,
+ api_modules=discovery.cached_api_modules,
+ files_analyzed=0,
+ files_skipped=discovery.files_skipped,
+ analyzed_lines=0,
+ analyzed_functions=0,
+ analyzed_methods=0,
+ analyzed_classes=0,
+ failed_files=(),
+ source_read_failures=(),
+ structural_findings=discovery.cached_structural_findings,
+ source_stats_by_file=discovery.cached_source_stats_by_file,
+ )
+
+ all_units: list[GroupItem] = list(discovery.cached_units)
+ all_blocks: list[GroupItem] = list(discovery.cached_blocks)
+ all_segments: list[GroupItem] = list(discovery.cached_segments)
+ all_class_metrics: list[ClassMetrics] = list(discovery.cached_class_metrics)
+ all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps)
+ all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates)
+ all_referenced_names: set[str] = set(discovery.cached_referenced_names)
+ all_referenced_qualnames: set[str] = set(discovery.cached_referenced_qualnames)
+ all_security_surfaces: list[SecuritySurface] = list(
+ discovery.cached_security_surfaces
+ )
+ all_typing_modules: list[ModuleTypingCoverage] = list(
+ discovery.cached_typing_modules
+ )
+ all_docstring_modules: list[ModuleDocstringCoverage] = list(
+ discovery.cached_docstring_modules
+ )
+ all_api_modules: list[ModuleApiSurface] = list(discovery.cached_api_modules)
+
+ collect_structural_findings = _should_collect_structural_findings(boot.output_paths)
+ collect_api_surface = not boot.args.skip_metrics and bool(
+ getattr(boot.args, "api_surface", False)
+ )
+ api_include_private_modules = bool(
+ getattr(boot.args, "api_include_private_modules", False)
+ )
+ files_analyzed = 0
+ files_skipped = discovery.files_skipped
+ analyzed_lines = 0
+ analyzed_functions = 0
+ analyzed_methods = 0
+ analyzed_classes = 0
+ all_structural_findings: list[StructuralFindingGroup] = list(
+ discovery.cached_structural_findings
+ )
+ source_stats_by_file: dict[str, tuple[int, int, int, int]] = {
+ filepath: (lines, functions, methods, classes)
+ for (
+ filepath,
+ lines,
+ functions,
+ methods,
+ classes,
+ ) in discovery.cached_source_stats_by_file
+ }
+ failed_files: list[str] = []
+ source_read_failures: list[str] = []
+ root_str = str(boot.root)
+ processes = _resolve_process_count(boot.args.processes)
+ min_loc = int(boot.args.min_loc)
+ min_stmt = int(boot.args.min_stmt)
+ block_min_loc = int(boot.args.block_min_loc)
+ block_min_stmt = int(boot.args.block_min_stmt)
+ segment_min_loc = int(boot.args.segment_min_loc)
+ segment_min_stmt = int(boot.args.segment_min_stmt)
+
+ def _accept_result(result: FileProcessResult) -> None:
+ nonlocal files_analyzed
+ nonlocal files_skipped
+ nonlocal analyzed_lines
+ nonlocal analyzed_functions
+ nonlocal analyzed_methods
+ nonlocal analyzed_classes
+
+ if result.success and result.stat is not None:
+ source_stats_payload = SourceStatsDict(
+ lines=result.lines,
+ functions=result.functions,
+ methods=result.methods,
+ classes=result.classes,
+ )
+ structural_payload = (
+ result.structural_findings if collect_structural_findings else None
+ )
+ try:
+ cache.put_file_entry(
+ result.filepath,
+ result.stat,
+ result.units or [],
+ result.blocks or [],
+ result.segments or [],
+ source_stats=source_stats_payload,
+ file_metrics=result.file_metrics,
+ structural_findings=structural_payload,
+ )
+ except TypeError as exc:
+ if "source_stats" not in str(exc):
+ raise
+ cache.put_file_entry(
+ result.filepath,
+ result.stat,
+ result.units or [],
+ result.blocks or [],
+ result.segments or [],
+ file_metrics=result.file_metrics,
+ structural_findings=structural_payload,
+ )
+ files_analyzed += 1
+ analyzed_lines += result.lines
+ analyzed_functions += result.functions
+ analyzed_methods += result.methods
+ analyzed_classes += result.classes
+ source_stats_by_file[result.filepath] = (
+ result.lines,
+ result.functions,
+ result.methods,
+ result.classes,
+ )
+ if result.units:
+ all_units.extend(_unit_to_group_item(unit) for unit in result.units)
+ if result.blocks:
+ all_blocks.extend(
+ _block_to_group_item(block) for block in result.blocks
+ )
+ if result.segments:
+ all_segments.extend(
+ _segment_to_group_item(segment) for segment in result.segments
+ )
+ if result.structural_findings:
+ all_structural_findings.extend(result.structural_findings)
+ if not boot.args.skip_metrics and result.file_metrics is not None:
+ all_class_metrics.extend(result.file_metrics.class_metrics)
+ all_module_deps.extend(result.file_metrics.module_deps)
+ all_dead_candidates.extend(result.file_metrics.dead_candidates)
+ all_referenced_names.update(result.file_metrics.referenced_names)
+ all_referenced_qualnames.update(
+ result.file_metrics.referenced_qualnames
+ )
+ all_security_surfaces.extend(result.file_metrics.security_surfaces)
+ if result.file_metrics.typing_coverage is not None:
+ all_typing_modules.append(result.file_metrics.typing_coverage)
+ if result.file_metrics.docstring_coverage is not None:
+ all_docstring_modules.append(result.file_metrics.docstring_coverage)
+ if result.file_metrics.api_surface is not None:
+ all_api_modules.append(result.file_metrics.api_surface)
+ return
+
+ files_skipped += 1
+ failure = f"{result.filepath}: {result.error}"
+ failed_files.append(failure)
+ if result.error_kind == "source_read_error":
+ source_read_failures.append(failure)
+
+ def _run_sequential(files: Sequence[str]) -> None:
+ for filepath in files:
+ _accept_result(
+ _invoke_process_file(
+ filepath,
+ root_str,
+ boot.config,
+ min_loc,
+ min_stmt,
+ collect_structural_findings=collect_structural_findings,
+ collect_api_surface=collect_api_surface,
+ api_include_private_modules=api_include_private_modules,
+ block_min_loc=block_min_loc,
+ block_min_stmt=block_min_stmt,
+ segment_min_loc=segment_min_loc,
+ segment_min_stmt=segment_min_stmt,
+ )
+ )
+ if on_advance is not None:
+ on_advance()
+
+ if _should_use_parallel(len(files_to_process), processes):
+ try:
+ with ProcessPoolExecutor(max_workers=processes) as executor:
+ for idx in range(0, len(files_to_process), batch_size):
+ batch = files_to_process[idx : idx + batch_size]
+ futures = [
+ executor.submit(
+ _invoke_process_file,
+ filepath,
+ root_str,
+ boot.config,
+ min_loc,
+ min_stmt,
+ collect_structural_findings=collect_structural_findings,
+ collect_api_surface=collect_api_surface,
+ api_include_private_modules=api_include_private_modules,
+ block_min_loc=block_min_loc,
+ block_min_stmt=block_min_stmt,
+ segment_min_loc=segment_min_loc,
+ segment_min_stmt=segment_min_stmt,
+ )
+ for filepath in batch
+ ]
+ future_to_path = {
+ id(future): filepath
+ for future, filepath in zip(futures, batch, strict=True)
+ }
+ for future in as_completed(futures):
+ filepath = future_to_path[id(future)]
+ try:
+ _accept_result(future.result())
+ except Exception as exc: # pragma: no cover - worker crash
+ files_skipped += 1
+ failed_files.append(f"{filepath}: {exc}")
+ if on_worker_error is not None:
+ on_worker_error(str(exc))
+ if on_advance is not None:
+ on_advance()
+ except (OSError, RuntimeError, PermissionError) as exc:
+ if on_parallel_fallback is not None:
+ on_parallel_fallback(exc)
+ _run_sequential(files_to_process)
+ else:
+ _run_sequential(files_to_process)
+
+ return ProcessingResult(
+ units=tuple(sorted(all_units, key=_group_item_sort_key)),
+ blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)),
+ segments=tuple(sorted(all_segments, key=_group_item_sort_key)),
+ class_metrics=tuple(sorted(all_class_metrics, key=_class_metric_sort_key)),
+ module_deps=tuple(sorted(all_module_deps, key=_module_dep_sort_key)),
+ dead_candidates=tuple(
+ sorted(all_dead_candidates, key=_dead_candidate_sort_key)
+ ),
+ referenced_names=frozenset(all_referenced_names),
+ security_surfaces=tuple(
+ sorted(
+ all_security_surfaces,
+ key=lambda item: (
+ item.filepath,
+ item.start_line,
+ item.end_line,
+ item.qualname,
+ item.category,
+ item.capability,
+ item.evidence_symbol,
+ ),
+ )
+ ),
+ referenced_qualnames=frozenset(all_referenced_qualnames),
+ typing_modules=tuple(
+ sorted(all_typing_modules, key=lambda item: (item.filepath, item.module))
+ ),
+ docstring_modules=tuple(
+ sorted(all_docstring_modules, key=lambda item: (item.filepath, item.module))
+ ),
+ api_modules=tuple(
+ sorted(all_api_modules, key=lambda item: (item.filepath, item.module))
+ ),
+ files_analyzed=files_analyzed,
+ files_skipped=files_skipped,
+ analyzed_lines=analyzed_lines,
+ analyzed_functions=analyzed_functions,
+ analyzed_methods=analyzed_methods,
+ analyzed_classes=analyzed_classes,
+ failed_files=tuple(sorted(failed_files)),
+ source_read_failures=tuple(sorted(source_read_failures)),
+ structural_findings=tuple(all_structural_findings),
+ source_stats_by_file=tuple(
+ (filepath, *stats)
+ for filepath, stats in sorted(source_stats_by_file.items())
+ ),
+ )
diff --git a/codeclone/core/pipeline.py b/codeclone/core/pipeline.py
new file mode 100644
index 0000000..96abf1b
--- /dev/null
+++ b/codeclone/core/pipeline.py
@@ -0,0 +1,363 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from ..contracts import DEFAULT_COVERAGE_MIN
+from ..findings.clones.golden_fixtures import (
+ build_suppressed_clone_groups,
+ split_clone_groups_for_golden_fixtures,
+)
+from ..findings.clones.grouping import (
+ build_block_groups,
+ build_groups,
+ build_segment_groups,
+)
+from ..findings.structural.detectors import (
+ build_clone_cohort_structural_findings,
+)
+from ..metrics._base import MetricProjectContext
+from ..metrics.coverage_join import CoverageJoinParseError, build_coverage_join
+from ..metrics.dead_code import find_suppressed_unused
+from ..metrics.registry import (
+ METRIC_FAMILIES,
+ build_project_metrics,
+ project_metrics_defaults,
+)
+from ..models import (
+ ClassMetrics,
+ CoverageJoinResult,
+ DeadCandidate,
+ DeadItem,
+ DepGraph,
+ GroupItemLike,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ ProjectMetrics,
+ SecuritySurface,
+ StructuralFindingGroup,
+ Suggestion,
+)
+from ..report.blocks import prepare_block_report_groups
+from ..report.explain import build_block_group_facts
+from ..report.segments import prepare_segment_report_groups
+from ..report.suggestions import generate_suggestions
+from ._types import (
+ AnalysisResult,
+ BootstrapResult,
+ DiscoveryResult,
+ ProcessingResult,
+ _segment_groups_digest,
+ _should_collect_structural_findings,
+)
+from .bootstrap import _resolve_optional_runtime_path
+from .metrics_payload import build_metrics_report_payload
+
+
+def _artifact_dep_graph(value: object, default: DepGraph) -> DepGraph:
+ return value if isinstance(value, DepGraph) else default
+
+
+def _artifact_dead_items(
+ value: object,
+ default: tuple[DeadItem, ...],
+) -> tuple[DeadItem, ...]:
+ if isinstance(value, tuple) and all(isinstance(item, DeadItem) for item in value):
+ return value
+ return default
+
+
+def compute_project_metrics(
+ *,
+ units: Sequence[GroupItemLike],
+ class_metrics: Sequence[ClassMetrics],
+ module_deps: Sequence[ModuleDep],
+ dead_candidates: Sequence[DeadCandidate],
+ referenced_names: frozenset[str],
+ referenced_qualnames: frozenset[str],
+ security_surfaces: Sequence[SecuritySurface] = (),
+ typing_modules: Sequence[ModuleTypingCoverage] = (),
+ docstring_modules: Sequence[ModuleDocstringCoverage] = (),
+ api_modules: Sequence[ModuleApiSurface] = (),
+ files_found: int,
+ files_analyzed_or_cached: int,
+ function_clone_groups: int,
+ block_clone_groups: int,
+ skip_dependencies: bool,
+ skip_dead_code: bool,
+) -> tuple[ProjectMetrics, DepGraph, tuple[DeadItem, ...]]:
+ context = MetricProjectContext(
+ units=tuple(units),
+ class_metrics=tuple(class_metrics),
+ module_deps=tuple(module_deps),
+ dead_candidates=tuple(dead_candidates),
+ referenced_names=referenced_names,
+ referenced_qualnames=referenced_qualnames,
+ security_surfaces=tuple(security_surfaces),
+ typing_modules=tuple(typing_modules),
+ docstring_modules=tuple(docstring_modules),
+ api_modules=tuple(api_modules),
+ files_found=files_found,
+ files_analyzed_or_cached=files_analyzed_or_cached,
+ function_clone_groups=function_clone_groups,
+ block_clone_groups=block_clone_groups,
+ skip_dependencies=skip_dependencies,
+ skip_dead_code=skip_dead_code,
+ )
+ project_fields = project_metrics_defaults()
+ dep_graph = DepGraph(
+ modules=frozenset(),
+ edges=(),
+ cycles=(),
+ max_depth=0,
+ avg_depth=0.0,
+ p95_depth=0,
+ longest_chains=(),
+ )
+ dead_items: tuple[DeadItem, ...] = ()
+ for family in METRIC_FAMILIES.values():
+ aggregate = family.aggregate([family.compute(context)])
+ project_fields.update(aggregate.project_fields)
+ dep_graph = _artifact_dep_graph(aggregate.artifacts.get("dep_graph"), dep_graph)
+ dead_items = _artifact_dead_items(
+ aggregate.artifacts.get("dead_items"),
+ dead_items,
+ )
+ return build_project_metrics(project_fields), dep_graph, dead_items
+
+
+def compute_suggestions(
+ *,
+ project_metrics: ProjectMetrics,
+ units: Sequence[GroupItemLike],
+ class_metrics: Sequence[ClassMetrics],
+ func_groups: Mapping[str, Sequence[GroupItemLike]],
+ block_groups: Mapping[str, Sequence[GroupItemLike]],
+ segment_groups: Mapping[str, Sequence[GroupItemLike]],
+ block_group_facts: Mapping[str, Mapping[str, str]] | None = None,
+ structural_findings: Sequence[StructuralFindingGroup] | None = None,
+ scan_root: str = "",
+) -> tuple[Suggestion, ...]:
+ return generate_suggestions(
+ project_metrics=project_metrics,
+ units=units,
+ class_metrics=class_metrics,
+ func_groups=func_groups,
+ block_groups=block_groups,
+ segment_groups=segment_groups,
+ block_group_facts=block_group_facts,
+ structural_findings=structural_findings,
+ scan_root=scan_root,
+ )
+
+
+def analyze(
+ *,
+ boot: BootstrapResult,
+ discovery: DiscoveryResult,
+ processing: ProcessingResult,
+) -> AnalysisResult:
+ golden_fixture_paths = tuple(
+ str(pattern).strip()
+ for pattern in getattr(boot.args, "golden_fixture_paths", ())
+ if str(pattern).strip()
+ )
+ func_split = split_clone_groups_for_golden_fixtures(
+ groups=build_groups(processing.units),
+ kind="function",
+ golden_fixture_paths=golden_fixture_paths,
+ scan_root=str(boot.root),
+ )
+ block_split = split_clone_groups_for_golden_fixtures(
+ groups=build_block_groups(processing.blocks),
+ kind="block",
+ golden_fixture_paths=golden_fixture_paths,
+ scan_root=str(boot.root),
+ )
+ segment_split = split_clone_groups_for_golden_fixtures(
+ groups=build_segment_groups(processing.segments),
+ kind="segment",
+ golden_fixture_paths=golden_fixture_paths,
+ scan_root=str(boot.root),
+ )
+
+ func_groups = func_split.active_groups
+ block_groups = block_split.active_groups
+ segment_groups_raw = segment_split.active_groups
+ segment_groups_raw_digest = _segment_groups_digest(segment_groups_raw)
+ cached_projection = discovery.cached_segment_report_projection
+ if (
+ cached_projection is not None
+ and cached_projection.get("digest") == segment_groups_raw_digest
+ ):
+ projection_groups = cached_projection.get("groups", {})
+ segment_groups = {
+ group_key: [
+ {
+ "segment_hash": str(item["segment_hash"]),
+ "segment_sig": str(item["segment_sig"]),
+ "filepath": str(item["filepath"]),
+ "qualname": str(item["qualname"]),
+ "start_line": int(item["start_line"]),
+ "end_line": int(item["end_line"]),
+ "size": int(item["size"]),
+ }
+ for item in projection_groups[group_key]
+ ]
+ for group_key in sorted(projection_groups)
+ }
+ suppressed_segment_groups = int(cached_projection.get("suppressed", 0))
+ else:
+ segment_groups, suppressed_segment_groups = prepare_segment_report_groups(
+ segment_groups_raw
+ )
+
+ block_groups_report = prepare_block_report_groups(block_groups)
+ suppressed_block_groups_report = prepare_block_report_groups(
+ block_split.suppressed_groups
+ )
+ if segment_split.suppressed_groups:
+ suppressed_segment_groups_report, _ = prepare_segment_report_groups(
+ segment_split.suppressed_groups
+ )
+ else:
+ suppressed_segment_groups_report = {}
+ suppressed_clone_groups = (
+ *build_suppressed_clone_groups(
+ kind="function",
+ groups=func_split.suppressed_groups,
+ matched_patterns=func_split.matched_patterns,
+ ),
+ *build_suppressed_clone_groups(
+ kind="block",
+ groups=suppressed_block_groups_report,
+ matched_patterns=block_split.matched_patterns,
+ ),
+ *build_suppressed_clone_groups(
+ kind="segment",
+ groups=suppressed_segment_groups_report,
+ matched_patterns=segment_split.matched_patterns,
+ ),
+ )
+ block_group_facts = build_block_group_facts(
+ {**block_groups_report, **suppressed_block_groups_report}
+ )
+
+ func_clones_count = len(func_groups)
+ block_clones_count = len(block_groups)
+ segment_clones_count = len(segment_groups)
+ files_analyzed_or_cached = processing.files_analyzed + discovery.cache_hits
+
+ project_metrics: ProjectMetrics | None = None
+ metrics_payload: dict[str, object] | None = None
+ suggestions: tuple[Suggestion, ...] = ()
+ suppressed_dead_items: tuple[DeadItem, ...] = ()
+ coverage_join: CoverageJoinResult | None = None
+ cohort_structural_findings: tuple[StructuralFindingGroup, ...] = ()
+ if _should_collect_structural_findings(boot.output_paths):
+ cohort_structural_findings = build_clone_cohort_structural_findings(
+ func_groups=func_groups
+ )
+ combined_structural_findings = (
+ *processing.structural_findings,
+ *cohort_structural_findings,
+ )
+ if not boot.args.skip_metrics:
+ project_metrics, dep_graph, _ = compute_project_metrics(
+ units=processing.units,
+ class_metrics=processing.class_metrics,
+ module_deps=processing.module_deps,
+ dead_candidates=processing.dead_candidates,
+ referenced_names=processing.referenced_names,
+ referenced_qualnames=processing.referenced_qualnames,
+ security_surfaces=processing.security_surfaces,
+ typing_modules=processing.typing_modules,
+ docstring_modules=processing.docstring_modules,
+ api_modules=processing.api_modules,
+ files_found=discovery.files_found,
+ files_analyzed_or_cached=files_analyzed_or_cached,
+ function_clone_groups=func_clones_count,
+ block_clone_groups=block_clones_count,
+ skip_dependencies=boot.args.skip_dependencies,
+ skip_dead_code=boot.args.skip_dead_code,
+ )
+ if not boot.args.skip_dead_code:
+ suppressed_dead_items = find_suppressed_unused(
+ definitions=tuple(processing.dead_candidates),
+ referenced_names=processing.referenced_names,
+ referenced_qualnames=processing.referenced_qualnames,
+ )
+ suggestions = compute_suggestions(
+ project_metrics=project_metrics,
+ units=processing.units,
+ class_metrics=processing.class_metrics,
+ func_groups=func_groups,
+ block_groups=block_groups_report,
+ segment_groups=segment_groups,
+ block_group_facts=block_group_facts,
+ structural_findings=combined_structural_findings,
+ scan_root=str(boot.root),
+ )
+ coverage_xml_path = _resolve_optional_runtime_path(
+ getattr(boot.args, "coverage_xml", None),
+ root=boot.root,
+ )
+ if coverage_xml_path is not None:
+ try:
+ coverage_join = build_coverage_join(
+ coverage_xml=coverage_xml_path,
+ root_path=boot.root,
+ units=processing.units,
+ hotspot_threshold_percent=int(
+ getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN)
+ ),
+ )
+ except CoverageJoinParseError as exc:
+ coverage_join = CoverageJoinResult(
+ coverage_xml=str(coverage_xml_path),
+ status="invalid",
+ hotspot_threshold_percent=int(
+ getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN)
+ ),
+ invalid_reason=str(exc),
+ )
+ metrics_payload = build_metrics_report_payload(
+ scan_root=str(boot.root),
+ project_metrics=project_metrics,
+ dep_graph=dep_graph,
+ coverage_join=coverage_join,
+ units=processing.units,
+ class_metrics=processing.class_metrics,
+ module_deps=processing.module_deps,
+ security_surfaces=processing.security_surfaces,
+ source_stats_by_file=processing.source_stats_by_file,
+ suppressed_dead_code=suppressed_dead_items,
+ )
+
+ return AnalysisResult(
+ func_groups=func_groups,
+ block_groups=block_groups,
+ block_groups_report=block_groups_report,
+ segment_groups=segment_groups,
+ suppressed_clone_groups=tuple(suppressed_clone_groups),
+ suppressed_segment_groups=suppressed_segment_groups,
+ block_group_facts=block_group_facts,
+ func_clones_count=func_clones_count,
+ block_clones_count=block_clones_count,
+ segment_clones_count=segment_clones_count,
+ files_analyzed_or_cached=files_analyzed_or_cached,
+ project_metrics=project_metrics,
+ metrics_payload=metrics_payload,
+ suggestions=suggestions,
+ segment_groups_raw_digest=segment_groups_raw_digest,
+ coverage_join=coverage_join,
+ suppressed_dead_code_items=len(suppressed_dead_items),
+ structural_findings=combined_structural_findings,
+ )
diff --git a/codeclone/core/reporting.py b/codeclone/core/reporting.py
new file mode 100644
index 0000000..d43683f
--- /dev/null
+++ b/codeclone/core/reporting.py
@@ -0,0 +1,267 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable, Collection, Mapping
+
+from ..contracts import DEFAULT_COVERAGE_MIN
+from ..models import MetricsDiff
+from ..report.gates.evaluator import GateResult, GateState
+from ..report.gates.evaluator import MetricGateConfig as _MetricGateConfig
+from ..report.gates.evaluator import evaluate_gate_state as _evaluate_gate_state
+from ..report.gates.evaluator import (
+ gate_state_from_project_metrics as _gate_state_from_metrics,
+)
+from ..report.renderers.json import render_json_report_document
+from ..report.renderers.text import render_text_report_document
+from ._types import (
+ AnalysisResult,
+ BootstrapResult,
+ DiscoveryResult,
+ ProcessingResult,
+ ReportArtifacts,
+)
+from .metrics_payload import _enrich_metrics_report_payload
+
+MetricGateConfig = _MetricGateConfig
+GatingResult = GateResult
+
+
+def _coerce_metrics_diff(value: object | None) -> MetricsDiff | None:
+ return value if isinstance(value, MetricsDiff) else None
+
+
+def _load_markdown_report_renderer() -> Callable[..., str]:
+ from ..report.renderers.markdown import to_markdown_report
+
+ return to_markdown_report
+
+
+def _load_sarif_report_renderer() -> Callable[..., str]:
+ from ..report.renderers.sarif import to_sarif_report
+
+ return to_sarif_report
+
+
+def _load_report_document_builder() -> Callable[..., dict[str, object]]:
+ from ..report.document.builder import build_report_document
+
+ return build_report_document
+
+
+def report(
+ *,
+ boot: BootstrapResult,
+ discovery: DiscoveryResult,
+ processing: ProcessingResult,
+ analysis: AnalysisResult,
+ report_meta: Mapping[str, object],
+ new_func: Collection[str],
+ new_block: Collection[str],
+ html_builder: Callable[..., str] | None = None,
+ metrics_diff: object | None = None,
+ coverage_adoption_diff_available: bool = False,
+ api_surface_diff_available: bool = False,
+ include_report_document: bool = False,
+) -> ReportArtifacts:
+ contents: dict[str, str | None] = {
+ "html": None,
+ "json": None,
+ "md": None,
+ "sarif": None,
+ "text": None,
+ }
+ structural_findings = (
+ analysis.structural_findings if analysis.structural_findings else None
+ )
+ report_inventory = {
+ "files": {
+ "total_found": discovery.files_found,
+ "analyzed": processing.files_analyzed,
+ "cached": discovery.cache_hits,
+ "skipped": processing.files_skipped,
+ "source_io_skipped": len(processing.source_read_failures),
+ },
+ "code": {
+ "parsed_lines": processing.analyzed_lines + discovery.cached_lines,
+ "functions": processing.analyzed_functions + discovery.cached_functions,
+ "methods": processing.analyzed_methods + discovery.cached_methods,
+ "classes": processing.analyzed_classes + discovery.cached_classes,
+ },
+ "file_list": list(discovery.all_file_paths),
+ }
+ report_document: dict[str, object] | None = None
+ needs_report_document = (
+ include_report_document
+ or boot.output_paths.html is not None
+ or any(
+ path is not None
+ for path in (
+ boot.output_paths.json,
+ boot.output_paths.md,
+ boot.output_paths.sarif,
+ boot.output_paths.text,
+ )
+ )
+ )
+ if needs_report_document:
+ build_report_document = _load_report_document_builder()
+ validated_metrics_diff = _coerce_metrics_diff(metrics_diff)
+ metrics_for_report = (
+ _enrich_metrics_report_payload(
+ metrics_payload=analysis.metrics_payload,
+ metrics_diff=validated_metrics_diff,
+ coverage_adoption_diff_available=coverage_adoption_diff_available,
+ api_surface_diff_available=api_surface_diff_available,
+ )
+ if analysis.metrics_payload is not None
+ else None
+ )
+ report_document = build_report_document(
+ func_groups=analysis.func_groups,
+ block_groups=analysis.block_groups_report,
+ segment_groups=analysis.segment_groups,
+ suppressed_clone_groups=analysis.suppressed_clone_groups,
+ meta=report_meta,
+ inventory=report_inventory,
+ block_facts=analysis.block_group_facts,
+ new_function_group_keys=new_func,
+ new_block_group_keys=new_block,
+ new_segment_group_keys=set(analysis.segment_groups.keys()),
+ metrics=metrics_for_report,
+ suggestions=analysis.suggestions,
+ structural_findings=structural_findings,
+ )
+
+ if boot.output_paths.html and html_builder is not None:
+ validated_metrics_diff = _coerce_metrics_diff(metrics_diff)
+ metrics_for_html = (
+ _enrich_metrics_report_payload(
+ metrics_payload=analysis.metrics_payload,
+ metrics_diff=validated_metrics_diff,
+ coverage_adoption_diff_available=coverage_adoption_diff_available,
+ api_surface_diff_available=api_surface_diff_available,
+ )
+ if analysis.metrics_payload is not None
+ else None
+ )
+ contents["html"] = html_builder(
+ func_groups=analysis.func_groups,
+ block_groups=analysis.block_groups_report,
+ segment_groups=analysis.segment_groups,
+ block_group_facts=analysis.block_group_facts,
+ new_function_group_keys=new_func,
+ new_block_group_keys=new_block,
+ report_meta=report_meta,
+ metrics=metrics_for_html,
+ suggestions=analysis.suggestions,
+ structural_findings=structural_findings,
+ report_document=report_document,
+ metrics_diff=metrics_diff,
+ title="CodeClone Report",
+ context_lines=3,
+ max_snippet_lines=220,
+ )
+
+ if any(
+ path is not None
+ for path in (
+ boot.output_paths.json,
+ boot.output_paths.md,
+ boot.output_paths.sarif,
+ boot.output_paths.text,
+ )
+ ):
+ assert report_document is not None
+
+ if boot.output_paths.json and report_document is not None:
+ contents["json"] = render_json_report_document(report_document)
+
+ def _render_projection_artifact(renderer: Callable[..., str]) -> str:
+ assert report_document is not None
+ return renderer(
+ report_document=report_document,
+ meta=report_meta,
+ inventory=report_inventory,
+ func_groups=analysis.func_groups,
+ block_groups=analysis.block_groups_report,
+ segment_groups=analysis.segment_groups,
+ block_facts=analysis.block_group_facts,
+ new_function_group_keys=new_func,
+ new_block_group_keys=new_block,
+ new_segment_group_keys=set(analysis.segment_groups.keys()),
+ metrics=analysis.metrics_payload,
+ suggestions=analysis.suggestions,
+ structural_findings=structural_findings,
+ )
+
+ for key, output_path, loader in (
+ ("md", boot.output_paths.md, _load_markdown_report_renderer),
+ ("sarif", boot.output_paths.sarif, _load_sarif_report_renderer),
+ ):
+ if output_path and report_document is not None:
+ contents[key] = _render_projection_artifact(loader())
+
+ if boot.output_paths.text and report_document is not None:
+ contents["text"] = render_text_report_document(report_document)
+
+ return ReportArtifacts(
+ html=contents["html"],
+ json=contents["json"],
+ md=contents["md"],
+ sarif=contents["sarif"],
+ text=contents["text"],
+ report_document=report_document,
+ )
+
+
+def gate(
+ *,
+ boot: BootstrapResult,
+ analysis: AnalysisResult,
+ new_func: Collection[str],
+ new_block: Collection[str],
+ metrics_diff: MetricsDiff | None,
+) -> GatingResult:
+ config = MetricGateConfig(
+ fail_complexity=boot.args.fail_complexity,
+ fail_coupling=boot.args.fail_coupling,
+ fail_cohesion=boot.args.fail_cohesion,
+ fail_cycles=boot.args.fail_cycles,
+ fail_dead_code=boot.args.fail_dead_code,
+ fail_health=boot.args.fail_health,
+ fail_on_new_metrics=boot.args.fail_on_new_metrics,
+ fail_on_typing_regression=bool(
+ getattr(boot.args, "fail_on_typing_regression", False)
+ ),
+ fail_on_docstring_regression=bool(
+ getattr(boot.args, "fail_on_docstring_regression", False)
+ ),
+ fail_on_api_break=bool(getattr(boot.args, "fail_on_api_break", False)),
+ fail_on_untested_hotspots=bool(
+ getattr(boot.args, "fail_on_untested_hotspots", False)
+ ),
+ min_typing_coverage=int(getattr(boot.args, "min_typing_coverage", -1)),
+ min_docstring_coverage=int(getattr(boot.args, "min_docstring_coverage", -1)),
+ coverage_min=int(getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN)),
+ fail_on_new=bool(getattr(boot.args, "fail_on_new", False)),
+ fail_threshold=int(getattr(boot.args, "fail_threshold", -1)),
+ )
+ clone_new_count = len(tuple(new_func)) + len(tuple(new_block))
+ clone_total = analysis.func_clones_count + analysis.block_clones_count
+ if analysis.project_metrics is None:
+ state = GateState(clone_new_count=clone_new_count, clone_total=clone_total)
+ else:
+ state = _gate_state_from_metrics(
+ project_metrics=analysis.project_metrics,
+ coverage_join=analysis.coverage_join,
+ metrics_diff=metrics_diff,
+ clone_new_count=clone_new_count,
+ clone_total=clone_total,
+ )
+ result = _evaluate_gate_state(state=state, config=config)
+ return GatingResult(exit_code=result.exit_code, reasons=result.reasons)
diff --git a/codeclone/core/security_surfaces_payload.py b/codeclone/core/security_surfaces_payload.py
new file mode 100644
index 0000000..40d2a16
--- /dev/null
+++ b/codeclone/core/security_surfaces_payload.py
@@ -0,0 +1,104 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Sequence
+
+from ..domain.source_scope import SOURCE_KIND_BREAKDOWN_KEYS
+from ..models import SecuritySurface
+from ..paths import classify_source_kind
+
+
+def _security_surface_source_kind(
+ surface: SecuritySurface,
+ *,
+ scan_root: str,
+) -> str:
+ return classify_source_kind(surface.filepath, scan_root=scan_root)
+
+
+def _security_surface_sort_key(
+ surface: SecuritySurface,
+ *,
+ scan_root: str,
+) -> tuple[str, int, int, str, str, str, str]:
+ source_kind = _security_surface_source_kind(surface, scan_root=scan_root)
+ return (
+ source_kind,
+ surface.start_line,
+ surface.end_line,
+ surface.filepath,
+ surface.qualname,
+ surface.category,
+ surface.capability,
+ )
+
+
+def build_security_surfaces_payload(
+ *,
+ scan_root: str,
+ surfaces: Sequence[SecuritySurface],
+) -> dict[str, object]:
+ sorted_surfaces = tuple(
+ sorted(
+ surfaces,
+ key=lambda surface: _security_surface_sort_key(
+ surface,
+ scan_root=scan_root,
+ ),
+ )
+ )
+ category_counts = Counter(surface.category for surface in sorted_surfaces)
+ source_kind_counts = Counter(
+ _security_surface_source_kind(surface, scan_root=scan_root)
+ for surface in sorted_surfaces
+ )
+ return {
+ "summary": {
+ "items": len(sorted_surfaces),
+ "modules": len({surface.module for surface in sorted_surfaces}),
+ "exact_items": len(sorted_surfaces),
+ "category_count": len(category_counts),
+ "categories": {
+ category: category_counts[category]
+ for category in sorted(category_counts)
+ },
+ "by_source_kind": {
+ kind: source_kind_counts.get(kind, 0)
+ for kind in SOURCE_KIND_BREAKDOWN_KEYS
+ },
+ "production": source_kind_counts.get("production", 0),
+ "tests": source_kind_counts.get("tests", 0),
+ "fixtures": source_kind_counts.get("fixtures", 0),
+ "other": source_kind_counts.get("other", 0),
+ "report_only": True,
+ },
+ "items": [
+ {
+ "category": surface.category,
+ "capability": surface.capability,
+ "module": surface.module,
+ "filepath": surface.filepath,
+ "qualname": surface.qualname,
+ "start_line": surface.start_line,
+ "end_line": surface.end_line,
+ "source_kind": _security_surface_source_kind(
+ surface,
+ scan_root=scan_root,
+ ),
+ "location_scope": surface.location_scope,
+ "classification_mode": surface.classification_mode,
+ "evidence_kind": surface.evidence_kind,
+ "evidence_symbol": surface.evidence_symbol,
+ }
+ for surface in sorted_surfaces
+ ],
+ }
+
+
+__all__ = ["build_security_surfaces_payload"]
diff --git a/codeclone/core/worker.py b/codeclone/core/worker.py
new file mode 100644
index 0000000..4cbd52c
--- /dev/null
+++ b/codeclone/core/worker.py
@@ -0,0 +1,171 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+import inspect
+import os
+from collections.abc import Callable
+from pathlib import Path
+
+from ..analysis.normalizer import NormalizationConfig
+from ..analysis.units import extract_units_and_stats_from_source
+from ..cache.entries import FileStat
+from ..contracts import (
+ DEFAULT_BLOCK_MIN_LOC,
+ DEFAULT_BLOCK_MIN_STMT,
+ DEFAULT_SEGMENT_MIN_LOC,
+ DEFAULT_SEGMENT_MIN_STMT,
+)
+from ..scanner import module_name_from_path
+from ._types import MAX_FILE_SIZE, FileProcessResult
+
+
+def process_file(
+ filepath: str,
+ root: str,
+ cfg: NormalizationConfig,
+ min_loc: int,
+ min_stmt: int,
+ collect_structural_findings: bool = True,
+ collect_api_surface: bool = False,
+ api_include_private_modules: bool = False,
+ block_min_loc: int = DEFAULT_BLOCK_MIN_LOC,
+ block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT,
+ segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC,
+ segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT,
+) -> FileProcessResult:
+ try:
+ try:
+ stat_result = os.stat(filepath)
+ if stat_result.st_size > MAX_FILE_SIZE:
+ return FileProcessResult(
+ filepath=filepath,
+ success=False,
+ error=(
+ f"File too large: {stat_result.st_size} bytes "
+ f"(max {MAX_FILE_SIZE})"
+ ),
+ error_kind="file_too_large",
+ )
+ except OSError as exc:
+ return FileProcessResult(
+ filepath=filepath,
+ success=False,
+ error=f"Cannot stat file: {exc}",
+ error_kind="stat_error",
+ )
+ stat: FileStat = {
+ "mtime_ns": stat_result.st_mtime_ns,
+ "size": stat_result.st_size,
+ }
+ try:
+ source = Path(filepath).read_text("utf-8")
+ except UnicodeDecodeError as exc:
+ return FileProcessResult(
+ filepath=filepath,
+ success=False,
+ error=f"Encoding error: {exc}",
+ error_kind="source_read_error",
+ )
+ except OSError as exc:
+ return FileProcessResult(
+ filepath=filepath,
+ success=False,
+ error=f"Cannot read file: {exc}",
+ error_kind="source_read_error",
+ )
+ module_name = module_name_from_path(root, filepath)
+ units, blocks, segments, source_stats, file_metrics, structural_findings = (
+ extract_units_and_stats_from_source(
+ source=source,
+ filepath=filepath,
+ module_name=module_name,
+ cfg=cfg,
+ min_loc=min_loc,
+ min_stmt=min_stmt,
+ block_min_loc=block_min_loc,
+ block_min_stmt=block_min_stmt,
+ segment_min_loc=segment_min_loc,
+ segment_min_stmt=segment_min_stmt,
+ collect_structural_findings=collect_structural_findings,
+ collect_api_surface=collect_api_surface,
+ api_include_private_modules=api_include_private_modules,
+ )
+ )
+ return FileProcessResult(
+ filepath=filepath,
+ success=True,
+ units=units,
+ blocks=blocks,
+ segments=segments,
+ lines=source_stats.lines,
+ functions=source_stats.functions,
+ methods=source_stats.methods,
+ classes=source_stats.classes,
+ stat=stat,
+ file_metrics=file_metrics,
+ structural_findings=structural_findings,
+ )
+ except Exception as exc: # pragma: no cover - defensive shell around workers
+ return FileProcessResult(
+ filepath=filepath,
+ success=False,
+ error=f"Unexpected error: {type(exc).__name__}: {exc}",
+ error_kind="unexpected_error",
+ )
+
+
+def _invoke_process_file(
+ filepath: str,
+ root: str,
+ cfg: NormalizationConfig,
+ min_loc: int,
+ min_stmt: int,
+ *,
+ collect_structural_findings: bool,
+ collect_api_surface: bool,
+ api_include_private_modules: bool,
+ block_min_loc: int,
+ block_min_stmt: int,
+ segment_min_loc: int,
+ segment_min_stmt: int,
+) -> FileProcessResult:
+ optional_kwargs: dict[str, object] = {
+ "collect_structural_findings": collect_structural_findings,
+ "collect_api_surface": collect_api_surface,
+ "api_include_private_modules": api_include_private_modules,
+ "block_min_loc": block_min_loc,
+ "block_min_stmt": block_min_stmt,
+ "segment_min_loc": segment_min_loc,
+ "segment_min_stmt": segment_min_stmt,
+ }
+ try:
+ signature = inspect.signature(process_file)
+ except (TypeError, ValueError):
+ supported_kwargs = optional_kwargs
+ else:
+ parameters = tuple(signature.parameters.values())
+ if any(
+ parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters
+ ):
+ supported_kwargs = optional_kwargs
+ else:
+ supported_names = {parameter.name for parameter in parameters}
+ supported_kwargs = {
+ key: value
+ for key, value in optional_kwargs.items()
+ if key in supported_names
+ }
+ process_callable: Callable[..., FileProcessResult] = process_file
+ return process_callable(
+ filepath,
+ root,
+ cfg,
+ min_loc,
+ min_stmt,
+ **supported_kwargs,
+ )
diff --git a/codeclone/domain/__init__.py b/codeclone/domain/__init__.py
index 61cd04f..9135843 100644
--- a/codeclone/domain/__init__.py
+++ b/codeclone/domain/__init__.py
@@ -3,135 +3,3 @@
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# Copyright (c) 2026 Den Rozhnovskiy
-
-from .findings import (
- CATEGORY_CLONE,
- CATEGORY_COHESION,
- CATEGORY_COMPLEXITY,
- CATEGORY_COUPLING,
- CATEGORY_DEAD_CODE,
- CATEGORY_DEPENDENCY,
- CATEGORY_STRUCTURAL,
- CLONE_KIND_BLOCK,
- CLONE_KIND_FUNCTION,
- CLONE_KIND_SEGMENT,
- CLONE_NOVELTY_KNOWN,
- CLONE_NOVELTY_NEW,
- FAMILY_CLONE,
- FAMILY_CLONES,
- FAMILY_DEAD_CODE,
- FAMILY_DESIGN,
- FAMILY_METRICS,
- FAMILY_STRUCTURAL,
- FINDING_KIND_CLASS_HOTSPOT,
- FINDING_KIND_CLONE_GROUP,
- FINDING_KIND_CYCLE,
- FINDING_KIND_FUNCTION_HOTSPOT,
- FINDING_KIND_UNUSED_SYMBOL,
- STRUCTURAL_KIND_CLONE_COHORT_DRIFT,
- STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE,
- STRUCTURAL_KIND_DUPLICATED_BRANCHES,
- SYMBOL_KIND_CLASS,
- SYMBOL_KIND_FUNCTION,
- SYMBOL_KIND_IMPORT,
- SYMBOL_KIND_METHOD,
-)
-from .quality import (
- CONFIDENCE_HIGH,
- CONFIDENCE_LOW,
- CONFIDENCE_MEDIUM,
- EFFORT_EASY,
- EFFORT_HARD,
- EFFORT_MODERATE,
- EFFORT_WEIGHT,
- HEALTH_GRADE_A,
- HEALTH_GRADE_B,
- HEALTH_GRADE_C,
- HEALTH_GRADE_D,
- HEALTH_GRADE_F,
- HEALTH_GRADES,
- RISK_HIGH,
- RISK_LOW,
- RISK_MEDIUM,
- SEVERITY_CRITICAL,
- SEVERITY_INFO,
- SEVERITY_ORDER,
- SEVERITY_RANK,
- SEVERITY_WARNING,
-)
-from .source_scope import (
- IMPACT_SCOPE_MIXED,
- IMPACT_SCOPE_NON_RUNTIME,
- IMPACT_SCOPE_RUNTIME,
- SOURCE_KIND_BREAKDOWN_KEYS,
- SOURCE_KIND_FIXTURES,
- SOURCE_KIND_MIXED,
- SOURCE_KIND_ORDER,
- SOURCE_KIND_OTHER,
- SOURCE_KIND_PRODUCTION,
- SOURCE_KIND_TESTS,
-)
-
-__all__ = [
- "CATEGORY_CLONE",
- "CATEGORY_COHESION",
- "CATEGORY_COMPLEXITY",
- "CATEGORY_COUPLING",
- "CATEGORY_DEAD_CODE",
- "CATEGORY_DEPENDENCY",
- "CATEGORY_STRUCTURAL",
- "CLONE_KIND_BLOCK",
- "CLONE_KIND_FUNCTION",
- "CLONE_KIND_SEGMENT",
- "CLONE_NOVELTY_KNOWN",
- "CLONE_NOVELTY_NEW",
- "CONFIDENCE_HIGH",
- "CONFIDENCE_LOW",
- "CONFIDENCE_MEDIUM",
- "EFFORT_EASY",
- "EFFORT_HARD",
- "EFFORT_MODERATE",
- "EFFORT_WEIGHT",
- "FAMILY_CLONE",
- "FAMILY_CLONES",
- "FAMILY_DEAD_CODE",
- "FAMILY_DESIGN",
- "FAMILY_METRICS",
- "FAMILY_STRUCTURAL",
- "FINDING_KIND_CLASS_HOTSPOT",
- "FINDING_KIND_CLONE_GROUP",
- "FINDING_KIND_CYCLE",
- "FINDING_KIND_FUNCTION_HOTSPOT",
- "FINDING_KIND_UNUSED_SYMBOL",
- "HEALTH_GRADES",
- "HEALTH_GRADE_A",
- "HEALTH_GRADE_B",
- "HEALTH_GRADE_C",
- "HEALTH_GRADE_D",
- "HEALTH_GRADE_F",
- "IMPACT_SCOPE_MIXED",
- "IMPACT_SCOPE_NON_RUNTIME",
- "IMPACT_SCOPE_RUNTIME",
- "RISK_HIGH",
- "RISK_LOW",
- "RISK_MEDIUM",
- "SEVERITY_CRITICAL",
- "SEVERITY_INFO",
- "SEVERITY_ORDER",
- "SEVERITY_RANK",
- "SEVERITY_WARNING",
- "SOURCE_KIND_BREAKDOWN_KEYS",
- "SOURCE_KIND_FIXTURES",
- "SOURCE_KIND_MIXED",
- "SOURCE_KIND_ORDER",
- "SOURCE_KIND_OTHER",
- "SOURCE_KIND_PRODUCTION",
- "SOURCE_KIND_TESTS",
- "STRUCTURAL_KIND_CLONE_COHORT_DRIFT",
- "STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE",
- "STRUCTURAL_KIND_DUPLICATED_BRANCHES",
- "SYMBOL_KIND_CLASS",
- "SYMBOL_KIND_FUNCTION",
- "SYMBOL_KIND_IMPORT",
- "SYMBOL_KIND_METHOD",
-]
diff --git a/codeclone/extractor.py b/codeclone/extractor.py
deleted file mode 100644
index bacbef4..0000000
--- a/codeclone/extractor.py
+++ /dev/null
@@ -1,1149 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import ast
-import io
-import math
-import os
-import signal
-import tokenize
-from contextlib import contextmanager
-from dataclasses import dataclass, field
-from hashlib import sha1 as _sha1
-from typing import TYPE_CHECKING, Literal, NamedTuple
-
-from . import qualnames as _qualnames
-from .blocks import extract_blocks, extract_segments
-from .cfg import CFGBuilder
-from .errors import ParseError
-from .fingerprint import bucket_loc, sha1
-from .metrics import (
- cohesion_risk,
- compute_cbo,
- compute_lcom4,
- coupling_risk,
- cyclomatic_complexity,
- risk_level,
-)
-from .metrics.adoption import collect_module_adoption
-from .metrics.api_surface import collect_module_api_surface
-from .models import (
- BlockUnit,
- ClassMetrics,
- DeadCandidate,
- FileMetrics,
- ModuleDep,
- SegmentUnit,
- SourceStats,
- StructuralFindingGroup,
- Unit,
-)
-from .normalize import (
- AstNormalizer,
- NormalizationConfig,
- normalized_ast_dump_from_list,
- stmt_hashes,
-)
-from .paths import is_test_filepath
-from .structural_findings import scan_function_structure
-from .suppressions import (
- DeclarationTarget,
- bind_suppressions_to_declarations,
- build_suppression_index,
- extract_suppression_directives,
- suppression_target_key,
-)
-
-if TYPE_CHECKING:
- from collections.abc import Iterator, Mapping
-
- from .suppressions import SuppressionTargetKey
-
-__all__ = [
- "Unit",
- "extract_units_and_stats_from_source",
-]
-
-# =========================
-# Helpers
-# =========================
-
-PARSE_TIMEOUT_SECONDS = 5
-
-
-class _ParseTimeoutError(Exception):
- pass
-
-
-# Any named declaration: function, async function, or class.
-_NamedDeclarationNode = _qualnames.FunctionNode | ast.ClassDef
-# Unique key for a declaration's token index: (start_line, end_line, qualname).
-_DeclarationTokenIndexKey = tuple[int, int, str]
-_DECLARATION_TOKEN_STRINGS = frozenset({"def", "async", "class"})
-
-
-def _consumed_cpu_seconds(resource_module: object) -> float:
- """Return consumed CPU seconds for the current process."""
- try:
- usage = resource_module.getrusage( # type: ignore[attr-defined]
- resource_module.RUSAGE_SELF # type: ignore[attr-defined]
- )
- return float(usage.ru_utime) + float(usage.ru_stime)
- except Exception:
- return 0.0
-
-
-@contextmanager
-def _parse_limits(timeout_s: int) -> Iterator[None]:
- if os.name != "posix" or timeout_s <= 0:
- yield
- return
-
- old_handler = signal.getsignal(signal.SIGALRM)
-
- def _timeout_handler(_signum: int, _frame: object) -> None:
- raise _ParseTimeoutError("AST parsing timeout")
-
- old_limits: tuple[int, int] | None = None
- try:
- signal.signal(signal.SIGALRM, _timeout_handler)
- signal.setitimer(signal.ITIMER_REAL, timeout_s)
-
- try:
- import resource
-
- old_limits = resource.getrlimit(resource.RLIMIT_CPU)
- soft, hard = old_limits
- consumed_cpu_s = _consumed_cpu_seconds(resource)
- desired_soft = max(1, timeout_s + math.ceil(consumed_cpu_s))
- if soft == resource.RLIM_INFINITY:
- candidate_soft = desired_soft
- else:
- # Never reduce finite soft limits and avoid immediate SIGXCPU
- # when the process already consumed more CPU than timeout_s.
- candidate_soft = max(soft, desired_soft)
- if hard == resource.RLIM_INFINITY:
- new_soft = candidate_soft
- else:
- new_soft = min(max(1, hard), candidate_soft)
- # Never lower hard limit: raising it back may be disallowed for
- # unprivileged processes and can lead to process termination later.
- resource.setrlimit(resource.RLIMIT_CPU, (new_soft, hard))
- except Exception:
- # If resource is unavailable or cannot be set, rely on alarm only.
- pass
-
- yield
- finally:
- signal.setitimer(signal.ITIMER_REAL, 0)
- signal.signal(signal.SIGALRM, old_handler)
- if old_limits is not None:
- try:
- import resource
-
- resource.setrlimit(resource.RLIMIT_CPU, old_limits)
- except Exception:
- pass
-
-
-def _parse_with_limits(source: str, timeout_s: int) -> ast.AST:
- try:
- with _parse_limits(timeout_s):
- return ast.parse(source)
- except _ParseTimeoutError as e:
- raise ParseError(str(e)) from e
-
-
-def _stmt_count(node: ast.AST) -> int:
- body = getattr(node, "body", None)
- return len(body) if isinstance(body, list) else 0
-
-
-def _source_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]:
- try:
- return tuple(tokenize.generate_tokens(io.StringIO(source).readline))
- except tokenize.TokenError:
- return ()
-
-
-def _declaration_token_name(node: ast.AST) -> str:
- if isinstance(node, ast.ClassDef):
- return "class"
- if isinstance(node, ast.AsyncFunctionDef):
- return "async"
- return "def"
-
-
-def _declaration_token_index(
- *,
- source_tokens: tuple[tokenize.TokenInfo, ...],
- start_line: int,
- start_col: int,
- declaration_token: str,
- source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
-) -> int | None:
- if source_token_index is not None:
- return source_token_index.get((start_line, start_col, declaration_token))
- for idx, token in enumerate(source_tokens):
- if token.start != (start_line, start_col):
- continue
- if token.type == tokenize.NAME and token.string == declaration_token:
- return idx
- return None
-
-
-def _build_declaration_token_index(
- source_tokens: tuple[tokenize.TokenInfo, ...],
-) -> Mapping[_DeclarationTokenIndexKey, int]:
- indexed: dict[_DeclarationTokenIndexKey, int] = {}
- for idx, token in enumerate(source_tokens):
- if token.type == tokenize.NAME and token.string in _DECLARATION_TOKEN_STRINGS:
- indexed[(token.start[0], token.start[1], token.string)] = idx
- return indexed
-
-
-def _scan_declaration_colon_line(
- *,
- source_tokens: tuple[tokenize.TokenInfo, ...],
- start_index: int,
-) -> int | None:
- nesting = 0
- for token in source_tokens[start_index + 1 :]:
- if token.type == tokenize.OP:
- if token.string in "([{":
- nesting += 1
- continue
- if token.string in ")]}":
- if nesting > 0:
- nesting -= 1
- continue
- if token.string == ":" and nesting == 0:
- return token.start[0]
- if token.type == tokenize.NEWLINE and nesting == 0:
- return None
- return None
-
-
-def _fallback_declaration_end_line(node: ast.AST, *, start_line: int) -> int:
- body = getattr(node, "body", None)
- if not isinstance(body, list) or not body:
- return start_line
-
- first_body_line = int(getattr(body[0], "lineno", 0))
- if first_body_line <= 0 or first_body_line == start_line:
- return start_line
- return max(start_line, first_body_line - 1)
-
-
-def _declaration_end_line(
- node: ast.AST,
- *,
- source_tokens: tuple[tokenize.TokenInfo, ...],
- source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
-) -> int:
- start_line = int(getattr(node, "lineno", 0))
- start_col = int(getattr(node, "col_offset", 0))
- if start_line <= 0:
- return 0
-
- declaration_token = _declaration_token_name(node)
- start_index = _declaration_token_index(
- source_tokens=source_tokens,
- start_line=start_line,
- start_col=start_col,
- declaration_token=declaration_token,
- source_token_index=source_token_index,
- )
- if start_index is None:
- return _fallback_declaration_end_line(node, start_line=start_line)
-
- colon_line = _scan_declaration_colon_line(
- source_tokens=source_tokens,
- start_index=start_index,
- )
- if colon_line is not None:
- return colon_line
- return _fallback_declaration_end_line(node, start_line=start_line)
-
-
-# =========================
-# CFG fingerprinting
-# =========================
-
-
-def _cfg_fingerprint_and_complexity(
- node: _qualnames.FunctionNode,
- cfg: NormalizationConfig,
- qualname: str,
-) -> tuple[str, int]:
- """
- Generate a structural fingerprint for a function using CFG analysis.
-
- The fingerprint is computed by:
- 1. Building a Control Flow Graph (CFG) from the function
- 2. Normalizing each CFG block's statements (variable names, constants, etc.)
- 3. Creating a canonical representation of the CFG structure
- 4. Hashing the representation with SHA-1
-
- Functions with identical control flow and normalized statements will
- produce the same fingerprint, even if they differ in variable names,
- constants, or type annotations.
-
- Args:
- node: Function AST node to fingerprint
- cfg: Normalization configuration (what to ignore)
- qualname: Qualified name for logging/debugging
-
- Returns:
- 40-character hex SHA-1 hash of the normalized CFG
- """
- builder = CFGBuilder()
- graph = builder.build(qualname, node)
- cfg_normalizer = AstNormalizer(cfg)
-
- # Use generator to avoid building large list of strings
- parts: list[str] = []
- for block in sorted(graph.blocks, key=lambda b: b.id):
- succ_ids = ",".join(
- str(s.id) for s in sorted(block.successors, key=lambda s: s.id)
- )
- block_dump = normalized_ast_dump_from_list(
- block.statements,
- cfg,
- normalizer=cfg_normalizer,
- )
- parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}")
- return sha1("|".join(parts)), cyclomatic_complexity(graph)
-
-
-def _raw_source_hash_for_range(
- source_lines: list[str],
- start_line: int,
- end_line: int,
-) -> str:
- window = "".join(source_lines[start_line - 1 : end_line]).strip()
- no_space = "".join(window.split())
- return _sha1(no_space.encode("utf-8")).hexdigest()
-
-
-def _resolve_import_target(
- module_name: str,
- import_node: ast.ImportFrom,
-) -> str:
- if import_node.level <= 0:
- return import_node.module or ""
-
- parent_parts = module_name.split(".")
- keep = max(0, len(parent_parts) - import_node.level)
- prefix = parent_parts[:keep]
- if import_node.module:
- return ".".join([*prefix, import_node.module])
- return ".".join(prefix)
-
-
-_PROTOCOL_MODULE_NAMES = frozenset({"typing", "typing_extensions"})
-
-
-@dataclass(slots=True)
-class _ModuleWalkState:
- import_names: set[str] = field(default_factory=set)
- deps: list[ModuleDep] = field(default_factory=list)
- referenced_names: set[str] = field(default_factory=set)
- imported_symbol_bindings: dict[str, set[str]] = field(default_factory=dict)
- imported_module_aliases: dict[str, str] = field(default_factory=dict)
- name_nodes: list[ast.Name] = field(default_factory=list)
- attr_nodes: list[ast.Attribute] = field(default_factory=list)
- protocol_symbol_aliases: set[str] = field(default_factory=lambda: {"Protocol"})
- protocol_module_aliases: set[str] = field(
- default_factory=lambda: set(_PROTOCOL_MODULE_NAMES)
- )
-
-
-def _append_module_dep(
- *,
- module_name: str,
- target: str,
- import_type: Literal["import", "from_import"],
- line: int,
- state: _ModuleWalkState,
-) -> None:
- state.deps.append(
- ModuleDep(
- source=module_name,
- target=target,
- import_type=import_type,
- line=line,
- )
- )
-
-
-def _collect_import_node(
- *,
- node: ast.Import,
- module_name: str,
- state: _ModuleWalkState,
- collect_referenced_names: bool,
-) -> None:
- line = int(getattr(node, "lineno", 0))
- for alias in node.names:
- alias_name = alias.asname or alias.name.split(".", 1)[0]
- state.import_names.add(alias_name)
- _append_module_dep(
- module_name=module_name,
- target=alias.name,
- import_type="import",
- line=line,
- state=state,
- )
- if collect_referenced_names:
- state.imported_module_aliases[alias_name] = alias.name
- if alias.name in _PROTOCOL_MODULE_NAMES:
- state.protocol_module_aliases.add(alias_name)
-
-
-def _dotted_expr_name(expr: ast.expr) -> str | None:
- if isinstance(expr, ast.Name):
- return expr.id
- if isinstance(expr, ast.Attribute):
- prefix = _dotted_expr_name(expr.value)
- if prefix is None:
- return None
- return f"{prefix}.{expr.attr}"
- return None
-
-
-def _collect_import_from_node(
- *,
- node: ast.ImportFrom,
- module_name: str,
- state: _ModuleWalkState,
- collect_referenced_names: bool,
-) -> None:
- target = _resolve_import_target(module_name, node)
- if target:
- state.import_names.add(target.split(".", 1)[0])
- _append_module_dep(
- module_name=module_name,
- target=target,
- import_type="from_import",
- line=int(getattr(node, "lineno", 0)),
- state=state,
- )
-
- if node.module in _PROTOCOL_MODULE_NAMES:
- for alias in node.names:
- if alias.name == "Protocol":
- state.protocol_symbol_aliases.add(alias.asname or alias.name)
-
- if not collect_referenced_names or not target:
- return
-
- for alias in node.names:
- if alias.name == "*":
- continue
- alias_name = alias.asname or alias.name
- state.imported_symbol_bindings.setdefault(alias_name, set()).add(
- f"{target}:{alias.name}"
- )
-
-
-def _is_protocol_class(
- class_node: ast.ClassDef,
- *,
- protocol_symbol_aliases: frozenset[str],
- protocol_module_aliases: frozenset[str],
-) -> bool:
- for base in class_node.bases:
- base_name = _dotted_expr_name(base)
- if base_name is None:
- continue
- if base_name in protocol_symbol_aliases:
- return True
- if "." in base_name and base_name.rsplit(".", 1)[-1] == "Protocol":
- module_alias = base_name.rsplit(".", 1)[0]
- if module_alias in protocol_module_aliases:
- return True
- return False
-
-
-def _is_non_runtime_candidate(node: _qualnames.FunctionNode) -> bool:
- for decorator in node.decorator_list:
- name = _dotted_expr_name(decorator)
- if name is None:
- continue
- terminal = name.rsplit(".", 1)[-1]
- if terminal in {"overload", "abstractmethod"}:
- return True
- return False
-
-
-def _node_line_span(node: ast.AST) -> tuple[int, int] | None:
- start = int(getattr(node, "lineno", 0))
- end = int(getattr(node, "end_lineno", 0))
- if start <= 0 or end <= 0:
- return None
- return start, end
-
-
-def _eligible_unit_shape(
- node: _qualnames.FunctionNode,
- *,
- min_loc: int,
- min_stmt: int,
-) -> tuple[int, int, int, int] | None:
- span = _node_line_span(node)
- if span is None:
- return None
- start, end = span
- if end < start:
- return None
- loc = end - start + 1
- stmt_count = _stmt_count(node)
- if loc < min_loc or stmt_count < min_stmt:
- return None
- return start, end, loc, stmt_count
-
-
-def _class_metrics_for_node(
- *,
- module_name: str,
- class_qualname: str,
- class_node: ast.ClassDef,
- filepath: str,
- module_import_names: set[str],
- module_class_names: set[str],
-) -> ClassMetrics | None:
- span = _node_line_span(class_node)
- if span is None:
- return None
- start, end = span
- cbo, coupled_classes = compute_cbo(
- class_node,
- module_import_names=module_import_names,
- module_class_names=module_class_names,
- )
- lcom4, method_count, instance_var_count = compute_lcom4(class_node)
- return ClassMetrics(
- qualname=f"{module_name}:{class_qualname}",
- filepath=filepath,
- start_line=start,
- end_line=end,
- cbo=cbo,
- lcom4=lcom4,
- method_count=method_count,
- instance_var_count=instance_var_count,
- risk_coupling=coupling_risk(cbo),
- risk_cohesion=cohesion_risk(lcom4),
- coupled_classes=coupled_classes,
- )
-
-
-def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]:
- return "method" if "." in local_name else "function"
-
-
-def _should_skip_dead_candidate(
- local_name: str,
- node: _qualnames.FunctionNode,
- *,
- protocol_class_qualnames: set[str],
-) -> bool:
- if _is_non_runtime_candidate(node):
- return True
- if "." not in local_name:
- return False
- owner_qualname = local_name.rsplit(".", 1)[0]
- return owner_qualname in protocol_class_qualnames
-
-
-def _build_dead_candidate(
- *,
- module_name: str,
- local_name: str,
- node: _NamedDeclarationNode,
- filepath: str,
- kind: Literal["class", "function", "method"],
- suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]],
- start_line: int,
- end_line: int,
-) -> DeadCandidate:
- qualname = f"{module_name}:{local_name}"
- return DeadCandidate(
- qualname=qualname,
- local_name=node.name,
- filepath=filepath,
- start_line=start_line,
- end_line=end_line,
- kind=kind,
- suppressed_rules=suppression_index.get(
- suppression_target_key(
- filepath=filepath,
- qualname=qualname,
- start_line=start_line,
- end_line=end_line,
- kind=kind,
- ),
- (),
- ),
- )
-
-
-def _dead_candidate_for_unit(
- *,
- module_name: str,
- local_name: str,
- node: _qualnames.FunctionNode,
- filepath: str,
- suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]],
- protocol_class_qualnames: set[str],
-) -> DeadCandidate | None:
- span = _node_line_span(node)
- if span is None:
- return None
- if _should_skip_dead_candidate(
- local_name,
- node,
- protocol_class_qualnames=protocol_class_qualnames,
- ):
- return None
- start, end = span
- return _build_dead_candidate(
- module_name=module_name,
- local_name=local_name,
- node=node,
- filepath=filepath,
- kind=_dead_candidate_kind(local_name),
- suppression_index=suppression_index,
- start_line=start,
- end_line=end,
- )
-
-
-def _collect_load_reference_node(
- *,
- node: ast.AST,
- state: _ModuleWalkState,
-) -> None:
- if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
- state.referenced_names.add(node.id)
- state.name_nodes.append(node)
- return
- if isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load):
- state.referenced_names.add(node.attr)
- state.attr_nodes.append(node)
-
-
-def _resolve_referenced_qualnames(
- *,
- module_name: str,
- collector: _qualnames.QualnameCollector,
- state: _ModuleWalkState,
-) -> frozenset[str]:
- top_level_class_by_name = {
- class_qualname: class_qualname
- for class_qualname, _class_node in collector.class_nodes
- if "." not in class_qualname
- }
- local_method_qualnames = frozenset(
- f"{module_name}:{local_name}"
- for local_name, _node in collector.units
- if "." in local_name
- )
-
- resolved: set[str] = set()
- for name_node in state.name_nodes:
- for qualname in state.imported_symbol_bindings.get(name_node.id, ()):
- resolved.add(qualname)
-
- for attr_node in state.attr_nodes:
- base = attr_node.value
- if isinstance(base, ast.Name):
- imported_module = state.imported_module_aliases.get(base.id)
- if imported_module is not None:
- resolved.add(f"{imported_module}:{attr_node.attr}")
- else:
- class_qualname = top_level_class_by_name.get(base.id)
- if class_qualname is not None:
- local_method_qualname = (
- f"{module_name}:{class_qualname}.{attr_node.attr}"
- )
- if local_method_qualname in local_method_qualnames:
- resolved.add(local_method_qualname)
-
- return frozenset(resolved)
-
-
-class _ModuleWalkResult(NamedTuple):
- import_names: frozenset[str]
- module_deps: tuple[ModuleDep, ...]
- referenced_names: frozenset[str]
- referenced_qualnames: frozenset[str]
- protocol_symbol_aliases: frozenset[str]
- protocol_module_aliases: frozenset[str]
-
-
-def _collect_module_walk_data(
- *,
- tree: ast.AST,
- module_name: str,
- collector: _qualnames.QualnameCollector,
- collect_referenced_names: bool,
-) -> _ModuleWalkResult:
- """Single ast.walk that collects imports, deps, names, qualnames & protocol aliases.
-
- Reduces the hot path to one tree walk plus one local qualname resolution phase.
- """
- state = _ModuleWalkState()
- for node in ast.walk(tree):
- if isinstance(node, ast.Import):
- _collect_import_node(
- node=node,
- module_name=module_name,
- state=state,
- collect_referenced_names=collect_referenced_names,
- )
- elif isinstance(node, ast.ImportFrom):
- _collect_import_from_node(
- node=node,
- module_name=module_name,
- state=state,
- collect_referenced_names=collect_referenced_names,
- )
- elif collect_referenced_names:
- _collect_load_reference_node(node=node, state=state)
-
- deps_sorted = tuple(
- sorted(
- state.deps,
- key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line),
- )
- )
- resolved = (
- _resolve_referenced_qualnames(
- module_name=module_name,
- collector=collector,
- state=state,
- )
- if collect_referenced_names
- else frozenset()
- )
-
- return _ModuleWalkResult(
- import_names=frozenset(state.import_names),
- module_deps=deps_sorted,
- referenced_names=frozenset(state.referenced_names),
- referenced_qualnames=resolved,
- protocol_symbol_aliases=frozenset(state.protocol_symbol_aliases),
- protocol_module_aliases=frozenset(state.protocol_module_aliases),
- )
-
-
-def _collect_dead_candidates(
- *,
- filepath: str,
- module_name: str,
- collector: _qualnames.QualnameCollector,
- protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}),
- protocol_module_aliases: frozenset[str] = frozenset(
- {"typing", "typing_extensions"}
- ),
- suppression_rules_by_target: Mapping[SuppressionTargetKey, tuple[str, ...]]
- | None = None,
-) -> tuple[DeadCandidate, ...]:
- protocol_class_qualnames = {
- class_qualname
- for class_qualname, class_node in collector.class_nodes
- if _is_protocol_class(
- class_node,
- protocol_symbol_aliases=protocol_symbol_aliases,
- protocol_module_aliases=protocol_module_aliases,
- )
- }
-
- candidates: list[DeadCandidate] = []
- suppression_index = (
- suppression_rules_by_target if suppression_rules_by_target is not None else {}
- )
- for local_name, node in collector.units:
- candidate = _dead_candidate_for_unit(
- module_name=module_name,
- local_name=local_name,
- node=node,
- filepath=filepath,
- suppression_index=suppression_index,
- protocol_class_qualnames=protocol_class_qualnames,
- )
- if candidate is not None:
- candidates.append(candidate)
-
- for class_qualname, class_node in collector.class_nodes:
- span = _node_line_span(class_node)
- if span is not None:
- start, end = span
- candidates.append(
- _build_dead_candidate(
- module_name=module_name,
- local_name=class_qualname,
- node=class_node,
- filepath=filepath,
- kind="class",
- suppression_index=suppression_index,
- start_line=start,
- end_line=end,
- )
- )
-
- return tuple(
- sorted(
- candidates,
- key=lambda item: (
- item.filepath,
- item.start_line,
- item.end_line,
- item.qualname,
- ),
- )
- )
-
-
-def _collect_declaration_targets(
- *,
- filepath: str,
- module_name: str,
- collector: _qualnames.QualnameCollector,
- source_tokens: tuple[tokenize.TokenInfo, ...] = (),
- source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None,
- include_inline_lines: bool = False,
-) -> tuple[DeclarationTarget, ...]:
- declarations: list[DeclarationTarget] = []
- declaration_specs: list[
- tuple[str, ast.AST, Literal["function", "method", "class"]]
- ] = [
- (
- local_name,
- node,
- "method" if "." in local_name else "function",
- )
- for local_name, node in collector.units
- ]
- declaration_specs.extend(
- (class_qualname, class_node, "class")
- for class_qualname, class_node in collector.class_nodes
- )
-
- for qualname_suffix, node, kind in declaration_specs:
- start = int(getattr(node, "lineno", 0))
- end = int(getattr(node, "end_lineno", 0))
- if start > 0 and end > 0:
- declaration_end_line = (
- _declaration_end_line(
- node,
- source_tokens=source_tokens,
- source_token_index=source_token_index,
- )
- if include_inline_lines
- else None
- )
- declarations.append(
- DeclarationTarget(
- filepath=filepath,
- qualname=f"{module_name}:{qualname_suffix}",
- start_line=start,
- end_line=end,
- kind=kind,
- declaration_end_line=declaration_end_line,
- )
- )
-
- return tuple(
- sorted(
- declarations,
- key=lambda item: (
- item.filepath,
- item.start_line,
- item.end_line,
- item.qualname,
- item.kind,
- ),
- )
- )
-
-
-def _build_suppression_index_for_source(
- *,
- source: str,
- filepath: str,
- module_name: str,
- collector: _qualnames.QualnameCollector,
-) -> Mapping[SuppressionTargetKey, tuple[str, ...]]:
- suppression_directives = extract_suppression_directives(source)
- if not suppression_directives:
- return {}
-
- needs_inline_binding = any(
- directive.binding == "inline" for directive in suppression_directives
- )
- source_tokens: tuple[tokenize.TokenInfo, ...] = ()
- source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None
- if needs_inline_binding:
- source_tokens = _source_tokens(source)
- if source_tokens:
- source_token_index = _build_declaration_token_index(source_tokens)
-
- declaration_targets = _collect_declaration_targets(
- filepath=filepath,
- module_name=module_name,
- collector=collector,
- source_tokens=source_tokens,
- source_token_index=source_token_index,
- include_inline_lines=needs_inline_binding,
- )
- suppression_bindings = bind_suppressions_to_declarations(
- directives=suppression_directives,
- declarations=declaration_targets,
- )
- return build_suppression_index(suppression_bindings)
-
-
-# =========================
-# Public API
-# =========================
-
-
-def extract_units_and_stats_from_source(
- source: str,
- filepath: str,
- module_name: str,
- cfg: NormalizationConfig,
- min_loc: int,
- min_stmt: int,
- *,
- block_min_loc: int = 20,
- block_min_stmt: int = 8,
- segment_min_loc: int = 20,
- segment_min_stmt: int = 10,
- collect_structural_findings: bool = True,
- collect_api_surface: bool = False,
- api_include_private_modules: bool = False,
-) -> tuple[
- list[Unit],
- list[BlockUnit],
- list[SegmentUnit],
- SourceStats,
- FileMetrics,
- list[StructuralFindingGroup],
-]:
- try:
- tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS)
- except SyntaxError as e:
- raise ParseError(f"Failed to parse {filepath}: {e}") from e
- if not isinstance(tree, ast.Module):
- raise ParseError(f"Failed to parse {filepath}: expected module AST root")
-
- collector = _qualnames.QualnameCollector()
- collector.visit(tree)
- source_lines = source.splitlines()
- source_line_count = len(source_lines)
-
- is_test_file = is_test_filepath(filepath)
-
- # Single-pass AST walk replaces 3 separate functions / 4 walks.
- _walk = _collect_module_walk_data(
- tree=tree,
- module_name=module_name,
- collector=collector,
- collect_referenced_names=not is_test_file,
- )
- import_names = _walk.import_names
- module_deps = _walk.module_deps
- referenced_names = _walk.referenced_names
- referenced_qualnames = _walk.referenced_qualnames
- protocol_symbol_aliases = _walk.protocol_symbol_aliases
- protocol_module_aliases = _walk.protocol_module_aliases
-
- suppression_index = _build_suppression_index_for_source(
- source=source,
- filepath=filepath,
- module_name=module_name,
- collector=collector,
- )
- class_names = frozenset(class_node.name for _, class_node in collector.class_nodes)
- module_import_names = set(import_names)
- module_class_names = set(class_names)
- class_metrics: list[ClassMetrics] = []
-
- units: list[Unit] = []
- block_units: list[BlockUnit] = []
- segment_units: list[SegmentUnit] = []
- structural_findings: list[StructuralFindingGroup] = []
-
- for local_name, node in collector.units:
- unit_shape = _eligible_unit_shape(
- node,
- min_loc=min_loc,
- min_stmt=min_stmt,
- )
- if unit_shape is None:
- continue
- start, end, loc, stmt_count = unit_shape
-
- qualname = f"{module_name}:{local_name}"
- fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname)
- structure_facts = scan_function_structure(
- node,
- filepath,
- qualname,
- collect_findings=collect_structural_findings,
- )
- depth = structure_facts.nesting_depth
- risk = risk_level(complexity)
- raw_hash = _raw_source_hash_for_range(source_lines, start, end)
-
- units.append(
- Unit(
- qualname=qualname,
- filepath=filepath,
- start_line=start,
- end_line=end,
- loc=loc,
- stmt_count=stmt_count,
- fingerprint=fingerprint,
- loc_bucket=bucket_loc(loc),
- cyclomatic_complexity=complexity,
- nesting_depth=depth,
- risk=risk,
- raw_hash=raw_hash,
- entry_guard_count=structure_facts.entry_guard_count,
- entry_guard_terminal_profile=(
- structure_facts.entry_guard_terminal_profile
- ),
- entry_guard_has_side_effect_before=(
- structure_facts.entry_guard_has_side_effect_before
- ),
- terminal_kind=structure_facts.terminal_kind,
- try_finally_profile=structure_facts.try_finally_profile,
- side_effect_order_profile=structure_facts.side_effect_order_profile,
- )
- )
-
- needs_blocks = (
- not local_name.endswith("__init__")
- and loc >= block_min_loc
- and stmt_count >= block_min_stmt
- )
- needs_segments = loc >= segment_min_loc and stmt_count >= segment_min_stmt
-
- if needs_blocks or needs_segments:
- body = getattr(node, "body", None)
- hashes: list[str] | None = None
- if isinstance(body, list):
- hashes = stmt_hashes(body, cfg)
-
- if needs_blocks:
- block_units.extend(
- extract_blocks(
- node,
- filepath=filepath,
- qualname=qualname,
- cfg=cfg,
- block_size=4,
- max_blocks=15,
- precomputed_hashes=hashes,
- )
- )
-
- if needs_segments:
- segment_units.extend(
- extract_segments(
- node,
- filepath=filepath,
- qualname=qualname,
- cfg=cfg,
- window_size=6,
- max_segments=60,
- precomputed_hashes=hashes,
- )
- )
-
- if collect_structural_findings:
- structural_findings.extend(structure_facts.structural_findings)
-
- for class_qualname, class_node in collector.class_nodes:
- class_metric = _class_metrics_for_node(
- module_name=module_name,
- class_qualname=class_qualname,
- class_node=class_node,
- filepath=filepath,
- module_import_names=module_import_names,
- module_class_names=module_class_names,
- )
- if class_metric is not None:
- class_metrics.append(class_metric)
-
- dead_candidates = _collect_dead_candidates(
- filepath=filepath,
- module_name=module_name,
- collector=collector,
- protocol_symbol_aliases=protocol_symbol_aliases,
- protocol_module_aliases=protocol_module_aliases,
- suppression_rules_by_target=suppression_index,
- )
-
- sorted_class_metrics = tuple(
- sorted(
- class_metrics,
- key=lambda item: (
- item.filepath,
- item.start_line,
- item.end_line,
- item.qualname,
- ),
- )
- )
- typing_coverage, docstring_coverage = collect_module_adoption(
- tree=tree,
- module_name=module_name,
- filepath=filepath,
- collector=collector,
- imported_names=import_names,
- )
- api_surface = None
- if collect_api_surface:
- api_surface = collect_module_api_surface(
- tree=tree,
- module_name=module_name,
- filepath=filepath,
- collector=collector,
- imported_names=import_names,
- include_private_modules=api_include_private_modules,
- )
-
- return (
- units,
- block_units,
- segment_units,
- SourceStats(
- lines=source_line_count,
- functions=collector.function_count,
- methods=collector.method_count,
- classes=collector.class_count,
- ),
- FileMetrics(
- class_metrics=sorted_class_metrics,
- module_deps=module_deps,
- dead_candidates=dead_candidates,
- referenced_names=referenced_names,
- import_names=import_names,
- class_names=class_names,
- referenced_qualnames=referenced_qualnames,
- typing_coverage=typing_coverage,
- docstring_coverage=docstring_coverage,
- api_surface=api_surface,
- ),
- structural_findings,
- )
diff --git a/codeclone/findings/__init__.py b/codeclone/findings/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/findings/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/findings/clones/__init__.py b/codeclone/findings/clones/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/findings/clones/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/golden_fixtures.py b/codeclone/findings/clones/golden_fixtures.py
similarity index 97%
rename from codeclone/golden_fixtures.py
rename to codeclone/findings/clones/golden_fixtures.py
index 3b6fe47..b60caaa 100644
--- a/codeclone/golden_fixtures.py
+++ b/codeclone/findings/clones/golden_fixtures.py
@@ -11,15 +11,15 @@
from pathlib import PurePosixPath
from typing import Literal
-from .domain.source_scope import SOURCE_KIND_FIXTURES, SOURCE_KIND_TESTS
-from .models import (
+from ...domain.source_scope import SOURCE_KIND_FIXTURES, SOURCE_KIND_TESTS
+from ...models import (
GroupItem,
GroupItemLike,
GroupMap,
GroupMapLike,
SuppressedCloneGroup,
)
-from .paths import classify_source_kind, normalize_repo_path, relative_repo_path
+from ...paths import classify_source_kind, normalize_repo_path, relative_repo_path
CloneGroupKind = Literal["function", "block", "segment"]
diff --git a/codeclone/grouping.py b/codeclone/findings/clones/grouping.py
similarity index 98%
rename from codeclone/grouping.py
rename to codeclone/findings/clones/grouping.py
index c4590b3..7aa37dc 100644
--- a/codeclone/grouping.py
+++ b/codeclone/findings/clones/grouping.py
@@ -9,7 +9,7 @@
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- from .models import GroupItemsLike, GroupMap
+ from ...models import GroupItemsLike, GroupMap
def _group_items_by_key(
diff --git a/codeclone/findings/ids.py b/codeclone/findings/ids.py
new file mode 100644
index 0000000..d09cf0d
--- /dev/null
+++ b/codeclone/findings/ids.py
@@ -0,0 +1,31 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+
+def clone_group_id(kind: str, group_key: str) -> str:
+ return f"clone:{kind}:{group_key}"
+
+
+def structural_group_id(finding_kind: str, finding_key: str) -> str:
+ return f"structural:{finding_kind}:{finding_key}"
+
+
+def dead_code_group_id(subject_key: str) -> str:
+ return f"dead_code:{subject_key}"
+
+
+def design_group_id(category: str, subject_key: str) -> str:
+ return f"design:{category}:{subject_key}"
+
+
+__all__ = [
+ "clone_group_id",
+ "dead_code_group_id",
+ "design_group_id",
+ "structural_group_id",
+]
diff --git a/codeclone/findings/structural/__init__.py b/codeclone/findings/structural/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/findings/structural/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/structural_findings.py b/codeclone/findings/structural/detectors.py
similarity index 99%
rename from codeclone/structural_findings.py
rename to codeclone/findings/structural/detectors.py
index 2d805d7..0ebe138 100644
--- a/codeclone/structural_findings.py
+++ b/codeclone/findings/structural/detectors.py
@@ -21,13 +21,13 @@
from hashlib import sha1
from typing import TYPE_CHECKING, overload
-from ._coerce import as_int, as_str
-from .domain.findings import (
+from ...domain.findings import (
STRUCTURAL_KIND_CLONE_COHORT_DRIFT,
STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE,
STRUCTURAL_KIND_DUPLICATED_BRANCHES,
)
-from .models import GroupItemLike, StructuralFindingGroup, StructuralFindingOccurrence
+from ...models import GroupItemLike, StructuralFindingGroup, StructuralFindingOccurrence
+from ...utils.coerce import as_int, as_str
if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
diff --git a/codeclone/fingerprint.py b/codeclone/fingerprint.py
deleted file mode 100644
index 72adaee..0000000
--- a/codeclone/fingerprint.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import hashlib
-
-
-def sha1(s: str) -> str:
- return hashlib.sha1(s.encode("utf-8")).hexdigest()
-
-
-def bucket_loc(loc: int) -> str:
- # Helps avoid grouping wildly different sizes if desired
- if loc < 20:
- return "0-19"
- if loc < 50:
- return "20-49"
- if loc < 100:
- return "50-99"
- return "100+"
diff --git a/codeclone/html_report.py b/codeclone/html_report.py
deleted file mode 100644
index 16ceab5..0000000
--- a/codeclone/html_report.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-"""Public facade for HTML report generation.
-
-Re-exports build_html_report from the new _html_report package and
-keeps backward-compatible imports that tests and downstream code rely on.
-"""
-
-from __future__ import annotations
-
-from ._html_report import build_html_report
-from ._html_snippets import (
- _FileCache,
- _pygments_css,
- _render_code_block,
- _try_pygments,
-)
-
-__all__ = [
- "_FileCache",
- "_pygments_css",
- "_render_code_block",
- "_try_pygments",
- "build_html_report",
-]
diff --git a/codeclone/main.py b/codeclone/main.py
new file mode 100644
index 0000000..7e17b85
--- /dev/null
+++ b/codeclone/main.py
@@ -0,0 +1,15 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from .surfaces.cli.workflow import main
+
+__all__ = ["main"]
+
+
+if __name__ == "__main__":
+ main()
diff --git a/codeclone/mcp_service.py b/codeclone/mcp_service.py
deleted file mode 100644
index cbed02b..0000000
--- a/codeclone/mcp_service.py
+++ /dev/null
@@ -1,4727 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import hashlib
-import subprocess
-from argparse import Namespace
-from collections import OrderedDict
-from collections.abc import Iterable, Mapping, Sequence
-from dataclasses import dataclass
-from json import JSONDecodeError
-from pathlib import Path
-from threading import RLock
-from typing import Final, Literal, cast
-
-import orjson
-
-from . import __version__
-from ._cli_args import (
- DEFAULT_BASELINE_PATH,
- DEFAULT_BLOCK_MIN_LOC,
- DEFAULT_BLOCK_MIN_STMT,
- DEFAULT_MAX_BASELINE_SIZE_MB,
- DEFAULT_MAX_CACHE_SIZE_MB,
- DEFAULT_MIN_LOC,
- DEFAULT_MIN_STMT,
- DEFAULT_SEGMENT_MIN_LOC,
- DEFAULT_SEGMENT_MIN_STMT,
-)
-from ._cli_baselines import (
- CloneBaselineState,
- MetricsBaselineState,
- probe_metrics_baseline_section,
- resolve_clone_baseline_state,
- resolve_metrics_baseline_state,
-)
-from ._cli_config import ConfigValidationError, load_pyproject_config
-from ._cli_meta import _build_report_meta, _current_report_timestamp_utc
-from ._cli_runtime import (
- resolve_cache_path,
- resolve_cache_status,
- validate_numeric_args,
-)
-from ._coerce import as_float as _as_float
-from ._coerce import as_int as _as_int
-from ._git_diff import validate_git_diff_ref
-from .baseline import Baseline
-from .cache import Cache, CacheStatus
-from .contracts import (
- DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- DOCS_URL,
- REPORT_SCHEMA_VERSION,
- ExitCode,
-)
-from .domain.findings import (
- CATEGORY_CLONE,
- CATEGORY_COHESION,
- CATEGORY_COMPLEXITY,
- CATEGORY_COUPLING,
- CATEGORY_DEAD_CODE,
- CATEGORY_DEPENDENCY,
- CATEGORY_STRUCTURAL,
- CLONE_KIND_SEGMENT,
- FAMILY_CLONE,
- FAMILY_CLONES,
- FAMILY_DEAD_CODE,
- FAMILY_DESIGN,
- FAMILY_STRUCTURAL,
-)
-from .domain.quality import (
- CONFIDENCE_HIGH,
- CONFIDENCE_LOW,
- CONFIDENCE_MEDIUM,
- EFFORT_EASY,
- EFFORT_HARD,
- EFFORT_MODERATE,
- SEVERITY_CRITICAL,
- SEVERITY_INFO,
- SEVERITY_WARNING,
-)
-from .domain.source_scope import (
- SOURCE_KIND_FIXTURES,
- SOURCE_KIND_MIXED,
- SOURCE_KIND_ORDER,
- SOURCE_KIND_OTHER,
- SOURCE_KIND_PRODUCTION,
- SOURCE_KIND_TESTS,
-)
-from .models import CoverageJoinResult, MetricsDiff, ProjectMetrics, Suggestion
-from .pipeline import (
- GatingResult,
- MetricGateConfig,
- OutputPaths,
- analyze,
- bootstrap,
- discover,
- metric_gate_reasons,
- process,
- report,
-)
-from .report.json_contract import (
- clone_group_id,
- dead_code_group_id,
- design_group_id,
- structural_group_id,
-)
-
-AnalysisMode = Literal["full", "clones_only"]
-CachePolicy = Literal["reuse", "refresh", "off"]
-FreshnessKind = Literal["fresh", "mixed", "reused"]
-HotlistKind = Literal[
- "most_actionable",
- "highest_spread",
- "highest_priority",
- "production_hotspots",
- "test_fixture_hotspots",
-]
-FindingFamilyFilter = Literal["all", "clone", "structural", "dead_code", "design"]
-FindingNoveltyFilter = Literal["all", "new", "known"]
-FindingSort = Literal["default", "priority", "severity", "spread"]
-DetailLevel = Literal["summary", "normal", "full"]
-ComparisonFocus = Literal["all", "clones", "structural", "metrics"]
-PRSummaryFormat = Literal["markdown", "json"]
-HelpTopic = Literal[
- "workflow",
- "analysis_profile",
- "suppressions",
- "baseline",
- "coverage",
- "latest_runs",
- "review_state",
- "changed_scope",
-]
-HelpDetail = Literal["compact", "normal"]
-MetricsDetailFamily = Literal[
- "complexity",
- "coupling",
- "cohesion",
- "coverage_adoption",
- "coverage_join",
- "dependencies",
- "dead_code",
- "api_surface",
- "god_modules",
- "overloaded_modules",
- "health",
-]
-ReportSection = Literal[
- "all",
- "meta",
- "inventory",
- "findings",
- "metrics",
- "metrics_detail",
- "derived",
- "changed",
- "integrity",
-]
-HealthScope = Literal["repository"]
-SummaryFocus = Literal["repository", "production", "changed_paths"]
-
-_LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser()
-_REPORT_DUMMY_PATH = Path(".cache/codeclone/report.json")
-_HEALTH_SCOPE_REPOSITORY: Final[HealthScope] = "repository"
-_FOCUS_REPOSITORY: Final[SummaryFocus] = "repository"
-_FOCUS_PRODUCTION: Final[SummaryFocus] = "production"
-_FOCUS_CHANGED_PATHS: Final[SummaryFocus] = "changed_paths"
-_MCP_CONFIG_KEYS = frozenset(
- {
- "min_loc",
- "min_stmt",
- "block_min_loc",
- "block_min_stmt",
- "segment_min_loc",
- "segment_min_stmt",
- "processes",
- "cache_path",
- "max_cache_size_mb",
- "baseline",
- "max_baseline_size_mb",
- "metrics_baseline",
- "api_surface",
- "coverage_xml",
- "coverage_min",
- "golden_fixture_paths",
- }
-)
-_RESOURCE_SECTION_MAP: Final[dict[str, ReportSection]] = {
- "report.json": "all",
- "summary": "meta",
- "health": "metrics",
- "changed": "changed",
- "overview": "derived",
-}
-_SEVERITY_WEIGHT: Final[dict[str, float]] = {
- SEVERITY_CRITICAL: 1.0,
- SEVERITY_WARNING: 0.6,
- SEVERITY_INFO: 0.2,
-}
-_EFFORT_WEIGHT: Final[dict[str, float]] = {
- EFFORT_EASY: 1.0,
- EFFORT_MODERATE: 0.6,
- EFFORT_HARD: 0.3,
-}
-_NOVELTY_WEIGHT: Final[dict[str, float]] = {"new": 1.0, "known": 0.5}
-_RUNTIME_WEIGHT: Final[dict[str, float]] = {
- "production": 1.0,
- "mixed": 0.8,
- "tests": 0.4,
- "fixtures": 0.2,
- "other": 0.5,
-}
-_CONFIDENCE_WEIGHT: Final[dict[str, float]] = {
- CONFIDENCE_HIGH: 1.0,
- CONFIDENCE_MEDIUM: 0.7,
- CONFIDENCE_LOW: 0.3,
-}
-# Canonical report groups use FAMILY_CLONES ("clones"), while individual finding
-# payloads use FAMILY_CLONE ("clone").
-_VALID_ANALYSIS_MODES = frozenset({"full", "clones_only"})
-_VALID_CACHE_POLICIES = frozenset({"reuse", "refresh", "off"})
-_VALID_FINDING_FAMILIES = frozenset(
- {"all", "clone", "structural", "dead_code", "design"}
-)
-_VALID_FINDING_NOVELTY = frozenset({"all", "new", "known"})
-_VALID_FINDING_SORT = frozenset({"default", "priority", "severity", "spread"})
-_VALID_DETAIL_LEVELS = frozenset({"summary", "normal", "full"})
-_VALID_COMPARISON_FOCUS = frozenset({"all", "clones", "structural", "metrics"})
-_VALID_PR_SUMMARY_FORMATS = frozenset({"markdown", "json"})
-_VALID_HELP_TOPICS = frozenset(
- {
- "workflow",
- "analysis_profile",
- "suppressions",
- "baseline",
- "coverage",
- "latest_runs",
- "review_state",
- "changed_scope",
- }
-)
-_VALID_HELP_DETAILS = frozenset({"compact", "normal"})
-DEFAULT_MCP_HISTORY_LIMIT = 4
-MAX_MCP_HISTORY_LIMIT = 10
-_VALID_REPORT_SECTIONS = frozenset(
- {
- "all",
- "meta",
- "inventory",
- "findings",
- "metrics",
- "metrics_detail",
- "derived",
- "changed",
- "integrity",
- }
-)
-_VALID_HOTLIST_KINDS = frozenset(
- {
- "most_actionable",
- "highest_spread",
- "highest_priority",
- "production_hotspots",
- "test_fixture_hotspots",
- }
-)
-_VALID_SEVERITIES = frozenset({SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO})
-_SOURCE_KIND_BREAKDOWN_ORDER: Final[tuple[str, ...]] = (
- SOURCE_KIND_PRODUCTION,
- SOURCE_KIND_TESTS,
- SOURCE_KIND_FIXTURES,
- SOURCE_KIND_MIXED,
- SOURCE_KIND_OTHER,
-)
-_COMPACT_ITEM_PATH_KEYS: Final[frozenset[str]] = frozenset(
- {"relative_path", "path", "filepath", "file"}
-)
-_COMPACT_ITEM_EMPTY_VALUES: Final[tuple[object, ...]] = ("", None, [], {}, ())
-_HOTLIST_REPORT_KEYS: Final[dict[str, str]] = {
- "most_actionable": "most_actionable_ids",
- "highest_spread": "highest_spread_ids",
- "production_hotspots": "production_hotspot_ids",
- "test_fixture_hotspots": "test_fixture_hotspot_ids",
-}
-_CHECK_TO_DIMENSION: Final[dict[str, str]] = {
- "cohesion": "cohesion",
- "coupling": "coupling",
- "dead_code": "dead_code",
- "complexity": "complexity",
- "clones": "clones",
-}
-_DESIGN_CHECK_CONTEXT: Final[dict[str, dict[str, object]]] = {
- "complexity": {
- "category": CATEGORY_COMPLEXITY,
- "metric": "cyclomatic_complexity",
- "operator": ">",
- "default_threshold": DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- },
- "coupling": {
- "category": CATEGORY_COUPLING,
- "metric": "cbo",
- "operator": ">",
- "default_threshold": DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- },
- "cohesion": {
- "category": CATEGORY_COHESION,
- "metric": "lcom4",
- "operator": ">=",
- "default_threshold": DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- },
-}
-_VALID_METRICS_DETAIL_FAMILIES = frozenset(
- {
- "complexity",
- "coupling",
- "cohesion",
- "coverage_adoption",
- "coverage_join",
- "dependencies",
- "dead_code",
- "api_surface",
- "god_modules",
- "overloaded_modules",
- "health",
- }
-)
-_METRICS_DETAIL_FAMILY_ALIASES: Final[dict[str, str]] = {
- "god_modules": "overloaded_modules",
-}
-_SHORT_RUN_ID_LENGTH = 8
-_SHORT_HASH_ID_LENGTH = 6
-
-
-@dataclass(frozen=True)
-class MCPHelpTopicSpec:
- summary: str
- key_points: tuple[str, ...]
- recommended_tools: tuple[str, ...]
- doc_links: tuple[tuple[str, str], ...]
- warnings: tuple[str, ...] = ()
- anti_patterns: tuple[str, ...] = ()
-
-
-_MCP_BOOK_URL: Final = f"{DOCS_URL}book/"
-_MCP_GUIDE_URL: Final = f"{DOCS_URL}mcp/"
-_MCP_INTERFACE_DOC_LINK: Final[tuple[str, str]] = (
- "MCP interface contract",
- f"{_MCP_BOOK_URL}20-mcp-interface/",
-)
-_BASELINE_DOC_LINK: Final[tuple[str, str]] = (
- "Baseline contract",
- f"{_MCP_BOOK_URL}06-baseline/",
-)
-_CONFIG_DOC_LINK: Final[tuple[str, str]] = (
- "Config and defaults",
- f"{_MCP_BOOK_URL}04-config-and-defaults/",
-)
-_REPORT_DOC_LINK: Final[tuple[str, str]] = (
- "Report contract",
- f"{_MCP_BOOK_URL}08-report/",
-)
-_CLI_DOC_LINK: Final[tuple[str, str]] = (
- "CLI contract",
- f"{_MCP_BOOK_URL}09-cli/",
-)
-_PIPELINE_DOC_LINK: Final[tuple[str, str]] = (
- "Core pipeline",
- f"{_MCP_BOOK_URL}05-core-pipeline/",
-)
-_SUPPRESSIONS_DOC_LINK: Final[tuple[str, str]] = (
- "Inline suppressions contract",
- f"{_MCP_BOOK_URL}19-inline-suppressions/",
-)
-_MCP_GUIDE_DOC_LINK: Final[tuple[str, str]] = ("MCP usage guide", _MCP_GUIDE_URL)
-_HELP_TOPIC_SPECS: Final[dict[str, MCPHelpTopicSpec]] = {
- "workflow": MCPHelpTopicSpec(
- summary=(
- "CodeClone MCP is triage-first and budget-aware. Start with a "
- "summary or production triage, then narrow through hotspots or "
- "focused checks before opening one finding in detail."
- ),
- key_points=(
- "Recommended first pass: analyze_repository or analyze_changed_paths.",
- (
- "Start with default or pyproject-resolved thresholds; lower them "
- "only for an explicit higher-sensitivity follow-up pass."
- ),
- (
- "Use get_run_summary or get_production_triage before broad "
- "finding listing."
- ),
- (
- "Prefer list_hotspots or focused check_* tools over "
- "list_findings on noisy repositories."
- ),
- ("Use get_finding and get_remediation only after selecting an issue."),
- (
- "get_report_section(section='all') is an exception path, not "
- "a default first step."
- ),
- ),
- recommended_tools=(
- "analyze_repository",
- "analyze_changed_paths",
- "get_run_summary",
- "get_production_triage",
- "list_hotspots",
- "check_clones",
- "check_dead_code",
- "get_finding",
- "get_remediation",
- ),
- doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK),
- warnings=(
- (
- "Broad list_findings calls burn context quickly on large or "
- "noisy repositories."
- ),
- (
- "Prefer generate_pr_summary(format='markdown') unless machine "
- "JSON is explicitly required."
- ),
- ),
- anti_patterns=(
- "Starting exploration with list_findings on a noisy repository.",
- "Using get_report_section(section='all') as the default first step.",
- (
- "Escalating detail on larger lists instead of opening one "
- "finding with get_finding."
- ),
- ),
- ),
- "analysis_profile": MCPHelpTopicSpec(
- summary=(
- "CodeClone default analysis is intentionally conservative: stable "
- "first-pass review, baseline-aware governance, and CI-friendly "
- "signal over maximum local sensitivity."
- ),
- key_points=(
- (
- "Default thresholds are intentionally conservative and "
- "production-friendly."
- ),
- (
- "A clean default run does not rule out smaller local "
- "duplication or repetition."
- ),
- (
- "Lowering thresholds increases sensitivity and can surface "
- "smaller functions, tighter windows, and finer local signals."
- ),
- (
- "Lower-threshold runs are best for exploratory local review, "
- "not as a silent replacement for the default governance profile."
- ),
- "Interpret results in the context of the active threshold profile.",
- ),
- recommended_tools=(
- "analyze_repository",
- "analyze_changed_paths",
- "get_run_summary",
- "compare_runs",
- ),
- doc_links=(
- _CONFIG_DOC_LINK,
- _PIPELINE_DOC_LINK,
- _MCP_INTERFACE_DOC_LINK,
- ),
- warnings=(
- (
- "Do not treat a default-threshold run as proof that no smaller "
- "local clone or repetition exists."
- ),
- (
- "Lower-threshold runs usually increase noise and should be read "
- "as higher-sensitivity exploratory passes."
- ),
- "Run comparisons are most meaningful when profiles are aligned.",
- ),
- anti_patterns=(
- (
- "Assuming a clean default pass means no finer-grained "
- "duplication exists anywhere in the repository."
- ),
- (
- "Lowering thresholds for exploration and then interpreting the "
- "result as if it had the same meaning as the conservative "
- "default pass."
- ),
- (
- "Mixing low-threshold exploratory output into baseline or CI "
- "reasoning without acknowledging the profile change."
- ),
- ),
- ),
- "suppressions": MCPHelpTopicSpec(
- summary=(
- "CodeClone supports explicit inline suppressions for selected "
- "findings. They are local policy, not analysis truth, and should "
- "stay narrow and declaration-scoped."
- ),
- key_points=(
- "Current syntax uses codeclone: ignore[rule-id,...].",
- "Binding is declaration-scoped: def, async def, or class.",
- (
- "Supported placement is the previous line or inline on the "
- "declaration or header line."
- ),
- (
- "Suppressions are target-specific and do not imply file-wide "
- "or cascading scope."
- ),
- (
- "Use suppressions for accepted dynamic or runtime false "
- "positives, not to hide broad classes of debt."
- ),
- ),
- recommended_tools=("get_finding", "get_remediation"),
- doc_links=(_SUPPRESSIONS_DOC_LINK, _MCP_INTERFACE_DOC_LINK),
- warnings=(
- (
- "MCP explains suppression semantics but never creates or "
- "updates suppressions."
- ),
- ),
- anti_patterns=(
- "Treating suppressions as file-wide or inherited state.",
- (
- "Using suppressions to hide broad structural debt instead of "
- "accepted false positives."
- ),
- ),
- ),
- "baseline": MCPHelpTopicSpec(
- summary=(
- "A baseline is CodeClone's accepted comparison snapshot for clones "
- "and optional metrics. It separates known debt from new regressions "
- "and is trust-checked before use."
- ),
- key_points=(
- (
- "Canonical baseline schema is v2.0 with meta and clone keys; "
- "metrics may be embedded for unified flows."
- ),
- (
- "Compatibility depends on generator identity, supported "
- "schema version, fingerprint version, python tag, and payload "
- "integrity."
- ),
- (
- "Known means already present in the trusted baseline; new "
- "means not accepted by baseline."
- ),
- (
- "In CI and gating contexts, untrusted baseline states are "
- "contract errors rather than soft warnings."
- ),
- "MCP is read-only and does not update or rewrite baselines.",
- ),
- recommended_tools=("get_run_summary", "evaluate_gates", "compare_runs"),
- doc_links=(_BASELINE_DOC_LINK,),
- warnings=(
- "Baseline trust semantics directly affect new-vs-known classification.",
- ),
- anti_patterns=(
- "Treating baseline as mutable MCP session state.",
- "Assuming an untrusted baseline is only cosmetic in CI contexts.",
- ),
- ),
- "coverage": MCPHelpTopicSpec(
- summary=(
- "Coverage join is an external current-run signal: CodeClone reads "
- "an existing Cobertura XML report and joins line hits to risky "
- "function spans."
- ),
- key_points=(
- "Use Cobertura XML such as `coverage xml` output from coverage.py.",
- "Coverage join does not become baseline truth and does not affect health.",
- (
- "Coverage hotspot gating is current-run only and focuses on "
- "medium/high-risk functions measured below the configured "
- "threshold."
- ),
- (
- "Functions missing from the supplied coverage.xml are surfaced "
- "as scope gaps, not labeled as untested."
- ),
- "Use metrics_detail(family='coverage_join') for bounded drill-down.",
- ),
- recommended_tools=(
- "analyze_repository",
- "analyze_changed_paths",
- "get_run_summary",
- "get_report_section",
- "evaluate_gates",
- ),
- doc_links=(
- _MCP_INTERFACE_DOC_LINK,
- _CLI_DOC_LINK,
- _REPORT_DOC_LINK,
- ),
- warnings=(
- "Coverage join is only as accurate as the external XML path mapping.",
- "It does not infer branch coverage and does not execute tests.",
- "Use fail-on-untested-hotspots only with a valid joined coverage input.",
- ),
- anti_patterns=(
- "Treating missing coverage XML as zero coverage without stating it.",
- "Reading coverage join as a baseline-aware trend signal.",
- "Assuming dynamic runtime dispatch is visible through a static line join.",
- ),
- ),
- "latest_runs": MCPHelpTopicSpec(
- summary=(
- "latest/* resources point to the most recent analysis run in the "
- "current MCP session. They are convenience handles, not persistent "
- "truth anchors."
- ),
- key_points=(
- "Run history is in-memory only and bounded by history-limit.",
- "The latest pointer moves when a newer analyze_* call registers a run.",
- "A fresh repository state requires a fresh analyze run.",
- (
- "Short run ids are convenience handles derived from canonical "
- "run identity."
- ),
- (
- "Do not assume latest/* is globally current outside the "
- "active MCP session."
- ),
- ),
- recommended_tools=(
- "analyze_repository",
- "analyze_changed_paths",
- "get_run_summary",
- "compare_runs",
- ),
- doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK),
- warnings=(
- (
- "latest/* can point at a different repository after a later "
- "analyze call in the same session."
- ),
- ),
- anti_patterns=(
- (
- "Assuming latest/* remains tied to one repository across the "
- "whole client session."
- ),
- (
- "Using latest/* as a substitute for starting a fresh run when "
- "freshness matters."
- ),
- ),
- ),
- "review_state": MCPHelpTopicSpec(
- summary=(
- "Reviewed state in MCP is session-local workflow state. It helps "
- "long sessions track review progress without modifying canonical "
- "findings, baseline, or persisted artifacts."
- ),
- key_points=(
- "Review markers are in-memory only.",
- "They do not change report truth, finding identity, or CI semantics.",
- "They are useful for triage workflows across long sessions.",
- (
- "They should not be interpreted as acceptance, suppression, "
- "or baseline update."
- ),
- ),
- recommended_tools=(
- "list_hotspots",
- "get_finding",
- "mark_finding_reviewed",
- "list_reviewed_findings",
- ),
- doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK),
- warnings=(
- "Reviewed markers disappear when the MCP session is cleared or restarted.",
- ),
- anti_patterns=(
- "Treating reviewed state as a persistent acceptance signal.",
- "Assuming reviewed findings are removed from canonical report truth.",
- ),
- ),
- "changed_scope": MCPHelpTopicSpec(
- summary=(
- "Changed-scope analysis narrows review to findings that touch a "
- "selected change set. It is for PR and patch review, not a "
- "replacement for full canonical analysis."
- ),
- key_points=(
- (
- "Use analyze_changed_paths with explicit changed_paths or "
- "git_diff_ref for review-focused runs."
- ),
- (
- "Start with the same conservative profile as the default "
- "review, then lower thresholds only when you explicitly want "
- "a higher-sensitivity changed-files pass."
- ),
- (
- "Changed-scope is best for asking what new issues touch "
- "modified files and whether anything should block CI."
- ),
- "Prefer production triage and hotspot views before broad listing.",
- "If repository-wide truth is needed, run full analysis first.",
- ),
- recommended_tools=(
- "analyze_changed_paths",
- "get_run_summary",
- "get_production_triage",
- "evaluate_gates",
- "generate_pr_summary",
- ),
- doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK),
- warnings=(
- (
- "Changed-scope narrows review focus; it does not replace the "
- "full canonical report for repository-wide truth."
- ),
- ),
- anti_patterns=(
- "Using changed-scope as if it were the only source of repository truth.",
- (
- "Starting changed-files review with broad listing instead of "
- "compact triage."
- ),
- ),
- ),
-}
-
-
-def _suggestion_finding_id_payload(suggestion: object) -> str:
- if not hasattr(suggestion, "finding_family"):
- return ""
- family = str(getattr(suggestion, "finding_family", "")).strip()
- if family == FAMILY_CLONES:
- kind = str(getattr(suggestion, "finding_kind", "")).strip()
- subject_key = str(getattr(suggestion, "subject_key", "")).strip()
- return clone_group_id(kind or CLONE_KIND_SEGMENT, subject_key)
- if family == FAMILY_STRUCTURAL:
- return structural_group_id(
- str(getattr(suggestion, "finding_kind", "")).strip() or CATEGORY_STRUCTURAL,
- str(getattr(suggestion, "subject_key", "")).strip(),
- )
- category = str(getattr(suggestion, "category", "")).strip()
- subject_key = str(getattr(suggestion, "subject_key", "")).strip()
- if category == CATEGORY_DEAD_CODE:
- return dead_code_group_id(subject_key)
- return design_group_id(
- category,
- subject_key or str(getattr(suggestion, "title", "")),
- )
-
-
-@dataclass(frozen=True, slots=True)
-class _CloneShortIdEntry:
- canonical_id: str
- alias: str
- token: str
- suffix: str
-
- def render(self, prefix_length: int) -> str:
- if prefix_length <= 0:
- prefix_length = len(self.token)
- return f"{self.alias}:{self.token[:prefix_length]}{self.suffix}"
-
-
-def _partitioned_short_id(alias: str, remainder: str) -> str:
- first, _, rest = remainder.partition(":")
- return f"{alias}:{first}:{rest}" if rest else f"{alias}:{first}"
-
-
-def _clone_short_id_entry_payload(canonical_id: str) -> _CloneShortIdEntry:
- _prefix, _, remainder = canonical_id.partition(":")
- clone_kind, _, group_key = remainder.partition(":")
- hashes = [part for part in group_key.split("|") if part]
- if clone_kind == "function":
- fingerprint = hashes[0] if hashes else group_key
- bucket = ""
- if "|" in group_key:
- bucket = "|" + group_key.split("|")[-1]
- return _CloneShortIdEntry(
- canonical_id=canonical_id,
- alias="fn",
- token=fingerprint,
- suffix=bucket,
- )
- alias = {"block": "blk", "segment": "seg"}.get(clone_kind, "clone")
- combined = "|".join(hashes) if hashes else group_key
- token = hashlib.sha256(combined.encode()).hexdigest()
- return _CloneShortIdEntry(
- canonical_id=canonical_id,
- alias=alias,
- token=token,
- suffix=f"|x{len(hashes) or 1}",
- )
-
-
-def _disambiguated_clone_short_ids_payload(
- canonical_ids: Sequence[str],
-) -> dict[str, str]:
- clone_entries = [
- _clone_short_id_entry_payload(canonical_id) for canonical_id in canonical_ids
- ]
- max_token_length = max((len(entry.token) for entry in clone_entries), default=0)
- for prefix_length in range(_SHORT_HASH_ID_LENGTH + 2, max_token_length + 1, 2):
- candidates = {
- entry.canonical_id: entry.render(prefix_length) for entry in clone_entries
- }
- if len(set(candidates.values())) == len(candidates):
- return candidates
- return {
- entry.canonical_id: entry.render(max_token_length) for entry in clone_entries
- }
-
-
-def _leaf_symbol_name_payload(value: object) -> str:
- text = str(value).strip()
- if not text:
- return ""
- if ":" in text:
- text = text.rsplit(":", maxsplit=1)[-1]
- if "." in text:
- text = text.rsplit(".", maxsplit=1)[-1]
- return text
-
-
-def _base_short_finding_id_payload(canonical_id: str) -> str:
- prefix, _, remainder = canonical_id.partition(":")
- if prefix == "clone":
- return _clone_short_id_entry_payload(canonical_id).render(_SHORT_HASH_ID_LENGTH)
- if prefix == "structural":
- finding_kind, _, finding_key = remainder.partition(":")
- return f"struct:{finding_kind}:{finding_key[:_SHORT_HASH_ID_LENGTH]}"
- if prefix == "dead_code":
- return f"dead:{_leaf_symbol_name_payload(remainder)}"
- if prefix == "design":
- category, _, subject_key = remainder.partition(":")
- return f"design:{category}:{_leaf_symbol_name_payload(subject_key)}"
- return canonical_id
-
-
-def _disambiguated_short_finding_id_payload(canonical_id: str) -> str:
- prefix, _, remainder = canonical_id.partition(":")
- if prefix == "clone":
- return _clone_short_id_entry_payload(canonical_id).render(0)
- if prefix == "structural":
- return _partitioned_short_id("struct", remainder)
- if prefix == "dead_code":
- return f"dead:{remainder}"
- if prefix == "design":
- return _partitioned_short_id("design", remainder)
- return canonical_id
-
-
-def _json_text_payload(
- payload: object,
- *,
- sort_keys: bool = True,
-) -> str:
- options = orjson.OPT_INDENT_2
- if sort_keys:
- options |= orjson.OPT_SORT_KEYS
- return orjson.dumps(payload, option=options).decode("utf-8")
-
-
-def _git_diff_lines_payload(
- *,
- root_path: Path,
- git_diff_ref: str,
-) -> tuple[str, ...]:
- try:
- validated_ref = validate_git_diff_ref(git_diff_ref)
- except ValueError as exc:
- raise MCPGitDiffError(str(exc)) from exc
- try:
- completed = subprocess.run(
- ["git", "diff", "--name-only", validated_ref, "--"],
- cwd=root_path,
- check=True,
- capture_output=True,
- text=True,
- timeout=30,
- )
- except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc:
- raise MCPGitDiffError(
- f"Unable to resolve changed paths from git diff ref '{validated_ref}'."
- ) from exc
- return tuple(
- sorted({line.strip() for line in completed.stdout.splitlines() if line.strip()})
- )
-
-
-def _load_report_document_payload(report_json: str) -> dict[str, object]:
- try:
- payload = orjson.loads(report_json)
- except JSONDecodeError as exc:
- raise MCPServiceError(
- f"Generated canonical report is not valid JSON: {exc}"
- ) from exc
- if not isinstance(payload, dict):
- raise MCPServiceError("Generated canonical report must be a JSON object.")
- return dict(payload)
-
-
-def _validated_history_limit(history_limit: int) -> int:
- if not 1 <= history_limit <= MAX_MCP_HISTORY_LIMIT:
- raise ValueError(
- f"history_limit must be between 1 and {MAX_MCP_HISTORY_LIMIT}."
- )
- return history_limit
-
-
-class MCPServiceError(RuntimeError):
- """Base class for CodeClone MCP service errors."""
-
-
-class MCPServiceContractError(MCPServiceError):
- """Raised when an MCP request violates the CodeClone service contract."""
-
-
-class MCPRunNotFoundError(MCPServiceError):
- """Raised when a requested MCP run is not available in the in-memory registry."""
-
-
-class MCPFindingNotFoundError(MCPServiceError):
- """Raised when a requested finding id is not present in the selected run."""
-
-
-class MCPGitDiffError(MCPServiceError):
- """Raised when changed paths cannot be resolved from a git ref."""
-
-
-class _BufferConsole:
- def __init__(self) -> None:
- self.messages: list[str] = []
-
- def print(self, *objects: object, **_kwargs: object) -> None:
- text = " ".join(str(obj) for obj in objects).strip()
- if text:
- self.messages.append(text)
-
-
-@dataclass(frozen=True, slots=True)
-class MCPAnalysisRequest:
- root: str | None = None
- analysis_mode: AnalysisMode = "full"
- respect_pyproject: bool = True
- changed_paths: tuple[str, ...] = ()
- git_diff_ref: str | None = None
- processes: int | None = None
- min_loc: int | None = None
- min_stmt: int | None = None
- block_min_loc: int | None = None
- block_min_stmt: int | None = None
- segment_min_loc: int | None = None
- segment_min_stmt: int | None = None
- api_surface: bool | None = None
- coverage_xml: str | None = None
- coverage_min: int | None = None
- complexity_threshold: int | None = None
- coupling_threshold: int | None = None
- cohesion_threshold: int | None = None
- baseline_path: str | None = None
- metrics_baseline_path: str | None = None
- max_baseline_size_mb: int | None = None
- cache_policy: CachePolicy = "reuse"
- cache_path: str | None = None
- max_cache_size_mb: int | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class MCPGateRequest:
- run_id: str | None = None
- fail_on_new: bool = False
- fail_threshold: int = -1
- fail_complexity: int = -1
- fail_coupling: int = -1
- fail_cohesion: int = -1
- fail_cycles: bool = False
- fail_dead_code: bool = False
- fail_health: int = -1
- fail_on_new_metrics: bool = False
- fail_on_typing_regression: bool = False
- fail_on_docstring_regression: bool = False
- fail_on_api_break: bool = False
- fail_on_untested_hotspots: bool = False
- min_typing_coverage: int = -1
- min_docstring_coverage: int = -1
- coverage_min: int = 50
-
-
-@dataclass(frozen=True, slots=True)
-class MCPRunRecord:
- run_id: str
- root: Path
- request: MCPAnalysisRequest
- comparison_settings: tuple[object, ...]
- report_document: dict[str, object]
- summary: dict[str, object]
- changed_paths: tuple[str, ...]
- changed_projection: dict[str, object] | None
- warnings: tuple[str, ...]
- failures: tuple[str, ...]
- func_clones_count: int
- block_clones_count: int
- project_metrics: ProjectMetrics | None
- coverage_join: CoverageJoinResult | None
- suggestions: tuple[Suggestion, ...]
- new_func: frozenset[str]
- new_block: frozenset[str]
- metrics_diff: MetricsDiff | None
-
-
-class CodeCloneMCPRunStore:
- def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None:
- self._history_limit = _validated_history_limit(history_limit)
- self._lock = RLock()
- self._records: OrderedDict[str, MCPRunRecord] = OrderedDict()
- self._latest_run_id: str | None = None
-
- def register(self, record: MCPRunRecord) -> MCPRunRecord:
- with self._lock:
- self._records.pop(record.run_id, None)
- self._records[record.run_id] = record
- self._records.move_to_end(record.run_id)
- self._latest_run_id = record.run_id
- while len(self._records) > self._history_limit:
- self._records.popitem(last=False)
- return record
-
- def get(self, run_id: str | None = None) -> MCPRunRecord:
- with self._lock:
- resolved_run_id = self._resolve_run_id(run_id)
- if resolved_run_id is None:
- raise MCPRunNotFoundError("No matching MCP analysis run is available.")
- return self._records[resolved_run_id]
-
- def _resolve_run_id(self, run_id: str | None) -> str | None:
- if run_id is None:
- return self._latest_run_id
- if run_id in self._records:
- return run_id
- matches = [
- candidate for candidate in self._records if candidate.startswith(run_id)
- ]
- if len(matches) == 1:
- return matches[0]
- if len(matches) > 1:
- raise MCPServiceContractError(
- f"Run id '{run_id}' is ambiguous in this MCP session."
- )
- return None
-
- def records(self) -> tuple[MCPRunRecord, ...]:
- with self._lock:
- return tuple(self._records.values())
-
- def clear(self) -> tuple[str, ...]:
- with self._lock:
- removed_run_ids = tuple(self._records.keys())
- self._records.clear()
- self._latest_run_id = None
- return removed_run_ids
-
-
-class CodeCloneMCPService:
- def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None:
- self._runs = CodeCloneMCPRunStore(history_limit=history_limit)
- self._state_lock = RLock()
- self._review_state: dict[str, OrderedDict[str, str | None]] = {}
- self._last_gate_results: dict[str, dict[str, object]] = {}
- self._spread_max_cache: dict[str, int] = {}
-
- def analyze_repository(self, request: MCPAnalysisRequest) -> dict[str, object]:
- self._validate_analysis_request(request)
- root_path = self._resolve_root(request.root)
- analysis_started_at_utc = _current_report_timestamp_utc()
- changed_paths = self._resolve_request_changed_paths(
- root_path=root_path,
- changed_paths=request.changed_paths,
- git_diff_ref=request.git_diff_ref,
- )
- args = self._build_args(root_path=root_path, request=request)
- (
- baseline_path,
- baseline_exists,
- metrics_baseline_path,
- metrics_baseline_exists,
- shared_baseline_payload,
- ) = self._resolve_baseline_inputs(root_path=root_path, args=args)
- cache_path = self._resolve_cache_path(root_path=root_path, args=args)
- cache = self._build_cache(
- root_path=root_path,
- args=args,
- cache_path=cache_path,
- policy=request.cache_policy,
- )
- console = _BufferConsole()
-
- boot = bootstrap(
- args=args,
- root=root_path,
- output_paths=OutputPaths(json=_REPORT_DUMMY_PATH),
- cache_path=cache_path,
- )
- discovery_result = discover(boot=boot, cache=cache)
- processing_result = process(boot=boot, discovery=discovery_result, cache=cache)
- analysis_result = analyze(
- boot=boot,
- discovery=discovery_result,
- processing=processing_result,
- )
-
- clone_baseline_state = resolve_clone_baseline_state(
- args=args,
- baseline_path=baseline_path,
- baseline_exists=baseline_exists,
- func_groups=analysis_result.func_groups,
- block_groups=analysis_result.block_groups,
- codeclone_version=__version__,
- console=console,
- shared_baseline_payload=(
- shared_baseline_payload
- if metrics_baseline_path == baseline_path
- else None
- ),
- )
- metrics_baseline_state = resolve_metrics_baseline_state(
- args=args,
- metrics_baseline_path=metrics_baseline_path,
- metrics_baseline_exists=metrics_baseline_exists,
- baseline_updated_path=clone_baseline_state.updated_path,
- project_metrics=analysis_result.project_metrics,
- console=console,
- shared_baseline_payload=(
- shared_baseline_payload
- if metrics_baseline_path == baseline_path
- else None
- ),
- )
-
- cache_status, cache_schema_version = resolve_cache_status(cache)
- report_meta = _build_report_meta(
- codeclone_version=__version__,
- scan_root=root_path,
- baseline_path=baseline_path,
- baseline=clone_baseline_state.baseline,
- baseline_loaded=clone_baseline_state.loaded,
- baseline_status=clone_baseline_state.status.value,
- cache_path=cache_path,
- cache_used=cache_status == CacheStatus.OK,
- cache_status=cache_status.value,
- cache_schema_version=cache_schema_version,
- files_skipped_source_io=len(processing_result.source_read_failures),
- metrics_baseline_path=metrics_baseline_path,
- metrics_baseline=metrics_baseline_state.baseline,
- metrics_baseline_loaded=metrics_baseline_state.loaded,
- metrics_baseline_status=metrics_baseline_state.status.value,
- health_score=(
- analysis_result.project_metrics.health.total
- if analysis_result.project_metrics is not None
- else None
- ),
- health_grade=(
- analysis_result.project_metrics.health.grade
- if analysis_result.project_metrics is not None
- else None
- ),
- analysis_mode=request.analysis_mode,
- metrics_computed=self._metrics_computed(request.analysis_mode),
- min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC),
- min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT),
- block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC),
- block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT),
- segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC),
- segment_min_stmt=_as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT),
- design_complexity_threshold=_as_int(
- getattr(
- args,
- "design_complexity_threshold",
- DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- ),
- DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- ),
- design_coupling_threshold=_as_int(
- getattr(
- args,
- "design_coupling_threshold",
- DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- ),
- DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- ),
- design_cohesion_threshold=_as_int(
- getattr(
- args,
- "design_cohesion_threshold",
- DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- ),
- DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- ),
- analysis_started_at_utc=analysis_started_at_utc,
- report_generated_at_utc=_current_report_timestamp_utc(),
- )
-
- baseline_for_diff = (
- clone_baseline_state.baseline
- if clone_baseline_state.trusted_for_diff
- else Baseline(baseline_path)
- )
- new_func, new_block = baseline_for_diff.diff(
- analysis_result.func_groups,
- analysis_result.block_groups,
- )
- metrics_diff = None
- if (
- analysis_result.project_metrics is not None
- and metrics_baseline_state.trusted_for_diff
- ):
- metrics_diff = metrics_baseline_state.baseline.diff(
- analysis_result.project_metrics
- )
-
- report_artifacts = report(
- boot=boot,
- discovery=discovery_result,
- processing=processing_result,
- analysis=analysis_result,
- report_meta=report_meta,
- new_func=new_func,
- new_block=new_block,
- metrics_diff=metrics_diff,
- )
- report_json = report_artifacts.json
- if report_json is None:
- raise MCPServiceError("CodeClone MCP expected a canonical JSON report.")
- report_document = self._load_report_document(report_json)
- run_id = self._report_digest(report_document)
-
- warning_items = set(console.messages)
- if cache.load_warning:
- warning_items.add(cache.load_warning)
- warning_items.update(discovery_result.skipped_warnings)
- warnings = tuple(sorted(warning_items))
- failures = tuple(
- sorted(
- {
- *processing_result.failed_files,
- *processing_result.source_read_failures,
- }
- )
- )
-
- base_summary = self._build_run_summary_payload(
- run_id=run_id,
- root_path=root_path,
- request=request,
- report_document=report_document,
- baseline_state=clone_baseline_state,
- metrics_baseline_state=metrics_baseline_state,
- cache_status=cache_status,
- new_func=new_func,
- new_block=new_block,
- metrics_diff=metrics_diff,
- warnings=warnings,
- failures=failures,
- )
- provisional_record = MCPRunRecord(
- run_id=run_id,
- root=root_path,
- request=request,
- comparison_settings=self._comparison_settings(args=args, request=request),
- report_document=report_document,
- summary=base_summary,
- changed_paths=changed_paths,
- changed_projection=None,
- warnings=warnings,
- failures=failures,
- func_clones_count=analysis_result.func_clones_count,
- block_clones_count=analysis_result.block_clones_count,
- project_metrics=analysis_result.project_metrics,
- coverage_join=analysis_result.coverage_join,
- suggestions=analysis_result.suggestions,
- new_func=frozenset(new_func),
- new_block=frozenset(new_block),
- metrics_diff=metrics_diff,
- )
- changed_projection = self._build_changed_projection(provisional_record)
- summary = self._augment_summary_with_changed(
- summary=base_summary,
- changed_paths=changed_paths,
- changed_projection=changed_projection,
- )
- record = MCPRunRecord(
- run_id=run_id,
- root=root_path,
- request=request,
- comparison_settings=self._comparison_settings(args=args, request=request),
- report_document=report_document,
- summary=summary,
- changed_paths=changed_paths,
- changed_projection=changed_projection,
- warnings=warnings,
- failures=failures,
- func_clones_count=analysis_result.func_clones_count,
- block_clones_count=analysis_result.block_clones_count,
- project_metrics=analysis_result.project_metrics,
- coverage_join=analysis_result.coverage_join,
- suggestions=analysis_result.suggestions,
- new_func=frozenset(new_func),
- new_block=frozenset(new_block),
- metrics_diff=metrics_diff,
- )
- self._runs.register(record)
- self._prune_session_state()
- return self._summary_payload(record.summary, record=record)
-
- def analyze_changed_paths(self, request: MCPAnalysisRequest) -> dict[str, object]:
- if not request.changed_paths and request.git_diff_ref is None:
- raise MCPServiceContractError(
- "analyze_changed_paths requires changed_paths or git_diff_ref."
- )
- analysis_summary = self.analyze_repository(request)
- record = self._runs.get(str(analysis_summary.get("run_id", "")) or None)
- return self._changed_analysis_payload(record)
-
- def get_run_summary(self, run_id: str | None = None) -> dict[str, object]:
- record = self._runs.get(run_id)
- return self._summary_payload(record.summary, record=record)
-
- def compare_runs(
- self,
- *,
- run_id_before: str,
- run_id_after: str | None = None,
- focus: ComparisonFocus = "all",
- ) -> dict[str, object]:
- validated_focus = cast(
- "ComparisonFocus",
- self._validate_choice("focus", focus, _VALID_COMPARISON_FOCUS),
- )
- before = self._runs.get(run_id_before)
- after = self._runs.get(run_id_after)
- before_findings = self._comparison_index(before, focus=validated_focus)
- after_findings = self._comparison_index(after, focus=validated_focus)
- before_ids = set(before_findings)
- after_ids = set(after_findings)
- regressions = sorted(after_ids - before_ids)
- improvements = sorted(before_ids - after_ids)
- common = before_ids & after_ids
- health_before = self._summary_health_score(before.summary)
- health_after = self._summary_health_score(after.summary)
- comparability = self._comparison_scope(before=before, after=after)
- comparable = bool(comparability["comparable"])
- health_delta = (
- health_after - health_before
- if comparable and health_before is not None and health_after is not None
- else None
- )
- verdict = (
- self._comparison_verdict(
- regressions=len(regressions),
- improvements=len(improvements),
- health_delta=health_delta,
- )
- if comparable
- else "incomparable"
- )
- regressions_payload = (
- [
- self._comparison_finding_card(
- after,
- after_findings[finding_id],
- )
- for finding_id in regressions
- ]
- if comparable
- else []
- )
- improvements_payload = (
- [
- self._comparison_finding_card(
- before,
- before_findings[finding_id],
- )
- for finding_id in improvements
- ]
- if comparable
- else []
- )
- payload: dict[str, object] = {
- "before": {
- "run_id": self._short_run_id(before.run_id),
- "health": health_before,
- },
- "after": {
- "run_id": self._short_run_id(after.run_id),
- "health": health_after,
- },
- "comparable": comparable,
- "health_delta": health_delta,
- "verdict": verdict,
- "regressions": regressions_payload,
- "improvements": improvements_payload,
- "unchanged": len(common) if comparable else None,
- "summary": self._comparison_summary_text(
- comparable=comparable,
- comparability_reason=str(comparability["reason"]),
- regressions=len(regressions),
- improvements=len(improvements),
- health_delta=health_delta,
- ),
- }
- if not comparable:
- payload["reason"] = comparability["reason"]
- return payload
-
- def evaluate_gates(self, request: MCPGateRequest) -> dict[str, object]:
- record = self._runs.get(request.run_id)
- gate_result = self._evaluate_gate_snapshot(record=record, request=request)
- result = {
- "run_id": self._short_run_id(record.run_id),
- "would_fail": gate_result.exit_code != 0,
- "exit_code": gate_result.exit_code,
- "reasons": list(gate_result.reasons),
- "config": {
- "fail_on_new": request.fail_on_new,
- "fail_threshold": request.fail_threshold,
- "fail_complexity": request.fail_complexity,
- "fail_coupling": request.fail_coupling,
- "fail_cohesion": request.fail_cohesion,
- "fail_cycles": request.fail_cycles,
- "fail_dead_code": request.fail_dead_code,
- "fail_health": request.fail_health,
- "fail_on_new_metrics": request.fail_on_new_metrics,
- "fail_on_typing_regression": request.fail_on_typing_regression,
- "fail_on_docstring_regression": request.fail_on_docstring_regression,
- "fail_on_api_break": request.fail_on_api_break,
- "fail_on_untested_hotspots": request.fail_on_untested_hotspots,
- "min_typing_coverage": request.min_typing_coverage,
- "min_docstring_coverage": request.min_docstring_coverage,
- "coverage_min": request.coverage_min,
- },
- }
- with self._state_lock:
- self._last_gate_results[record.run_id] = dict(result)
- return result
-
- def _evaluate_gate_snapshot(
- self,
- *,
- record: MCPRunRecord,
- request: MCPGateRequest,
- ) -> GatingResult:
- reasons: list[str] = []
- if request.fail_on_untested_hotspots:
- if record.coverage_join is None:
- raise MCPServiceContractError(
- "Coverage gating requires a run created with coverage_xml."
- )
- if record.coverage_join.status != "ok":
- detail = record.coverage_join.invalid_reason or "invalid coverage input"
- raise MCPServiceContractError(
- "Coverage gating requires a valid Cobertura XML input. "
- f"Reason: {detail}"
- )
- if record.project_metrics is not None:
- metric_reasons = metric_gate_reasons(
- project_metrics=record.project_metrics,
- coverage_join=record.coverage_join,
- metrics_diff=record.metrics_diff,
- config=MetricGateConfig(
- fail_complexity=request.fail_complexity,
- fail_coupling=request.fail_coupling,
- fail_cohesion=request.fail_cohesion,
- fail_cycles=request.fail_cycles,
- fail_dead_code=request.fail_dead_code,
- fail_health=request.fail_health,
- fail_on_new_metrics=request.fail_on_new_metrics,
- fail_on_typing_regression=request.fail_on_typing_regression,
- fail_on_docstring_regression=request.fail_on_docstring_regression,
- fail_on_api_break=request.fail_on_api_break,
- fail_on_untested_hotspots=request.fail_on_untested_hotspots,
- min_typing_coverage=request.min_typing_coverage,
- min_docstring_coverage=request.min_docstring_coverage,
- coverage_min=request.coverage_min,
- ),
- )
- reasons.extend(f"metric:{reason}" for reason in metric_reasons)
-
- if request.fail_on_new and (record.new_func or record.new_block):
- reasons.append("clone:new")
-
- total_clone_groups = record.func_clones_count + record.block_clones_count
- if 0 <= request.fail_threshold < total_clone_groups:
- reasons.append(
- f"clone:threshold:{total_clone_groups}:{request.fail_threshold}"
- )
-
- if reasons:
- return GatingResult(
- exit_code=int(ExitCode.GATING_FAILURE),
- reasons=tuple(reasons),
- )
- return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=())
-
- def get_report_section(
- self,
- *,
- run_id: str | None = None,
- section: ReportSection = "all",
- family: MetricsDetailFamily | None = None,
- path: str | None = None,
- offset: int = 0,
- limit: int = 50,
- ) -> dict[str, object]:
- validated_section = cast(
- "ReportSection",
- self._validate_choice("section", section, _VALID_REPORT_SECTIONS),
- )
- record = self._runs.get(run_id)
- report_document = record.report_document
- if validated_section == "all":
- return dict(report_document)
- if validated_section == "changed":
- if record.changed_projection is None:
- raise MCPServiceContractError(
- "Report section 'changed' is not available in this run."
- )
- return dict(record.changed_projection)
- if validated_section == "metrics":
- metrics = self._as_mapping(report_document.get("metrics"))
- return {"summary": dict(self._as_mapping(metrics.get("summary")))}
- if validated_section == "metrics_detail":
- metrics = self._as_mapping(report_document.get("metrics"))
- if not metrics:
- raise MCPServiceContractError(
- "Report section 'metrics_detail' is not available in this run."
- )
- validated_family_input = self._validate_optional_choice(
- "family",
- family,
- _VALID_METRICS_DETAIL_FAMILIES,
- )
- normalized_family = (
- _METRICS_DETAIL_FAMILY_ALIASES.get(
- str(validated_family_input),
- str(validated_family_input),
- )
- if validated_family_input is not None
- else None
- )
- validated_family = cast("MetricsDetailFamily | None", normalized_family)
- return self._metrics_detail_payload(
- metrics=metrics,
- family=validated_family,
- path=path,
- offset=offset,
- limit=limit,
- )
- if validated_section == "derived":
- return self._derived_section_payload(record)
- payload = report_document.get(validated_section)
- if not isinstance(payload, Mapping):
- raise MCPServiceContractError(
- f"Report section '{validated_section}' is not available in this run."
- )
- return dict(payload)
-
- def list_findings(
- self,
- *,
- run_id: str | None = None,
- family: FindingFamilyFilter = "all",
- category: str | None = None,
- severity: str | None = None,
- source_kind: str | None = None,
- novelty: FindingNoveltyFilter = "all",
- sort_by: FindingSort = "default",
- detail_level: DetailLevel = "summary",
- changed_paths: Sequence[str] = (),
- git_diff_ref: str | None = None,
- exclude_reviewed: bool = False,
- offset: int = 0,
- limit: int = 50,
- max_results: int | None = None,
- ) -> dict[str, object]:
- validated_family = cast(
- "FindingFamilyFilter",
- self._validate_choice("family", family, _VALID_FINDING_FAMILIES),
- )
- validated_novelty = cast(
- "FindingNoveltyFilter",
- self._validate_choice("novelty", novelty, _VALID_FINDING_NOVELTY),
- )
- validated_sort = cast(
- "FindingSort",
- self._validate_choice("sort_by", sort_by, _VALID_FINDING_SORT),
- )
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- validated_severity = self._validate_optional_choice(
- "severity",
- severity,
- _VALID_SEVERITIES,
- )
- record = self._runs.get(run_id)
- paths_filter = self._resolve_query_changed_paths(
- record=record,
- changed_paths=changed_paths,
- git_diff_ref=git_diff_ref,
- )
- normalized_limit = max(
- 1,
- min(max_results if max_results is not None else limit, 200),
- )
- filtered = self._query_findings(
- record=record,
- family=validated_family,
- category=category,
- severity=validated_severity,
- source_kind=source_kind,
- novelty=validated_novelty,
- sort_by=validated_sort,
- detail_level=validated_detail,
- changed_paths=paths_filter,
- exclude_reviewed=exclude_reviewed,
- )
- total = len(filtered)
- normalized_offset = max(0, offset)
- items = filtered[normalized_offset : normalized_offset + normalized_limit]
- next_offset = normalized_offset + len(items)
- return {
- "run_id": self._short_run_id(record.run_id),
- "detail_level": validated_detail,
- "sort_by": validated_sort,
- "changed_paths": list(paths_filter),
- "offset": normalized_offset,
- "limit": normalized_limit,
- "returned": len(items),
- "total": total,
- "next_offset": next_offset if next_offset < total else None,
- "items": items,
- }
-
- def get_finding(
- self,
- *,
- finding_id: str,
- run_id: str | None = None,
- detail_level: DetailLevel = "normal",
- ) -> dict[str, object]:
- record = self._runs.get(run_id)
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- canonical_id = self._resolve_canonical_finding_id(record, finding_id)
- for finding in self._base_findings(record):
- if str(finding.get("id")) == canonical_id:
- return self._decorate_finding(
- record,
- finding,
- detail_level=validated_detail,
- )
- raise MCPFindingNotFoundError(
- f"Finding id '{finding_id}' was not found in run "
- f"'{self._short_run_id(record.run_id)}'."
- )
-
- def get_remediation(
- self,
- *,
- finding_id: str,
- run_id: str | None = None,
- detail_level: DetailLevel = "normal",
- ) -> dict[str, object]:
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- record = self._runs.get(run_id)
- canonical_id = self._resolve_canonical_finding_id(record, finding_id)
- finding = self.get_finding(
- finding_id=canonical_id,
- run_id=record.run_id,
- detail_level="full",
- )
- remediation = self._as_mapping(finding.get("remediation"))
- if not remediation:
- raise MCPFindingNotFoundError(
- f"Finding id '{finding_id}' does not expose remediation guidance."
- )
- return {
- "run_id": self._short_run_id(record.run_id),
- "finding_id": self._short_finding_id(record, canonical_id),
- "detail_level": validated_detail,
- "remediation": self._project_remediation(
- remediation,
- detail_level=validated_detail,
- ),
- }
-
- def list_hotspots(
- self,
- *,
- kind: HotlistKind,
- run_id: str | None = None,
- detail_level: DetailLevel = "summary",
- changed_paths: Sequence[str] = (),
- git_diff_ref: str | None = None,
- exclude_reviewed: bool = False,
- limit: int = 10,
- max_results: int | None = None,
- ) -> dict[str, object]:
- validated_kind = cast(
- "HotlistKind",
- self._validate_choice("kind", kind, _VALID_HOTLIST_KINDS),
- )
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- record = self._runs.get(run_id)
- paths_filter = self._resolve_query_changed_paths(
- record=record,
- changed_paths=changed_paths,
- git_diff_ref=git_diff_ref,
- )
- rows = self._hotspot_rows(
- record=record,
- kind=validated_kind,
- detail_level=validated_detail,
- changed_paths=paths_filter,
- exclude_reviewed=exclude_reviewed,
- )
- normalized_limit = max(
- 1,
- min(max_results if max_results is not None else limit, 50),
- )
- return {
- "run_id": self._short_run_id(record.run_id),
- "kind": validated_kind,
- "detail_level": validated_detail,
- "changed_paths": list(paths_filter),
- "returned": min(len(rows), normalized_limit),
- "total": len(rows),
- "items": [dict(self._as_mapping(item)) for item in rows[:normalized_limit]],
- }
-
- def get_production_triage(
- self,
- *,
- run_id: str | None = None,
- max_hotspots: int = 3,
- max_suggestions: int = 3,
- ) -> dict[str, object]:
- record = self._runs.get(run_id)
- summary = self._summary_payload(record.summary, record=record)
- findings = self._base_findings(record)
- findings_breakdown = self._source_kind_breakdown(
- self._finding_source_kind(finding) for finding in findings
- )
- suggestion_rows = self._triage_suggestion_rows(record)
- suggestion_breakdown = self._source_kind_breakdown(
- row.get("source_kind") for row in suggestion_rows
- )
- hotspot_limit = max(1, min(max_hotspots, 10))
- suggestion_limit = max(1, min(max_suggestions, 10))
- production_hotspots = self._hotspot_rows(
- record=record,
- kind="production_hotspots",
- detail_level="summary",
- changed_paths=(),
- exclude_reviewed=False,
- )
- production_suggestions = [
- dict(row)
- for row in suggestion_rows
- if str(row.get("source_kind", "")) == SOURCE_KIND_PRODUCTION
- ]
- payload: dict[str, object] = {
- "run_id": self._short_run_id(record.run_id),
- "focus": _FOCUS_PRODUCTION,
- "health_scope": _HEALTH_SCOPE_REPOSITORY,
- "baseline": dict(self._as_mapping(summary.get("baseline"))),
- "health": dict(self._summary_health_payload(summary)),
- "cache": dict(self._as_mapping(summary.get("cache"))),
- "findings": {
- "total": len(findings),
- "by_source_kind": findings_breakdown,
- "new_by_source_kind": dict(
- self._as_mapping(
- self._as_mapping(summary.get("findings")).get(
- "new_by_source_kind"
- )
- )
- ),
- "outside_focus": len(findings)
- - findings_breakdown[SOURCE_KIND_PRODUCTION],
- },
- "top_hotspots": {
- "kind": "production_hotspots",
- "available": len(production_hotspots),
- "returned": min(len(production_hotspots), hotspot_limit),
- "items": [
- dict(self._as_mapping(item))
- for item in production_hotspots[:hotspot_limit]
- ],
- },
- "suggestions": {
- "total": len(suggestion_rows),
- "by_source_kind": suggestion_breakdown,
- "outside_focus": len(suggestion_rows)
- - suggestion_breakdown[SOURCE_KIND_PRODUCTION],
- },
- "top_suggestions": {
- "available": len(production_suggestions),
- "returned": min(len(production_suggestions), suggestion_limit),
- "items": production_suggestions[:suggestion_limit],
- },
- }
- analysis_profile = self._summary_analysis_profile_payload(summary)
- if analysis_profile:
- payload["analysis_profile"] = analysis_profile
- coverage_join = self._summary_coverage_join_payload(record)
- if coverage_join:
- payload["coverage_join"] = coverage_join
- return payload
-
- def get_help(
- self,
- *,
- topic: HelpTopic,
- detail: HelpDetail = "compact",
- ) -> dict[str, object]:
- validated_topic = cast(
- "HelpTopic",
- self._validate_choice("topic", topic, _VALID_HELP_TOPICS),
- )
- validated_detail = cast(
- "HelpDetail",
- self._validate_choice("detail", detail, _VALID_HELP_DETAILS),
- )
- spec = _HELP_TOPIC_SPECS[validated_topic]
- payload: dict[str, object] = {
- "topic": validated_topic,
- "detail": validated_detail,
- "summary": spec.summary,
- "key_points": list(spec.key_points),
- "recommended_tools": list(spec.recommended_tools),
- "doc_links": [
- {"title": title, "url": url} for title, url in spec.doc_links
- ],
- }
- if validated_detail == "normal":
- if spec.warnings:
- payload["warnings"] = list(spec.warnings)
- if spec.anti_patterns:
- payload["anti_patterns"] = list(spec.anti_patterns)
- return payload
-
- def generate_pr_summary(
- self,
- *,
- run_id: str | None = None,
- changed_paths: Sequence[str] = (),
- git_diff_ref: str | None = None,
- format: PRSummaryFormat = "markdown",
- ) -> dict[str, object]:
- output_format = cast(
- "PRSummaryFormat",
- self._validate_choice("format", format, _VALID_PR_SUMMARY_FORMATS),
- )
- record = self._runs.get(run_id)
- paths_filter = self._resolve_query_changed_paths(
- record=record,
- changed_paths=changed_paths,
- git_diff_ref=git_diff_ref,
- prefer_record_paths=True,
- )
- changed_items = self._query_findings(
- record=record,
- detail_level="summary",
- changed_paths=paths_filter,
- )
- previous = self._previous_run_for_root(record)
- resolved: list[dict[str, object]] = []
- if previous is not None:
- compare_payload = self.compare_runs(
- run_id_before=previous.run_id,
- run_id_after=record.run_id,
- focus="all",
- )
- resolved = cast("list[dict[str, object]]", compare_payload["improvements"])
- with self._state_lock:
- gate_result = dict(
- self._last_gate_results.get(
- record.run_id,
- {"would_fail": False, "reasons": []},
- )
- )
- verdict = self._changed_verdict(
- changed_projection={
- "total": len(changed_items),
- "new": sum(
- 1 for item in changed_items if str(item.get("novelty", "")) == "new"
- ),
- },
- health_delta=self._summary_health_delta(record.summary),
- )
- payload: dict[str, object] = {
- "run_id": self._short_run_id(record.run_id),
- "changed_files": len(paths_filter),
- "health": self._summary_health_payload(record.summary),
- "health_delta": self._summary_health_delta(record.summary),
- "verdict": verdict,
- "new_findings_in_changed_files": changed_items,
- "resolved": resolved,
- "blocking_gates": list(cast(Sequence[str], gate_result.get("reasons", []))),
- }
- if output_format == "json":
- return payload
- return {
- "run_id": self._short_run_id(record.run_id),
- "format": output_format,
- "content": self._render_pr_summary_markdown(payload),
- }
-
- def mark_finding_reviewed(
- self,
- *,
- finding_id: str,
- run_id: str | None = None,
- note: str | None = None,
- ) -> dict[str, object]:
- record = self._runs.get(run_id)
- canonical_id = self._resolve_canonical_finding_id(record, finding_id)
- self.get_finding(
- finding_id=canonical_id,
- run_id=record.run_id,
- detail_level="normal",
- )
- with self._state_lock:
- review_map = self._review_state.setdefault(record.run_id, OrderedDict())
- review_map[canonical_id] = (
- note.strip() if isinstance(note, str) and note.strip() else None
- )
- review_map.move_to_end(canonical_id)
- return {
- "run_id": self._short_run_id(record.run_id),
- "finding_id": self._short_finding_id(record, canonical_id),
- "reviewed": True,
- "note": review_map[canonical_id],
- "reviewed_count": len(review_map),
- }
-
- def list_reviewed_findings(
- self,
- *,
- run_id: str | None = None,
- ) -> dict[str, object]:
- record = self._runs.get(run_id)
- with self._state_lock:
- review_items = tuple(
- self._review_state.get(record.run_id, OrderedDict()).items()
- )
- items = []
- for finding_id, note in review_items:
- try:
- finding = self.get_finding(finding_id=finding_id, run_id=record.run_id)
- except MCPFindingNotFoundError:
- continue
- items.append(
- {
- "finding_id": self._short_finding_id(record, finding_id),
- "note": note,
- "finding": self._project_finding_detail(
- record,
- finding,
- detail_level="summary",
- ),
- }
- )
- return {
- "run_id": self._short_run_id(record.run_id),
- "reviewed_count": len(items),
- "items": items,
- }
-
- def clear_session_runs(self) -> dict[str, object]:
- removed_run_ids = self._runs.clear()
- with self._state_lock:
- cleared_review_entries = sum(
- len(entries) for entries in self._review_state.values()
- )
- cleared_gate_results = len(self._last_gate_results)
- cleared_spread_cache_entries = len(self._spread_max_cache)
- self._review_state.clear()
- self._last_gate_results.clear()
- self._spread_max_cache.clear()
- return {
- "cleared_runs": len(removed_run_ids),
- "cleared_run_ids": [
- self._short_run_id(run_id) for run_id in removed_run_ids
- ],
- "cleared_review_entries": cleared_review_entries,
- "cleared_gate_results": cleared_gate_results,
- "cleared_spread_cache_entries": cleared_spread_cache_entries,
- }
-
- def check_complexity(
- self,
- *,
- run_id: str | None = None,
- root: str | None = None,
- path: str | None = None,
- min_complexity: int | None = None,
- max_results: int = 10,
- detail_level: DetailLevel = "summary",
- ) -> dict[str, object]:
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- record = self._resolve_granular_record(
- run_id=run_id,
- root=root,
- analysis_mode="full",
- )
- findings = self._query_findings(
- record=record,
- family="design",
- category=CATEGORY_COMPLEXITY,
- detail_level=validated_detail,
- changed_paths=self._path_filter_tuple(path),
- sort_by="priority",
- )
- if min_complexity is not None:
- findings = [
- finding
- for finding in findings
- if _as_int(
- self._as_mapping(finding.get("facts")).get(
- "cyclomatic_complexity",
- 0,
- )
- )
- >= min_complexity
- ]
- return self._granular_payload(
- record=record,
- check="complexity",
- items=findings,
- detail_level=validated_detail,
- max_results=max_results,
- path=path,
- threshold_context=self._design_threshold_context(
- record=record,
- check="complexity",
- path=path,
- items=findings,
- requested_min=min_complexity,
- ),
- )
-
- def check_clones(
- self,
- *,
- run_id: str | None = None,
- root: str | None = None,
- path: str | None = None,
- clone_type: str | None = None,
- source_kind: str | None = None,
- max_results: int = 10,
- detail_level: DetailLevel = "summary",
- ) -> dict[str, object]:
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- record = self._resolve_granular_record(
- run_id=run_id,
- root=root,
- analysis_mode="clones_only",
- )
- findings = self._query_findings(
- record=record,
- family="clone",
- source_kind=source_kind,
- detail_level=validated_detail,
- changed_paths=self._path_filter_tuple(path),
- sort_by="priority",
- )
- if clone_type is not None:
- findings = [
- finding
- for finding in findings
- if str(finding.get("clone_type", "")).strip() == clone_type
- ]
- return self._granular_payload(
- record=record,
- check="clones",
- items=findings,
- detail_level=validated_detail,
- max_results=max_results,
- path=path,
- )
-
- def check_coupling(
- self,
- *,
- run_id: str | None = None,
- root: str | None = None,
- path: str | None = None,
- max_results: int = 10,
- detail_level: DetailLevel = "summary",
- ) -> dict[str, object]:
- return self._check_design_metric(
- run_id=run_id,
- root=root,
- path=path,
- max_results=max_results,
- detail_level=detail_level,
- category=CATEGORY_COUPLING,
- check="coupling",
- )
-
- def check_cohesion(
- self,
- *,
- run_id: str | None = None,
- root: str | None = None,
- path: str | None = None,
- max_results: int = 10,
- detail_level: DetailLevel = "summary",
- ) -> dict[str, object]:
- return self._check_design_metric(
- run_id=run_id,
- root=root,
- path=path,
- max_results=max_results,
- detail_level=detail_level,
- category=CATEGORY_COHESION,
- check="cohesion",
- )
-
- def _check_design_metric(
- self,
- *,
- run_id: str | None,
- root: str | None,
- path: str | None,
- max_results: int,
- detail_level: DetailLevel,
- category: str,
- check: str,
- ) -> dict[str, object]:
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- record = self._resolve_granular_record(
- run_id=run_id,
- root=root,
- analysis_mode="full",
- )
- findings = self._query_findings(
- record=record,
- family="design",
- category=category,
- detail_level=validated_detail,
- changed_paths=self._path_filter_tuple(path),
- sort_by="priority",
- )
- return self._granular_payload(
- record=record,
- check=check,
- items=findings,
- detail_level=validated_detail,
- max_results=max_results,
- path=path,
- threshold_context=self._design_threshold_context(
- record=record,
- check=check,
- path=path,
- items=findings,
- ),
- )
-
- def check_dead_code(
- self,
- *,
- run_id: str | None = None,
- root: str | None = None,
- path: str | None = None,
- min_severity: str | None = None,
- max_results: int = 10,
- detail_level: DetailLevel = "summary",
- ) -> dict[str, object]:
- validated_detail = cast(
- "DetailLevel",
- self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS),
- )
- validated_min_severity = self._validate_optional_choice(
- "min_severity",
- min_severity,
- _VALID_SEVERITIES,
- )
- record = self._resolve_granular_record(
- run_id=run_id,
- root=root,
- analysis_mode="full",
- )
- findings = self._query_findings(
- record=record,
- family="dead_code",
- detail_level=validated_detail,
- changed_paths=self._path_filter_tuple(path),
- sort_by="priority",
- )
- if validated_min_severity is not None:
- findings = [
- finding
- for finding in findings
- if self._severity_rank(str(finding.get("severity", "")))
- >= self._severity_rank(validated_min_severity)
- ]
- return self._granular_payload(
- record=record,
- check="dead_code",
- items=findings,
- detail_level=validated_detail,
- max_results=max_results,
- path=path,
- )
-
- def read_resource(self, uri: str) -> str:
- if uri == "codeclone://schema":
- return _json_text_payload(self._schema_resource_payload())
- if uri == "codeclone://latest/triage":
- latest = self._runs.get()
- return _json_text_payload(self.get_production_triage(run_id=latest.run_id))
- latest_prefix = "codeclone://latest/"
- run_prefix = "codeclone://runs/"
- if uri.startswith(latest_prefix):
- latest = self._runs.get()
- suffix = uri[len(latest_prefix) :]
- return self._render_resource(latest, suffix)
- if not uri.startswith(run_prefix):
- raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}")
- remainder = uri[len(run_prefix) :]
- run_id, sep, suffix = remainder.partition("/")
- if not sep:
- raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}")
- record = self._runs.get(run_id)
- return self._render_resource(record, suffix)
-
- def _render_resource(self, record: MCPRunRecord, suffix: str) -> str:
- if suffix == "summary":
- return _json_text_payload(
- self._summary_payload(record.summary, record=record)
- )
- if suffix == "triage":
- raise MCPServiceContractError(
- "Production triage is exposed only as codeclone://latest/triage."
- )
- if suffix == "health":
- return _json_text_payload(self._summary_health_payload(record.summary))
- if suffix == "gates":
- with self._state_lock:
- gate_result = self._last_gate_results.get(record.run_id)
- if gate_result is None:
- raise MCPServiceContractError(
- "No gate evaluation result is available in this MCP session."
- )
- return _json_text_payload(gate_result)
- if suffix == "changed":
- if record.changed_projection is None:
- raise MCPServiceContractError(
- "Changed-findings projection is not available in this run."
- )
- return _json_text_payload(record.changed_projection)
- if suffix == "schema":
- return _json_text_payload(self._schema_resource_payload())
- if suffix == "report.json":
- return _json_text_payload(record.report_document, sort_keys=False)
- if suffix == "overview":
- return _json_text_payload(
- self.list_hotspots(kind="highest_spread", run_id=record.run_id)
- )
- finding_prefix = "findings/"
- if suffix.startswith(finding_prefix):
- finding_id = suffix[len(finding_prefix) :]
- return _json_text_payload(
- self.get_finding(run_id=record.run_id, finding_id=finding_id)
- )
- raise MCPServiceContractError(
- f"Unsupported CodeClone resource suffix '{suffix}'."
- )
-
- def _resolve_request_changed_paths(
- self,
- *,
- root_path: Path,
- changed_paths: Sequence[str],
- git_diff_ref: str | None,
- ) -> tuple[str, ...]:
- if changed_paths and git_diff_ref is not None:
- raise MCPServiceContractError(
- "Provide changed_paths or git_diff_ref, not both."
- )
- if git_diff_ref is not None:
- return self._git_diff_paths(root_path=root_path, git_diff_ref=git_diff_ref)
- if not changed_paths:
- return ()
- return self._normalize_changed_paths(root_path=root_path, paths=changed_paths)
-
- def _resolve_query_changed_paths(
- self,
- *,
- record: MCPRunRecord,
- changed_paths: Sequence[str],
- git_diff_ref: str | None,
- prefer_record_paths: bool = False,
- ) -> tuple[str, ...]:
- if changed_paths or git_diff_ref is not None:
- return self._resolve_request_changed_paths(
- root_path=record.root,
- changed_paths=changed_paths,
- git_diff_ref=git_diff_ref,
- )
- if prefer_record_paths:
- return record.changed_paths
- return ()
-
- def _normalize_changed_paths(
- self,
- *,
- root_path: Path,
- paths: Sequence[str],
- ) -> tuple[str, ...]:
- normalized: set[str] = set()
- for raw_path in paths:
- candidate = Path(str(raw_path)).expanduser()
- if candidate.is_absolute():
- try:
- relative = candidate.resolve().relative_to(root_path)
- except (OSError, ValueError) as exc:
- raise MCPServiceContractError(
- f"Changed path '{raw_path}' is outside root '{root_path}'."
- ) from exc
- normalized.add(relative.as_posix())
- continue
- cleaned = self._normalize_relative_path(candidate.as_posix())
- if cleaned:
- normalized.add(cleaned)
- return tuple(sorted(normalized))
-
- def _git_diff_paths(
- self,
- *,
- root_path: Path,
- git_diff_ref: str,
- ) -> tuple[str, ...]:
- lines = _git_diff_lines_payload(
- root_path=root_path,
- git_diff_ref=git_diff_ref,
- )
- return self._normalize_changed_paths(root_path=root_path, paths=lines)
-
- def _prune_session_state(self) -> None:
- active_run_ids = {record.run_id for record in self._runs.records()}
- with self._state_lock:
- for state_map in (
- self._review_state,
- self._last_gate_results,
- self._spread_max_cache,
- ):
- stale_run_ids = [
- run_id for run_id in state_map if run_id not in active_run_ids
- ]
- for run_id in stale_run_ids:
- state_map.pop(run_id, None)
-
- def _summary_health_score(self, summary: Mapping[str, object]) -> int | None:
- health = self._summary_health_payload(summary)
- if health.get("available") is False:
- return None
- score = health.get("score", 0)
- return _as_int(score, 0)
-
- def _summary_health_delta(self, summary: Mapping[str, object]) -> int | None:
- if self._summary_health_payload(summary).get("available") is False:
- return None
- metrics_diff = self._as_mapping(summary.get("metrics_diff"))
- value = metrics_diff.get("health_delta", 0)
- return _as_int(value, 0)
-
- def _summary_health_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, object]:
- if str(summary.get("analysis_mode", "")) == "clones_only":
- return {"available": False, "reason": "metrics_skipped"}
- health = dict(self._as_mapping(summary.get("health")))
- if health:
- return health
- return {"available": False, "reason": "unavailable"}
-
- @staticmethod
- def _short_run_id(run_id: str) -> str:
- return run_id[:_SHORT_RUN_ID_LENGTH]
-
- def _finding_id_maps(
- self,
- record: MCPRunRecord,
- ) -> tuple[dict[str, str], dict[str, str]]:
- canonical_ids = sorted(
- str(finding.get("id", ""))
- for finding in self._base_findings(record)
- if str(finding.get("id", ""))
- )
- base_ids = {
- canonical_id: self._base_short_finding_id(canonical_id)
- for canonical_id in canonical_ids
- }
- grouped: dict[str, list[str]] = {}
- for canonical_id, short_id in base_ids.items():
- grouped.setdefault(short_id, []).append(canonical_id)
- canonical_to_short: dict[str, str] = {}
- short_to_canonical: dict[str, str] = {}
- for short_id, group in grouped.items():
- if len(group) == 1:
- canonical_id = group[0]
- canonical_to_short[canonical_id] = short_id
- short_to_canonical[short_id] = canonical_id
- continue
- disambiguated_ids = self._disambiguated_short_finding_ids(group)
- for canonical_id, disambiguated in disambiguated_ids.items():
- canonical_to_short[canonical_id] = disambiguated
- short_to_canonical[disambiguated] = canonical_id
- return canonical_to_short, short_to_canonical
-
- @staticmethod
- def _base_short_finding_id(canonical_id: str) -> str:
- return _base_short_finding_id_payload(canonical_id)
-
- @staticmethod
- def _disambiguated_short_finding_id(canonical_id: str) -> str:
- return _disambiguated_short_finding_id_payload(canonical_id)
-
- def _disambiguated_short_finding_ids(
- self,
- canonical_ids: Sequence[str],
- ) -> dict[str, str]:
- clone_ids = [
- canonical_id
- for canonical_id in canonical_ids
- if canonical_id.startswith("clone:")
- ]
- if len(clone_ids) == len(canonical_ids):
- clone_short_ids = _disambiguated_clone_short_ids_payload(clone_ids)
- if len(set(clone_short_ids.values())) == len(clone_short_ids):
- return clone_short_ids
- return {
- canonical_id: self._disambiguated_short_finding_id(canonical_id)
- for canonical_id in canonical_ids
- }
-
- def _short_finding_id(
- self,
- record: MCPRunRecord,
- canonical_id: str,
- ) -> str:
- canonical_to_short, _short_to_canonical = self._finding_id_maps(record)
- return canonical_to_short.get(canonical_id, canonical_id)
-
- def _resolve_canonical_finding_id(
- self,
- record: MCPRunRecord,
- finding_id: str,
- ) -> str:
- canonical_to_short, short_to_canonical = self._finding_id_maps(record)
- if finding_id in canonical_to_short:
- return finding_id
- canonical = short_to_canonical.get(finding_id)
- if canonical is not None:
- return canonical
- raise MCPFindingNotFoundError(
- f"Finding id '{finding_id}' was not found in run "
- f"'{self._short_run_id(record.run_id)}'."
- )
-
- def _leaf_symbol_name(self, value: object) -> str:
- return _leaf_symbol_name_payload(value)
-
- @staticmethod
- def _comparison_settings(
- *,
- args: Namespace,
- request: MCPAnalysisRequest,
- ) -> tuple[object, ...]:
- return (
- request.analysis_mode,
- _as_int(args.min_loc, DEFAULT_MIN_LOC),
- _as_int(args.min_stmt, DEFAULT_MIN_STMT),
- _as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC),
- _as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT),
- _as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC),
- _as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT),
- _as_int(
- args.design_complexity_threshold,
- DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- ),
- _as_int(
- args.design_coupling_threshold,
- DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- ),
- _as_int(
- args.design_cohesion_threshold,
- DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- ),
- )
-
- @staticmethod
- def _comparison_scope(
- *,
- before: MCPRunRecord,
- after: MCPRunRecord,
- ) -> dict[str, object]:
- same_root = before.root == after.root
- same_analysis_settings = before.comparison_settings == after.comparison_settings
- if same_root and same_analysis_settings:
- reason = "comparable"
- elif not same_root and not same_analysis_settings:
- reason = "different_root_and_analysis_settings"
- elif not same_root:
- reason = "different_root"
- else:
- reason = "different_analysis_settings"
- return {
- "comparable": same_root and same_analysis_settings,
- "same_root": same_root,
- "same_analysis_settings": same_analysis_settings,
- "reason": reason,
- }
-
- @staticmethod
- def _severity_rank(severity: str) -> int:
- return {
- SEVERITY_CRITICAL: 3,
- SEVERITY_WARNING: 2,
- SEVERITY_INFO: 1,
- }.get(severity, 0)
-
- def _path_filter_tuple(self, path: str | None) -> tuple[str, ...]:
- if not path:
- return ()
- cleaned = self._normalize_relative_path(Path(path).as_posix())
- return (cleaned,) if cleaned else ()
-
- def _normalize_relative_path(self, path: str) -> str:
- cleaned = path.strip()
- if cleaned == ".":
- return ""
- if cleaned.startswith("./"):
- cleaned = cleaned[2:]
- cleaned = cleaned.rstrip("/")
- if ".." in Path(cleaned).parts:
- raise MCPServiceContractError(f"path traversal not allowed: {path}")
- return cleaned
-
- def _previous_run_for_root(self, record: MCPRunRecord) -> MCPRunRecord | None:
- previous: MCPRunRecord | None = None
- for item in self._runs.records():
- if item.run_id == record.run_id:
- return previous
- if item.root == record.root:
- previous = item
- return None
-
- @staticmethod
- def _record_supports_analysis_mode(
- record: MCPRunRecord,
- *,
- analysis_mode: AnalysisMode,
- ) -> bool:
- record_mode = record.request.analysis_mode
- if analysis_mode == "clones_only":
- return record_mode in {"clones_only", "full"}
- return record_mode == "full"
-
- def _latest_compatible_record(
- self,
- *,
- analysis_mode: AnalysisMode,
- root_path: Path | None = None,
- ) -> MCPRunRecord | None:
- for item in reversed(self._runs.records()):
- if root_path is not None and item.root != root_path:
- continue
- if self._record_supports_analysis_mode(
- item,
- analysis_mode=analysis_mode,
- ):
- return item
- return None
-
- def _resolve_granular_record(
- self,
- *,
- run_id: str | None,
- root: str | None,
- analysis_mode: AnalysisMode,
- ) -> MCPRunRecord:
- if run_id is not None:
- record = self._runs.get(run_id)
- if self._record_supports_analysis_mode(record, analysis_mode=analysis_mode):
- return record
- raise MCPServiceContractError(
- "Selected MCP run is not compatible with this check. "
- f"Call analyze_repository(root='{record.root}', "
- "analysis_mode='full') first."
- )
- root_path = self._resolve_optional_root(root)
- latest_record = self._latest_compatible_record(
- analysis_mode=analysis_mode,
- root_path=root_path,
- )
- if latest_record is not None:
- return latest_record
- if root_path is not None:
- raise MCPRunNotFoundError(
- f"No compatible MCP analysis run is available for root: {root_path}. "
- f"Call analyze_repository(root='{root_path}') or "
- f"analyze_changed_paths(root='{root_path}', changed_paths=[...]) first."
- )
- raise MCPRunNotFoundError(
- "No compatible MCP analysis run is available. "
- "Call analyze_repository(root='/path/to/repo') or "
- "analyze_changed_paths(root='/path/to/repo', changed_paths=[...]) first."
- )
-
- def _base_findings(self, record: MCPRunRecord) -> list[dict[str, object]]:
- report_document = record.report_document
- findings = self._as_mapping(report_document.get("findings"))
- groups = self._as_mapping(findings.get("groups"))
- clone_groups = self._as_mapping(groups.get(FAMILY_CLONES))
- return [
- *self._dict_list(clone_groups.get("functions")),
- *self._dict_list(clone_groups.get("blocks")),
- *self._dict_list(clone_groups.get("segments")),
- *self._dict_list(
- self._as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups")
- ),
- *self._dict_list(
- self._as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups")
- ),
- *self._dict_list(self._as_mapping(groups.get(FAMILY_DESIGN)).get("groups")),
- ]
-
- def _query_findings(
- self,
- *,
- record: MCPRunRecord,
- family: FindingFamilyFilter = "all",
- category: str | None = None,
- severity: str | None = None,
- source_kind: str | None = None,
- novelty: FindingNoveltyFilter = "all",
- sort_by: FindingSort = "default",
- detail_level: DetailLevel = "normal",
- changed_paths: Sequence[str] = (),
- exclude_reviewed: bool = False,
- ) -> list[dict[str, object]]:
- findings = self._base_findings(record)
- max_spread_value = max(
- (self._spread_value(finding) for finding in findings),
- default=0,
- )
- with self._state_lock:
- self._spread_max_cache[record.run_id] = max_spread_value
- filtered = [
- finding
- for finding in findings
- if self._matches_finding_filters(
- finding=finding,
- family=family,
- category=category,
- severity=severity,
- source_kind=source_kind,
- novelty=novelty,
- )
- and (
- not changed_paths
- or self._finding_touches_paths(
- finding=finding,
- changed_paths=changed_paths,
- )
- )
- and (not exclude_reviewed or not self._finding_is_reviewed(record, finding))
- ]
- remediation_map = {
- str(finding.get("id", "")): self._remediation_for_finding(record, finding)
- for finding in filtered
- }
- priority_map = {
- str(finding.get("id", "")): self._priority_score(
- record,
- finding,
- remediation=remediation_map[str(finding.get("id", ""))],
- max_spread_value=max_spread_value,
- )
- for finding in filtered
- }
- ordered = self._sort_findings(
- record=record,
- findings=filtered,
- sort_by=sort_by,
- priority_map=priority_map,
- )
- return [
- self._decorate_finding(
- record,
- finding,
- detail_level=detail_level,
- remediation=remediation_map[str(finding.get("id", ""))],
- priority_payload=priority_map[str(finding.get("id", ""))],
- max_spread_value=max_spread_value,
- )
- for finding in ordered
- ]
-
- def _sort_findings(
- self,
- *,
- record: MCPRunRecord,
- findings: Sequence[Mapping[str, object]],
- sort_by: FindingSort,
- priority_map: Mapping[str, Mapping[str, object]] | None = None,
- ) -> list[dict[str, object]]:
- finding_rows = [dict(finding) for finding in findings]
- if sort_by == "default":
- return finding_rows
- if sort_by == "severity":
- finding_rows.sort(
- key=lambda finding: (
- -self._severity_rank(str(finding.get("severity", ""))),
- str(finding.get("id", "")),
- )
- )
- elif sort_by == "spread":
- finding_rows.sort(
- key=lambda finding: (
- -self._spread_value(finding),
- -_as_float(finding.get("priority", 0.0), 0.0),
- str(finding.get("id", "")),
- )
- )
- else:
- finding_rows.sort(
- key=lambda finding: (
- -_as_float(
- self._as_mapping(
- (priority_map or {}).get(str(finding.get("id", "")))
- ).get("score", 0.0),
- 0.0,
- )
- if priority_map is not None
- else -_as_float(
- self._priority_score(record, finding)["score"],
- 0.0,
- ),
- -self._severity_rank(str(finding.get("severity", ""))),
- str(finding.get("id", "")),
- )
- )
- return finding_rows
-
- def _decorate_finding(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- *,
- detail_level: DetailLevel,
- remediation: Mapping[str, object] | None = None,
- priority_payload: Mapping[str, object] | None = None,
- max_spread_value: int | None = None,
- ) -> dict[str, object]:
- resolved_remediation = (
- remediation
- if remediation is not None
- else self._remediation_for_finding(record, finding)
- )
- resolved_priority_payload = (
- dict(priority_payload)
- if priority_payload is not None
- else self._priority_score(
- record,
- finding,
- remediation=resolved_remediation,
- max_spread_value=max_spread_value,
- )
- )
- payload = dict(finding)
- payload["priority_score"] = resolved_priority_payload["score"]
- payload["priority_factors"] = resolved_priority_payload["factors"]
- payload["locations"] = self._locations_for_finding(
- record,
- finding,
- include_uri=detail_level == "full",
- )
- payload["html_anchor"] = f"finding-{finding.get('id', '')}"
- if resolved_remediation is not None:
- payload["remediation"] = resolved_remediation
- return self._project_finding_detail(
- record,
- payload,
- detail_level=detail_level,
- )
-
- def _project_finding_detail(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- *,
- detail_level: DetailLevel,
- ) -> dict[str, object]:
- if detail_level == "full":
- full_payload = dict(finding)
- full_payload["id"] = self._short_finding_id(
- record,
- str(finding.get("id", "")),
- )
- return full_payload
- payload: dict[str, object] = {
- "id": self._short_finding_id(record, str(finding.get("id", ""))),
- "kind": self._finding_kind_label(finding),
- "severity": str(finding.get("severity", "")),
- "novelty": str(finding.get("novelty", "")),
- "scope": self._finding_source_kind(finding),
- "count": _as_int(finding.get("count", 0), 0),
- "spread": dict(self._as_mapping(finding.get("spread"))),
- "priority": round(_as_float(finding.get("priority_score", 0.0), 0.0), 2),
- }
- clone_type = str(finding.get("clone_type", "")).strip()
- if clone_type:
- payload["type"] = clone_type
- locations = [
- self._as_mapping(item)
- for item in self._as_sequence(finding.get("locations"))
- ]
- if detail_level == "summary":
- remediation = self._as_mapping(finding.get("remediation"))
- if remediation:
- payload["effort"] = str(remediation.get("effort", ""))
- payload["locations"] = [
- summary_location
- for summary_location in (
- self._summary_location_string(location) for location in locations
- )
- if summary_location
- ]
- return payload
- remediation = self._as_mapping(finding.get("remediation"))
- if remediation:
- payload["remediation"] = self._project_remediation(
- remediation,
- detail_level="normal",
- )
- payload["locations"] = [
- projected
- for projected in (
- self._normal_location_payload(location) for location in locations
- )
- if projected
- ]
- return payload
-
- def _finding_summary_card(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- ) -> dict[str, object]:
- return self._finding_summary_card_payload(
- record,
- self._decorate_finding(record, finding, detail_level="full"),
- )
-
- def _finding_summary_card_payload(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- ) -> dict[str, object]:
- return self._project_finding_detail(record, finding, detail_level="summary")
-
- def _comparison_finding_card(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- ) -> dict[str, object]:
- summary_card = self._finding_summary_card(record, finding)
- return {
- "id": summary_card.get("id"),
- "kind": summary_card.get("kind"),
- "severity": summary_card.get("severity"),
- }
-
- @staticmethod
- def _finding_kind_label(finding: Mapping[str, object]) -> str:
- family = str(finding.get("family", "")).strip()
- kind = str(finding.get("kind", finding.get("category", ""))).strip()
- if family == FAMILY_CLONE:
- clone_kind = str(
- finding.get("clone_kind", finding.get("category", kind))
- ).strip()
- return f"{clone_kind}_clone" if clone_kind else "clone"
- if family == FAMILY_DEAD_CODE:
- return "dead_code"
- return kind or family
-
- @staticmethod
- def _summary_location_string(location: Mapping[str, object]) -> str:
- path = str(location.get("file", "")).strip()
- line = _as_int(location.get("line", 0), 0)
- if not path:
- return ""
- return f"{path}:{line}" if line > 0 else path
-
- def _normal_location_payload(
- self,
- location: Mapping[str, object],
- ) -> dict[str, object]:
- path = str(location.get("file", "")).strip()
- if not path:
- return {}
- payload: dict[str, object] = {
- "path": path,
- "line": _as_int(location.get("line", 0), 0),
- "end_line": _as_int(location.get("end_line", 0), 0),
- }
- symbol = self._leaf_symbol_name(location.get("symbol"))
- if symbol:
- payload["symbol"] = symbol
- return payload
-
- def _matches_finding_filters(
- self,
- *,
- finding: Mapping[str, object],
- family: FindingFamilyFilter,
- category: str | None = None,
- severity: str | None,
- source_kind: str | None,
- novelty: FindingNoveltyFilter,
- ) -> bool:
- finding_family = str(finding.get("family", "")).strip()
- if family != "all" and finding_family != family:
- return False
- if (
- category is not None
- and str(finding.get("category", "")).strip() != category
- ):
- return False
- if (
- severity is not None
- and str(finding.get("severity", "")).strip() != severity
- ):
- return False
- dominant_kind = str(
- self._as_mapping(finding.get("source_scope")).get("dominant_kind", "")
- ).strip()
- if source_kind is not None and dominant_kind != source_kind:
- return False
- return novelty == "all" or str(finding.get("novelty", "")).strip() == novelty
-
- def _finding_touches_paths(
- self,
- *,
- finding: Mapping[str, object],
- changed_paths: Sequence[str],
- ) -> bool:
- normalized_paths = tuple(changed_paths)
- for item in self._as_sequence(finding.get("items")):
- relative_path = str(self._as_mapping(item).get("relative_path", "")).strip()
- if relative_path and self._path_matches(relative_path, normalized_paths):
- return True
- return False
-
- @staticmethod
- def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool:
- for candidate in changed_paths:
- if relative_path == candidate or relative_path.startswith(candidate + "/"):
- return True
- return False
-
- def _finding_is_reviewed(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- ) -> bool:
- with self._state_lock:
- review_map = self._review_state.get(record.run_id, OrderedDict())
- return str(finding.get("id", "")) in review_map
-
- def _include_hotspot_finding(
- self,
- *,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- changed_paths: Sequence[str],
- exclude_reviewed: bool,
- ) -> bool:
- if changed_paths and not self._finding_touches_paths(
- finding=finding,
- changed_paths=changed_paths,
- ):
- return False
- return not exclude_reviewed or not self._finding_is_reviewed(record, finding)
-
- def _priority_score(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- *,
- remediation: Mapping[str, object] | None = None,
- max_spread_value: int | None = None,
- ) -> dict[str, object]:
- spread_weight = self._spread_weight(
- record,
- finding,
- max_spread_value=max_spread_value,
- )
- factors = {
- "severity_weight": _SEVERITY_WEIGHT.get(
- str(finding.get("severity", "")),
- 0.2,
- ),
- "effort_weight": _EFFORT_WEIGHT.get(
- (
- str(remediation.get("effort", EFFORT_MODERATE))
- if remediation is not None
- else EFFORT_MODERATE
- ),
- 0.6,
- ),
- "novelty_weight": _NOVELTY_WEIGHT.get(
- str(finding.get("novelty", "")),
- 0.7,
- ),
- "runtime_weight": _RUNTIME_WEIGHT.get(
- str(
- self._as_mapping(finding.get("source_scope")).get(
- "dominant_kind",
- "other",
- )
- ),
- 0.5,
- ),
- "spread_weight": spread_weight,
- "confidence_weight": _CONFIDENCE_WEIGHT.get(
- str(finding.get("confidence", CONFIDENCE_MEDIUM)),
- 0.7,
- ),
- }
- product = 1.0
- for value in factors.values():
- product *= max(_as_float(value, 0.01), 0.01)
- score = product ** (1.0 / max(len(factors), 1))
- return {
- "score": round(score, 4),
- "factors": {
- key: round(_as_float(value, 0.0), 4) for key, value in factors.items()
- },
- }
-
- def _spread_weight(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- *,
- max_spread_value: int | None = None,
- ) -> float:
- spread_value = self._spread_value(finding)
- if max_spread_value is None:
- with self._state_lock:
- max_spread_value = self._spread_max_cache.get(record.run_id)
- if max_spread_value is None:
- max_spread_value = max(
- (self._spread_value(item) for item in self._base_findings(record)),
- default=0,
- )
- with self._state_lock:
- self._spread_max_cache[record.run_id] = max_spread_value
- max_value = max_spread_value
- if max_value <= 0:
- return 0.3
- return max(0.2, min(1.0, spread_value / max_value))
-
- def _spread_value(self, finding: Mapping[str, object]) -> int:
- spread = self._as_mapping(finding.get("spread"))
- files = _as_int(spread.get("files", 0), 0)
- functions = _as_int(spread.get("functions", 0), 0)
- count = _as_int(finding.get("count", 0), 0)
- return max(files, functions, count, 1)
-
- def _locations_for_finding(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- *,
- include_uri: bool = True,
- ) -> list[dict[str, object]]:
- locations: list[dict[str, object]] = []
- for item in self._as_sequence(finding.get("items")):
- item_map = self._as_mapping(item)
- relative_path = str(item_map.get("relative_path", "")).strip()
- if not relative_path:
- continue
- line = _as_int(item_map.get("start_line", 0) or 0, 0)
- end_line = _as_int(item_map.get("end_line", 0) or 0, 0)
- symbol = str(item_map.get("qualname", item_map.get("module", ""))).strip()
- location: dict[str, object] = {
- "file": relative_path,
- "line": line,
- "end_line": end_line,
- "symbol": symbol,
- }
- if include_uri:
- absolute_path = (record.root / relative_path).resolve()
- uri = absolute_path.as_uri()
- if line > 0:
- uri = f"{uri}#L{line}"
- location["uri"] = uri
- locations.append(location)
- deduped: list[dict[str, object]] = []
- seen: set[tuple[str, int, str]] = set()
- for location in locations:
- key = (
- str(location.get("file", "")),
- _as_int(location.get("line", 0), 0),
- str(location.get("symbol", "")),
- )
- if key not in seen:
- seen.add(key)
- deduped.append(location)
- return deduped
-
- @staticmethod
- def _suggestion_finding_id(suggestion: object) -> str:
- return _suggestion_finding_id_payload(suggestion)
-
- def _remediation_for_finding(
- self,
- record: MCPRunRecord,
- finding: Mapping[str, object],
- ) -> dict[str, object] | None:
- suggestion = self._suggestion_for_finding(record, str(finding.get("id", "")))
- if suggestion is None:
- return None
- source_kind = str(getattr(suggestion, "source_kind", "other"))
- spread_files = _as_int(getattr(suggestion, "spread_files", 0), 0)
- spread_functions = _as_int(getattr(suggestion, "spread_functions", 0), 0)
- title = str(getattr(suggestion, "title", "")).strip()
- severity = str(finding.get("severity", "")).strip()
- novelty = str(finding.get("novelty", "known")).strip()
- count = _as_int(
- getattr(suggestion, "fact_count", 0) or finding.get("count", 0) or 0,
- 0,
- )
- safe_refactor_shape = self._safe_refactor_shape(suggestion)
- effort = str(getattr(suggestion, "effort", EFFORT_MODERATE))
- confidence = str(getattr(suggestion, "confidence", CONFIDENCE_MEDIUM))
- risk_level = self._risk_level_for_effort(effort)
- return {
- "effort": effort,
- "priority": _as_float(getattr(suggestion, "priority", 0.0), 0.0),
- "confidence": confidence,
- "safe_refactor_shape": safe_refactor_shape,
- "steps": list(getattr(suggestion, "steps", ())),
- "risk_level": risk_level,
- "why_now": self._why_now_text(
- title=title,
- severity=severity,
- novelty=novelty,
- count=count,
- source_kind=source_kind,
- spread_files=spread_files,
- spread_functions=spread_functions,
- effort=effort,
- ),
- "blast_radius": {
- "files": spread_files,
- "functions": spread_functions,
- "is_production": source_kind == "production",
- },
- }
-
- def _suggestion_for_finding(
- self,
- record: MCPRunRecord,
- finding_id: str,
- ) -> object | None:
- for suggestion in record.suggestions:
- if self._suggestion_finding_id(suggestion) == finding_id:
- return suggestion
- return None
-
- @staticmethod
- def _safe_refactor_shape(suggestion: object) -> str:
- category = str(getattr(suggestion, "category", "")).strip()
- clone_type = str(getattr(suggestion, "clone_type", "")).strip()
- title = str(getattr(suggestion, "title", "")).strip()
- if category == CATEGORY_CLONE and clone_type == "Type-1":
- return "Keep one canonical implementation and route callers through it."
- if category == CATEGORY_CLONE and clone_type == "Type-2":
- return "Extract shared implementation with explicit parameters."
- if category == CATEGORY_CLONE and "Block" in title:
- return "Extract the repeated statement sequence into a helper."
- if category == CATEGORY_STRUCTURAL:
- return "Extract the repeated branch family into a named helper."
- if category == CATEGORY_COMPLEXITY:
- return "Split the function into smaller named steps."
- if category == CATEGORY_COUPLING:
- return "Isolate responsibilities and invert unnecessary dependencies."
- if category == CATEGORY_COHESION:
- return "Split the class by responsibility boundary."
- if category == CATEGORY_DEAD_CODE:
- return "Delete the unused symbol or document intentional reachability."
- if category == CATEGORY_DEPENDENCY:
- return "Break the cycle by moving shared abstractions to a lower layer."
- return "Extract the repeated logic into a shared, named abstraction."
-
- @staticmethod
- def _risk_level_for_effort(effort: str) -> str:
- return {
- EFFORT_EASY: "low",
- EFFORT_MODERATE: "medium",
- EFFORT_HARD: "high",
- }.get(effort, "medium")
-
- @staticmethod
- def _why_now_text(
- *,
- title: str,
- severity: str,
- novelty: str,
- count: int,
- source_kind: str,
- spread_files: int,
- spread_functions: int,
- effort: str,
- ) -> str:
- novelty_text = "new regression" if novelty == "new" else "known debt"
- context = (
- "production code"
- if source_kind == "production"
- else source_kind or "mixed scope"
- )
- spread_text = f"{spread_files} files / {spread_functions} functions"
- count_text = f"{count} instances" if count > 0 else "localized issue"
- return (
- f"{severity.upper()} {title} in {context} — {count_text}, "
- f"{spread_text}, {effort} fix, {novelty_text}."
- )
-
- def _project_remediation(
- self,
- remediation: Mapping[str, object],
- *,
- detail_level: DetailLevel,
- ) -> dict[str, object]:
- if detail_level == "full":
- return dict(remediation)
- projected = {
- "effort": remediation.get("effort"),
- "risk": remediation.get("risk_level"),
- "shape": remediation.get("safe_refactor_shape"),
- "why_now": remediation.get("why_now"),
- }
- if detail_level == "summary":
- return projected
- projected["steps"] = list(self._as_sequence(remediation.get("steps")))
- return projected
-
- def _hotspot_rows(
- self,
- *,
- record: MCPRunRecord,
- kind: HotlistKind,
- detail_level: DetailLevel,
- changed_paths: Sequence[str],
- exclude_reviewed: bool,
- ) -> list[dict[str, object]]:
- findings = self._base_findings(record)
- finding_index = {str(finding.get("id", "")): finding for finding in findings}
- max_spread_value = max(
- (self._spread_value(finding) for finding in findings),
- default=0,
- )
- with self._state_lock:
- self._spread_max_cache[record.run_id] = max_spread_value
- remediation_map = {
- str(finding.get("id", "")): self._remediation_for_finding(record, finding)
- for finding in findings
- }
- priority_map = {
- str(finding.get("id", "")): self._priority_score(
- record,
- finding,
- remediation=remediation_map[str(finding.get("id", ""))],
- max_spread_value=max_spread_value,
- )
- for finding in findings
- }
- derived = self._as_mapping(record.report_document.get("derived"))
- hotlists = self._as_mapping(derived.get("hotlists"))
- if kind == "highest_priority":
- ordered_ids = [
- str(finding.get("id", ""))
- for finding in self._sort_findings(
- record=record,
- findings=findings,
- sort_by="priority",
- priority_map=priority_map,
- )
- ]
- else:
- hotlist_key = _HOTLIST_REPORT_KEYS.get(kind)
- if hotlist_key is None:
- return []
- ordered_ids = [
- str(item)
- for item in self._as_sequence(hotlists.get(hotlist_key))
- if str(item)
- ]
- rows: list[dict[str, object]] = []
- for finding_id in ordered_ids:
- finding = finding_index.get(finding_id)
- if finding is None or not self._include_hotspot_finding(
- record=record,
- finding=finding,
- changed_paths=changed_paths,
- exclude_reviewed=exclude_reviewed,
- ):
- continue
- finding_id_key = str(finding.get("id", ""))
- rows.append(
- self._decorate_finding(
- record,
- finding,
- detail_level=detail_level,
- remediation=remediation_map[finding_id_key],
- priority_payload=priority_map[finding_id_key],
- max_spread_value=max_spread_value,
- )
- )
- return rows
-
- def _build_changed_projection(
- self,
- record: MCPRunRecord,
- ) -> dict[str, object] | None:
- if not record.changed_paths:
- return None
- items = self._query_findings(
- record=record,
- detail_level="summary",
- changed_paths=record.changed_paths,
- )
- new_count = sum(1 for item in items if str(item.get("novelty", "")) == "new")
- known_count = sum(
- 1 for item in items if str(item.get("novelty", "")) == "known"
- )
- new_by_source_kind = self._source_kind_breakdown(
- item.get("source_kind")
- for item in items
- if str(item.get("novelty", "")) == "new"
- )
- health_delta = self._summary_health_delta(record.summary)
- return {
- "run_id": self._short_run_id(record.run_id),
- "changed_paths": list(record.changed_paths),
- "total": len(items),
- "new": new_count,
- "known": known_count,
- "new_by_source_kind": new_by_source_kind,
- "items": items,
- "health": dict(self._summary_health_payload(record.summary)),
- "health_delta": health_delta,
- "verdict": self._changed_verdict(
- changed_projection={"new": new_count, "total": len(items)},
- health_delta=health_delta,
- ),
- }
-
- def _changed_analysis_payload(
- self,
- record: MCPRunRecord,
- ) -> dict[str, object]:
- changed_projection = self._as_mapping(record.changed_projection)
- health = self._summary_health_payload(record.summary)
- health_payload = (
- {
- "score": health.get("score"),
- "grade": health.get("grade"),
- }
- if health.get("available") is not False
- else dict(health)
- )
- return {
- "run_id": self._short_run_id(record.run_id),
- "focus": _FOCUS_CHANGED_PATHS,
- "health_scope": _HEALTH_SCOPE_REPOSITORY,
- "baseline": dict(self._summary_baseline_payload(record.summary)),
- "changed_files": len(record.changed_paths),
- "health": health_payload,
- "analysis_profile": self._summary_analysis_profile_payload(record.summary),
- "health_delta": (
- _as_int(changed_projection.get("health_delta", 0), 0)
- if changed_projection.get("health_delta") is not None
- else None
- ),
- "verdict": str(changed_projection.get("verdict", "stable")),
- "new_findings": _as_int(changed_projection.get("new", 0), 0),
- "new_by_source_kind": dict(
- self._as_mapping(changed_projection.get("new_by_source_kind"))
- ),
- "resolved_findings": 0,
- "changed_findings": [],
- "coverage_join": self._summary_coverage_join_payload(record),
- }
-
- def _augment_summary_with_changed(
- self,
- *,
- summary: Mapping[str, object],
- changed_paths: Sequence[str],
- changed_projection: Mapping[str, object] | None,
- ) -> dict[str, object]:
- payload = dict(summary)
- if changed_paths:
- payload["changed_paths"] = list(changed_paths)
- if changed_projection is not None:
- payload["changed_findings"] = {
- "total": _as_int(changed_projection.get("total", 0), 0),
- "new": _as_int(changed_projection.get("new", 0), 0),
- "known": _as_int(changed_projection.get("known", 0), 0),
- "items": [
- dict(self._as_mapping(item))
- for item in self._as_sequence(changed_projection.get("items"))[:10]
- ],
- }
- payload["health_delta"] = (
- _as_int(changed_projection.get("health_delta", 0), 0)
- if changed_projection.get("health_delta") is not None
- else None
- )
- payload["verdict"] = str(changed_projection.get("verdict", "stable"))
- return payload
-
- @staticmethod
- def _changed_verdict(
- *,
- changed_projection: Mapping[str, object],
- health_delta: int | None,
- ) -> str:
- if _as_int(changed_projection.get("new", 0), 0) > 0 or (
- health_delta is not None and health_delta < 0
- ):
- return "regressed"
- if (
- _as_int(changed_projection.get("total", 0), 0) == 0
- and health_delta is not None
- and health_delta > 0
- ):
- return "improved"
- return "stable"
-
- def _comparison_index(
- self,
- record: MCPRunRecord,
- *,
- focus: ComparisonFocus,
- ) -> dict[str, dict[str, object]]:
- findings = self._base_findings(record)
- if focus == "clones":
- findings = [f for f in findings if str(f.get("family", "")) == FAMILY_CLONE]
- elif focus == "structural":
- findings = [
- f for f in findings if str(f.get("family", "")) == FAMILY_STRUCTURAL
- ]
- elif focus == "metrics":
- findings = [
- f
- for f in findings
- if str(f.get("family", "")) in {FAMILY_DESIGN, FAMILY_DEAD_CODE}
- ]
- return {str(finding.get("id", "")): dict(finding) for finding in findings}
-
- @staticmethod
- def _comparison_verdict(
- *,
- regressions: int,
- improvements: int,
- health_delta: int | None,
- ) -> str:
- has_negative_signal = regressions > 0 or (
- health_delta is not None and health_delta < 0
- )
- has_positive_signal = improvements > 0 or (
- health_delta is not None and health_delta > 0
- )
- if has_negative_signal and has_positive_signal:
- return "mixed"
- if has_negative_signal:
- return "regressed"
- if has_positive_signal:
- return "improved"
- return "stable"
-
- @staticmethod
- def _comparison_summary_text(
- *,
- comparable: bool,
- comparability_reason: str,
- regressions: int,
- improvements: int,
- health_delta: int | None,
- ) -> str:
- if not comparable:
- reason_text = {
- "different_root": "different roots",
- "different_analysis_settings": "different analysis settings",
- "different_root_and_analysis_settings": (
- "different roots and analysis settings"
- ),
- }.get(comparability_reason, "incomparable runs")
- return f"Finding and run health deltas omitted ({reason_text})"
- if health_delta is None:
- return (
- f"{improvements} findings resolved, {regressions} new regressions; "
- "run health delta omitted (metrics unavailable)"
- )
- return (
- f"{improvements} findings resolved, {regressions} new regressions, "
- f"run health delta {health_delta:+d}"
- )
-
- def _render_pr_summary_markdown(self, payload: Mapping[str, object]) -> str:
- health = self._as_mapping(payload.get("health"))
- score = health.get("score", "n/a")
- grade = health.get("grade", "n/a")
- delta = _as_int(payload.get("health_delta", 0), 0)
- changed_items = [
- self._as_mapping(item)
- for item in self._as_sequence(payload.get("new_findings_in_changed_files"))
- ]
- resolved = [
- self._as_mapping(item)
- for item in self._as_sequence(payload.get("resolved"))
- ]
- blocking_gates = [
- str(item)
- for item in self._as_sequence(payload.get("blocking_gates"))
- if str(item)
- ]
- health_line = (
- f"Health: {score}/100 ({grade}) | Delta: {delta:+d} | "
- f"Verdict: {payload.get('verdict', 'stable')}"
- if payload.get("health_delta") is not None
- else (
- f"Health: {score}/100 ({grade}) | Delta: n/a | "
- f"Verdict: {payload.get('verdict', 'stable')}"
- )
- )
- lines = [
- "## CodeClone Summary",
- "",
- health_line,
- "",
- f"### New findings in changed files ({len(changed_items)})",
- ]
- if not changed_items:
- lines.append("- None")
- else:
- lines.extend(
- [
- (
- f"- **{str(item.get('severity', 'info')).upper()}** "
- f"{item.get('kind', 'finding')} in "
- f"`{self._finding_display_location(item)}`"
- )
- for item in changed_items[:10]
- ]
- )
- lines.extend(["", f"### Resolved ({len(resolved)})"])
- if not resolved:
- lines.append("- None")
- else:
- lines.extend(
- [
- (
- f"- {item.get('kind', 'finding')} in "
- f"`{self._finding_display_location(item)}`"
- )
- for item in resolved[:10]
- ]
- )
- lines.extend(["", "### Blocking gates"])
- if not blocking_gates:
- lines.append("- none")
- else:
- lines.extend([f"- `{reason}`" for reason in blocking_gates])
- return "\n".join(lines)
-
- def _finding_display_location(self, finding: Mapping[str, object]) -> str:
- locations = self._as_sequence(finding.get("locations"))
- if not locations:
- return "(unknown)"
- first = locations[0]
- if isinstance(first, str):
- return first
- location = self._as_mapping(first)
- path = str(location.get("path", location.get("file", ""))).strip()
- line = _as_int(location.get("line", 0), 0)
- if not path:
- return "(unknown)"
- return f"{path}:{line}" if line > 0 else path
-
- def _granular_payload(
- self,
- *,
- record: MCPRunRecord,
- check: str,
- items: Sequence[Mapping[str, object]],
- detail_level: DetailLevel,
- max_results: int,
- path: str | None,
- threshold_context: Mapping[str, object] | None = None,
- ) -> dict[str, object]:
- bounded_items = [dict(item) for item in items[: max(1, max_results)]]
- full_health = dict(self._as_mapping(record.summary.get("health")))
- dimensions = self._as_mapping(full_health.get("dimensions"))
- relevant_dimension = _CHECK_TO_DIMENSION.get(check)
- slim_dimensions = (
- {relevant_dimension: dimensions.get(relevant_dimension)}
- if relevant_dimension and relevant_dimension in dimensions
- else dict(dimensions)
- )
- payload: dict[str, object] = {
- "run_id": self._short_run_id(record.run_id),
- "check": check,
- "detail_level": detail_level,
- "path": path,
- "returned": len(bounded_items),
- "total": len(items),
- "health": {
- "score": full_health.get("score"),
- "grade": full_health.get("grade"),
- "dimensions": slim_dimensions,
- },
- "items": bounded_items,
- }
- if threshold_context:
- payload["threshold_context"] = dict(threshold_context)
- return payload
-
- def _design_threshold_context(
- self,
- *,
- record: MCPRunRecord,
- check: str,
- path: str | None,
- items: Sequence[Mapping[str, object]],
- requested_min: int | None = None,
- ) -> dict[str, object] | None:
- if items:
- return None
- spec = _DESIGN_CHECK_CONTEXT.get(check)
- if spec is None:
- return None
- category = str(spec["category"])
- metric = str(spec["metric"])
- operator = str(spec["operator"])
- normalized_path = self._normalize_relative_path(path or "")
- metrics = self._as_mapping(record.report_document.get("metrics"))
- families = self._as_mapping(metrics.get("families"))
- family = self._as_mapping(families.get(category))
- metric_items = [
- self._as_mapping(item)
- for item in self._as_sequence(family.get("items"))
- if not normalized_path
- or self._metric_item_matches_path(
- self._as_mapping(item),
- normalized_path,
- )
- ]
- if not metric_items:
- return None
- values = [_as_int(item.get(metric), 0) for item in metric_items]
- finding_threshold = self._design_finding_threshold(
- record=record,
- check=check,
- )
- threshold = finding_threshold
- threshold_kind = "finding_threshold"
- if requested_min is not None and requested_min > finding_threshold:
- threshold = requested_min
- threshold_kind = "requested_min"
- highest_below = self._highest_below_threshold(
- values=values,
- operator=operator,
- threshold=threshold,
- )
- payload: dict[str, object] = {
- "metric": metric,
- "threshold": threshold,
- "threshold_kind": threshold_kind,
- "measured_units": len(metric_items),
- }
- if threshold_kind != "finding_threshold":
- payload["finding_threshold"] = finding_threshold
- if highest_below is not None:
- payload["highest_below_threshold"] = highest_below
- return payload
-
- def _design_finding_threshold(
- self,
- *,
- record: MCPRunRecord,
- check: str,
- ) -> int:
- spec = _DESIGN_CHECK_CONTEXT[check]
- category = str(spec["category"])
- default_threshold = _as_int(spec["default_threshold"])
- findings = self._as_mapping(record.report_document.get("findings"))
- thresholds = self._as_mapping(
- self._as_mapping(findings.get("thresholds")).get("design_findings")
- )
- threshold_payload = self._as_mapping(thresholds.get(category))
- if threshold_payload:
- return _as_int(threshold_payload.get("value"), default_threshold)
- request_value = {
- "complexity": record.request.complexity_threshold,
- "coupling": record.request.coupling_threshold,
- "cohesion": record.request.cohesion_threshold,
- }.get(check)
- return _as_int(request_value, default_threshold)
-
- @staticmethod
- def _highest_below_threshold(
- *,
- values: Sequence[int],
- operator: str,
- threshold: int,
- ) -> int | None:
- if operator == ">":
- below = [value for value in values if value <= threshold]
- elif operator == ">=":
- below = [value for value in values if value < threshold]
- else:
- return None
- if not below:
- return None
- return max(below)
-
- @staticmethod
- def _normalized_source_kind(value: object) -> str:
- normalized = str(value).strip().lower()
- if normalized in SOURCE_KIND_ORDER:
- return normalized
- return SOURCE_KIND_OTHER
-
- def _finding_source_kind(self, finding: Mapping[str, object]) -> str:
- source_scope = self._as_mapping(finding.get("source_scope"))
- return self._normalized_source_kind(source_scope.get("dominant_kind"))
-
- def _source_kind_breakdown(
- self,
- source_kinds: Iterable[object],
- ) -> dict[str, int]:
- breakdown = dict.fromkeys(_SOURCE_KIND_BREAKDOWN_ORDER, 0)
- for value in source_kinds:
- breakdown[self._normalized_source_kind(value)] += 1
- return breakdown
-
- def _triage_suggestion_rows(self, record: MCPRunRecord) -> list[dict[str, object]]:
- derived = self._as_mapping(record.report_document.get("derived"))
- canonical_rows = self._dict_list(derived.get("suggestions"))
- suggestion_source_kinds = {
- self._suggestion_finding_id(suggestion): self._normalized_source_kind(
- getattr(suggestion, "source_kind", SOURCE_KIND_OTHER)
- )
- for suggestion in record.suggestions
- }
- rows: list[dict[str, object]] = []
- for row in canonical_rows:
- canonical_finding_id = str(row.get("finding_id", ""))
- action = self._as_mapping(row.get("action"))
- try:
- finding_id = self._short_finding_id(
- record,
- self._resolve_canonical_finding_id(record, canonical_finding_id),
- )
- except MCPFindingNotFoundError:
- finding_id = self._base_short_finding_id(canonical_finding_id)
- rows.append(
- {
- "id": f"suggestion:{finding_id}",
- "finding_id": finding_id,
- "title": str(row.get("title", "")),
- "summary": str(row.get("summary", "")),
- "effort": str(action.get("effort", "")),
- "steps": list(self._as_sequence(action.get("steps"))),
- "source_kind": suggestion_source_kinds.get(
- canonical_finding_id,
- SOURCE_KIND_OTHER,
- ),
- }
- )
- return rows
-
- def _derived_section_payload(self, record: MCPRunRecord) -> dict[str, object]:
- derived = self._as_mapping(record.report_document.get("derived"))
- if not derived:
- raise MCPServiceContractError(
- "Report section 'derived' is not available in this run."
- )
- suggestions = self._triage_suggestion_rows(record)
- canonical_to_short, _ = self._finding_id_maps(record)
- hotlists = self._as_mapping(derived.get("hotlists"))
- projected_hotlists: dict[str, list[str]] = {}
- for hotlist_key, hotlist_ids in hotlists.items():
- projected_hotlists[hotlist_key] = [
- canonical_to_short.get(
- str(finding_id),
- self._base_short_finding_id(str(finding_id)),
- )
- for finding_id in self._as_sequence(hotlist_ids)
- if str(finding_id)
- ]
- return {
- "suggestions": suggestions,
- "hotlists": projected_hotlists,
- }
-
- @staticmethod
- def _schema_resource_payload() -> dict[str, object]:
- return {
- "$schema": "https://json-schema.org/draft/2020-12/schema",
- "title": "CodeCloneCanonicalReport",
- "type": "object",
- "required": [
- "report_schema_version",
- "meta",
- "inventory",
- "findings",
- "derived",
- "integrity",
- ],
- "properties": {
- "report_schema_version": {
- "type": "string",
- "const": REPORT_SCHEMA_VERSION,
- },
- "meta": {"type": "object"},
- "inventory": {"type": "object"},
- "findings": {"type": "object"},
- "metrics": {"type": "object"},
- "derived": {"type": "object"},
- "integrity": {"type": "object"},
- },
- }
-
- def _validate_analysis_request(self, request: MCPAnalysisRequest) -> None:
- self._validate_choice(
- "analysis_mode",
- request.analysis_mode,
- _VALID_ANALYSIS_MODES,
- )
- self._validate_choice(
- "cache_policy",
- request.cache_policy,
- _VALID_CACHE_POLICIES,
- )
- if request.cache_policy == "refresh":
- raise MCPServiceContractError(
- "cache_policy='refresh' is not supported by the read-only "
- "CodeClone MCP server. Use 'reuse' or 'off'."
- )
- if request.analysis_mode == "clones_only" and request.coverage_xml is not None:
- raise MCPServiceContractError(
- "coverage_xml requires analysis_mode='full' because coverage join "
- "depends on metrics-enabled analysis."
- )
-
- @staticmethod
- def _validate_choice(
- name: str,
- value: str,
- allowed: Sequence[str] | frozenset[str],
- ) -> str:
- if value not in allowed:
- allowed_list = ", ".join(sorted(allowed))
- raise MCPServiceContractError(
- f"Invalid value for {name}: {value!r}. Expected one of: {allowed_list}."
- )
- return value
-
- def _validate_optional_choice(
- self,
- name: str,
- value: str | None,
- allowed: Sequence[str] | frozenset[str],
- ) -> str | None:
- if value is None:
- return None
- return self._validate_choice(name, value, allowed)
-
- @staticmethod
- def _resolve_root(root: str | None) -> Path:
- cleaned_root = "" if root is None else str(root).strip()
- if not cleaned_root:
- raise MCPServiceContractError(
- "MCP analysis requires an absolute repository root. "
- "Omitted or relative roots are unsafe because the MCP server "
- "working directory may not match the client workspace."
- )
- candidate = Path(cleaned_root).expanduser()
- if not candidate.is_absolute():
- raise MCPServiceContractError(
- f"MCP requires an absolute repository root; got relative root "
- f"{cleaned_root!r}. Relative roots like '.' are unsafe because "
- "the MCP server working directory may not match the client "
- "workspace."
- )
- try:
- root_path = candidate.resolve()
- except OSError as exc:
- raise MCPServiceContractError(
- f"Invalid root path '{cleaned_root}': {exc}"
- ) from exc
- if not root_path.exists():
- raise MCPServiceContractError(f"Root path does not exist: {root_path}")
- if not root_path.is_dir():
- raise MCPServiceContractError(f"Root path is not a directory: {root_path}")
- return root_path
-
- def _resolve_optional_root(self, root: str | None) -> Path | None:
- cleaned_root = "" if root is None else str(root).strip()
- if not cleaned_root:
- return None
- return self._resolve_root(cleaned_root)
-
- def _build_args(self, *, root_path: Path, request: MCPAnalysisRequest) -> Namespace:
- args = Namespace(
- root=str(root_path),
- min_loc=DEFAULT_MIN_LOC,
- min_stmt=DEFAULT_MIN_STMT,
- block_min_loc=DEFAULT_BLOCK_MIN_LOC,
- block_min_stmt=DEFAULT_BLOCK_MIN_STMT,
- segment_min_loc=DEFAULT_SEGMENT_MIN_LOC,
- segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT,
- processes=None,
- cache_path=None,
- max_cache_size_mb=DEFAULT_MAX_CACHE_SIZE_MB,
- baseline=DEFAULT_BASELINE_PATH,
- max_baseline_size_mb=DEFAULT_MAX_BASELINE_SIZE_MB,
- update_baseline=False,
- fail_on_new=False,
- fail_threshold=-1,
- ci=False,
- fail_complexity=-1,
- fail_coupling=-1,
- fail_cohesion=-1,
- fail_cycles=False,
- fail_dead_code=False,
- fail_health=-1,
- fail_on_new_metrics=False,
- fail_on_typing_regression=False,
- fail_on_docstring_regression=False,
- fail_on_api_break=False,
- min_typing_coverage=-1,
- min_docstring_coverage=-1,
- api_surface=False,
- coverage_xml=None,
- fail_on_untested_hotspots=False,
- coverage_min=50,
- design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
- design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
- design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
- update_metrics_baseline=False,
- metrics_baseline=DEFAULT_BASELINE_PATH,
- skip_metrics=False,
- skip_dead_code=False,
- skip_dependencies=False,
- golden_fixture_paths=(),
- html_out=None,
- json_out=None,
- md_out=None,
- sarif_out=None,
- text_out=None,
- no_progress=True,
- no_color=True,
- quiet=True,
- verbose=False,
- debug=False,
- open_html_report=False,
- timestamped_report_paths=False,
- )
- if request.respect_pyproject:
- try:
- config_values = load_pyproject_config(root_path)
- except ConfigValidationError as exc:
- raise MCPServiceContractError(str(exc)) from exc
- for key in sorted(_MCP_CONFIG_KEYS.intersection(config_values)):
- setattr(args, key, config_values[key])
-
- self._apply_request_overrides(args=args, root_path=root_path, request=request)
-
- if request.analysis_mode == "clones_only":
- args.skip_metrics = True
- args.skip_dead_code = True
- args.skip_dependencies = True
- else:
- args.skip_metrics = False
- args.skip_dead_code = False
- args.skip_dependencies = False
-
- if not validate_numeric_args(args):
- raise MCPServiceContractError(
- "Numeric analysis settings must be non-negative and thresholds "
- "must be >= -1. Coverage thresholds must be between 0 and 100."
- )
-
- return args
-
- def _apply_request_overrides(
- self,
- *,
- args: Namespace,
- root_path: Path,
- request: MCPAnalysisRequest,
- ) -> None:
- override_map: dict[str, object | None] = {
- "processes": request.processes,
- "min_loc": request.min_loc,
- "min_stmt": request.min_stmt,
- "block_min_loc": request.block_min_loc,
- "block_min_stmt": request.block_min_stmt,
- "segment_min_loc": request.segment_min_loc,
- "segment_min_stmt": request.segment_min_stmt,
- "api_surface": request.api_surface,
- "coverage_min": request.coverage_min,
- "max_baseline_size_mb": request.max_baseline_size_mb,
- "max_cache_size_mb": request.max_cache_size_mb,
- "design_complexity_threshold": request.complexity_threshold,
- "design_coupling_threshold": request.coupling_threshold,
- "design_cohesion_threshold": request.cohesion_threshold,
- }
- for key, value in override_map.items():
- if value is not None:
- setattr(args, key, value)
-
- if request.baseline_path is not None:
- args.baseline = str(
- self._resolve_optional_path(request.baseline_path, root_path)
- )
- if request.metrics_baseline_path is not None:
- args.metrics_baseline = str(
- self._resolve_optional_path(request.metrics_baseline_path, root_path)
- )
- if request.cache_path is not None:
- args.cache_path = str(
- self._resolve_optional_path(request.cache_path, root_path)
- )
- if request.coverage_xml is not None:
- args.coverage_xml = str(
- self._resolve_optional_path(request.coverage_xml, root_path)
- )
-
- @staticmethod
- def _resolve_optional_path(value: str, root_path: Path) -> Path:
- candidate = Path(value).expanduser()
- resolved = candidate if candidate.is_absolute() else root_path / candidate
- try:
- return resolved.resolve()
- except OSError as exc:
- raise MCPServiceContractError(
- f"Invalid path '{value}' relative to '{root_path}': {exc}"
- ) from exc
-
- def _resolve_baseline_inputs(
- self,
- *,
- root_path: Path,
- args: Namespace,
- ) -> tuple[Path, bool, Path, bool, dict[str, object] | None]:
- baseline_path = self._resolve_optional_path(str(args.baseline), root_path)
- baseline_exists = baseline_path.exists()
-
- metrics_baseline_arg_path = self._resolve_optional_path(
- str(args.metrics_baseline),
- root_path,
- )
- shared_baseline_payload: dict[str, object] | None = None
- if metrics_baseline_arg_path == baseline_path:
- probe = probe_metrics_baseline_section(metrics_baseline_arg_path)
- metrics_baseline_exists = probe.has_metrics_section
- shared_baseline_payload = probe.payload
- else:
- metrics_baseline_exists = metrics_baseline_arg_path.exists()
-
- return (
- baseline_path,
- baseline_exists,
- metrics_baseline_arg_path,
- metrics_baseline_exists,
- shared_baseline_payload,
- )
-
- @staticmethod
- def _resolve_cache_path(*, root_path: Path, args: Namespace) -> Path:
- return resolve_cache_path(
- root_path=root_path,
- args=args,
- from_args=bool(args.cache_path),
- legacy_cache_path=_LEGACY_CACHE_PATH,
- console=_BufferConsole(),
- )
-
- @staticmethod
- def _build_cache(
- *,
- root_path: Path,
- args: Namespace,
- cache_path: Path,
- policy: CachePolicy,
- ) -> Cache:
- cache = Cache(
- cache_path,
- root=root_path,
- max_size_bytes=_as_int(args.max_cache_size_mb, 0) * 1024 * 1024,
- min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC),
- min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT),
- block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC),
- block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT),
- segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC),
- segment_min_stmt=_as_int(
- args.segment_min_stmt,
- DEFAULT_SEGMENT_MIN_STMT,
- ),
- collect_api_surface=bool(getattr(args, "api_surface", False)),
- )
- if policy != "off":
- cache.load()
- return cache
-
- @staticmethod
- def _metrics_computed(analysis_mode: AnalysisMode) -> tuple[str, ...]:
- return (
- ()
- if analysis_mode == "clones_only"
- else (
- "complexity",
- "coupling",
- "cohesion",
- "health",
- "dependencies",
- "dead_code",
- )
- )
-
- @staticmethod
- def _load_report_document(report_json: str) -> dict[str, object]:
- return _load_report_document_payload(report_json)
-
- def _report_digest(self, report_document: Mapping[str, object]) -> str:
- integrity = self._as_mapping(report_document.get("integrity"))
- digest = self._as_mapping(integrity.get("digest"))
- value = digest.get("value")
- if not isinstance(value, str) or not value:
- raise MCPServiceError("Canonical report digest is missing.")
- return value
-
- def _build_run_summary_payload(
- self,
- *,
- run_id: str,
- root_path: Path,
- request: MCPAnalysisRequest,
- report_document: Mapping[str, object],
- baseline_state: CloneBaselineState,
- metrics_baseline_state: MetricsBaselineState,
- cache_status: CacheStatus,
- new_func: Sequence[str] | set[str],
- new_block: Sequence[str] | set[str],
- metrics_diff: MetricsDiff | None,
- warnings: Sequence[str],
- failures: Sequence[str],
- ) -> dict[str, object]:
- meta = self._as_mapping(report_document.get("meta"))
- meta_baseline = self._as_mapping(meta.get("baseline"))
- meta_metrics_baseline = self._as_mapping(meta.get("metrics_baseline"))
- meta_cache = self._as_mapping(meta.get("cache"))
- inventory = self._as_mapping(report_document.get("inventory"))
- findings = self._as_mapping(report_document.get("findings"))
- metrics = self._as_mapping(report_document.get("metrics"))
- metrics_summary = self._as_mapping(metrics.get("summary"))
- summary = self._as_mapping(findings.get("summary"))
- analysis_profile = self._summary_analysis_profile_payload(meta)
- payload = {
- "run_id": run_id,
- "root": str(root_path),
- "analysis_mode": request.analysis_mode,
- "codeclone_version": meta.get("codeclone_version", __version__),
- "python_tag": str(meta.get("python_tag", "")),
- "report_schema_version": report_document.get(
- "report_schema_version",
- REPORT_SCHEMA_VERSION,
- ),
- "baseline": {
- "path": meta_baseline.get(
- "path",
- str(root_path / DEFAULT_BASELINE_PATH),
- ),
- "loaded": bool(meta_baseline.get("loaded", baseline_state.loaded)),
- "status": str(meta_baseline.get("status", baseline_state.status.value)),
- "trusted_for_diff": baseline_state.trusted_for_diff,
- "python_tag": meta_baseline.get("python_tag"),
- },
- "metrics_baseline": {
- "path": meta_metrics_baseline.get(
- "path",
- str(root_path / DEFAULT_BASELINE_PATH),
- ),
- "loaded": bool(
- meta_metrics_baseline.get(
- "loaded",
- metrics_baseline_state.loaded,
- )
- ),
- "status": str(
- meta_metrics_baseline.get(
- "status",
- metrics_baseline_state.status.value,
- )
- ),
- "trusted_for_diff": metrics_baseline_state.trusted_for_diff,
- },
- "cache": {
- "path": meta_cache.get("path"),
- "status": str(meta_cache.get("status", cache_status.value)),
- "used": bool(meta_cache.get("used", False)),
- "schema_version": meta_cache.get("schema_version"),
- },
- "inventory": dict(inventory),
- "findings_summary": dict(summary),
- "health": dict(self._as_mapping(metrics_summary.get("health"))),
- "baseline_diff": {
- "new_function_clone_groups": len(new_func),
- "new_block_clone_groups": len(new_block),
- "new_clone_groups_total": len(new_func) + len(new_block),
- },
- "metrics_diff": self._metrics_diff_payload(metrics_diff),
- "warnings": list(warnings),
- "failures": list(failures),
- }
- if analysis_profile:
- payload["analysis_profile"] = analysis_profile
- payload["cache"] = self._summary_cache_payload(payload)
- payload["health"] = self._summary_health_payload(payload)
- return payload
-
- def _summary_payload(
- self,
- summary: Mapping[str, object],
- *,
- record: MCPRunRecord | None = None,
- ) -> dict[str, object]:
- inventory = self._as_mapping(summary.get("inventory"))
- if (
- not summary.get("run_id")
- and not record
- and "inventory" in summary
- and not summary.get("baseline")
- ):
- return {
- "focus": _FOCUS_REPOSITORY,
- "health_scope": _HEALTH_SCOPE_REPOSITORY,
- "inventory": self._summary_inventory_payload(inventory),
- "health": self._summary_health_payload(summary),
- }
- resolved_run_id = (
- record.run_id if record is not None else str(summary.get("run_id", ""))
- )
- payload: dict[str, object] = {
- "run_id": self._short_run_id(resolved_run_id) if resolved_run_id else "",
- "focus": _FOCUS_REPOSITORY,
- "health_scope": _HEALTH_SCOPE_REPOSITORY,
- "version": str(summary.get("codeclone_version", __version__)),
- "schema": str(summary.get("report_schema_version", REPORT_SCHEMA_VERSION)),
- "mode": str(summary.get("analysis_mode", "")),
- "baseline": self._summary_baseline_payload(summary),
- "metrics_baseline": self._summary_metrics_baseline_payload(summary),
- "cache": self._summary_cache_payload(summary),
- "inventory": self._summary_inventory_payload(inventory),
- "health": self._summary_health_payload(summary),
- "findings": self._summary_findings_payload(summary, record=record),
- "diff": self._summary_diff_payload(summary),
- "warnings": list(self._as_sequence(summary.get("warnings"))),
- "failures": list(self._as_sequence(summary.get("failures"))),
- }
- analysis_profile = self._summary_analysis_profile_payload(summary)
- if analysis_profile:
- payload["analysis_profile"] = analysis_profile
- if record is not None:
- coverage_join = self._summary_coverage_join_payload(record)
- if coverage_join:
- payload["coverage_join"] = coverage_join
- return payload
-
- def _summary_analysis_profile_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, int]:
- analysis_profile = self._as_mapping(summary.get("analysis_profile"))
- if not analysis_profile:
- return {}
- keys = (
- "min_loc",
- "min_stmt",
- "block_min_loc",
- "block_min_stmt",
- "segment_min_loc",
- "segment_min_stmt",
- )
- payload = {key: _as_int(analysis_profile.get(key), -1) for key in keys}
- return {key: value for key, value in payload.items() if value >= 0}
-
- def _summary_baseline_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, object]:
- return self._summary_trusted_state_payload(summary, key="baseline")
-
- def _summary_metrics_baseline_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, object]:
- return self._summary_trusted_state_payload(summary, key="metrics_baseline")
-
- def _summary_trusted_state_payload(
- self,
- summary: Mapping[str, object],
- *,
- key: str,
- ) -> dict[str, object]:
- baseline = self._as_mapping(summary.get(key))
- trusted = bool(baseline.get("trusted_for_diff", False))
- payload: dict[str, object] = {
- "loaded": bool(baseline.get("loaded", False)),
- "status": str(baseline.get("status", "")),
- "trusted": trusted,
- }
- if key == "baseline":
- payload["compared_without_valid_baseline"] = not trusted
- baseline_python_tag = baseline.get("python_tag")
- runtime_python_tag = summary.get("python_tag")
- if isinstance(baseline_python_tag, str) and baseline_python_tag.strip():
- payload["baseline_python_tag"] = baseline_python_tag
- if isinstance(runtime_python_tag, str) and runtime_python_tag.strip():
- payload["runtime_python_tag"] = runtime_python_tag
- return payload
-
- def _summary_cache_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, object]:
- cache = dict(self._as_mapping(summary.get("cache")))
- if not cache:
- return {}
- return {
- "used": bool(cache.get("used", False)),
- "freshness": self._effective_freshness(summary),
- }
-
- def _effective_freshness(
- self,
- summary: Mapping[str, object],
- ) -> FreshnessKind:
- inventory = self._as_mapping(summary.get("inventory"))
- files = self._as_mapping(inventory.get("files"))
- analyzed = max(0, _as_int(files.get("analyzed", 0), 0))
- cached = max(0, _as_int(files.get("cached", 0), 0))
- cache = self._as_mapping(summary.get("cache"))
- cache_used = bool(cache.get("used"))
- if cache_used and cached > 0 and analyzed == 0:
- return "reused"
- if cache_used and cached > 0 and analyzed > 0:
- return "mixed"
- return "fresh"
-
- def _summary_inventory_payload(
- self,
- inventory: Mapping[str, object],
- ) -> dict[str, object]:
- if not inventory:
- return {}
- files = self._as_mapping(inventory.get("files"))
- code = self._as_mapping(inventory.get("code"))
- total_files = _as_int(
- files.get(
- "total_found",
- files.get(
- "analyzed",
- len(
- self._as_sequence(
- self._as_mapping(inventory.get("file_registry")).get(
- "items"
- )
- )
- ),
- ),
- ),
- 0,
- )
- functions = _as_int(code.get("functions", 0), 0) + _as_int(
- code.get("methods", 0),
- 0,
- )
- return {
- "files": total_files,
- "lines": _as_int(code.get("parsed_lines", 0), 0),
- "functions": functions,
- "classes": _as_int(code.get("classes", 0), 0),
- }
-
- def _summary_findings_payload(
- self,
- summary: Mapping[str, object],
- *,
- record: MCPRunRecord | None,
- ) -> dict[str, object]:
- findings_summary = self._as_mapping(summary.get("findings_summary"))
- if record is None:
- return {
- "total": _as_int(findings_summary.get("total", 0), 0),
- "new": 0,
- "known": 0,
- "by_family": {},
- "production": 0,
- "new_by_source_kind": self._source_kind_breakdown(()),
- }
- findings = self._base_findings(record)
- by_family: dict[str, int] = {
- "clones": 0,
- "structural": 0,
- "dead_code": 0,
- "design": 0,
- }
- new_count = 0
- known_count = 0
- production_count = 0
- new_by_source_kind = self._source_kind_breakdown(
- self._finding_source_kind(finding)
- for finding in findings
- if str(finding.get("novelty", "")).strip() == "new"
- )
- for finding in findings:
- family = str(finding.get("family", "")).strip()
- family_key = "clones" if family == FAMILY_CLONE else family
- if family_key in by_family:
- by_family[family_key] += 1
- if str(finding.get("novelty", "")).strip() == "new":
- new_count += 1
- else:
- known_count += 1
- if self._finding_source_kind(finding) == SOURCE_KIND_PRODUCTION:
- production_count += 1
- return {
- "total": len(findings),
- "new": new_count,
- "known": known_count,
- "by_family": {key: value for key, value in by_family.items() if value > 0},
- "production": production_count,
- "new_by_source_kind": new_by_source_kind,
- }
-
- def _summary_diff_payload(
- self,
- summary: Mapping[str, object],
- ) -> dict[str, object]:
- baseline_diff = self._as_mapping(summary.get("baseline_diff"))
- metrics_diff = self._as_mapping(summary.get("metrics_diff"))
- return {
- "new_clones": _as_int(baseline_diff.get("new_clone_groups_total", 0), 0),
- "health_delta": (
- _as_int(metrics_diff.get("health_delta", 0), 0)
- if metrics_diff
- and self._summary_health_payload(summary).get("available") is not False
- else None
- ),
- "typing_param_permille_delta": _as_int(
- metrics_diff.get("typing_param_permille_delta", 0),
- 0,
- ),
- "typing_return_permille_delta": _as_int(
- metrics_diff.get("typing_return_permille_delta", 0),
- 0,
- ),
- "docstring_permille_delta": _as_int(
- metrics_diff.get("docstring_permille_delta", 0),
- 0,
- ),
- "api_breaking_changes": _as_int(
- metrics_diff.get("api_breaking_changes", 0),
- 0,
- ),
- "new_api_symbols": _as_int(
- metrics_diff.get("new_api_symbols", 0),
- 0,
- ),
- }
-
- def _summary_coverage_join_payload(
- self,
- record: MCPRunRecord,
- ) -> dict[str, object]:
- metrics = self._as_mapping(record.report_document.get("metrics"))
- families = self._as_mapping(metrics.get("families"))
- coverage_join = self._as_mapping(families.get("coverage_join"))
- summary = self._as_mapping(coverage_join.get("summary"))
- if not summary:
- return {}
- payload: dict[str, object] = {
- "status": str(summary.get("status", "")).strip(),
- "overall_permille": _as_int(summary.get("overall_permille", 0), 0),
- "coverage_hotspots": _as_int(summary.get("coverage_hotspots", 0), 0),
- "scope_gap_hotspots": _as_int(summary.get("scope_gap_hotspots", 0), 0),
- "hotspot_threshold_percent": _as_int(
- summary.get("hotspot_threshold_percent", 0),
- 0,
- ),
- }
- source_value = summary.get("source")
- source = source_value.strip() if isinstance(source_value, str) else ""
- if source:
- payload["source"] = source
- invalid_reason_value = summary.get("invalid_reason")
- invalid_reason = (
- invalid_reason_value.strip()
- if isinstance(invalid_reason_value, str)
- else ""
- )
- if invalid_reason:
- payload["invalid_reason"] = invalid_reason
- return payload
-
- def _metrics_detail_payload(
- self,
- *,
- metrics: Mapping[str, object],
- family: MetricsDetailFamily | None,
- path: str | None,
- offset: int,
- limit: int,
- ) -> dict[str, object]:
- summary = dict(self._as_mapping(metrics.get("summary")))
- families = self._as_mapping(metrics.get("families"))
- normalized_path = self._normalize_relative_path(path or "")
- if family is None and not normalized_path:
- return {
- "summary": summary,
- "_hint": "Use family and/or path parameters to access per-item detail.",
- }
- normalized_offset = max(0, offset)
- normalized_limit = max(1, min(limit, 200))
- family_names: Sequence[str] = (
- (family,) if family is not None else tuple(sorted(families))
- )
- items: list[dict[str, object]] = []
- for family_name in family_names:
- family_payload = self._as_mapping(families.get(family_name))
- for item in self._as_sequence(family_payload.get("items")):
- item_map = self._as_mapping(item)
- if normalized_path and not self._metric_item_matches_path(
- item_map,
- normalized_path,
- ):
- continue
- compact_item = self._compact_metrics_item(item_map)
- if family is None:
- compact_item = {"family": family_name, **compact_item}
- items.append(compact_item)
- if family is None:
- items.sort(
- key=lambda item: (
- str(item.get("family", "")),
- str(item.get("path", "")),
- str(item.get("qualname", "")),
- _as_int(item.get("start_line", 0), 0),
- )
- )
- page = items[normalized_offset : normalized_offset + normalized_limit]
- return {
- "family": family,
- "path": normalized_path or None,
- "offset": normalized_offset,
- "limit": normalized_limit,
- "returned": len(page),
- "total": len(items),
- "has_more": normalized_offset + len(page) < len(items),
- "items": page,
- }
-
- def _metric_item_matches_path(
- self,
- item: Mapping[str, object],
- normalized_path: str,
- ) -> bool:
- path_value = (
- str(item.get("relative_path", "")).strip()
- or str(item.get("path", "")).strip()
- or str(item.get("filepath", "")).strip()
- or str(item.get("file", "")).strip()
- )
- if not path_value:
- return False
- return self._path_matches(path_value, (normalized_path,))
-
- @staticmethod
- def _compact_metrics_item(
- item: Mapping[str, object],
- ) -> dict[str, object]:
- compact: dict[str, object] = {}
- path_value = (
- str(item.get("relative_path", "")).strip()
- or str(item.get("path", "")).strip()
- or str(item.get("filepath", "")).strip()
- or str(item.get("file", "")).strip()
- )
- if path_value:
- compact["path"] = path_value
- for key, value in item.items():
- if (
- key not in _COMPACT_ITEM_PATH_KEYS
- and value not in _COMPACT_ITEM_EMPTY_VALUES
- ):
- compact[str(key)] = value
- return compact
-
- @staticmethod
- def _metrics_diff_payload(
- metrics_diff: MetricsDiff | None,
- ) -> dict[str, object] | None:
- if metrics_diff is None:
- return None
- new_high_risk_functions = tuple(
- cast(Sequence[str], getattr(metrics_diff, "new_high_risk_functions", ()))
- )
- new_high_coupling_classes = tuple(
- cast(Sequence[str], getattr(metrics_diff, "new_high_coupling_classes", ()))
- )
- new_cycles = tuple(
- cast(Sequence[object], getattr(metrics_diff, "new_cycles", ()))
- )
- new_dead_code = tuple(
- cast(Sequence[str], getattr(metrics_diff, "new_dead_code", ()))
- )
- health_delta = getattr(metrics_diff, "health_delta", 0)
- return {
- "new_high_risk_functions": len(new_high_risk_functions),
- "new_high_coupling_classes": len(new_high_coupling_classes),
- "new_cycles": len(new_cycles),
- "new_dead_code": len(new_dead_code),
- "health_delta": _as_int(health_delta, 0),
- "typing_param_permille_delta": _as_int(
- getattr(metrics_diff, "typing_param_permille_delta", 0),
- 0,
- ),
- "typing_return_permille_delta": _as_int(
- getattr(metrics_diff, "typing_return_permille_delta", 0),
- 0,
- ),
- "docstring_permille_delta": _as_int(
- getattr(metrics_diff, "docstring_permille_delta", 0),
- 0,
- ),
- "api_breaking_changes": len(
- tuple(
- cast(
- Sequence[object],
- getattr(metrics_diff, "new_api_breaking_changes", ()),
- )
- )
- ),
- "new_api_symbols": len(tuple(getattr(metrics_diff, "new_api_symbols", ()))),
- }
-
- def _dict_list(self, value: object) -> list[dict[str, object]]:
- return [dict(self._as_mapping(item)) for item in self._as_sequence(value)]
-
- @staticmethod
- def _as_mapping(value: object) -> Mapping[str, object]:
- return value if isinstance(value, Mapping) else {}
-
- @staticmethod
- def _as_sequence(value: object) -> Sequence[object]:
- if isinstance(value, Sequence) and not isinstance(
- value,
- (str, bytes, bytearray),
- ):
- return value
- return ()
diff --git a/codeclone/meta_markers.py b/codeclone/meta_markers/__init__.py
similarity index 100%
rename from codeclone/meta_markers.py
rename to codeclone/meta_markers/__init__.py
diff --git a/codeclone/metrics/__init__.py b/codeclone/metrics/__init__.py
index 0551b7d..9135843 100644
--- a/codeclone/metrics/__init__.py
+++ b/codeclone/metrics/__init__.py
@@ -3,42 +3,3 @@
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-from .cohesion import cohesion_risk, compute_lcom4
-from .complexity import cyclomatic_complexity, nesting_depth, risk_level
-from .coupling import compute_cbo, coupling_risk
-from .coverage_join import CoverageJoinParseError, build_coverage_join
-from .dead_code import find_suppressed_unused, find_unused
-from .dependencies import (
- build_dep_graph,
- build_import_graph,
- find_cycles,
- longest_chains,
- max_depth,
-)
-from .health import HealthInputs, compute_health
-from .overloaded_modules import build_overloaded_modules_payload
-
-__all__ = [
- "CoverageJoinParseError",
- "HealthInputs",
- "build_coverage_join",
- "build_dep_graph",
- "build_import_graph",
- "build_overloaded_modules_payload",
- "cohesion_risk",
- "compute_cbo",
- "compute_health",
- "compute_lcom4",
- "coupling_risk",
- "cyclomatic_complexity",
- "find_cycles",
- "find_suppressed_unused",
- "find_unused",
- "longest_chains",
- "max_depth",
- "nesting_depth",
- "risk_level",
-]
diff --git a/codeclone/metrics/_base.py b/codeclone/metrics/_base.py
new file mode 100644
index 0000000..e34da0e
--- /dev/null
+++ b/codeclone/metrics/_base.py
@@ -0,0 +1,64 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from collections.abc import Callable
+
+ from ..models import (
+ ClassMetrics,
+ DeadCandidate,
+ GroupItemLike,
+ ModuleApiSurface,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ SecuritySurface,
+ )
+
+MetricResult = dict[str, object]
+
+
+@dataclass(frozen=True, slots=True)
+class MetricAggregate:
+ project_fields: dict[str, object]
+ artifacts: dict[str, object] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class MetricProjectContext:
+ units: tuple[GroupItemLike, ...]
+ class_metrics: tuple[ClassMetrics, ...]
+ module_deps: tuple[ModuleDep, ...]
+ dead_candidates: tuple[DeadCandidate, ...]
+ referenced_names: frozenset[str]
+ referenced_qualnames: frozenset[str]
+ security_surfaces: tuple[SecuritySurface, ...] = ()
+ typing_modules: tuple[ModuleTypingCoverage, ...] = ()
+ docstring_modules: tuple[ModuleDocstringCoverage, ...] = ()
+ api_modules: tuple[ModuleApiSurface, ...] = ()
+ files_found: int = 0
+ files_analyzed_or_cached: int = 0
+ function_clone_groups: int = 0
+ block_clone_groups: int = 0
+ skip_dependencies: bool = False
+ skip_dead_code: bool = False
+ memo: dict[str, MetricResult] = field(default_factory=dict)
+
+
+@dataclass(frozen=True, slots=True)
+class MetricFamily:
+ name: str
+ compute: Callable[[MetricProjectContext], MetricResult]
+ aggregate: Callable[[list[MetricResult]], MetricAggregate]
+ report_section: str
+ baseline_key: str | None
+ gate_keys: tuple[str, ...]
+ skippable_flag: str | None
diff --git a/codeclone/metrics/complexity.py b/codeclone/metrics/complexity.py
index fa98f9d..97808e3 100644
--- a/codeclone/metrics/complexity.py
+++ b/codeclone/metrics/complexity.py
@@ -15,7 +15,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable
- from ..cfg_model import CFG
+ from ..analysis.cfg_model import CFG
ControlNode = (
ast.If
diff --git a/codeclone/metrics/coverage_join.py b/codeclone/metrics/coverage_join.py
index 08c8278..386b16b 100644
--- a/codeclone/metrics/coverage_join.py
+++ b/codeclone/metrics/coverage_join.py
@@ -10,11 +10,11 @@
from collections.abc import Sequence
from dataclasses import dataclass
from pathlib import Path
-from typing import Literal, cast
+from typing import Literal
from xml.etree import ElementTree
-from .._coerce import as_int, as_str
from ..models import CoverageJoinResult, GroupItemLike, UnitCoverageFact
+from ..utils.coerce import as_int, as_str
__all__ = [
"CoverageJoinParseError",
@@ -221,8 +221,10 @@ def _resolve_unit_path(filepath: str) -> str:
def _risk_level(value: object) -> _Risk:
risk = as_str(value, "low")
- if risk in {"low", "medium", "high"}:
- return cast(_Risk, risk)
+ if risk == "medium":
+ return "medium"
+ if risk == "high":
+ return "high"
return "low"
diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py
index b6306d9..599dc1c 100644
--- a/codeclone/metrics/dead_code.py
+++ b/codeclone/metrics/dead_code.py
@@ -9,11 +9,11 @@
from dataclasses import replace
from typing import Literal
+from ..analysis.suppressions import DEAD_CODE_RULE_ID
from ..domain.findings import SYMBOL_KIND_FUNCTION, SYMBOL_KIND_METHOD
from ..domain.quality import CONFIDENCE_HIGH, CONFIDENCE_MEDIUM
from ..models import DeadCandidate, DeadItem
from ..paths import is_test_filepath
-from ..suppressions import DEAD_CODE_RULE_ID
_TEST_NAME_PREFIXES = ("test_", "pytest_")
_DYNAMIC_METHOD_PREFIXES = ("visit_",)
diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py
index 48ba032..573cc9e 100644
--- a/codeclone/metrics/dependencies.py
+++ b/codeclone/metrics/dependencies.py
@@ -6,6 +6,7 @@
from __future__ import annotations
+from math import ceil
from typing import TYPE_CHECKING
from ..models import DepGraph, ModuleDep
@@ -16,6 +17,37 @@
DepAdjacency = dict[str, set[str]]
+def _internal_roots(
+ modules: Iterable[str],
+ deps: Sequence[ModuleDep],
+) -> frozenset[str]:
+ roots: set[str] = set()
+ for module_name in modules:
+ if module_name:
+ roots.add(module_name.split(".", 1)[0])
+ for dep in deps:
+ if dep.source:
+ roots.add(dep.source.split(".", 1)[0])
+ return frozenset(sorted(roots))
+
+
+def _is_internal_target(target: str, *, internal_roots: frozenset[str]) -> bool:
+ if not target:
+ return False
+ return target.split(".", 1)[0] in internal_roots
+
+
+def _unique_sorted_edges(deps: Sequence[ModuleDep]) -> tuple[ModuleDep, ...]:
+ return tuple(
+ sorted(
+ {
+ (dep.source, dep.target, dep.import_type, dep.line): dep for dep in deps
+ }.values(),
+ key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line),
+ )
+ )
+
+
def build_import_graph(
*,
modules: Iterable[str],
@@ -123,6 +155,23 @@ def max_depth(graph: DepAdjacency) -> int:
return best
+def depth_profile(graph: DepAdjacency) -> tuple[float, int]:
+ if not graph:
+ return 0.0, 0
+
+ memo: dict[str, int] = {}
+ depths = sorted(
+ _longest_path_from(node, graph=graph, visiting=set(), memo=memo)
+ for node in sorted(graph)
+ )
+ if not depths:
+ return 0.0, 0
+
+ avg_depth = sum(depths) / len(depths)
+ percentile_index = max(0, ceil(len(depths) * 0.95) - 1)
+ return avg_depth, int(depths[percentile_index])
+
+
def _longest_path_nodes_from(
node: str,
*,
@@ -180,22 +229,44 @@ def longest_chains(
def build_dep_graph(*, modules: Iterable[str], deps: Sequence[ModuleDep]) -> DepGraph:
- graph = build_import_graph(modules=modules, deps=deps)
- cycles = find_cycles(graph)
- depth = max_depth(graph)
- chains = longest_chains(graph)
- unique_edges = tuple(
+ base_modules = frozenset(
sorted(
{
- (dep.source, dep.target, dep.import_type, dep.line): dep for dep in deps
- }.values(),
- key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line),
+ str(module_name).strip()
+ for module_name in modules
+ if str(module_name).strip()
+ }
)
)
+ internal_roots = _internal_roots(base_modules, deps)
+ internal_edges = _unique_sorted_edges(
+ tuple(
+ dep
+ for dep in deps
+ if dep.source
+ and _is_internal_target(dep.target, internal_roots=internal_roots)
+ )
+ )
+ graph_modules = frozenset(
+ sorted(
+ {
+ *base_modules,
+ *(dep.source for dep in internal_edges if dep.source),
+ *(dep.target for dep in internal_edges if dep.target),
+ }
+ )
+ )
+ graph = build_import_graph(modules=graph_modules, deps=internal_edges)
+ cycles = find_cycles(graph)
+ depth = max_depth(graph)
+ avg_depth, p95_depth = depth_profile(graph)
+ chains = longest_chains(graph)
return DepGraph(
modules=frozenset(graph.keys()),
- edges=unique_edges,
+ edges=internal_edges,
cycles=cycles,
max_depth=depth,
+ avg_depth=avg_depth,
+ p95_depth=p95_depth,
longest_chains=chains,
)
diff --git a/codeclone/metrics/health.py b/codeclone/metrics/health.py
index 9f0ab67..354bb5a 100644
--- a/codeclone/metrics/health.py
+++ b/codeclone/metrics/health.py
@@ -7,9 +7,16 @@
from __future__ import annotations
from dataclasses import dataclass
+from math import ceil
from typing import Literal
-from ..contracts import HEALTH_WEIGHTS
+from ..contracts import (
+ HEALTH_DEPENDENCY_CYCLE_PENALTY,
+ HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER,
+ HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY,
+ HEALTH_DEPENDENCY_DEPTH_P95_MARGIN,
+ HEALTH_WEIGHTS,
+)
from ..models import HealthScore
@@ -29,6 +36,8 @@ class HealthInputs:
low_cohesion_classes: int
dependency_cycles: int
dependency_max_depth: int
+ dependency_avg_depth: float
+ dependency_p95_depth: int
dead_code_items: int
@@ -54,6 +63,26 @@ def _safe_div(numerator: float, denominator: float) -> float:
return numerator / denominator
+def _dependency_expected_tail(*, avg_depth: float, p95_depth: int) -> int:
+ avg_based = ceil(max(0.0, avg_depth) * HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER)
+ p95_based = max(0, p95_depth) + HEALTH_DEPENDENCY_DEPTH_P95_MARGIN
+ return max(avg_based, p95_based)
+
+
+def _dependency_tail_pressure(
+ *,
+ max_depth: int,
+ avg_depth: float,
+ p95_depth: int,
+) -> int:
+ if max_depth <= 0:
+ return 0
+ return max(
+ 0,
+ max_depth - _dependency_expected_tail(avg_depth=avg_depth, p95_depth=p95_depth),
+ )
+
+
# Piecewise clone-density curve: mild penalty for low density,
# steep in the structural-debt zone, brutal when it's systemic.
_CLONE_BREAKPOINTS: tuple[tuple[float, float], ...] = (
@@ -104,8 +133,13 @@ def compute_health(inputs: HealthInputs) -> HealthScore:
dead_code_score = _clamp_score(100 - inputs.dead_code_items * 8)
dependency_score = _clamp_score(
100
- - inputs.dependency_cycles * 25
- - max(0, inputs.dependency_max_depth - 6) * 4
+ - inputs.dependency_cycles * HEALTH_DEPENDENCY_CYCLE_PENALTY
+ - _dependency_tail_pressure(
+ max_depth=inputs.dependency_max_depth,
+ avg_depth=inputs.dependency_avg_depth,
+ p95_depth=inputs.dependency_p95_depth,
+ )
+ * HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY
)
coverage_score = _clamp_score(
_safe_div(inputs.files_analyzed_or_cached * 100.0, max(1, inputs.files_found))
diff --git a/codeclone/metrics/overloaded_modules.py b/codeclone/metrics/overloaded_modules.py
index 46b414b..e151879 100644
--- a/codeclone/metrics/overloaded_modules.py
+++ b/codeclone/metrics/overloaded_modules.py
@@ -11,7 +11,6 @@
from collections.abc import Sequence
from math import floor
-from .._coerce import as_float, as_int, as_sequence, as_str
from ..domain.source_scope import (
SOURCE_KIND_FIXTURES,
SOURCE_KIND_OTHER,
@@ -20,6 +19,7 @@
)
from ..models import ClassMetrics, GroupItemLike, ModuleDep
from ..scanner import module_name_from_path
+from ..utils.coerce import as_float, as_int, as_sequence, as_str
_CANDIDATE = "candidate"
_NON_CANDIDATE = "non_candidate"
diff --git a/codeclone/metrics/registry.py b/codeclone/metrics/registry.py
new file mode 100644
index 0000000..23c6df8
--- /dev/null
+++ b/codeclone/metrics/registry.py
@@ -0,0 +1,755 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TypeGuard
+
+from ..domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING
+from ..domain.quality import RISK_HIGH
+from ..models import (
+ ApiSurfaceSnapshot,
+ DeadItem,
+ DepGraph,
+ HealthScore,
+ ModuleDep,
+ ModuleDocstringCoverage,
+ ModuleTypingCoverage,
+ ProjectMetrics,
+)
+from ..utils.coerce import as_int as _as_int
+from ..utils.coerce import as_str as _as_str
+from ._base import MetricAggregate, MetricFamily, MetricProjectContext, MetricResult
+from .dead_code import find_unused
+from .dependencies import build_dep_graph
+from .health import HealthInputs, compute_health
+
+
+def _group_item_sort_key(item: object) -> tuple[str, int, int, str]:
+ if not isinstance(item, dict):
+ return "", 0, 0, ""
+ return (
+ _as_str(item.get("filepath")),
+ _as_int(item.get("start_line")),
+ _as_int(item.get("end_line")),
+ _as_str(item.get("qualname")),
+ )
+
+
+def _class_metric_sort_key(metric: object) -> tuple[str, int, int, str]:
+ filepath = getattr(metric, "filepath", "")
+ start_line = getattr(metric, "start_line", 0)
+ end_line = getattr(metric, "end_line", 0)
+ qualname = getattr(metric, "qualname", "")
+ return str(filepath), int(start_line), int(end_line), str(qualname)
+
+
+def _module_names_from_units(units: tuple[object, ...]) -> frozenset[str]:
+ modules: set[str] = set()
+ for item in units:
+ if not isinstance(item, dict):
+ continue
+ qualname = _as_str(item.get("qualname"))
+ module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname
+ if module_name:
+ modules.add(module_name)
+ return frozenset(sorted(modules))
+
+
+def _empty_dep_graph() -> DepGraph:
+ return DepGraph(
+ modules=frozenset(),
+ edges=(),
+ cycles=(),
+ max_depth=0,
+ avg_depth=0.0,
+ p95_depth=0,
+ longest_chains=(),
+ )
+
+
+_EMPTY_HEALTH_SCORE = compute_health(
+ HealthInputs(
+ files_found=0,
+ files_analyzed_or_cached=0,
+ function_clone_groups=0,
+ block_clone_groups=0,
+ complexity_avg=0.0,
+ complexity_max=0,
+ high_risk_functions=0,
+ coupling_avg=0.0,
+ coupling_max=0,
+ high_risk_classes=0,
+ cohesion_avg=0.0,
+ low_cohesion_classes=0,
+ dependency_cycles=0,
+ dependency_max_depth=0,
+ dependency_avg_depth=0.0,
+ dependency_p95_depth=0,
+ dead_code_items=0,
+ )
+)
+
+
+def _is_tuple_of_str(value: object) -> TypeGuard[tuple[str, ...]]:
+ return isinstance(value, tuple) and all(isinstance(item, str) for item in value)
+
+
+def _is_tuple_of_tuple_str(value: object) -> TypeGuard[tuple[tuple[str, ...], ...]]:
+ return isinstance(value, tuple) and all(_is_tuple_of_str(item) for item in value)
+
+
+def _is_tuple_of_dead_items(value: object) -> TypeGuard[tuple[DeadItem, ...]]:
+ return isinstance(value, tuple) and all(
+ isinstance(item, DeadItem) for item in value
+ )
+
+
+def _is_tuple_of_module_deps(value: object) -> TypeGuard[tuple[ModuleDep, ...]]:
+ return isinstance(value, tuple) and all(
+ isinstance(item, ModuleDep) for item in value
+ )
+
+
+def _is_tuple_of_typing_modules(
+ value: object,
+) -> TypeGuard[tuple[ModuleTypingCoverage, ...]]:
+ return isinstance(value, tuple) and all(
+ isinstance(item, ModuleTypingCoverage) for item in value
+ )
+
+
+def _is_tuple_of_docstring_modules(
+ value: object,
+) -> TypeGuard[tuple[ModuleDocstringCoverage, ...]]:
+ return isinstance(value, tuple) and all(
+ isinstance(item, ModuleDocstringCoverage) for item in value
+ )
+
+
+def project_metrics_defaults() -> dict[str, object]:
+ return {
+ "complexity_avg": 0.0,
+ "complexity_max": 0,
+ "high_risk_functions": (),
+ "coupling_avg": 0.0,
+ "coupling_max": 0,
+ "high_risk_classes": (),
+ "cohesion_avg": 0.0,
+ "cohesion_max": 0,
+ "low_cohesion_classes": (),
+ "dependency_modules": 0,
+ "dependency_edges": 0,
+ "dependency_edge_list": (),
+ "dependency_cycles": (),
+ "dependency_max_depth": 0,
+ "dependency_longest_chains": (),
+ "dead_code": (),
+ "health": _EMPTY_HEALTH_SCORE,
+ "typing_param_total": 0,
+ "typing_param_annotated": 0,
+ "typing_return_total": 0,
+ "typing_return_annotated": 0,
+ "typing_any_count": 0,
+ "docstring_public_total": 0,
+ "docstring_public_documented": 0,
+ "typing_modules": (),
+ "docstring_modules": (),
+ "api_surface": None,
+ }
+
+
+def build_project_metrics(project_fields: dict[str, object]) -> ProjectMetrics:
+ return ProjectMetrics(
+ complexity_avg=_result_float(project_fields, "complexity_avg"),
+ complexity_max=_result_int(project_fields, "complexity_max"),
+ high_risk_functions=_result_tuple_str(project_fields, "high_risk_functions"),
+ coupling_avg=_result_float(project_fields, "coupling_avg"),
+ coupling_max=_result_int(project_fields, "coupling_max"),
+ high_risk_classes=_result_tuple_str(project_fields, "high_risk_classes"),
+ cohesion_avg=_result_float(project_fields, "cohesion_avg"),
+ cohesion_max=_result_int(project_fields, "cohesion_max"),
+ low_cohesion_classes=_result_tuple_str(project_fields, "low_cohesion_classes"),
+ dependency_modules=_result_int(project_fields, "dependency_modules"),
+ dependency_edges=_result_int(project_fields, "dependency_edges"),
+ dependency_edge_list=_result_module_deps(
+ project_fields,
+ "dependency_edge_list",
+ ),
+ dependency_cycles=_result_nested_tuple_str(
+ project_fields,
+ "dependency_cycles",
+ ),
+ dependency_max_depth=_result_int(project_fields, "dependency_max_depth"),
+ dependency_longest_chains=_result_nested_tuple_str(
+ project_fields,
+ "dependency_longest_chains",
+ ),
+ dead_code=_result_dead_items(project_fields, "dead_code"),
+ health=_result_health(project_fields, "health"),
+ typing_param_total=_result_int(project_fields, "typing_param_total"),
+ typing_param_annotated=_result_int(project_fields, "typing_param_annotated"),
+ typing_return_total=_result_int(project_fields, "typing_return_total"),
+ typing_return_annotated=_result_int(
+ project_fields,
+ "typing_return_annotated",
+ ),
+ typing_any_count=_result_int(project_fields, "typing_any_count"),
+ docstring_public_total=_result_int(project_fields, "docstring_public_total"),
+ docstring_public_documented=_result_int(
+ project_fields,
+ "docstring_public_documented",
+ ),
+ typing_modules=_result_typing_modules(project_fields, "typing_modules"),
+ docstring_modules=_result_docstring_modules(
+ project_fields,
+ "docstring_modules",
+ ),
+ api_surface=_result_api_surface(project_fields, "api_surface"),
+ )
+
+
+def _result_float(result: dict[str, object], key: str) -> float:
+ value = result.get(key)
+ return float(value) if isinstance(value, int | float) else 0.0
+
+
+def _result_int(result: dict[str, object], key: str) -> int:
+ return _as_int(result.get(key), 0)
+
+
+def _result_tuple_str(result: dict[str, object], key: str) -> tuple[str, ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_str(value) else ()
+
+
+def _result_nested_tuple_str(
+ result: dict[str, object],
+ key: str,
+) -> tuple[tuple[str, ...], ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_tuple_str(value) else ()
+
+
+def _result_dead_items(
+ result: dict[str, object],
+ key: str,
+) -> tuple[DeadItem, ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_dead_items(value) else ()
+
+
+def _result_module_deps(
+ result: dict[str, object],
+ key: str,
+) -> tuple[ModuleDep, ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_module_deps(value) else ()
+
+
+def _result_health(result: dict[str, object], key: str) -> HealthScore:
+ value = result.get(key)
+ return value if isinstance(value, HealthScore) else _EMPTY_HEALTH_SCORE
+
+
+def _result_typing_modules(
+ result: dict[str, object],
+ key: str,
+) -> tuple[ModuleTypingCoverage, ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_typing_modules(value) else ()
+
+
+def _result_docstring_modules(
+ result: dict[str, object],
+ key: str,
+) -> tuple[ModuleDocstringCoverage, ...]:
+ value = result.get(key, ())
+ return value if _is_tuple_of_docstring_modules(value) else ()
+
+
+def _result_api_surface(
+ result: dict[str, object],
+ key: str,
+) -> ApiSurfaceSnapshot | None:
+ value = result.get(key)
+ return value if isinstance(value, ApiSurfaceSnapshot) else None
+
+
+def _memoized_result(
+ context: MetricProjectContext,
+ *,
+ family_name: str,
+ builder: Callable[[MetricProjectContext], MetricResult],
+) -> MetricResult:
+ cached = context.memo.get(family_name)
+ if cached is not None:
+ return cached
+ result = builder(context)
+ context.memo[family_name] = result
+ return result
+
+
+def _first_result(results: list[MetricResult]) -> MetricResult:
+ return results[0] if results else {}
+
+
+def _build_complexity_result(context: MetricProjectContext) -> MetricResult:
+ unit_rows = tuple(sorted(context.units, key=_group_item_sort_key))
+ complexities = tuple(
+ max(1, _as_int(row.get("cyclomatic_complexity"), 1)) for row in unit_rows
+ )
+ complexity_max = max(complexities) if complexities else 0
+ complexity_avg = (
+ float(sum(complexities)) / float(len(complexities)) if complexities else 0.0
+ )
+ high_risk_functions = tuple(
+ sorted(
+ {
+ _as_str(row.get("qualname"))
+ for row in unit_rows
+ if _as_str(row.get("risk")) == RISK_HIGH
+ }
+ )
+ )
+ return {
+ "complexity_avg": complexity_avg,
+ "complexity_max": complexity_max,
+ "high_risk_functions": high_risk_functions,
+ }
+
+
+def _summarize_class_metric_family(
+ context: MetricProjectContext,
+ *,
+ value_attr: str,
+ risk_attr: str,
+) -> tuple[float, int, tuple[str, ...]]:
+ classes_sorted = tuple(sorted(context.class_metrics, key=_class_metric_sort_key))
+ values = tuple(
+ _as_int(getattr(metric, value_attr, 0), 0) for metric in classes_sorted
+ )
+ value_max = max(values) if values else 0
+ value_avg = float(sum(values)) / float(len(values)) if values else 0.0
+ high_risk_symbols = tuple(
+ sorted(
+ {
+ metric.qualname
+ for metric in classes_sorted
+ if str(getattr(metric, risk_attr, "")) == RISK_HIGH
+ }
+ )
+ )
+ return value_avg, value_max, high_risk_symbols
+
+
+def _compute_complexity_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name=CATEGORY_COMPLEXITY,
+ builder=_build_complexity_result,
+ )
+
+
+def _aggregate_complexity_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(
+ project_fields={
+ "complexity_avg": _result_float(result, "complexity_avg"),
+ "complexity_max": _result_int(result, "complexity_max"),
+ "high_risk_functions": _result_tuple_str(result, "high_risk_functions"),
+ }
+ )
+
+
+def _build_coupling_result(context: MetricProjectContext) -> MetricResult:
+ coupling_avg, coupling_max, high_risk_classes = _summarize_class_metric_family(
+ context,
+ value_attr="cbo",
+ risk_attr="risk_coupling",
+ )
+ return {
+ "coupling_avg": coupling_avg,
+ "coupling_max": coupling_max,
+ "high_risk_classes": high_risk_classes,
+ }
+
+
+def _compute_coupling_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name=CATEGORY_COUPLING,
+ builder=_build_coupling_result,
+ )
+
+
+def _aggregate_coupling_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(
+ project_fields={
+ "coupling_avg": _result_float(result, "coupling_avg"),
+ "coupling_max": _result_int(result, "coupling_max"),
+ "high_risk_classes": _result_tuple_str(result, "high_risk_classes"),
+ }
+ )
+
+
+def _build_cohesion_result(context: MetricProjectContext) -> MetricResult:
+ cohesion_avg, cohesion_max, low_cohesion_classes = _summarize_class_metric_family(
+ context,
+ value_attr="lcom4",
+ risk_attr="risk_cohesion",
+ )
+ return {
+ "cohesion_avg": cohesion_avg,
+ "cohesion_max": cohesion_max,
+ "low_cohesion_classes": low_cohesion_classes,
+ }
+
+
+def _compute_cohesion_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name=CATEGORY_COHESION,
+ builder=_build_cohesion_result,
+ )
+
+
+def _aggregate_cohesion_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(
+ project_fields={
+ "cohesion_avg": _result_float(result, "cohesion_avg"),
+ "cohesion_max": _result_int(result, "cohesion_max"),
+ "low_cohesion_classes": _result_tuple_str(result, "low_cohesion_classes"),
+ }
+ )
+
+
+def _build_dependencies_result(context: MetricProjectContext) -> MetricResult:
+ dep_graph = _empty_dep_graph()
+ if not context.skip_dependencies:
+ dep_graph = build_dep_graph(
+ modules=_module_names_from_units(tuple(context.units)),
+ deps=context.module_deps,
+ )
+ return {
+ "dependency_modules": len(dep_graph.modules),
+ "dependency_edges": len(dep_graph.edges),
+ "dependency_edge_list": dep_graph.edges,
+ "dependency_cycles": dep_graph.cycles,
+ "dependency_max_depth": dep_graph.max_depth,
+ "dependency_avg_depth": dep_graph.avg_depth,
+ "dependency_p95_depth": dep_graph.p95_depth,
+ "dependency_longest_chains": dep_graph.longest_chains,
+ "dep_graph": dep_graph,
+ }
+
+
+def _compute_dependencies_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name="dependencies",
+ builder=_build_dependencies_result,
+ )
+
+
+def _aggregate_dependencies_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ dep_graph = result.get("dep_graph")
+ return MetricAggregate(
+ project_fields={
+ "dependency_modules": _result_int(result, "dependency_modules"),
+ "dependency_edges": _result_int(result, "dependency_edges"),
+ "dependency_edge_list": _result_module_deps(result, "dependency_edge_list"),
+ "dependency_cycles": _result_nested_tuple_str(result, "dependency_cycles"),
+ "dependency_max_depth": _result_int(result, "dependency_max_depth"),
+ "dependency_longest_chains": _result_nested_tuple_str(
+ result,
+ "dependency_longest_chains",
+ ),
+ },
+ artifacts=({"dep_graph": dep_graph} if isinstance(dep_graph, DepGraph) else {}),
+ )
+
+
+def _build_dead_code_result(context: MetricProjectContext) -> MetricResult:
+ dead_items: tuple[DeadItem, ...] = ()
+ if not context.skip_dead_code:
+ dead_items = find_unused(
+ definitions=tuple(context.dead_candidates),
+ referenced_names=context.referenced_names,
+ referenced_qualnames=context.referenced_qualnames,
+ )
+ return {
+ "dead_code": dead_items,
+ "dead_items": dead_items,
+ }
+
+
+def _compute_dead_code_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name="dead_code",
+ builder=_build_dead_code_result,
+ )
+
+
+def _aggregate_dead_code_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ dead_items = result.get("dead_items")
+ return MetricAggregate(
+ project_fields={
+ "dead_code": _result_dead_items(result, "dead_code"),
+ },
+ artifacts=({"dead_items": dead_items} if isinstance(dead_items, tuple) else {}),
+ )
+
+
+def _build_health_result(context: MetricProjectContext) -> MetricResult:
+ complexity = _compute_complexity_family(context)
+ coupling = _compute_coupling_family(context)
+ cohesion = _compute_cohesion_family(context)
+ dependencies = _compute_dependencies_family(context)
+ dead_code = _compute_dead_code_family(context)
+ health = compute_health(
+ HealthInputs(
+ files_found=context.files_found,
+ files_analyzed_or_cached=context.files_analyzed_or_cached,
+ function_clone_groups=context.function_clone_groups,
+ block_clone_groups=context.block_clone_groups,
+ complexity_avg=_result_float(complexity, "complexity_avg"),
+ complexity_max=_result_int(complexity, "complexity_max"),
+ high_risk_functions=len(
+ _result_tuple_str(complexity, "high_risk_functions")
+ ),
+ coupling_avg=_result_float(coupling, "coupling_avg"),
+ coupling_max=_result_int(coupling, "coupling_max"),
+ high_risk_classes=len(_result_tuple_str(coupling, "high_risk_classes")),
+ cohesion_avg=_result_float(cohesion, "cohesion_avg"),
+ low_cohesion_classes=len(
+ _result_tuple_str(cohesion, "low_cohesion_classes")
+ ),
+ dependency_cycles=len(
+ _result_nested_tuple_str(dependencies, "dependency_cycles")
+ ),
+ dependency_max_depth=_result_int(dependencies, "dependency_max_depth"),
+ dependency_avg_depth=_result_float(dependencies, "dependency_avg_depth"),
+ dependency_p95_depth=_result_int(dependencies, "dependency_p95_depth"),
+ dead_code_items=len(_result_dead_items(dead_code, "dead_code")),
+ )
+ )
+ return {"health": health}
+
+
+def _compute_health_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name="health",
+ builder=_build_health_result,
+ )
+
+
+def _aggregate_health_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(project_fields={"health": _result_health(result, "health")})
+
+
+def _build_coverage_adoption_result(context: MetricProjectContext) -> MetricResult:
+ typing_rows = tuple(
+ sorted(context.typing_modules, key=lambda item: (item.filepath, item.module))
+ )
+ docstring_rows = tuple(
+ sorted(context.docstring_modules, key=lambda item: (item.filepath, item.module))
+ )
+ return {
+ "typing_param_total": sum(item.params_total for item in typing_rows),
+ "typing_param_annotated": sum(item.params_annotated for item in typing_rows),
+ "typing_return_total": sum(item.returns_total for item in typing_rows),
+ "typing_return_annotated": sum(item.returns_annotated for item in typing_rows),
+ "typing_any_count": sum(item.any_annotation_count for item in typing_rows),
+ "docstring_public_total": sum(
+ item.public_symbol_total for item in docstring_rows
+ ),
+ "docstring_public_documented": sum(
+ item.public_symbol_documented for item in docstring_rows
+ ),
+ "typing_modules": typing_rows,
+ "docstring_modules": docstring_rows,
+ }
+
+
+def _compute_coverage_adoption_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name="coverage_adoption",
+ builder=_build_coverage_adoption_result,
+ )
+
+
+def _aggregate_coverage_adoption_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(
+ project_fields={
+ "typing_param_total": _result_int(result, "typing_param_total"),
+ "typing_param_annotated": _result_int(result, "typing_param_annotated"),
+ "typing_return_total": _result_int(result, "typing_return_total"),
+ "typing_return_annotated": _result_int(
+ result,
+ "typing_return_annotated",
+ ),
+ "typing_any_count": _result_int(result, "typing_any_count"),
+ "docstring_public_total": _result_int(result, "docstring_public_total"),
+ "docstring_public_documented": _result_int(
+ result,
+ "docstring_public_documented",
+ ),
+ "typing_modules": _result_typing_modules(result, "typing_modules"),
+ "docstring_modules": _result_docstring_modules(
+ result,
+ "docstring_modules",
+ ),
+ }
+ )
+
+
+def _build_api_surface_result(context: MetricProjectContext) -> MetricResult:
+ api_rows = tuple(
+ sorted(context.api_modules, key=lambda item: (item.filepath, item.module))
+ )
+ return {
+ "api_surface": ApiSurfaceSnapshot(modules=api_rows) if api_rows else None,
+ }
+
+
+def _compute_api_surface_family(context: MetricProjectContext) -> MetricResult:
+ return _memoized_result(
+ context,
+ family_name="api_surface",
+ builder=_build_api_surface_result,
+ )
+
+
+def _aggregate_api_surface_family(results: list[MetricResult]) -> MetricAggregate:
+ result = _first_result(results)
+ return MetricAggregate(project_fields={"api_surface": result.get("api_surface")})
+
+
+def _compute_report_only_family(_context: MetricProjectContext) -> MetricResult:
+ return {}
+
+
+def _aggregate_empty_family(_results: list[MetricResult]) -> MetricAggregate:
+ return MetricAggregate(project_fields={})
+
+
+METRIC_FAMILIES: dict[str, MetricFamily] = {
+ CATEGORY_COMPLEXITY: MetricFamily(
+ name=CATEGORY_COMPLEXITY,
+ compute=_compute_complexity_family,
+ aggregate=_aggregate_complexity_family,
+ report_section=CATEGORY_COMPLEXITY,
+ baseline_key="max_complexity",
+ gate_keys=("complexity_threshold", "new_high_risk_functions"),
+ skippable_flag="skip_metrics",
+ ),
+ CATEGORY_COUPLING: MetricFamily(
+ name=CATEGORY_COUPLING,
+ compute=_compute_coupling_family,
+ aggregate=_aggregate_coupling_family,
+ report_section=CATEGORY_COUPLING,
+ baseline_key="max_coupling",
+ gate_keys=("coupling_threshold", "new_high_coupling_classes"),
+ skippable_flag="skip_metrics",
+ ),
+ CATEGORY_COHESION: MetricFamily(
+ name=CATEGORY_COHESION,
+ compute=_compute_cohesion_family,
+ aggregate=_aggregate_cohesion_family,
+ report_section=CATEGORY_COHESION,
+ baseline_key="max_cohesion",
+ gate_keys=("cohesion_threshold",),
+ skippable_flag="skip_metrics",
+ ),
+ "dependencies": MetricFamily(
+ name="dependencies",
+ compute=_compute_dependencies_family,
+ aggregate=_aggregate_dependencies_family,
+ report_section="dependencies",
+ baseline_key="dependency_cycles",
+ gate_keys=("dependency_cycles", "new_dependency_cycles"),
+ skippable_flag="skip_metrics",
+ ),
+ "dead_code": MetricFamily(
+ name="dead_code",
+ compute=_compute_dead_code_family,
+ aggregate=_aggregate_dead_code_family,
+ report_section="dead_code",
+ baseline_key="dead_code_items",
+ gate_keys=("dead_code_high_confidence", "new_dead_code"),
+ skippable_flag="skip_metrics",
+ ),
+ "health": MetricFamily(
+ name="health",
+ compute=_compute_health_family,
+ aggregate=_aggregate_health_family,
+ report_section="health",
+ baseline_key="health_score",
+ gate_keys=("health_threshold", "health_regression"),
+ skippable_flag="skip_metrics",
+ ),
+ "coverage_adoption": MetricFamily(
+ name="coverage_adoption",
+ compute=_compute_coverage_adoption_family,
+ aggregate=_aggregate_coverage_adoption_family,
+ report_section="coverage_adoption",
+ baseline_key="typing_param_permille",
+ gate_keys=(
+ "typing_coverage_threshold",
+ "docstring_coverage_threshold",
+ "typing_regression",
+ "docstring_regression",
+ ),
+ skippable_flag="skip_metrics",
+ ),
+ "api_surface": MetricFamily(
+ name="api_surface",
+ compute=_compute_api_surface_family,
+ aggregate=_aggregate_api_surface_family,
+ report_section="api_surface",
+ baseline_key=None,
+ gate_keys=("api_breaking_changes",),
+ skippable_flag="skip_metrics",
+ ),
+ "overloaded_modules": MetricFamily(
+ name="overloaded_modules",
+ compute=_compute_report_only_family,
+ aggregate=_aggregate_empty_family,
+ report_section="overloaded_modules",
+ baseline_key=None,
+ gate_keys=(),
+ skippable_flag="skip_metrics",
+ ),
+ "security_surfaces": MetricFamily(
+ name="security_surfaces",
+ compute=_compute_report_only_family,
+ aggregate=_aggregate_empty_family,
+ report_section="security_surfaces",
+ baseline_key=None,
+ gate_keys=(),
+ skippable_flag="skip_metrics",
+ ),
+ "coverage_join": MetricFamily(
+ name="coverage_join",
+ compute=_compute_report_only_family,
+ aggregate=_aggregate_empty_family,
+ report_section="coverage_join",
+ baseline_key=None,
+ gate_keys=("coverage_hotspots",),
+ skippable_flag="skip_metrics",
+ ),
+}
diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py
deleted file mode 100644
index ed4197e..0000000
--- a/codeclone/metrics_baseline.py
+++ /dev/null
@@ -1,1317 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import hashlib
-import hmac
-from datetime import datetime, timezone
-from enum import Enum
-from json import JSONDecodeError
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Final, Literal, cast
-
-import orjson
-
-from . import __version__
-from ._json_io import read_json_object as _read_json_object
-from ._json_io import write_json_document_atomically as _write_json_document_atomically
-from ._schema_validation import validate_top_level_structure
-from .baseline import current_python_tag
-from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime
-from .contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION
-from .errors import BaselineValidationError
-from .metrics.api_surface import compare_api_surfaces
-from .models import (
- ApiBreakingChange,
- ApiParamSpec,
- ApiSurfaceSnapshot,
- MetricsDiff,
- MetricsSnapshot,
- ModuleApiSurface,
- ProjectMetrics,
- PublicSymbol,
-)
-
-if TYPE_CHECKING:
- from collections.abc import Mapping
-
-METRICS_BASELINE_GENERATOR: Final = "codeclone"
-MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024
-
-
-class MetricsBaselineStatus(str, Enum):
- OK = "ok"
- MISSING = "missing"
- TOO_LARGE = "too_large"
- INVALID_JSON = "invalid_json"
- INVALID_TYPE = "invalid_type"
- MISSING_FIELDS = "missing_fields"
- MISMATCH_SCHEMA_VERSION = "mismatch_schema_version"
- MISMATCH_PYTHON_VERSION = "mismatch_python_version"
- GENERATOR_MISMATCH = "generator_mismatch"
- INTEGRITY_MISSING = "integrity_missing"
- INTEGRITY_FAILED = "integrity_failed"
-
-
-METRICS_BASELINE_UNTRUSTED_STATUSES: Final[frozenset[MetricsBaselineStatus]] = (
- frozenset(
- {
- MetricsBaselineStatus.MISSING,
- MetricsBaselineStatus.TOO_LARGE,
- MetricsBaselineStatus.INVALID_JSON,
- MetricsBaselineStatus.INVALID_TYPE,
- MetricsBaselineStatus.MISSING_FIELDS,
- MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION,
- MetricsBaselineStatus.MISMATCH_PYTHON_VERSION,
- MetricsBaselineStatus.GENERATOR_MISMATCH,
- MetricsBaselineStatus.INTEGRITY_MISSING,
- MetricsBaselineStatus.INTEGRITY_FAILED,
- }
- )
-)
-
-_TOP_LEVEL_REQUIRED_KEYS = frozenset({"meta", "metrics"})
-_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | frozenset(
- {"clones", "api_surface"}
-)
-_META_REQUIRED_KEYS = frozenset(
- {"generator", "schema_version", "python_tag", "created_at", "payload_sha256"}
-)
-_METRICS_REQUIRED_KEYS = frozenset(
- {
- "max_complexity",
- "high_risk_functions",
- "max_coupling",
- "high_coupling_classes",
- "max_cohesion",
- "low_cohesion_classes",
- "dependency_cycles",
- "dependency_max_depth",
- "dead_code_items",
- "health_score",
- "health_grade",
- }
-)
-_METRICS_OPTIONAL_KEYS = frozenset(
- {
- "typing_param_permille",
- "typing_return_permille",
- "docstring_permille",
- "typing_any_count",
- }
-)
-_METRICS_PAYLOAD_SHA256_KEY = "metrics_payload_sha256"
-_API_SURFACE_PAYLOAD_SHA256_KEY = "api_surface_payload_sha256"
-
-
-def coerce_metrics_baseline_status(
- raw_status: str | MetricsBaselineStatus | None,
-) -> MetricsBaselineStatus:
- if isinstance(raw_status, MetricsBaselineStatus):
- return raw_status
- if isinstance(raw_status, str):
- try:
- return MetricsBaselineStatus(raw_status)
- except ValueError:
- return MetricsBaselineStatus.INVALID_TYPE
- return MetricsBaselineStatus.INVALID_TYPE
-
-
-def snapshot_from_project_metrics(project_metrics: ProjectMetrics) -> MetricsSnapshot:
- return MetricsSnapshot(
- max_complexity=int(project_metrics.complexity_max),
- high_risk_functions=tuple(sorted(set(project_metrics.high_risk_functions))),
- max_coupling=int(project_metrics.coupling_max),
- high_coupling_classes=tuple(sorted(set(project_metrics.high_risk_classes))),
- max_cohesion=int(project_metrics.cohesion_max),
- low_cohesion_classes=tuple(sorted(set(project_metrics.low_cohesion_classes))),
- dependency_cycles=tuple(
- sorted({tuple(cycle) for cycle in project_metrics.dependency_cycles})
- ),
- dependency_max_depth=int(project_metrics.dependency_max_depth),
- dead_code_items=tuple(
- sorted({item.qualname for item in project_metrics.dead_code})
- ),
- health_score=int(project_metrics.health.total),
- health_grade=project_metrics.health.grade,
- typing_param_permille=_permille(
- project_metrics.typing_param_annotated,
- project_metrics.typing_param_total,
- ),
- typing_return_permille=_permille(
- project_metrics.typing_return_annotated,
- project_metrics.typing_return_total,
- ),
- docstring_permille=_permille(
- project_metrics.docstring_public_documented,
- project_metrics.docstring_public_total,
- ),
- typing_any_count=int(project_metrics.typing_any_count),
- )
-
-
-def _permille(numerator: int, denominator: int) -> int:
- if denominator <= 0:
- return 0
- return round((1000.0 * float(numerator)) / float(denominator))
-
-
-def _canonical_json(payload: object) -> str:
- return orjson.dumps(payload, option=orjson.OPT_SORT_KEYS).decode("utf-8")
-
-
-def _snapshot_payload(
- snapshot: MetricsSnapshot,
- *,
- include_adoption: bool = True,
-) -> dict[str, object]:
- payload: dict[str, object] = {
- "max_complexity": int(snapshot.max_complexity),
- "high_risk_functions": list(snapshot.high_risk_functions),
- "max_coupling": int(snapshot.max_coupling),
- "high_coupling_classes": list(snapshot.high_coupling_classes),
- "max_cohesion": int(snapshot.max_cohesion),
- "low_cohesion_classes": list(snapshot.low_cohesion_classes),
- "dependency_cycles": [list(cycle) for cycle in snapshot.dependency_cycles],
- "dependency_max_depth": int(snapshot.dependency_max_depth),
- "dead_code_items": list(snapshot.dead_code_items),
- "health_score": int(snapshot.health_score),
- "health_grade": snapshot.health_grade,
- }
- if include_adoption:
- payload.update(
- {
- "typing_param_permille": int(snapshot.typing_param_permille),
- "typing_return_permille": int(snapshot.typing_return_permille),
- "docstring_permille": int(snapshot.docstring_permille),
- "typing_any_count": int(snapshot.typing_any_count),
- }
- )
- return payload
-
-
-def _compute_payload_sha256(
- snapshot: MetricsSnapshot,
- *,
- include_adoption: bool = True,
-) -> str:
- canonical = _canonical_json(
- _snapshot_payload(snapshot, include_adoption=include_adoption)
- )
- return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
-
-
-def _now_utc_z() -> str:
- return (
- datetime.now(timezone.utc)
- .replace(microsecond=0)
- .isoformat()
- .replace(
- "+00:00",
- "Z",
- )
- )
-
-
-class MetricsBaseline:
- __slots__ = (
- "api_surface_payload_sha256",
- "api_surface_snapshot",
- "created_at",
- "generator_name",
- "generator_version",
- "has_coverage_adoption_snapshot",
- "is_embedded_in_clone_baseline",
- "path",
- "payload_sha256",
- "python_tag",
- "schema_version",
- "snapshot",
- )
-
- def __init__(self, path: str | Path) -> None:
- self.path = Path(path)
- self.generator_name: str | None = None
- self.generator_version: str | None = None
- self.schema_version: str | None = None
- self.python_tag: str | None = None
- self.created_at: str | None = None
- self.payload_sha256: str | None = None
- self.snapshot: MetricsSnapshot | None = None
- self.has_coverage_adoption_snapshot = False
- self.api_surface_payload_sha256: str | None = None
- self.api_surface_snapshot: ApiSurfaceSnapshot | None = None
- self.is_embedded_in_clone_baseline = False
-
- def load(
- self,
- *,
- max_size_bytes: int | None = None,
- preloaded_payload: dict[str, object] | None = None,
- ) -> None:
- try:
- exists = self.path.exists()
- except OSError as e:
- raise BaselineValidationError(
- f"Cannot stat metrics baseline file at {self.path}: {e}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- ) from e
- if not exists:
- return
-
- size_limit = (
- MAX_METRICS_BASELINE_SIZE_BYTES
- if max_size_bytes is None
- else max_size_bytes
- )
- try:
- file_size = self.path.stat().st_size
- except OSError as e:
- raise BaselineValidationError(
- f"Cannot stat metrics baseline file at {self.path}: {e}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- ) from e
- if file_size > size_limit:
- raise BaselineValidationError(
- "Metrics baseline file is too large "
- f"({file_size} bytes, max {size_limit} bytes) at {self.path}.",
- status=MetricsBaselineStatus.TOO_LARGE,
- )
-
- if preloaded_payload is None:
- payload = _load_json_object(self.path)
- else:
- if not isinstance(preloaded_payload, dict):
- raise BaselineValidationError(
- f"Metrics baseline payload must be an object at {self.path}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- payload = preloaded_payload
- _validate_top_level_structure(payload, path=self.path)
- self.is_embedded_in_clone_baseline = "clones" in payload
-
- meta_obj = payload.get("meta")
- metrics_obj = payload.get("metrics")
- if not isinstance(meta_obj, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {self.path}: "
- "'meta' must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if not isinstance(metrics_obj, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {self.path}: "
- "'metrics' must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
- _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path)
- _validate_required_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path)
- _validate_exact_keys(
- metrics_obj,
- _METRICS_REQUIRED_KEYS | _METRICS_OPTIONAL_KEYS,
- path=self.path,
- )
-
- generator_name, generator_version = _parse_generator(meta_obj, path=self.path)
- schema_version = _require_str(meta_obj, "schema_version", path=self.path)
- python_tag = _require_str(meta_obj, "python_tag", path=self.path)
- created_at = _require_str(meta_obj, "created_at", path=self.path)
- payload_sha256 = _extract_metrics_payload_sha256(meta_obj, path=self.path)
- api_surface_payload_sha256 = _extract_optional_payload_sha256(
- meta_obj,
- key=_API_SURFACE_PAYLOAD_SHA256_KEY,
- )
-
- self.generator_name = generator_name
- self.generator_version = generator_version
- self.schema_version = schema_version
- self.python_tag = python_tag
- self.created_at = created_at
- self.payload_sha256 = payload_sha256
- self.api_surface_payload_sha256 = api_surface_payload_sha256
- self.snapshot = _parse_snapshot(metrics_obj, path=self.path)
- self.has_coverage_adoption_snapshot = _has_coverage_adoption_snapshot(
- metrics_obj,
- )
- self.api_surface_snapshot = _parse_api_surface_snapshot(
- payload.get("api_surface"),
- path=self.path,
- root=self.path.parent,
- )
-
- def save(self) -> None:
- if self.snapshot is None:
- raise BaselineValidationError(
- "Metrics baseline snapshot is missing.",
- status=MetricsBaselineStatus.MISSING_FIELDS,
- )
- payload = _build_payload(
- snapshot=self.snapshot,
- schema_version=self.schema_version or METRICS_BASELINE_SCHEMA_VERSION,
- python_tag=self.python_tag or current_python_tag(),
- generator_name=self.generator_name or METRICS_BASELINE_GENERATOR,
- generator_version=self.generator_version or __version__,
- created_at=self.created_at or _now_utc_z(),
- include_adoption=self.has_coverage_adoption_snapshot,
- api_surface_snapshot=self.api_surface_snapshot,
- api_surface_root=self.path.parent,
- )
- payload_meta = cast("Mapping[str, Any]", payload["meta"])
- payload_metrics_hash = _require_str(
- payload_meta,
- "payload_sha256",
- path=self.path,
- )
- payload_api_surface_hash = _optional_require_str(
- payload_meta,
- _API_SURFACE_PAYLOAD_SHA256_KEY,
- path=self.path,
- )
- existing: dict[str, Any] | None = None
- try:
- if self.path.exists():
- loaded = _load_json_object(self.path)
- if "clones" in loaded:
- existing = loaded
- except BaselineValidationError as e:
- raise BaselineValidationError(
- f"Cannot read existing baseline file at {self.path}: {e}",
- status=MetricsBaselineStatus.INVALID_JSON,
- ) from e
-
- if existing is not None:
- existing_meta, clones_obj = _require_embedded_clone_baseline_payload(
- existing, path=self.path
- )
- merged_schema_version = _resolve_embedded_schema_version(
- existing_meta, path=self.path
- )
- merged_meta = dict(existing_meta)
- merged_meta["schema_version"] = merged_schema_version
- merged_meta[_METRICS_PAYLOAD_SHA256_KEY] = payload_metrics_hash
- if payload_api_surface_hash is None:
- merged_meta.pop(_API_SURFACE_PAYLOAD_SHA256_KEY, None)
- else:
- merged_meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = payload_api_surface_hash
- merged_payload: dict[str, object] = {
- "meta": merged_meta,
- "clones": clones_obj,
- "metrics": payload["metrics"],
- }
- api_surface_payload = payload.get("api_surface")
- if api_surface_payload is not None:
- merged_payload["api_surface"] = api_surface_payload
- self.path.parent.mkdir(parents=True, exist_ok=True)
- _atomic_write_json(self.path, merged_payload)
- self.is_embedded_in_clone_baseline = True
- self.schema_version = merged_schema_version
- self.python_tag = _require_str(merged_meta, "python_tag", path=self.path)
- self.created_at = _require_str(merged_meta, "created_at", path=self.path)
- self.payload_sha256 = _require_str(
- merged_meta, _METRICS_PAYLOAD_SHA256_KEY, path=self.path
- )
- self.api_surface_payload_sha256 = _optional_require_str(
- merged_meta,
- _API_SURFACE_PAYLOAD_SHA256_KEY,
- path=self.path,
- )
- self.generator_name, self.generator_version = _parse_generator(
- merged_meta, path=self.path
- )
- return
-
- self.path.parent.mkdir(parents=True, exist_ok=True)
- _atomic_write_json(self.path, payload)
- self.is_embedded_in_clone_baseline = False
- self.schema_version = _require_str(
- payload_meta, "schema_version", path=self.path
- )
- self.python_tag = _require_str(payload_meta, "python_tag", path=self.path)
- self.created_at = _require_str(payload_meta, "created_at", path=self.path)
- self.payload_sha256 = payload_metrics_hash
- self.api_surface_payload_sha256 = payload_api_surface_hash
-
- def verify_compatibility(self, *, runtime_python_tag: str) -> None:
- if self.generator_name != METRICS_BASELINE_GENERATOR:
- raise BaselineValidationError(
- "Metrics baseline generator mismatch: expected 'codeclone'.",
- status=MetricsBaselineStatus.GENERATOR_MISMATCH,
- )
- expected_schema = (
- BASELINE_SCHEMA_VERSION
- if self.is_embedded_in_clone_baseline
- else METRICS_BASELINE_SCHEMA_VERSION
- )
- if not _is_compatible_metrics_schema(
- baseline_version=self.schema_version,
- expected_version=expected_schema,
- ):
- raise BaselineValidationError(
- "Metrics baseline schema version mismatch: "
- f"baseline={self.schema_version}, "
- f"expected={expected_schema}.",
- status=MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION,
- )
- if self.python_tag != runtime_python_tag:
- raise BaselineValidationError(
- "Metrics baseline python tag mismatch: "
- f"baseline={self.python_tag}, current={runtime_python_tag}.",
- status=MetricsBaselineStatus.MISMATCH_PYTHON_VERSION,
- )
- self.verify_integrity()
-
- def verify_integrity(self) -> None:
- if self.snapshot is None:
- raise BaselineValidationError(
- "Metrics baseline snapshot is missing.",
- status=MetricsBaselineStatus.MISSING_FIELDS,
- )
- if not isinstance(self.payload_sha256, str):
- raise BaselineValidationError(
- "Metrics baseline integrity payload hash is missing.",
- status=MetricsBaselineStatus.INTEGRITY_MISSING,
- )
- if len(self.payload_sha256) != 64:
- raise BaselineValidationError(
- "Metrics baseline integrity payload hash is missing.",
- status=MetricsBaselineStatus.INTEGRITY_MISSING,
- )
- expected = _compute_payload_sha256(
- self.snapshot,
- include_adoption=self.has_coverage_adoption_snapshot,
- )
- if not hmac.compare_digest(self.payload_sha256, expected):
- raise BaselineValidationError(
- "Metrics baseline integrity check failed: payload_sha256 mismatch.",
- status=MetricsBaselineStatus.INTEGRITY_FAILED,
- )
- if self.api_surface_snapshot is not None:
- if (
- not isinstance(self.api_surface_payload_sha256, str)
- or len(self.api_surface_payload_sha256) != 64
- ):
- raise BaselineValidationError(
- "Metrics baseline API surface integrity payload hash is missing.",
- status=MetricsBaselineStatus.INTEGRITY_MISSING,
- )
- expected_api = _compute_api_surface_payload_sha256(
- self.api_surface_snapshot,
- root=self.path.parent,
- )
- legacy_absolute_expected_api = _compute_api_surface_payload_sha256(
- self.api_surface_snapshot
- )
- legacy_expected_api = _compute_legacy_api_surface_payload_sha256(
- self.api_surface_snapshot,
- root=self.path.parent,
- )
- legacy_absolute_qualname_expected_api = (
- _compute_legacy_api_surface_payload_sha256(self.api_surface_snapshot)
- )
- if not (
- hmac.compare_digest(self.api_surface_payload_sha256, expected_api)
- or hmac.compare_digest(
- self.api_surface_payload_sha256,
- legacy_absolute_expected_api,
- )
- or hmac.compare_digest(
- self.api_surface_payload_sha256,
- legacy_expected_api,
- )
- or hmac.compare_digest(
- self.api_surface_payload_sha256,
- legacy_absolute_qualname_expected_api,
- )
- ):
- raise BaselineValidationError(
- "Metrics baseline integrity check failed: "
- "api_surface payload_sha256 mismatch.",
- status=MetricsBaselineStatus.INTEGRITY_FAILED,
- )
-
- @staticmethod
- def from_project_metrics(
- *,
- project_metrics: ProjectMetrics,
- path: str | Path,
- schema_version: str | None = None,
- python_tag: str | None = None,
- generator_version: str | None = None,
- include_adoption: bool = True,
- include_api_surface: bool = True,
- ) -> MetricsBaseline:
- baseline = MetricsBaseline(path)
- baseline.generator_name = METRICS_BASELINE_GENERATOR
- baseline.generator_version = generator_version or __version__
- baseline.schema_version = schema_version or METRICS_BASELINE_SCHEMA_VERSION
- baseline.python_tag = python_tag or current_python_tag()
- baseline.created_at = _now_utc_z()
- baseline.snapshot = snapshot_from_project_metrics(project_metrics)
- baseline.payload_sha256 = _compute_payload_sha256(
- baseline.snapshot,
- include_adoption=include_adoption,
- )
- baseline.has_coverage_adoption_snapshot = include_adoption
- baseline.api_surface_snapshot = (
- project_metrics.api_surface if include_api_surface else None
- )
- baseline.api_surface_payload_sha256 = (
- _compute_api_surface_payload_sha256(
- baseline.api_surface_snapshot,
- root=baseline.path.parent,
- )
- if baseline.api_surface_snapshot is not None
- else None
- )
- return baseline
-
- def diff(self, current: ProjectMetrics) -> MetricsDiff:
- if self.snapshot is None:
- snapshot = MetricsSnapshot(
- max_complexity=0,
- high_risk_functions=(),
- max_coupling=0,
- high_coupling_classes=(),
- max_cohesion=0,
- low_cohesion_classes=(),
- dependency_cycles=(),
- dependency_max_depth=0,
- dead_code_items=(),
- health_score=0,
- health_grade="F",
- typing_param_permille=0,
- typing_return_permille=0,
- docstring_permille=0,
- typing_any_count=0,
- )
- else:
- snapshot = self.snapshot
-
- current_snapshot = snapshot_from_project_metrics(current)
-
- new_high_risk_functions = tuple(
- sorted(
- set(current_snapshot.high_risk_functions)
- - set(snapshot.high_risk_functions)
- )
- )
- new_high_coupling_classes = tuple(
- sorted(
- set(current_snapshot.high_coupling_classes)
- - set(snapshot.high_coupling_classes)
- )
- )
- new_cycles = tuple(
- sorted(
- set(current_snapshot.dependency_cycles)
- - set(snapshot.dependency_cycles)
- )
- )
- new_dead_code = tuple(
- sorted(
- set(current_snapshot.dead_code_items) - set(snapshot.dead_code_items)
- )
- )
- added_api_symbols: tuple[str, ...]
- api_breaking_changes: tuple[ApiBreakingChange, ...]
- if self.api_surface_snapshot is None:
- added_api_symbols = ()
- api_breaking_changes = ()
- else:
- added_api_symbols, api_breaking_changes = compare_api_surfaces(
- baseline=self.api_surface_snapshot,
- current=current.api_surface,
- strict_types=False,
- )
-
- return MetricsDiff(
- new_high_risk_functions=new_high_risk_functions,
- new_high_coupling_classes=new_high_coupling_classes,
- new_cycles=new_cycles,
- new_dead_code=new_dead_code,
- health_delta=current_snapshot.health_score - snapshot.health_score,
- typing_param_permille_delta=(
- current_snapshot.typing_param_permille - snapshot.typing_param_permille
- ),
- typing_return_permille_delta=(
- current_snapshot.typing_return_permille
- - snapshot.typing_return_permille
- ),
- docstring_permille_delta=(
- current_snapshot.docstring_permille - snapshot.docstring_permille
- ),
- new_api_symbols=added_api_symbols,
- new_api_breaking_changes=api_breaking_changes,
- )
-
-
-def _is_compatible_metrics_schema(
- *,
- baseline_version: str | None,
- expected_version: str,
-) -> bool:
- if baseline_version is None:
- return False
- baseline_major_minor = _parse_major_minor(baseline_version)
- expected_major_minor = _parse_major_minor(expected_version)
- if baseline_major_minor is None or expected_major_minor is None:
- return baseline_version == expected_version
- baseline_major, baseline_minor = baseline_major_minor
- expected_major, expected_minor = expected_major_minor
- return baseline_major == expected_major and baseline_minor <= expected_minor
-
-
-def _has_coverage_adoption_snapshot(metrics_obj: Mapping[str, object]) -> bool:
- return all(
- key in metrics_obj
- for key in (
- "typing_param_permille",
- "typing_return_permille",
- "docstring_permille",
- )
- )
-
-
-def _parse_major_minor(version: str) -> tuple[int, int] | None:
- parts = version.split(".")
- if len(parts) != 2 or not all(part.isdigit() for part in parts):
- return None
- return int(parts[0]), int(parts[1])
-
-
-def _atomic_write_json(path: Path, payload: dict[str, object]) -> None:
- _write_json_document_atomically(
- path,
- payload,
- indent=True,
- trailing_newline=True,
- )
-
-
-def _load_json_object(path: Path) -> dict[str, Any]:
- try:
- return _read_json_object(path)
- except OSError as e:
- raise BaselineValidationError(
- f"Cannot read metrics baseline file at {path}: {e}",
- status=MetricsBaselineStatus.INVALID_JSON,
- ) from e
- except JSONDecodeError as e:
- raise BaselineValidationError(
- f"Corrupted metrics baseline file at {path}: {e}",
- status=MetricsBaselineStatus.INVALID_JSON,
- ) from e
- except TypeError:
- raise BaselineValidationError(
- f"Metrics baseline payload must be an object at {path}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- ) from None
-
-
-def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None:
- validate_top_level_structure(
- payload,
- path=path,
- required_keys=_TOP_LEVEL_REQUIRED_KEYS,
- allowed_keys=_TOP_LEVEL_ALLOWED_KEYS,
- schema_label="metrics baseline",
- missing_status=MetricsBaselineStatus.MISSING_FIELDS,
- extra_status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _validate_required_keys(
- payload: Mapping[str, Any],
- required: frozenset[str],
- *,
- path: Path,
-) -> None:
- missing = required - set(payload.keys())
- if missing:
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: missing required fields: {', '.join(sorted(missing))}",
- status=MetricsBaselineStatus.MISSING_FIELDS,
- )
-
-
-def _validate_exact_keys(
- payload: Mapping[str, Any],
- required: frozenset[str],
- *,
- path: Path,
-) -> None:
- extra = set(payload.keys()) - set(required)
- if extra:
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: unexpected fields: {', '.join(sorted(extra))}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _require_str(payload: Mapping[str, Any], key: str, *, path: Path) -> str:
- value = payload.get(key)
- if isinstance(value, str):
- return value
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _extract_metrics_payload_sha256(
- payload: Mapping[str, Any],
- *,
- path: Path,
-) -> str:
- direct = payload.get(_METRICS_PAYLOAD_SHA256_KEY)
- if isinstance(direct, str):
- return direct
- return _require_str(payload, "payload_sha256", path=path)
-
-
-def _extract_optional_payload_sha256(
- payload: Mapping[str, Any],
- *,
- key: str,
-) -> str | None:
- value = payload.get(key)
- return value if isinstance(value, str) else None
-
-
-def _require_int(payload: Mapping[str, Any], key: str, *, path: Path) -> int:
- value = payload.get(key)
- if isinstance(value, bool):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be int",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if isinstance(value, int):
- return value
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be int",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _optional_require_str(
- payload: Mapping[str, Any],
- key: str,
- *,
- path: Path,
-) -> str | None:
- value = payload.get(key)
- if value is None:
- return None
- if isinstance(value, str):
- return value
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _require_str_list(payload: Mapping[str, Any], key: str, *, path: Path) -> list[str]:
- value = payload.get(key)
- if not isinstance(value, list):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if not all(isinstance(item, str) for item in value):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- return value
-
-
-def _parse_cycles(
- payload: Mapping[str, Any],
- *,
- key: str,
- path: Path,
-) -> tuple[tuple[str, ...], ...]:
- value = payload.get(key)
- if not isinstance(value, list):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: {key!r} must be list",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
- cycles: list[tuple[str, ...]] = []
- for cycle in value:
- if not isinstance(cycle, list):
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: {key!r} cycle item must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if not all(isinstance(item, str) for item in cycle):
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: {key!r} cycle item must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- cycles.append(tuple(cycle))
- return tuple(sorted(set(cycles)))
-
-
-def _parse_generator(
- meta: Mapping[str, Any],
- *,
- path: Path,
-) -> tuple[str, str | None]:
- generator = meta.get("generator")
- if isinstance(generator, str):
- version_value = meta.get("generator_version")
- if version_value is None:
- version_value = meta.get("codeclone_version")
- if version_value is None:
- return generator, None
- if not isinstance(version_value, str):
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: generator_version must be str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- return generator, version_value
-
- if isinstance(generator, dict):
- allowed_keys = {"name", "version"}
- extra = set(generator.keys()) - allowed_keys
- if extra:
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- f"unexpected generator keys: {', '.join(sorted(extra))}",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- name = generator.get("name")
- version = generator.get("version")
- if not isinstance(name, str):
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: generator.name must be str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if version is not None and not isinstance(version, str):
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: generator.version must be str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- return name, version if isinstance(version, str) else None
-
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: generator must be object or str",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
-
-def _require_embedded_clone_baseline_payload(
- payload: Mapping[str, Any],
- *,
- path: Path,
-) -> tuple[dict[str, Any], dict[str, Any]]:
- meta_obj = payload.get("meta")
- clones_obj = payload.get("clones")
- if not isinstance(meta_obj, dict):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: 'meta' must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if not isinstance(clones_obj, dict):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: 'clones' must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- _require_str(meta_obj, "payload_sha256", path=path)
- _require_str(meta_obj, "python_tag", path=path)
- _require_str(meta_obj, "created_at", path=path)
- functions = clones_obj.get("functions")
- blocks = clones_obj.get("blocks")
- if not isinstance(functions, list) or not all(
- isinstance(item, str) for item in functions
- ):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: 'clones.functions' must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- if not isinstance(blocks, list) or not all(
- isinstance(item, str) for item in blocks
- ):
- raise BaselineValidationError(
- f"Invalid baseline schema at {path}: 'clones.blocks' must be list[str]",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- return meta_obj, clones_obj
-
-
-def _resolve_embedded_schema_version(meta: Mapping[str, Any], *, path: Path) -> str:
- raw_version = _require_str(meta, "schema_version", path=path)
- parts = raw_version.split(".")
- if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts):
- raise BaselineValidationError(
- "Invalid baseline schema at "
- f"{path}: 'schema_version' must be semver string",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- major = int(parts[0])
- if major >= 2:
- return raw_version
- return BASELINE_SCHEMA_VERSION
-
-
-def _parse_snapshot(
- payload: Mapping[str, Any],
- *,
- path: Path,
-) -> MetricsSnapshot:
- grade = _require_str(payload, "health_grade", path=path)
- if grade not in {"A", "B", "C", "D", "F"}:
- raise BaselineValidationError(
- "Invalid metrics baseline schema at "
- f"{path}: 'health_grade' must be one of A/B/C/D/F",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
-
- return MetricsSnapshot(
- max_complexity=_require_int(payload, "max_complexity", path=path),
- high_risk_functions=tuple(
- sorted(set(_require_str_list(payload, "high_risk_functions", path=path)))
- ),
- max_coupling=_require_int(payload, "max_coupling", path=path),
- high_coupling_classes=tuple(
- sorted(set(_require_str_list(payload, "high_coupling_classes", path=path)))
- ),
- max_cohesion=_require_int(payload, "max_cohesion", path=path),
- low_cohesion_classes=tuple(
- sorted(set(_require_str_list(payload, "low_cohesion_classes", path=path)))
- ),
- dependency_cycles=_parse_cycles(payload, key="dependency_cycles", path=path),
- dependency_max_depth=_require_int(payload, "dependency_max_depth", path=path),
- dead_code_items=tuple(
- sorted(set(_require_str_list(payload, "dead_code_items", path=path)))
- ),
- health_score=_require_int(payload, "health_score", path=path),
- health_grade=cast("Literal['A', 'B', 'C', 'D', 'F']", grade),
- typing_param_permille=_optional_int(
- payload,
- "typing_param_permille",
- path=path,
- ),
- typing_return_permille=_optional_int(
- payload,
- "typing_return_permille",
- path=path,
- ),
- docstring_permille=_optional_int(payload, "docstring_permille", path=path),
- typing_any_count=_optional_int(payload, "typing_any_count", path=path),
- )
-
-
-def _optional_int(payload: Mapping[str, Any], key: str, *, path: Path) -> int:
- value = payload.get(key)
- if value is None:
- return 0
- return _require_int(payload, key, path=path)
-
-
-def _parse_api_surface_snapshot(
- payload: object,
- *,
- path: Path,
- root: Path | None = None,
-) -> ApiSurfaceSnapshot | None:
- if payload is None:
- return None
- if not isinstance(payload, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: 'api_surface' must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- raw_modules = payload.get("modules", [])
- if not isinstance(raw_modules, list):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "'api_surface.modules' must be list",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- modules: list[ModuleApiSurface] = []
- for raw_module in raw_modules:
- if not isinstance(raw_module, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api surface module must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- module = _require_str(raw_module, "module", path=path)
- wire_filepath = _require_str(raw_module, "filepath", path=path)
- filepath = runtime_filepath_from_wire(wire_filepath, root=root)
- all_declared = _require_str_list_or_none(raw_module, "all_declared", path=path)
- raw_symbols = raw_module.get("symbols", [])
- if not isinstance(raw_symbols, list):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api surface symbols must be list",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- symbols: list[PublicSymbol] = []
- for raw_symbol in raw_symbols:
- if not isinstance(raw_symbol, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api surface symbol must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- local_name = _optional_require_str(raw_symbol, "local_name", path=path)
- legacy_qualname = _optional_require_str(raw_symbol, "qualname", path=path)
- if local_name is None and legacy_qualname is None:
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api surface symbol requires 'local_name' or 'qualname'",
- status=MetricsBaselineStatus.MISSING_FIELDS,
- )
- if local_name is None:
- assert legacy_qualname is not None
- qualname = legacy_qualname
- else:
- qualname = _compose_api_surface_qualname(
- module=module,
- local_name=local_name,
- )
- kind = _require_str(raw_symbol, "kind", path=path)
- exported_via = _require_str(raw_symbol, "exported_via", path=path)
- params_raw = raw_symbol.get("params", [])
- if not isinstance(params_raw, list):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api surface params must be list",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- params: list[ApiParamSpec] = []
- for raw_param in params_raw:
- if not isinstance(raw_param, dict):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api param must be object",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- name = _require_str(raw_param, "name", path=path)
- param_kind = _require_str(raw_param, "kind", path=path)
- has_default = raw_param.get("has_default")
- annotation_hash = _optional_require_str(
- raw_param,
- "annotation_hash",
- path=path,
- )
- if not isinstance(has_default, bool):
- raise BaselineValidationError(
- f"Invalid metrics baseline schema at {path}: "
- "api param 'has_default' must be bool",
- status=MetricsBaselineStatus.INVALID_TYPE,
- )
- params.append(
- ApiParamSpec(
- name=name,
- kind=cast(
- (
- "Literal['pos_only', 'pos_or_kw', "
- "'vararg', 'kw_only', 'kwarg']"
- ),
- param_kind,
- ),
- has_default=has_default,
- annotation_hash=annotation_hash or "",
- )
- )
- symbols.append(
- PublicSymbol(
- qualname=qualname,
- kind=cast(
- "Literal['function', 'class', 'method', 'constant']",
- kind,
- ),
- start_line=_require_int(raw_symbol, "start_line", path=path),
- end_line=_require_int(raw_symbol, "end_line", path=path),
- params=tuple(params),
- returns_hash=_optional_require_str(
- raw_symbol,
- "returns_hash",
- path=path,
- )
- or "",
- exported_via=cast("Literal['all', 'name']", exported_via),
- )
- )
- modules.append(
- ModuleApiSurface(
- module=module,
- filepath=filepath,
- symbols=tuple(sorted(symbols, key=lambda item: item.qualname)),
- all_declared=tuple(all_declared) if all_declared is not None else None,
- )
- )
- return ApiSurfaceSnapshot(
- modules=tuple(sorted(modules, key=lambda item: (item.filepath, item.module)))
- )
-
-
-def _require_str_list_or_none(
- payload: Mapping[str, Any],
- key: str,
- *,
- path: Path,
-) -> list[str] | None:
- value = payload.get(key)
- if value is None:
- return None
- return _require_str_list(payload, key, path=path)
-
-
-def _api_surface_snapshot_payload(
- snapshot: ApiSurfaceSnapshot,
- *,
- root: Path | None = None,
- legacy_qualname: bool = False,
-) -> dict[str, object]:
- return {
- "modules": [
- {
- "module": module.module,
- "filepath": wire_filepath_from_runtime(module.filepath, root=root),
- "all_declared": list(module.all_declared or ()),
- "symbols": [
- {
- ("qualname" if legacy_qualname else "local_name"): (
- symbol.qualname
- if legacy_qualname
- else _local_name_from_qualname(
- module=module.module,
- qualname=symbol.qualname,
- )
- ),
- "kind": symbol.kind,
- "start_line": symbol.start_line,
- "end_line": symbol.end_line,
- "params": [
- {
- "name": param.name,
- "kind": param.kind,
- "has_default": param.has_default,
- "annotation_hash": param.annotation_hash,
- }
- for param in symbol.params
- ],
- "returns_hash": symbol.returns_hash,
- "exported_via": symbol.exported_via,
- }
- for symbol in sorted(
- module.symbols,
- key=lambda item: item.qualname,
- )
- ],
- }
- for module in sorted(
- snapshot.modules,
- key=lambda item: (item.filepath, item.module),
- )
- ]
- }
-
-
-def _compute_api_surface_payload_sha256(
- snapshot: ApiSurfaceSnapshot,
- *,
- root: Path | None = None,
-) -> str:
- canonical = _canonical_json(_api_surface_snapshot_payload(snapshot, root=root))
- return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
-
-
-def _compute_legacy_api_surface_payload_sha256(
- snapshot: ApiSurfaceSnapshot,
- *,
- root: Path | None = None,
-) -> str:
- canonical = _canonical_json(
- _api_surface_snapshot_payload(snapshot, root=root, legacy_qualname=True)
- )
- return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
-
-
-def _compose_api_surface_qualname(*, module: str, local_name: str) -> str:
- return f"{module}:{local_name}"
-
-
-def _local_name_from_qualname(*, module: str, qualname: str) -> str:
- prefix = f"{module}:"
- if qualname.startswith(prefix):
- return qualname[len(prefix) :]
- return qualname
-
-
-def _build_payload(
- *,
- snapshot: MetricsSnapshot,
- schema_version: str,
- python_tag: str,
- generator_name: str,
- generator_version: str,
- created_at: str,
- include_adoption: bool = True,
- api_surface_snapshot: ApiSurfaceSnapshot | None = None,
- api_surface_root: Path | None = None,
-) -> dict[str, Any]:
- payload_sha256 = _compute_payload_sha256(
- snapshot,
- include_adoption=include_adoption,
- )
- payload: dict[str, Any] = {
- "meta": {
- "generator": {
- "name": generator_name,
- "version": generator_version,
- },
- "schema_version": schema_version,
- "python_tag": python_tag,
- "created_at": created_at,
- "payload_sha256": payload_sha256,
- },
- "metrics": _snapshot_payload(
- snapshot,
- include_adoption=include_adoption,
- ),
- }
- if api_surface_snapshot is not None:
- payload["meta"][_API_SURFACE_PAYLOAD_SHA256_KEY] = (
- _compute_api_surface_payload_sha256(
- api_surface_snapshot,
- root=api_surface_root,
- )
- )
- payload["api_surface"] = _api_surface_snapshot_payload(
- api_surface_snapshot,
- root=api_surface_root,
- )
- return payload
-
-
-__all__ = [
- "BASELINE_SCHEMA_VERSION",
- "MAX_METRICS_BASELINE_SIZE_BYTES",
- "METRICS_BASELINE_GENERATOR",
- "METRICS_BASELINE_SCHEMA_VERSION",
- "METRICS_BASELINE_UNTRUSTED_STATUSES",
- "MetricsBaseline",
- "MetricsBaselineStatus",
- "coerce_metrics_baseline_status",
- "current_python_tag",
- "snapshot_from_project_metrics",
-]
diff --git a/codeclone/models.py b/codeclone/models.py
index 4814fc1..26cd95e 100644
--- a/codeclone/models.py
+++ b/codeclone/models.py
@@ -93,6 +93,8 @@ class DepGraph:
edges: tuple[ModuleDep, ...]
cycles: tuple[tuple[str, ...], ...]
max_depth: int
+ avg_depth: float
+ p95_depth: int
longest_chains: tuple[tuple[str, ...], ...]
@@ -117,6 +119,42 @@ class DeadCandidate:
suppressed_rules: tuple[str, ...] = field(default_factory=tuple)
+SecuritySurfaceCategory = Literal[
+ "archive_extraction",
+ "crypto_transport",
+ "database_boundary",
+ "deserialization",
+ "dynamic_execution",
+ "dynamic_loading",
+ "filesystem_mutation",
+ "identity_token",
+ "network_boundary",
+ "process_boundary",
+]
+SecuritySurfaceLocationScope = Literal["module", "class", "callable"]
+SecuritySurfaceClassificationMode = Literal[
+ "exact_builtin",
+ "exact_call",
+ "exact_import",
+]
+SecuritySurfaceEvidenceKind = Literal["builtin", "call", "import"]
+
+
+@dataclass(frozen=True, slots=True)
+class SecuritySurface:
+ category: SecuritySurfaceCategory
+ capability: str
+ module: str
+ filepath: str
+ qualname: str
+ start_line: int
+ end_line: int
+ location_scope: SecuritySurfaceLocationScope
+ classification_mode: SecuritySurfaceClassificationMode
+ evidence_kind: SecuritySurfaceEvidenceKind
+ evidence_symbol: str
+
+
@dataclass(frozen=True, slots=True)
class FileMetrics:
class_metrics: tuple[ClassMetrics, ...]
@@ -125,6 +163,7 @@ class FileMetrics:
referenced_names: frozenset[str]
import_names: frozenset[str]
class_names: frozenset[str]
+ security_surfaces: tuple[SecuritySurface, ...] = ()
referenced_qualnames: frozenset[str] = field(default_factory=frozenset)
typing_coverage: ModuleTypingCoverage | None = None
docstring_coverage: ModuleDocstringCoverage | None = None
diff --git a/codeclone/paths.py b/codeclone/paths/__init__.py
similarity index 98%
rename from codeclone/paths.py
rename to codeclone/paths/__init__.py
index d93428f..f12522a 100644
--- a/codeclone/paths.py
+++ b/codeclone/paths/__init__.py
@@ -8,7 +8,7 @@
from pathlib import Path
-from .domain.source_scope import (
+from ..domain.source_scope import (
SOURCE_KIND_FIXTURES,
SOURCE_KIND_OTHER,
SOURCE_KIND_PRODUCTION,
diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py
deleted file mode 100644
index 50d2c58..0000000
--- a/codeclone/pipeline.py
+++ /dev/null
@@ -1,2773 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-import inspect
-import os
-from collections.abc import Mapping
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from dataclasses import dataclass
-from hashlib import sha256
-from pathlib import Path
-from typing import TYPE_CHECKING, Literal, cast
-
-import orjson
-
-from ._coerce import as_int, as_str
-from .cache import (
- ApiParamSpecDict,
- Cache,
- CacheEntry,
- ClassMetricsDict,
- DeadCandidateDict,
- FileStat,
- ModuleDepDict,
- PublicSymbolDict,
- SegmentReportProjection,
- SourceStatsDict,
- StructuralFindingGroupDict,
- file_stat_signature,
-)
-from .contracts import ExitCode
-from .domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING
-from .domain.quality import CONFIDENCE_HIGH, RISK_HIGH, RISK_LOW
-from .extractor import extract_units_and_stats_from_source
-from .golden_fixtures import (
- build_suppressed_clone_groups,
- split_clone_groups_for_golden_fixtures,
-)
-from .grouping import build_block_groups, build_groups, build_segment_groups
-from .metrics import (
- CoverageJoinParseError,
- HealthInputs,
- build_coverage_join,
- build_dep_graph,
- build_overloaded_modules_payload,
- compute_health,
- find_suppressed_unused,
- find_unused,
-)
-from .models import (
- ApiBreakingChange,
- ApiParamSpec,
- ApiSurfaceSnapshot,
- BlockUnit,
- ClassMetrics,
- CoverageJoinResult,
- DeadCandidate,
- DeadItem,
- DepGraph,
- FileMetrics,
- GroupItem,
- GroupItemLike,
- GroupMap,
- MetricsDiff,
- ModuleApiSurface,
- ModuleDep,
- ModuleDocstringCoverage,
- ModuleTypingCoverage,
- ProjectMetrics,
- PublicSymbol,
- SegmentUnit,
- StructuralFindingGroup,
- StructuralFindingOccurrence,
- Suggestion,
- SuppressedCloneGroup,
- Unit,
-)
-from .normalize import NormalizationConfig
-from .paths import is_test_filepath
-from .report.blocks import prepare_block_report_groups
-from .report.explain import build_block_group_facts
-from .report.json_contract import build_report_document
-from .report.segments import prepare_segment_report_groups
-from .report.serialize import render_json_report_document, render_text_report_document
-from .report.suggestions import generate_suggestions
-from .scanner import iter_py_files, module_name_from_path
-from .structural_findings import build_clone_cohort_structural_findings
-from .suppressions import DEAD_CODE_RULE_ID, INLINE_CODECLONE_SUPPRESSION_SOURCE
-
-if TYPE_CHECKING:
- from argparse import Namespace
- from collections.abc import Callable, Collection, Mapping, Sequence
-
-MAX_FILE_SIZE = 10 * 1024 * 1024
-DEFAULT_BATCH_SIZE = 100
-PARALLEL_MIN_FILES_PER_WORKER = 8
-PARALLEL_MIN_FILES_FLOOR = 16
-DEFAULT_RUNTIME_PROCESSES = 4
-
-_as_int = as_int
-_as_str = as_str
-
-
-@dataclass(frozen=True, slots=True)
-class OutputPaths:
- html: Path | None = None
- json: Path | None = None
- text: Path | None = None
- md: Path | None = None
- sarif: Path | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class BootstrapResult:
- root: Path
- config: NormalizationConfig
- args: Namespace
- output_paths: OutputPaths
- cache_path: Path
-
-
-@dataclass(frozen=True, slots=True)
-class DiscoveryResult:
- files_found: int
- cache_hits: int
- files_skipped: int
- all_file_paths: tuple[str, ...]
- cached_units: tuple[GroupItem, ...]
- cached_blocks: tuple[GroupItem, ...]
- cached_segments: tuple[GroupItem, ...]
- cached_class_metrics: tuple[ClassMetrics, ...]
- cached_module_deps: tuple[ModuleDep, ...]
- cached_dead_candidates: tuple[DeadCandidate, ...]
- cached_referenced_names: frozenset[str]
- files_to_process: tuple[str, ...]
- skipped_warnings: tuple[str, ...]
- cached_referenced_qualnames: frozenset[str] = frozenset()
- cached_typing_modules: tuple[ModuleTypingCoverage, ...] = ()
- cached_docstring_modules: tuple[ModuleDocstringCoverage, ...] = ()
- cached_api_modules: tuple[ModuleApiSurface, ...] = ()
- cached_structural_findings: tuple[StructuralFindingGroup, ...] = ()
- cached_segment_report_projection: SegmentReportProjection | None = None
- cached_lines: int = 0
- cached_functions: int = 0
- cached_methods: int = 0
- cached_classes: int = 0
- cached_source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = ()
-
-
-@dataclass(frozen=True, slots=True)
-class FileProcessResult:
- filepath: str
- success: bool
- error: str | None = None
- units: list[Unit] | None = None
- blocks: list[BlockUnit] | None = None
- segments: list[SegmentUnit] | None = None
- lines: int = 0
- functions: int = 0
- methods: int = 0
- classes: int = 0
- stat: FileStat | None = None
- error_kind: str | None = None
- file_metrics: FileMetrics | None = None
- structural_findings: list[StructuralFindingGroup] | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class ProcessingResult:
- units: tuple[GroupItem, ...]
- blocks: tuple[GroupItem, ...]
- segments: tuple[GroupItem, ...]
- class_metrics: tuple[ClassMetrics, ...]
- module_deps: tuple[ModuleDep, ...]
- dead_candidates: tuple[DeadCandidate, ...]
- referenced_names: frozenset[str]
- files_analyzed: int
- files_skipped: int
- analyzed_lines: int
- analyzed_functions: int
- analyzed_methods: int
- analyzed_classes: int
- failed_files: tuple[str, ...]
- source_read_failures: tuple[str, ...]
- referenced_qualnames: frozenset[str] = frozenset()
- typing_modules: tuple[ModuleTypingCoverage, ...] = ()
- docstring_modules: tuple[ModuleDocstringCoverage, ...] = ()
- api_modules: tuple[ModuleApiSurface, ...] = ()
- structural_findings: tuple[StructuralFindingGroup, ...] = ()
- source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = ()
-
-
-@dataclass(frozen=True, slots=True)
-class AnalysisResult:
- func_groups: GroupMap
- block_groups: GroupMap
- block_groups_report: GroupMap
- segment_groups: GroupMap
- suppressed_segment_groups: int
- block_group_facts: dict[str, dict[str, str]]
- func_clones_count: int
- block_clones_count: int
- segment_clones_count: int
- files_analyzed_or_cached: int
- project_metrics: ProjectMetrics | None
- metrics_payload: dict[str, object] | None
- suggestions: tuple[Suggestion, ...]
- segment_groups_raw_digest: str
- suppressed_clone_groups: tuple[SuppressedCloneGroup, ...] = ()
- coverage_join: CoverageJoinResult | None = None
- suppressed_dead_code_items: int = 0
- structural_findings: tuple[StructuralFindingGroup, ...] = ()
-
-
-@dataclass(frozen=True, slots=True)
-class GatingResult:
- exit_code: int
- reasons: tuple[str, ...]
-
-
-@dataclass(frozen=True, slots=True)
-class ReportArtifacts:
- html: str | None = None
- json: str | None = None
- text: str | None = None
- md: str | None = None
- sarif: str | None = None
- report_document: dict[str, object] | None = None
-
-
-@dataclass(frozen=True, slots=True)
-class MetricGateConfig:
- fail_complexity: int
- fail_coupling: int
- fail_cohesion: int
- fail_cycles: bool
- fail_dead_code: bool
- fail_health: int
- fail_on_new_metrics: bool
- fail_on_typing_regression: bool = False
- fail_on_docstring_regression: bool = False
- fail_on_api_break: bool = False
- fail_on_untested_hotspots: bool = False
- min_typing_coverage: int = -1
- min_docstring_coverage: int = -1
- coverage_min: int = 50
-
-
-def _as_sorted_str_tuple(value: object) -> tuple[str, ...]:
- if not isinstance(value, list):
- return ()
- return tuple(sorted({item for item in value if isinstance(item, str) and item}))
-
-
-def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]:
- return (
- _as_str(item.get("filepath")),
- _as_int(item.get("start_line")),
- _as_int(item.get("end_line")),
- _as_str(item.get("qualname")),
- )
-
-
-def _segment_projection_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]:
- return (
- _as_str(item.get("filepath")),
- _as_str(item.get("qualname")),
- _as_int(item.get("start_line")),
- _as_int(item.get("end_line")),
- )
-
-
-def _segment_groups_digest(segment_groups: GroupMap) -> str:
- normalized_rows: list[
- tuple[str, tuple[tuple[str, str, int, int, int, str, str], ...]]
- ] = []
- for group_key in sorted(segment_groups):
- items = sorted(segment_groups[group_key], key=_segment_projection_item_sort_key)
- normalized_items: list[tuple[str, str, int, int, int, str, str]] = [
- (
- _as_str(item.get("filepath")),
- _as_str(item.get("qualname")),
- _as_int(item.get("start_line")),
- _as_int(item.get("end_line")),
- _as_int(item.get("size")),
- _as_str(item.get("segment_hash")),
- _as_str(item.get("segment_sig")),
- )
- for item in items
- ]
- normalized_rows.append((group_key, tuple(normalized_items)))
- payload = orjson.dumps(tuple(normalized_rows), option=orjson.OPT_SORT_KEYS)
- return sha256(payload).hexdigest()
-
-
-def _coerce_segment_report_projection(
- value: object,
-) -> SegmentReportProjection | None:
- if not isinstance(value, dict):
- return None
- digest = value.get("digest")
- suppressed = value.get("suppressed")
- groups = value.get("groups")
- if (
- not isinstance(digest, str)
- or not isinstance(suppressed, int)
- or not isinstance(groups, dict)
- ):
- return None
- if not all(
- isinstance(group_key, str) and isinstance(items, list)
- for group_key, items in groups.items()
- ):
- return None
- return cast("SegmentReportProjection", value)
-
-
-def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]:
- return dep.source, dep.target, dep.import_type, dep.line
-
-
-def _class_metric_sort_key(metric: ClassMetrics) -> tuple[str, int, int, str]:
- return metric.filepath, metric.start_line, metric.end_line, metric.qualname
-
-
-def _dead_candidate_sort_key(item: DeadCandidate) -> tuple[str, int, int, str]:
- return item.filepath, item.start_line, item.end_line, item.qualname
-
-
-def _unit_to_group_item(unit: Unit) -> GroupItem:
- return {
- "qualname": unit.qualname,
- "filepath": unit.filepath,
- "start_line": unit.start_line,
- "end_line": unit.end_line,
- "loc": unit.loc,
- "stmt_count": unit.stmt_count,
- "fingerprint": unit.fingerprint,
- "loc_bucket": unit.loc_bucket,
- "cyclomatic_complexity": unit.cyclomatic_complexity,
- "nesting_depth": unit.nesting_depth,
- "risk": unit.risk,
- "raw_hash": unit.raw_hash,
- "entry_guard_count": unit.entry_guard_count,
- "entry_guard_terminal_profile": unit.entry_guard_terminal_profile,
- "entry_guard_has_side_effect_before": unit.entry_guard_has_side_effect_before,
- "terminal_kind": unit.terminal_kind,
- "try_finally_profile": unit.try_finally_profile,
- "side_effect_order_profile": unit.side_effect_order_profile,
- }
-
-
-def _block_to_group_item(block: BlockUnit) -> GroupItem:
- return {
- "block_hash": block.block_hash,
- "filepath": block.filepath,
- "qualname": block.qualname,
- "start_line": block.start_line,
- "end_line": block.end_line,
- "size": block.size,
- }
-
-
-def _segment_to_group_item(segment: SegmentUnit) -> GroupItem:
- return {
- "segment_hash": segment.segment_hash,
- "segment_sig": segment.segment_sig,
- "filepath": segment.filepath,
- "qualname": segment.qualname,
- "start_line": segment.start_line,
- "end_line": segment.end_line,
- "size": segment.size,
- }
-
-
-def _parallel_min_files(processes: int) -> int:
- return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER)
-
-
-def _resolve_process_count(processes: object) -> int:
- if processes is None:
- return DEFAULT_RUNTIME_PROCESSES
- return max(1, _as_int(processes, DEFAULT_RUNTIME_PROCESSES))
-
-
-def _should_collect_structural_findings(output_paths: OutputPaths) -> bool:
- return any(
- path is not None
- for path in (
- output_paths.html,
- output_paths.json,
- output_paths.md,
- output_paths.sarif,
- output_paths.text,
- )
- )
-
-
-def _should_use_parallel(files_count: int, processes: int) -> bool:
- if processes <= 1:
- return False
- return files_count >= _parallel_min_files(processes)
-
-
-def _new_discovery_buffers() -> tuple[
- list[GroupItem],
- list[GroupItem],
- list[GroupItem],
- list[ClassMetrics],
- list[ModuleDep],
- list[DeadCandidate],
- set[str],
- set[str],
- list[ModuleTypingCoverage],
- list[ModuleDocstringCoverage],
- list[ModuleApiSurface],
- list[str],
- list[str],
-]:
- return [], [], [], [], [], [], set(), set(), [], [], [], [], []
-
-
-def _decode_cached_structural_finding_group(
- group_dict: StructuralFindingGroupDict,
- filepath: str,
-) -> StructuralFindingGroup:
- """Convert a StructuralFindingGroupDict (from cache) to a StructuralFindingGroup."""
- finding_kind = group_dict["finding_kind"]
- finding_key = group_dict["finding_key"]
- signature = group_dict["signature"]
- items = tuple(
- StructuralFindingOccurrence(
- finding_kind=finding_kind,
- finding_key=finding_key,
- file_path=filepath,
- qualname=item["qualname"],
- start=item["start"],
- end=item["end"],
- signature=signature,
- )
- for item in group_dict["items"]
- )
- return StructuralFindingGroup(
- finding_kind=finding_kind,
- finding_key=finding_key,
- signature=signature,
- items=items,
- )
-
-
-def bootstrap(
- *,
- args: Namespace,
- root: Path,
- output_paths: OutputPaths,
- cache_path: Path,
-) -> BootstrapResult:
- return BootstrapResult(
- root=root,
- config=NormalizationConfig(),
- args=args,
- output_paths=output_paths,
- cache_path=cache_path,
- )
-
-
-def _resolve_optional_runtime_path(value: object, *, root: Path) -> Path | None:
- text = str(value).strip() if value is not None else ""
- if not text:
- return None
- candidate = Path(text).expanduser()
- resolved = candidate if candidate.is_absolute() else root / candidate
- try:
- return resolved.resolve()
- except OSError:
- return resolved.absolute()
-
-
-def _cache_entry_has_metrics(entry: CacheEntry) -> bool:
- metric_keys = (
- "class_metrics",
- "module_deps",
- "dead_candidates",
- "referenced_names",
- "referenced_qualnames",
- "import_names",
- "class_names",
- )
- return all(key in entry and isinstance(entry.get(key), list) for key in metric_keys)
-
-
-def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool:
- return "structural_findings" in entry
-
-
-def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | None:
- stats_obj = entry.get("source_stats")
- if not isinstance(stats_obj, dict):
- return None
- lines = stats_obj.get("lines")
- functions = stats_obj.get("functions")
- methods = stats_obj.get("methods")
- classes = stats_obj.get("classes")
- if not (
- isinstance(lines, int)
- and isinstance(functions, int)
- and isinstance(methods, int)
- and isinstance(classes, int)
- and lines >= 0
- and functions >= 0
- and methods >= 0
- and classes >= 0
- ):
- return None
- return lines, functions, methods, classes
-
-
-def _usable_cached_source_stats(
- entry: CacheEntry,
- *,
- skip_metrics: bool,
- collect_structural_findings: bool,
-) -> tuple[int, int, int, int] | None:
- if not skip_metrics and not _cache_entry_has_metrics(entry):
- return None
- if collect_structural_findings and not _cache_entry_has_structural_findings(entry):
- return None
- return _cache_entry_source_stats(entry)
-
-
-def _cache_dict_module_fields(
- value: object,
-) -> tuple[Mapping[str, object], str, str] | None:
- if not isinstance(value, dict):
- return None
- row = cast("Mapping[str, object]", value)
- module = row.get("module")
- filepath = row.get("filepath")
- if not isinstance(module, str) or not isinstance(filepath, str):
- return None
- return row, module, filepath
-
-
-def _cache_dict_int_fields(
- row: Mapping[str, object],
- *keys: str,
-) -> tuple[int, ...] | None:
- values: list[int] = []
- for key in keys:
- value = row.get(key)
- if not isinstance(value, int):
- return None
- values.append(value)
- return tuple(values)
-
-
-def _typing_coverage_from_cache_dict(
- value: object,
-) -> ModuleTypingCoverage | None:
- row_info = _cache_dict_module_fields(value)
- if row_info is None:
- return None
- row, module, filepath = row_info
- int_fields = _cache_dict_int_fields(
- row,
- "callable_count",
- "params_total",
- "params_annotated",
- "returns_total",
- "returns_annotated",
- "any_annotation_count",
- )
- if int_fields is None:
- return None
- (
- callable_count,
- params_total,
- params_annotated,
- returns_total,
- returns_annotated,
- any_annotation_count,
- ) = int_fields
- return ModuleTypingCoverage(
- module=module,
- filepath=filepath,
- callable_count=callable_count,
- params_total=params_total,
- params_annotated=params_annotated,
- returns_total=returns_total,
- returns_annotated=returns_annotated,
- any_annotation_count=any_annotation_count,
- )
-
-
-def _docstring_coverage_from_cache_dict(
- value: object,
-) -> ModuleDocstringCoverage | None:
- row_info = _cache_dict_module_fields(value)
- if row_info is None:
- return None
- row, module, filepath = row_info
- totals = _cache_dict_int_fields(
- row,
- "public_symbol_total",
- "public_symbol_documented",
- )
- if totals is None:
- return None
- public_symbol_total, public_symbol_documented = totals
- return ModuleDocstringCoverage(
- module=module,
- filepath=filepath,
- public_symbol_total=public_symbol_total,
- public_symbol_documented=public_symbol_documented,
- )
-
-
-def _api_param_spec_from_cache_dict(value: ApiParamSpecDict) -> ApiParamSpec | None:
- name = value.get("name")
- kind = value.get("kind")
- has_default = value.get("has_default")
- annotation_hash = value.get("annotation_hash", "")
- if (
- not isinstance(name, str)
- or not isinstance(kind, str)
- or not isinstance(has_default, bool)
- or not isinstance(annotation_hash, str)
- ):
- return None
- return ApiParamSpec(
- name=name,
- kind=cast(
- "Literal['pos_only', 'pos_or_kw', 'vararg', 'kw_only', 'kwarg']",
- kind,
- ),
- has_default=has_default,
- annotation_hash=annotation_hash,
- )
-
-
-def _public_symbol_from_cache_dict(
- value: PublicSymbolDict,
-) -> PublicSymbol | None:
- qualname = value.get("qualname")
- kind = value.get("kind")
- start_line = value.get("start_line")
- end_line = value.get("end_line")
- exported_via = value.get("exported_via", "name")
- returns_hash = value.get("returns_hash", "")
- params_raw = value.get("params", [])
- if (
- not isinstance(qualname, str)
- or not isinstance(kind, str)
- or not isinstance(start_line, int)
- or not isinstance(end_line, int)
- or not isinstance(exported_via, str)
- or not isinstance(returns_hash, str)
- or not isinstance(params_raw, list)
- ):
- return None
- params = []
- for param in params_raw:
- if not isinstance(param, dict):
- return None
- parsed = _api_param_spec_from_cache_dict(param)
- if parsed is None:
- return None
- params.append(parsed)
- return PublicSymbol(
- qualname=qualname,
- kind=cast("Literal['function', 'class', 'method', 'constant']", kind),
- start_line=start_line,
- end_line=end_line,
- params=tuple(params),
- returns_hash=returns_hash,
- exported_via=cast("Literal['all', 'name']", exported_via),
- )
-
-
-def _api_surface_from_cache_dict(value: object) -> ModuleApiSurface | None:
- row_info = _cache_dict_module_fields(value)
- if row_info is None:
- return None
- row, module, filepath = row_info
- all_declared_raw = row.get("all_declared", [])
- symbols_raw = row.get("symbols", [])
- if (
- not isinstance(all_declared_raw, list)
- or not isinstance(symbols_raw, list)
- or not all(isinstance(item, str) for item in all_declared_raw)
- ):
- return None
- symbols: list[PublicSymbol] = []
- for item in symbols_raw:
- if not isinstance(item, dict):
- return None
- parsed = _public_symbol_from_cache_dict(cast("PublicSymbolDict", item))
- if parsed is None:
- return None
- symbols.append(parsed)
- return ModuleApiSurface(
- module=module,
- filepath=filepath,
- all_declared=tuple(sorted(set(all_declared_raw))) or None,
- symbols=tuple(sorted(symbols, key=lambda item: item.qualname)),
- )
-
-
-def _load_cached_metrics_extended(
- entry: CacheEntry,
- *,
- filepath: str,
-) -> tuple[
- tuple[ClassMetrics, ...],
- tuple[ModuleDep, ...],
- tuple[DeadCandidate, ...],
- frozenset[str],
- frozenset[str],
- ModuleTypingCoverage | None,
- ModuleDocstringCoverage | None,
- ModuleApiSurface | None,
-]:
- class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", [])
- class_metrics = tuple(
- ClassMetrics(
- qualname=row["qualname"],
- filepath=row["filepath"],
- start_line=row["start_line"],
- end_line=row["end_line"],
- cbo=row["cbo"],
- lcom4=row["lcom4"],
- method_count=row["method_count"],
- instance_var_count=row["instance_var_count"],
- risk_coupling=cast(
- "Literal['low', 'medium', 'high']",
- row["risk_coupling"],
- ),
- risk_cohesion=cast(
- "Literal['low', 'medium', 'high']",
- row["risk_cohesion"],
- ),
- coupled_classes=_as_sorted_str_tuple(row.get("coupled_classes", [])),
- )
- for row in class_metrics_rows
- if row.get("qualname") and row.get("filepath")
- )
-
- module_dep_rows: list[ModuleDepDict] = entry.get("module_deps", [])
- module_deps = tuple(
- ModuleDep(
- source=row["source"],
- target=row["target"],
- import_type=cast("Literal['import', 'from_import']", row["import_type"]),
- line=row["line"],
- )
- for row in module_dep_rows
- if row.get("source") and row.get("target")
- )
-
- dead_rows: list[DeadCandidateDict] = entry.get("dead_candidates", [])
- dead_candidates = tuple(
- DeadCandidate(
- qualname=row["qualname"],
- local_name=row["local_name"],
- filepath=row["filepath"],
- start_line=row["start_line"],
- end_line=row["end_line"],
- kind=cast(
- "Literal['function', 'class', 'method', 'import']",
- row["kind"],
- ),
- suppressed_rules=tuple(sorted(set(row.get("suppressed_rules", [])))),
- )
- for row in dead_rows
- if row.get("qualname") and row.get("local_name") and row.get("filepath")
- )
-
- referenced_names = (
- frozenset()
- if is_test_filepath(filepath)
- else frozenset(entry.get("referenced_names", []))
- )
- referenced_qualnames = (
- frozenset()
- if is_test_filepath(filepath)
- else frozenset(entry.get("referenced_qualnames", []))
- )
- typing_coverage = _typing_coverage_from_cache_dict(entry.get("typing_coverage"))
- docstring_coverage = _docstring_coverage_from_cache_dict(
- entry.get("docstring_coverage")
- )
- api_surface = _api_surface_from_cache_dict(entry.get("api_surface"))
- return (
- class_metrics,
- module_deps,
- dead_candidates,
- referenced_names,
- referenced_qualnames,
- typing_coverage,
- docstring_coverage,
- api_surface,
- )
-
-
-def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult:
- files_found = 0
- cache_hits = 0
- files_skipped = 0
- collect_structural_findings = _should_collect_structural_findings(boot.output_paths)
- cached_segment_projection = _coerce_segment_report_projection(
- getattr(cache, "segment_report_projection", None)
- )
-
- (
- cached_units,
- cached_blocks,
- cached_segments,
- cached_class_metrics,
- cached_module_deps,
- cached_dead_candidates,
- cached_referenced_names,
- cached_referenced_qualnames,
- cached_typing_modules,
- cached_docstring_modules,
- cached_api_modules,
- files_to_process,
- skipped_warnings,
- ) = _new_discovery_buffers()
- cached_sf: list[StructuralFindingGroup] = []
- cached_source_stats_by_file: list[tuple[str, int, int, int, int]] = []
- cached_lines = 0
- cached_functions = 0
- cached_methods = 0
- cached_classes = 0
- all_file_paths: list[str] = []
-
- for filepath in iter_py_files(str(boot.root)):
- files_found += 1
- all_file_paths.append(filepath)
- try:
- stat = file_stat_signature(filepath)
- except OSError as exc:
- files_skipped += 1
- skipped_warnings.append(f"{filepath}: {exc}")
- continue
-
- cached = cache.get_file_entry(filepath)
- if cached and cached.get("stat") == stat:
- cached_source_stats = _usable_cached_source_stats(
- cached,
- skip_metrics=boot.args.skip_metrics,
- collect_structural_findings=collect_structural_findings,
- )
- if cached_source_stats is None:
- files_to_process.append(filepath)
- continue
-
- cache_hits += 1
- lines, functions, methods, classes = cached_source_stats
- cached_lines += lines
- cached_functions += functions
- cached_methods += methods
- cached_classes += classes
- cached_source_stats_by_file.append(
- (filepath, lines, functions, methods, classes)
- )
- cached_units.extend(cast("list[GroupItem]", cast(object, cached["units"])))
- cached_blocks.extend(
- cast("list[GroupItem]", cast(object, cached["blocks"]))
- )
- cached_segments.extend(
- cast("list[GroupItem]", cast(object, cached["segments"]))
- )
-
- if not boot.args.skip_metrics:
- (
- class_metrics,
- module_deps,
- dead_candidates,
- referenced_names,
- referenced_qualnames,
- typing_coverage,
- docstring_coverage,
- api_surface,
- ) = _load_cached_metrics_extended(cached, filepath=filepath)
- cached_class_metrics.extend(class_metrics)
- cached_module_deps.extend(module_deps)
- cached_dead_candidates.extend(dead_candidates)
- cached_referenced_names.update(referenced_names)
- cached_referenced_qualnames.update(referenced_qualnames)
- if typing_coverage is not None:
- cached_typing_modules.append(typing_coverage)
- if docstring_coverage is not None:
- cached_docstring_modules.append(docstring_coverage)
- if api_surface is not None:
- cached_api_modules.append(api_surface)
- if collect_structural_findings:
- cached_sf.extend(
- _decode_cached_structural_finding_group(group_dict, filepath)
- for group_dict in cached.get("structural_findings") or []
- )
- continue
-
- files_to_process.append(filepath)
-
- return DiscoveryResult(
- files_found=files_found,
- cache_hits=cache_hits,
- files_skipped=files_skipped,
- all_file_paths=tuple(all_file_paths),
- cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)),
- cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)),
- cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)),
- cached_class_metrics=tuple(
- sorted(cached_class_metrics, key=_class_metric_sort_key)
- ),
- cached_module_deps=tuple(sorted(cached_module_deps, key=_module_dep_sort_key)),
- cached_dead_candidates=tuple(
- sorted(cached_dead_candidates, key=_dead_candidate_sort_key)
- ),
- cached_referenced_names=frozenset(cached_referenced_names),
- cached_referenced_qualnames=frozenset(cached_referenced_qualnames),
- cached_typing_modules=tuple(
- sorted(cached_typing_modules, key=lambda item: (item.filepath, item.module))
- ),
- cached_docstring_modules=tuple(
- sorted(
- cached_docstring_modules,
- key=lambda item: (item.filepath, item.module),
- )
- ),
- cached_api_modules=tuple(
- sorted(cached_api_modules, key=lambda item: (item.filepath, item.module))
- ),
- files_to_process=tuple(files_to_process),
- skipped_warnings=tuple(sorted(skipped_warnings)),
- cached_structural_findings=tuple(cached_sf),
- cached_segment_report_projection=cached_segment_projection,
- cached_lines=cached_lines,
- cached_functions=cached_functions,
- cached_methods=cached_methods,
- cached_classes=cached_classes,
- cached_source_stats_by_file=tuple(
- sorted(cached_source_stats_by_file, key=lambda row: row[0])
- ),
- )
-
-
-def process_file(
- filepath: str,
- root: str,
- cfg: NormalizationConfig,
- min_loc: int,
- min_stmt: int,
- collect_structural_findings: bool = True,
- collect_api_surface: bool = False,
- api_include_private_modules: bool = False,
- block_min_loc: int = 20,
- block_min_stmt: int = 8,
- segment_min_loc: int = 20,
- segment_min_stmt: int = 10,
-) -> FileProcessResult:
- try:
- try:
- stat_result = os.stat(filepath)
- if stat_result.st_size > MAX_FILE_SIZE:
- return FileProcessResult(
- filepath=filepath,
- success=False,
- error=(
- f"File too large: {stat_result.st_size} bytes "
- f"(max {MAX_FILE_SIZE})"
- ),
- error_kind="file_too_large",
- )
- except OSError as exc:
- return FileProcessResult(
- filepath=filepath,
- success=False,
- error=f"Cannot stat file: {exc}",
- error_kind="stat_error",
- )
-
- stat: FileStat = {
- "mtime_ns": stat_result.st_mtime_ns,
- "size": stat_result.st_size,
- }
-
- try:
- source = Path(filepath).read_text("utf-8")
- except UnicodeDecodeError as exc:
- return FileProcessResult(
- filepath=filepath,
- success=False,
- error=f"Encoding error: {exc}",
- error_kind="source_read_error",
- )
- except OSError as exc:
- return FileProcessResult(
- filepath=filepath,
- success=False,
- error=f"Cannot read file: {exc}",
- error_kind="source_read_error",
- )
-
- module_name = module_name_from_path(root, filepath)
- units, blocks, segments, source_stats, file_metrics, sf = (
- extract_units_and_stats_from_source(
- source=source,
- filepath=filepath,
- module_name=module_name,
- cfg=cfg,
- min_loc=min_loc,
- min_stmt=min_stmt,
- block_min_loc=block_min_loc,
- block_min_stmt=block_min_stmt,
- segment_min_loc=segment_min_loc,
- segment_min_stmt=segment_min_stmt,
- collect_structural_findings=collect_structural_findings,
- collect_api_surface=collect_api_surface,
- api_include_private_modules=api_include_private_modules,
- )
- )
-
- return FileProcessResult(
- filepath=filepath,
- success=True,
- units=units,
- blocks=blocks,
- segments=segments,
- lines=source_stats.lines,
- functions=source_stats.functions,
- methods=source_stats.methods,
- classes=source_stats.classes,
- stat=stat,
- file_metrics=file_metrics,
- structural_findings=sf,
- )
- except Exception as exc: # pragma: no cover - defensive shell around workers
- return FileProcessResult(
- filepath=filepath,
- success=False,
- error=f"Unexpected error: {type(exc).__name__}: {exc}",
- error_kind="unexpected_error",
- )
-
-
-def _invoke_process_file(
- filepath: str,
- root: str,
- cfg: NormalizationConfig,
- min_loc: int,
- min_stmt: int,
- *,
- collect_structural_findings: bool,
- collect_api_surface: bool,
- api_include_private_modules: bool,
- block_min_loc: int,
- block_min_stmt: int,
- segment_min_loc: int,
- segment_min_stmt: int,
-) -> FileProcessResult:
- optional_kwargs: dict[str, object] = {
- "collect_structural_findings": collect_structural_findings,
- "collect_api_surface": collect_api_surface,
- "api_include_private_modules": api_include_private_modules,
- "block_min_loc": block_min_loc,
- "block_min_stmt": block_min_stmt,
- "segment_min_loc": segment_min_loc,
- "segment_min_stmt": segment_min_stmt,
- }
- try:
- signature = inspect.signature(process_file)
- except (TypeError, ValueError):
- supported_kwargs = optional_kwargs
- else:
- parameters = tuple(signature.parameters.values())
- if any(
- parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters
- ):
- supported_kwargs = optional_kwargs
- else:
- supported_names = {parameter.name for parameter in parameters}
- supported_kwargs = {
- key: value
- for key, value in optional_kwargs.items()
- if key in supported_names
- }
- process_callable = cast("Callable[..., FileProcessResult]", process_file)
- return process_callable(
- filepath,
- root,
- cfg,
- min_loc,
- min_stmt,
- **supported_kwargs,
- )
-
-
-def process(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- cache: Cache,
- on_advance: Callable[[], None] | None = None,
- on_worker_error: Callable[[str], None] | None = None,
- on_parallel_fallback: Callable[[Exception], None] | None = None,
- batch_size: int = DEFAULT_BATCH_SIZE,
-) -> ProcessingResult:
- files_to_process = discovery.files_to_process
- if not files_to_process:
- return ProcessingResult(
- units=discovery.cached_units,
- blocks=discovery.cached_blocks,
- segments=discovery.cached_segments,
- class_metrics=discovery.cached_class_metrics,
- module_deps=discovery.cached_module_deps,
- dead_candidates=discovery.cached_dead_candidates,
- referenced_names=discovery.cached_referenced_names,
- referenced_qualnames=discovery.cached_referenced_qualnames,
- typing_modules=discovery.cached_typing_modules,
- docstring_modules=discovery.cached_docstring_modules,
- api_modules=discovery.cached_api_modules,
- files_analyzed=0,
- files_skipped=discovery.files_skipped,
- analyzed_lines=0,
- analyzed_functions=0,
- analyzed_methods=0,
- analyzed_classes=0,
- failed_files=(),
- source_read_failures=(),
- structural_findings=discovery.cached_structural_findings,
- source_stats_by_file=discovery.cached_source_stats_by_file,
- )
-
- all_units: list[GroupItem] = list(discovery.cached_units)
- all_blocks: list[GroupItem] = list(discovery.cached_blocks)
- all_segments: list[GroupItem] = list(discovery.cached_segments)
-
- all_class_metrics: list[ClassMetrics] = list(discovery.cached_class_metrics)
- all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps)
- all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates)
- all_referenced_names: set[str] = set(discovery.cached_referenced_names)
- all_referenced_qualnames: set[str] = set(discovery.cached_referenced_qualnames)
- all_typing_modules: list[ModuleTypingCoverage] = list(
- discovery.cached_typing_modules
- )
- all_docstring_modules: list[ModuleDocstringCoverage] = list(
- discovery.cached_docstring_modules
- )
- all_api_modules: list[ModuleApiSurface] = list(discovery.cached_api_modules)
- collect_structural_findings = _should_collect_structural_findings(boot.output_paths)
- collect_api_surface = not boot.args.skip_metrics and bool(
- getattr(boot.args, "api_surface", False)
- )
- api_include_private_modules = bool(
- getattr(boot.args, "api_include_private_modules", False)
- )
-
- files_analyzed = 0
- files_skipped = discovery.files_skipped
- analyzed_lines = 0
- analyzed_functions = 0
- analyzed_methods = 0
- analyzed_classes = 0
-
- all_structural_findings: list[StructuralFindingGroup] = list(
- discovery.cached_structural_findings
- )
- source_stats_by_file: dict[str, tuple[int, int, int, int]] = {
- filepath: (lines, functions, methods, classes)
- for filepath, lines, functions, methods, classes in (
- discovery.cached_source_stats_by_file
- )
- }
- failed_files: list[str] = []
- source_read_failures: list[str] = []
- root_str = str(boot.root)
- # Keep process-count fallback in the core runtime so non-CLI callers such as
- # the MCP service do not need to guess or mirror parallelism policy.
- processes = _resolve_process_count(boot.args.processes)
- min_loc = int(boot.args.min_loc)
- min_stmt = int(boot.args.min_stmt)
- block_min_loc = int(boot.args.block_min_loc)
- block_min_stmt = int(boot.args.block_min_stmt)
- segment_min_loc = int(boot.args.segment_min_loc)
- segment_min_stmt = int(boot.args.segment_min_stmt)
- collect_structural_findings = _should_collect_structural_findings(boot.output_paths)
-
- def _accept_result(result: FileProcessResult) -> None:
- nonlocal files_analyzed
- nonlocal files_skipped
- nonlocal analyzed_lines
- nonlocal analyzed_functions
- nonlocal analyzed_methods
- nonlocal analyzed_classes
-
- if result.success and result.stat is not None:
- source_stats_payload = SourceStatsDict(
- lines=result.lines,
- functions=result.functions,
- methods=result.methods,
- classes=result.classes,
- )
- structural_payload = (
- result.structural_findings if collect_structural_findings else None
- )
- try:
- cache.put_file_entry(
- result.filepath,
- result.stat,
- result.units or [],
- result.blocks or [],
- result.segments or [],
- source_stats=source_stats_payload,
- file_metrics=result.file_metrics,
- structural_findings=structural_payload,
- )
- except TypeError as exc:
- if "source_stats" not in str(exc):
- raise
- cache.put_file_entry(
- result.filepath,
- result.stat,
- result.units or [],
- result.blocks or [],
- result.segments or [],
- file_metrics=result.file_metrics,
- structural_findings=structural_payload,
- )
- files_analyzed += 1
- analyzed_lines += result.lines
- analyzed_functions += result.functions
- analyzed_methods += result.methods
- analyzed_classes += result.classes
- source_stats_by_file[result.filepath] = (
- result.lines,
- result.functions,
- result.methods,
- result.classes,
- )
-
- if result.units:
- all_units.extend(_unit_to_group_item(unit) for unit in result.units)
- if result.blocks:
- all_blocks.extend(
- _block_to_group_item(block) for block in result.blocks
- )
- if result.segments:
- all_segments.extend(
- _segment_to_group_item(segment) for segment in result.segments
- )
- if result.structural_findings:
- all_structural_findings.extend(result.structural_findings)
-
- if not boot.args.skip_metrics and result.file_metrics is not None:
- all_class_metrics.extend(result.file_metrics.class_metrics)
- all_module_deps.extend(result.file_metrics.module_deps)
- all_dead_candidates.extend(result.file_metrics.dead_candidates)
- all_referenced_names.update(result.file_metrics.referenced_names)
- all_referenced_qualnames.update(
- result.file_metrics.referenced_qualnames
- )
- if result.file_metrics.typing_coverage is not None:
- all_typing_modules.append(result.file_metrics.typing_coverage)
- if result.file_metrics.docstring_coverage is not None:
- all_docstring_modules.append(result.file_metrics.docstring_coverage)
- if result.file_metrics.api_surface is not None:
- all_api_modules.append(result.file_metrics.api_surface)
- return
-
- files_skipped += 1
- failure = f"{result.filepath}: {result.error}"
- failed_files.append(failure)
- if result.error_kind == "source_read_error":
- source_read_failures.append(failure)
-
- def _run_sequential(files: Sequence[str]) -> None:
- for filepath in files:
- _accept_result(
- _invoke_process_file(
- filepath,
- root_str,
- boot.config,
- min_loc,
- min_stmt,
- collect_structural_findings=collect_structural_findings,
- collect_api_surface=collect_api_surface,
- api_include_private_modules=api_include_private_modules,
- block_min_loc=block_min_loc,
- block_min_stmt=block_min_stmt,
- segment_min_loc=segment_min_loc,
- segment_min_stmt=segment_min_stmt,
- )
- )
- if on_advance is not None:
- on_advance()
-
- if _should_use_parallel(len(files_to_process), processes):
- try:
- with ProcessPoolExecutor(max_workers=processes) as executor:
- for idx in range(0, len(files_to_process), batch_size):
- batch = files_to_process[idx : idx + batch_size]
- futures = [
- executor.submit(
- _invoke_process_file,
- filepath,
- root_str,
- boot.config,
- min_loc,
- min_stmt,
- collect_structural_findings=collect_structural_findings,
- collect_api_surface=collect_api_surface,
- api_include_private_modules=api_include_private_modules,
- block_min_loc=block_min_loc,
- block_min_stmt=block_min_stmt,
- segment_min_loc=segment_min_loc,
- segment_min_stmt=segment_min_stmt,
- )
- for filepath in batch
- ]
- future_to_path = {
- id(future): filepath
- for future, filepath in zip(futures, batch, strict=True)
- }
- for future in as_completed(futures):
- filepath = future_to_path[id(future)]
- try:
- _accept_result(future.result())
- except Exception as exc: # pragma: no cover - worker crash
- files_skipped += 1
- failed_files.append(f"{filepath}: {exc}")
- if on_worker_error is not None:
- on_worker_error(str(exc))
- if on_advance is not None:
- on_advance()
- except (OSError, RuntimeError, PermissionError) as exc:
- if on_parallel_fallback is not None:
- on_parallel_fallback(exc)
- _run_sequential(files_to_process)
- else:
- _run_sequential(files_to_process)
-
- return ProcessingResult(
- units=tuple(sorted(all_units, key=_group_item_sort_key)),
- blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)),
- segments=tuple(sorted(all_segments, key=_group_item_sort_key)),
- class_metrics=tuple(sorted(all_class_metrics, key=_class_metric_sort_key)),
- module_deps=tuple(sorted(all_module_deps, key=_module_dep_sort_key)),
- dead_candidates=tuple(
- sorted(all_dead_candidates, key=_dead_candidate_sort_key)
- ),
- referenced_names=frozenset(all_referenced_names),
- referenced_qualnames=frozenset(all_referenced_qualnames),
- typing_modules=tuple(
- sorted(all_typing_modules, key=lambda item: (item.filepath, item.module))
- ),
- docstring_modules=tuple(
- sorted(all_docstring_modules, key=lambda item: (item.filepath, item.module))
- ),
- api_modules=tuple(
- sorted(all_api_modules, key=lambda item: (item.filepath, item.module))
- ),
- files_analyzed=files_analyzed,
- files_skipped=files_skipped,
- analyzed_lines=analyzed_lines,
- analyzed_functions=analyzed_functions,
- analyzed_methods=analyzed_methods,
- analyzed_classes=analyzed_classes,
- failed_files=tuple(sorted(failed_files)),
- source_read_failures=tuple(sorted(source_read_failures)),
- structural_findings=tuple(all_structural_findings),
- source_stats_by_file=tuple(
- (filepath, *stats)
- for filepath, stats in sorted(source_stats_by_file.items())
- ),
- )
-
-
-def _module_names_from_units(units: Sequence[GroupItemLike]) -> frozenset[str]:
- modules: set[str] = set()
- for unit in units:
- qualname = _as_str(unit.get("qualname"))
- module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname
- if module_name:
- modules.add(module_name)
- return frozenset(sorted(modules))
-
-
-def compute_project_metrics(
- *,
- units: Sequence[GroupItemLike],
- class_metrics: Sequence[ClassMetrics],
- module_deps: Sequence[ModuleDep],
- dead_candidates: Sequence[DeadCandidate],
- referenced_names: frozenset[str],
- referenced_qualnames: frozenset[str],
- typing_modules: Sequence[ModuleTypingCoverage] = (),
- docstring_modules: Sequence[ModuleDocstringCoverage] = (),
- api_modules: Sequence[ModuleApiSurface] = (),
- files_found: int,
- files_analyzed_or_cached: int,
- function_clone_groups: int,
- block_clone_groups: int,
- skip_dependencies: bool,
- skip_dead_code: bool,
-) -> tuple[ProjectMetrics, DepGraph, tuple[DeadItem, ...]]:
- unit_rows = sorted(units, key=_group_item_sort_key)
- complexities = tuple(
- max(1, _as_int(row.get("cyclomatic_complexity"), 1)) for row in unit_rows
- )
- complexity_max = max(complexities) if complexities else 0
- complexity_avg = (
- float(sum(complexities)) / float(len(complexities)) if complexities else 0.0
- )
- high_risk_functions = tuple(
- sorted(
- {
- _as_str(row.get("qualname"))
- for row in unit_rows
- if _as_str(row.get("risk")) == RISK_HIGH
- }
- )
- )
-
- classes_sorted = tuple(sorted(class_metrics, key=_class_metric_sort_key))
- coupling_values = tuple(metric.cbo for metric in classes_sorted)
- coupling_max = max(coupling_values) if coupling_values else 0
- coupling_avg = (
- float(sum(coupling_values)) / float(len(coupling_values))
- if coupling_values
- else 0.0
- )
- high_risk_classes = tuple(
- sorted(
- {
- metric.qualname
- for metric in classes_sorted
- if metric.risk_coupling == RISK_HIGH
- }
- )
- )
-
- cohesion_values = tuple(metric.lcom4 for metric in classes_sorted)
- cohesion_max = max(cohesion_values) if cohesion_values else 0
- cohesion_avg = (
- float(sum(cohesion_values)) / float(len(cohesion_values))
- if cohesion_values
- else 0.0
- )
- low_cohesion_classes = tuple(
- sorted(
- {
- metric.qualname
- for metric in classes_sorted
- if metric.risk_cohesion == RISK_HIGH
- }
- )
- )
-
- dep_graph = DepGraph(
- modules=frozenset(),
- edges=(),
- cycles=(),
- max_depth=0,
- longest_chains=(),
- )
- if not skip_dependencies:
- dep_graph = build_dep_graph(
- modules=_module_names_from_units(unit_rows),
- deps=module_deps,
- )
-
- dead_items: tuple[DeadItem, ...] = ()
- if not skip_dead_code:
- dead_items = find_unused(
- definitions=tuple(dead_candidates),
- referenced_names=referenced_names,
- referenced_qualnames=referenced_qualnames,
- )
-
- typing_rows = tuple(
- sorted(typing_modules, key=lambda item: (item.filepath, item.module))
- )
- docstring_rows = tuple(
- sorted(docstring_modules, key=lambda item: (item.filepath, item.module))
- )
- api_rows = tuple(sorted(api_modules, key=lambda item: (item.filepath, item.module)))
- typing_param_total = sum(item.params_total for item in typing_rows)
- typing_param_annotated = sum(item.params_annotated for item in typing_rows)
- typing_return_total = sum(item.returns_total for item in typing_rows)
- typing_return_annotated = sum(item.returns_annotated for item in typing_rows)
- typing_any_count = sum(item.any_annotation_count for item in typing_rows)
- docstring_public_total = sum(item.public_symbol_total for item in docstring_rows)
- docstring_public_documented = sum(
- item.public_symbol_documented for item in docstring_rows
- )
-
- health = compute_health(
- HealthInputs(
- files_found=files_found,
- files_analyzed_or_cached=files_analyzed_or_cached,
- function_clone_groups=function_clone_groups,
- block_clone_groups=block_clone_groups,
- complexity_avg=complexity_avg,
- complexity_max=complexity_max,
- high_risk_functions=len(high_risk_functions),
- coupling_avg=coupling_avg,
- coupling_max=coupling_max,
- high_risk_classes=len(high_risk_classes),
- cohesion_avg=cohesion_avg,
- low_cohesion_classes=len(low_cohesion_classes),
- dependency_cycles=len(dep_graph.cycles),
- dependency_max_depth=dep_graph.max_depth,
- dead_code_items=len(dead_items),
- )
- )
-
- project_metrics = ProjectMetrics(
- complexity_avg=complexity_avg,
- complexity_max=complexity_max,
- high_risk_functions=high_risk_functions,
- coupling_avg=coupling_avg,
- coupling_max=coupling_max,
- high_risk_classes=high_risk_classes,
- cohesion_avg=cohesion_avg,
- cohesion_max=cohesion_max,
- low_cohesion_classes=low_cohesion_classes,
- dependency_modules=len(dep_graph.modules),
- dependency_edges=len(dep_graph.edges),
- dependency_edge_list=dep_graph.edges,
- dependency_cycles=dep_graph.cycles,
- dependency_max_depth=dep_graph.max_depth,
- dependency_longest_chains=dep_graph.longest_chains,
- dead_code=dead_items,
- health=health,
- typing_param_total=typing_param_total,
- typing_param_annotated=typing_param_annotated,
- typing_return_total=typing_return_total,
- typing_return_annotated=typing_return_annotated,
- typing_any_count=typing_any_count,
- docstring_public_total=docstring_public_total,
- docstring_public_documented=docstring_public_documented,
- typing_modules=typing_rows,
- docstring_modules=docstring_rows,
- api_surface=ApiSurfaceSnapshot(modules=api_rows) if api_rows else None,
- )
- return project_metrics, dep_graph, dead_items
-
-
-def compute_suggestions(
- *,
- project_metrics: ProjectMetrics,
- units: Sequence[GroupItemLike],
- class_metrics: Sequence[ClassMetrics],
- func_groups: Mapping[str, Sequence[GroupItemLike]],
- block_groups: Mapping[str, Sequence[GroupItemLike]],
- segment_groups: Mapping[str, Sequence[GroupItemLike]],
- block_group_facts: Mapping[str, Mapping[str, str]] | None = None,
- structural_findings: Sequence[StructuralFindingGroup] | None = None,
- scan_root: str = "",
-) -> tuple[Suggestion, ...]:
- return generate_suggestions(
- project_metrics=project_metrics,
- units=units,
- class_metrics=class_metrics,
- func_groups=func_groups,
- block_groups=block_groups,
- segment_groups=segment_groups,
- block_group_facts=block_group_facts,
- structural_findings=structural_findings,
- scan_root=scan_root,
- )
-
-
-def _permille(numerator: int, denominator: int) -> int:
- if denominator <= 0:
- return 0
- return round((1000.0 * float(numerator)) / float(denominator))
-
-
-def _coverage_join_summary(
- coverage_join: CoverageJoinResult | None,
-) -> dict[str, object]:
- if coverage_join is None:
- return {}
- return {
- "status": coverage_join.status,
- "source": coverage_join.coverage_xml,
- "files": coverage_join.files,
- "units": len(coverage_join.units),
- "measured_units": coverage_join.measured_units,
- "overall_executable_lines": coverage_join.overall_executable_lines,
- "overall_covered_lines": coverage_join.overall_covered_lines,
- "overall_permille": _permille(
- coverage_join.overall_covered_lines,
- coverage_join.overall_executable_lines,
- ),
- "missing_from_report_units": sum(
- 1
- for fact in coverage_join.units
- if fact.coverage_status == "missing_from_report"
- ),
- "coverage_hotspots": coverage_join.coverage_hotspots,
- "scope_gap_hotspots": coverage_join.scope_gap_hotspots,
- "hotspot_threshold_percent": coverage_join.hotspot_threshold_percent,
- "invalid_reason": coverage_join.invalid_reason,
- }
-
-
-def _coverage_join_rows(
- coverage_join: CoverageJoinResult | None,
-) -> list[dict[str, object]]:
- if coverage_join is None or coverage_join.status != "ok":
- return []
- return sorted(
- (
- {
- "qualname": fact.qualname,
- "filepath": fact.filepath,
- "start_line": fact.start_line,
- "end_line": fact.end_line,
- "cyclomatic_complexity": fact.cyclomatic_complexity,
- "risk": fact.risk,
- "executable_lines": fact.executable_lines,
- "covered_lines": fact.covered_lines,
- "coverage_permille": fact.coverage_permille,
- "coverage_status": fact.coverage_status,
- "coverage_hotspot": (
- fact.risk in {"medium", "high"}
- and fact.coverage_status == "measured"
- and (fact.coverage_permille / 10.0)
- < float(coverage_join.hotspot_threshold_percent)
- ),
- "scope_gap_hotspot": (
- fact.risk in {"medium", "high"}
- and fact.coverage_status == "missing_from_report"
- ),
- "coverage_review_item": (
- (
- fact.risk in {"medium", "high"}
- and fact.coverage_status == "measured"
- and (fact.coverage_permille / 10.0)
- < float(coverage_join.hotspot_threshold_percent)
- )
- or (
- fact.risk in {"medium", "high"}
- and fact.coverage_status == "missing_from_report"
- )
- ),
- }
- for fact in coverage_join.units
- ),
- key=lambda item: (
- 0 if bool(item.get("coverage_hotspot")) else 1,
- 0 if bool(item.get("scope_gap_hotspot")) else 1,
- {"high": 0, "medium": 1, "low": 2}.get(_as_str(item.get("risk")), 3),
- _as_int(item.get("coverage_permille"), 0),
- -_as_int(item.get("cyclomatic_complexity"), 0),
- _as_str(item.get("filepath")),
- _as_int(item.get("start_line")),
- _as_str(item.get("qualname")),
- ),
- )
-
-
-def _coverage_adoption_rows(
- project_metrics: ProjectMetrics,
-) -> list[dict[str, object]]:
- docstring_by_module = {
- (item.filepath, item.module): item for item in project_metrics.docstring_modules
- }
- rows: list[dict[str, object]] = []
- seen_keys: set[tuple[str, str]] = set()
- for typing_item in project_metrics.typing_modules:
- key = (typing_item.filepath, typing_item.module)
- seen_keys.add(key)
- docstring_item = docstring_by_module.get(key)
- doc_total = docstring_item.public_symbol_total if docstring_item else 0
- doc_documented = (
- docstring_item.public_symbol_documented if docstring_item else 0
- )
- rows.append(
- {
- "module": typing_item.module,
- "filepath": typing_item.filepath,
- "callable_count": typing_item.callable_count,
- "params_total": typing_item.params_total,
- "params_annotated": typing_item.params_annotated,
- "param_permille": _permille(
- typing_item.params_annotated,
- typing_item.params_total,
- ),
- "returns_total": typing_item.returns_total,
- "returns_annotated": typing_item.returns_annotated,
- "return_permille": _permille(
- typing_item.returns_annotated,
- typing_item.returns_total,
- ),
- "any_annotation_count": typing_item.any_annotation_count,
- "public_symbol_total": doc_total,
- "public_symbol_documented": doc_documented,
- "docstring_permille": _permille(doc_documented, doc_total),
- }
- )
- for docstring_item in project_metrics.docstring_modules:
- key = (docstring_item.filepath, docstring_item.module)
- if key in seen_keys:
- continue
- rows.append(
- {
- "module": docstring_item.module,
- "filepath": docstring_item.filepath,
- "callable_count": 0,
- "params_total": 0,
- "params_annotated": 0,
- "param_permille": 0,
- "returns_total": 0,
- "returns_annotated": 0,
- "return_permille": 0,
- "any_annotation_count": 0,
- "public_symbol_total": docstring_item.public_symbol_total,
- "public_symbol_documented": docstring_item.public_symbol_documented,
- "docstring_permille": _permille(
- docstring_item.public_symbol_documented,
- docstring_item.public_symbol_total,
- ),
- }
- )
- return sorted(
- rows,
- key=lambda item: (
- _as_int(item.get("param_permille")),
- _as_int(item.get("docstring_permille")),
- _as_int(item.get("return_permille")),
- _as_str(item.get("module")),
- ),
- )
-
-
-def _api_surface_summary(
- api_surface: ApiSurfaceSnapshot | None,
-) -> dict[str, object]:
- modules = api_surface.modules if api_surface is not None else ()
- return {
- "enabled": api_surface is not None,
- "modules": len(modules),
- "public_symbols": sum(len(module.symbols) for module in modules),
- "added": 0,
- "breaking": 0,
- "strict_types": False,
- }
-
-
-def _api_surface_rows(
- api_surface: ApiSurfaceSnapshot | None,
-) -> list[dict[str, object]]:
- if api_surface is None:
- return []
- rows: list[dict[str, object]] = []
- for module in api_surface.modules:
- rows.extend(
- {
- "record_kind": "symbol",
- "module": module.module,
- "filepath": module.filepath,
- "qualname": symbol.qualname,
- "start_line": symbol.start_line,
- "end_line": symbol.end_line,
- "symbol_kind": symbol.kind,
- "exported_via": symbol.exported_via,
- "params_total": len(symbol.params),
- "params": [
- {
- "name": param.name,
- "kind": param.kind,
- "has_default": param.has_default,
- "annotated": bool(param.annotation_hash),
- }
- for param in symbol.params
- ],
- "returns_annotated": bool(symbol.returns_hash),
- }
- for symbol in module.symbols
- )
- return sorted(
- rows,
- key=lambda item: (
- _as_str(item.get("filepath")),
- _as_int(item.get("start_line")),
- _as_int(item.get("end_line")),
- _as_str(item.get("qualname")),
- _as_str(item.get("record_kind")),
- ),
- )
-
-
-def _breaking_api_surface_rows(
- changes: Sequence[object],
-) -> list[dict[str, object]]:
- rows: list[dict[str, object]] = []
- for change in changes:
- if not isinstance(change, ApiBreakingChange):
- continue
- module_name, _, _local_name = change.qualname.partition(":")
- rows.append(
- {
- "record_kind": "breaking_change",
- "module": module_name,
- "filepath": change.filepath,
- "qualname": change.qualname,
- "start_line": change.start_line,
- "end_line": change.end_line,
- "symbol_kind": change.symbol_kind,
- "change_kind": change.change_kind,
- "detail": change.detail,
- }
- )
- return sorted(
- rows,
- key=lambda item: (
- _as_str(item.get("filepath")),
- _as_int(item.get("start_line")),
- _as_int(item.get("end_line")),
- _as_str(item.get("qualname")),
- _as_str(item.get("change_kind")),
- ),
- )
-
-
-def _enrich_metrics_report_payload(
- *,
- metrics_payload: Mapping[str, object],
- metrics_diff: MetricsDiff | None,
- coverage_adoption_diff_available: bool,
- api_surface_diff_available: bool,
-) -> dict[str, object]:
- enriched = {
- key: (dict(value) if isinstance(value, Mapping) else value)
- for key, value in metrics_payload.items()
- }
- coverage_adoption = dict(
- cast("Mapping[str, object]", enriched.get("coverage_adoption", {}))
- )
- coverage_summary = dict(
- cast("Mapping[str, object]", coverage_adoption.get("summary", {}))
- )
- if coverage_summary:
- coverage_summary["baseline_diff_available"] = coverage_adoption_diff_available
- coverage_summary["param_delta"] = (
- int(metrics_diff.typing_param_permille_delta)
- if metrics_diff is not None and coverage_adoption_diff_available
- else 0
- )
- coverage_summary["return_delta"] = (
- int(metrics_diff.typing_return_permille_delta)
- if metrics_diff is not None and coverage_adoption_diff_available
- else 0
- )
- coverage_summary["docstring_delta"] = (
- int(metrics_diff.docstring_permille_delta)
- if metrics_diff is not None and coverage_adoption_diff_available
- else 0
- )
- coverage_adoption["summary"] = coverage_summary
- enriched["coverage_adoption"] = coverage_adoption
-
- api_surface = dict(cast("Mapping[str, object]", enriched.get("api_surface", {})))
- api_summary = dict(cast("Mapping[str, object]", api_surface.get("summary", {})))
- api_items = list(cast("Sequence[object]", api_surface.get("items", ())))
- if api_summary:
- api_summary["baseline_diff_available"] = api_surface_diff_available
- api_summary["added"] = (
- len(metrics_diff.new_api_symbols)
- if metrics_diff is not None and api_surface_diff_available
- else 0
- )
- api_summary["breaking"] = (
- len(metrics_diff.new_api_breaking_changes)
- if metrics_diff is not None and api_surface_diff_available
- else 0
- )
- api_surface["summary"] = api_summary
- if (
- metrics_diff is not None
- and api_surface_diff_available
- and metrics_diff.new_api_breaking_changes
- ):
- api_items.extend(
- _breaking_api_surface_rows(metrics_diff.new_api_breaking_changes)
- )
- api_surface["items"] = api_items
- if api_surface:
- enriched["api_surface"] = api_surface
- return enriched
-
-
-def build_metrics_report_payload(
- *,
- scan_root: str = "",
- project_metrics: ProjectMetrics,
- coverage_join: CoverageJoinResult | None = None,
- units: Sequence[GroupItemLike],
- class_metrics: Sequence[ClassMetrics],
- module_deps: Sequence[ModuleDep] = (),
- source_stats_by_file: Sequence[tuple[str, int, int, int, int]] = (),
- suppressed_dead_code: Sequence[DeadItem] = (),
-) -> dict[str, object]:
- sorted_units = sorted(
- units,
- key=lambda item: (
- _as_int(item.get("cyclomatic_complexity")),
- _as_int(item.get("nesting_depth")),
- _as_str(item.get("qualname")),
- ),
- reverse=True,
- )
- complexity_rows = [
- {
- "qualname": _as_str(item.get("qualname")),
- "filepath": _as_str(item.get("filepath")),
- "start_line": _as_int(item.get("start_line")),
- "end_line": _as_int(item.get("end_line")),
- "cyclomatic_complexity": _as_int(item.get("cyclomatic_complexity"), 1),
- "nesting_depth": _as_int(item.get("nesting_depth")),
- "risk": _as_str(item.get("risk"), RISK_LOW),
- }
- for item in sorted_units
- ]
- classes_sorted = sorted(
- class_metrics,
- key=lambda item: (item.cbo, item.lcom4, item.qualname),
- reverse=True,
- )
- coupling_rows = [
- {
- "qualname": metric.qualname,
- "filepath": metric.filepath,
- "start_line": metric.start_line,
- "end_line": metric.end_line,
- "cbo": metric.cbo,
- "risk": metric.risk_coupling,
- "coupled_classes": list(metric.coupled_classes),
- }
- for metric in classes_sorted
- ]
- cohesion_rows = [
- {
- "qualname": metric.qualname,
- "filepath": metric.filepath,
- "start_line": metric.start_line,
- "end_line": metric.end_line,
- "lcom4": metric.lcom4,
- "risk": metric.risk_cohesion,
- "method_count": metric.method_count,
- "instance_var_count": metric.instance_var_count,
- }
- for metric in classes_sorted
- ]
- active_dead_items = tuple(project_metrics.dead_code)
- suppressed_dead_items = tuple(suppressed_dead_code)
- coverage_adoption_rows = _coverage_adoption_rows(project_metrics)
- api_surface_summary = _api_surface_summary(project_metrics.api_surface)
- api_surface_items = _api_surface_rows(project_metrics.api_surface)
- coverage_join_summary = _coverage_join_summary(coverage_join)
- coverage_join_items = _coverage_join_rows(coverage_join)
-
- def _serialize_dead_item(
- item: DeadItem,
- *,
- suppressed: bool = False,
- ) -> dict[str, object]:
- payload: dict[str, object] = {
- "qualname": item.qualname,
- "filepath": item.filepath,
- "start_line": item.start_line,
- "end_line": item.end_line,
- "kind": item.kind,
- "confidence": item.confidence,
- }
- if suppressed:
- payload["suppressed_by"] = [
- {
- "rule": DEAD_CODE_RULE_ID,
- "source": INLINE_CODECLONE_SUPPRESSION_SOURCE,
- }
- ]
- return payload
-
- payload = {
- CATEGORY_COMPLEXITY: {
- "functions": complexity_rows,
- "summary": {
- "total": len(complexity_rows),
- "average": round(project_metrics.complexity_avg, 2),
- "max": project_metrics.complexity_max,
- "high_risk": len(project_metrics.high_risk_functions),
- },
- },
- CATEGORY_COUPLING: {
- "classes": coupling_rows,
- "summary": {
- "total": len(coupling_rows),
- "average": round(project_metrics.coupling_avg, 2),
- "max": project_metrics.coupling_max,
- "high_risk": len(project_metrics.high_risk_classes),
- },
- },
- CATEGORY_COHESION: {
- "classes": cohesion_rows,
- "summary": {
- "total": len(cohesion_rows),
- "average": round(project_metrics.cohesion_avg, 2),
- "max": project_metrics.cohesion_max,
- "low_cohesion": len(project_metrics.low_cohesion_classes),
- },
- },
- "dependencies": {
- "modules": project_metrics.dependency_modules,
- "edges": project_metrics.dependency_edges,
- "max_depth": project_metrics.dependency_max_depth,
- "cycles": [list(cycle) for cycle in project_metrics.dependency_cycles],
- "longest_chains": [
- list(chain) for chain in project_metrics.dependency_longest_chains
- ],
- "edge_list": [
- {
- "source": edge.source,
- "target": edge.target,
- "import_type": edge.import_type,
- "line": edge.line,
- }
- for edge in project_metrics.dependency_edge_list
- ],
- },
- "dead_code": {
- "items": [_serialize_dead_item(item) for item in active_dead_items],
- "suppressed_items": [
- _serialize_dead_item(item, suppressed=True)
- for item in suppressed_dead_items
- ],
- "summary": {
- "total": len(active_dead_items),
- "critical": sum(
- 1
- for item in active_dead_items
- if item.confidence == CONFIDENCE_HIGH
- ),
- "high_confidence": sum(
- 1
- for item in active_dead_items
- if item.confidence == CONFIDENCE_HIGH
- ),
- "suppressed": len(suppressed_dead_items),
- },
- },
- "health": {
- "score": project_metrics.health.total,
- "grade": project_metrics.health.grade,
- "dimensions": dict(project_metrics.health.dimensions),
- },
- "coverage_adoption": {
- "summary": {
- "modules": len(coverage_adoption_rows),
- "params_total": project_metrics.typing_param_total,
- "params_annotated": project_metrics.typing_param_annotated,
- "param_permille": _permille(
- project_metrics.typing_param_annotated,
- project_metrics.typing_param_total,
- ),
- "returns_total": project_metrics.typing_return_total,
- "returns_annotated": project_metrics.typing_return_annotated,
- "return_permille": _permille(
- project_metrics.typing_return_annotated,
- project_metrics.typing_return_total,
- ),
- "public_symbol_total": project_metrics.docstring_public_total,
- "public_symbol_documented": project_metrics.docstring_public_documented,
- "docstring_permille": _permille(
- project_metrics.docstring_public_documented,
- project_metrics.docstring_public_total,
- ),
- "typing_any_count": project_metrics.typing_any_count,
- },
- "items": coverage_adoption_rows,
- },
- "api_surface": {
- "summary": dict(api_surface_summary),
- "items": api_surface_items,
- },
- "overloaded_modules": build_overloaded_modules_payload(
- scan_root=scan_root,
- source_stats_by_file=source_stats_by_file,
- units=units,
- class_metrics=class_metrics,
- module_deps=module_deps,
- ),
- }
- if coverage_join is not None:
- payload["coverage_join"] = {
- "summary": dict(coverage_join_summary),
- "items": coverage_join_items,
- }
- return payload
-
-
-def analyze(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- processing: ProcessingResult,
-) -> AnalysisResult:
- golden_fixture_paths = tuple(
- str(pattern).strip()
- for pattern in getattr(boot.args, "golden_fixture_paths", ())
- if str(pattern).strip()
- )
-
- func_split = split_clone_groups_for_golden_fixtures(
- groups=build_groups(processing.units),
- kind="function",
- golden_fixture_paths=golden_fixture_paths,
- scan_root=str(boot.root),
- )
- block_split = split_clone_groups_for_golden_fixtures(
- groups=build_block_groups(processing.blocks),
- kind="block",
- golden_fixture_paths=golden_fixture_paths,
- scan_root=str(boot.root),
- )
- segment_split = split_clone_groups_for_golden_fixtures(
- groups=build_segment_groups(processing.segments),
- kind="segment",
- golden_fixture_paths=golden_fixture_paths,
- scan_root=str(boot.root),
- )
-
- func_groups = func_split.active_groups
- block_groups = block_split.active_groups
- segment_groups_raw = segment_split.active_groups
- segment_groups_raw_digest = _segment_groups_digest(segment_groups_raw)
- cached_projection = discovery.cached_segment_report_projection
- if (
- cached_projection is not None
- and cached_projection.get("digest") == segment_groups_raw_digest
- ):
- projection_groups = cached_projection.get("groups", {})
- segment_groups = {
- group_key: [
- {
- "segment_hash": str(item["segment_hash"]),
- "segment_sig": str(item["segment_sig"]),
- "filepath": str(item["filepath"]),
- "qualname": str(item["qualname"]),
- "start_line": int(item["start_line"]),
- "end_line": int(item["end_line"]),
- "size": int(item["size"]),
- }
- for item in projection_groups[group_key]
- ]
- for group_key in sorted(projection_groups)
- }
- suppressed_segment_groups = int(cached_projection.get("suppressed", 0))
- else:
- segment_groups, suppressed_segment_groups = prepare_segment_report_groups(
- segment_groups_raw
- )
-
- block_groups_report = prepare_block_report_groups(block_groups)
- suppressed_block_groups_report = prepare_block_report_groups(
- block_split.suppressed_groups
- )
- if segment_split.suppressed_groups:
- suppressed_segment_groups_report, _ = prepare_segment_report_groups(
- segment_split.suppressed_groups
- )
- else:
- suppressed_segment_groups_report = {}
- suppressed_clone_groups = (
- *build_suppressed_clone_groups(
- kind="function",
- groups=func_split.suppressed_groups,
- matched_patterns=func_split.matched_patterns,
- ),
- *build_suppressed_clone_groups(
- kind="block",
- groups=suppressed_block_groups_report,
- matched_patterns=block_split.matched_patterns,
- ),
- *build_suppressed_clone_groups(
- kind="segment",
- groups=suppressed_segment_groups_report,
- matched_patterns=segment_split.matched_patterns,
- ),
- )
- block_group_facts = build_block_group_facts(
- {
- **block_groups_report,
- **suppressed_block_groups_report,
- }
- )
-
- func_clones_count = len(func_groups)
- block_clones_count = len(block_groups)
- segment_clones_count = len(segment_groups)
- files_analyzed_or_cached = processing.files_analyzed + discovery.cache_hits
-
- project_metrics: ProjectMetrics | None = None
- metrics_payload: dict[str, object] | None = None
- suggestions: tuple[Suggestion, ...] = ()
- suppressed_dead_items: tuple[DeadItem, ...] = ()
- coverage_join: CoverageJoinResult | None = None
- cohort_structural_findings: tuple[StructuralFindingGroup, ...] = ()
- if _should_collect_structural_findings(boot.output_paths):
- cohort_structural_findings = build_clone_cohort_structural_findings(
- func_groups=func_groups,
- )
- combined_structural_findings = (
- *processing.structural_findings,
- *cohort_structural_findings,
- )
-
- if not boot.args.skip_metrics:
- project_metrics, _, _ = compute_project_metrics(
- units=processing.units,
- class_metrics=processing.class_metrics,
- module_deps=processing.module_deps,
- dead_candidates=processing.dead_candidates,
- referenced_names=processing.referenced_names,
- referenced_qualnames=processing.referenced_qualnames,
- typing_modules=processing.typing_modules,
- docstring_modules=processing.docstring_modules,
- api_modules=processing.api_modules,
- files_found=discovery.files_found,
- files_analyzed_or_cached=files_analyzed_or_cached,
- function_clone_groups=func_clones_count,
- block_clone_groups=block_clones_count,
- skip_dependencies=boot.args.skip_dependencies,
- skip_dead_code=boot.args.skip_dead_code,
- )
- if not boot.args.skip_dead_code:
- suppressed_dead_items = find_suppressed_unused(
- definitions=tuple(processing.dead_candidates),
- referenced_names=processing.referenced_names,
- referenced_qualnames=processing.referenced_qualnames,
- )
- suggestions = compute_suggestions(
- project_metrics=project_metrics,
- units=processing.units,
- class_metrics=processing.class_metrics,
- func_groups=func_groups,
- block_groups=block_groups_report,
- segment_groups=segment_groups,
- block_group_facts=block_group_facts,
- structural_findings=combined_structural_findings,
- scan_root=str(boot.root),
- )
- coverage_xml_path = _resolve_optional_runtime_path(
- getattr(boot.args, "coverage_xml", None),
- root=boot.root,
- )
- if coverage_xml_path is not None:
- try:
- coverage_join = build_coverage_join(
- coverage_xml=coverage_xml_path,
- root_path=boot.root,
- units=processing.units,
- hotspot_threshold_percent=int(
- getattr(boot.args, "coverage_min", 50)
- ),
- )
- except CoverageJoinParseError as exc:
- coverage_join = CoverageJoinResult(
- coverage_xml=str(coverage_xml_path),
- status="invalid",
- hotspot_threshold_percent=int(
- getattr(boot.args, "coverage_min", 50)
- ),
- invalid_reason=str(exc),
- )
- metrics_payload = build_metrics_report_payload(
- scan_root=str(boot.root),
- project_metrics=project_metrics,
- coverage_join=coverage_join,
- units=processing.units,
- class_metrics=processing.class_metrics,
- module_deps=processing.module_deps,
- source_stats_by_file=processing.source_stats_by_file,
- suppressed_dead_code=suppressed_dead_items,
- )
-
- return AnalysisResult(
- func_groups=func_groups,
- block_groups=block_groups,
- block_groups_report=block_groups_report,
- segment_groups=segment_groups,
- suppressed_clone_groups=tuple(suppressed_clone_groups),
- suppressed_segment_groups=suppressed_segment_groups,
- block_group_facts=block_group_facts,
- func_clones_count=func_clones_count,
- block_clones_count=block_clones_count,
- segment_clones_count=segment_clones_count,
- files_analyzed_or_cached=files_analyzed_or_cached,
- project_metrics=project_metrics,
- metrics_payload=metrics_payload,
- suggestions=suggestions,
- segment_groups_raw_digest=segment_groups_raw_digest,
- coverage_join=coverage_join,
- suppressed_dead_code_items=len(suppressed_dead_items),
- structural_findings=combined_structural_findings,
- )
-
-
-def _load_markdown_report_renderer() -> Callable[..., str]:
- from .report.markdown import to_markdown_report
-
- return to_markdown_report
-
-
-def _load_sarif_report_renderer() -> Callable[..., str]:
- from .report.sarif import to_sarif_report
-
- return to_sarif_report
-
-
-def report(
- *,
- boot: BootstrapResult,
- discovery: DiscoveryResult,
- processing: ProcessingResult,
- analysis: AnalysisResult,
- report_meta: Mapping[str, object],
- new_func: Collection[str],
- new_block: Collection[str],
- html_builder: Callable[..., str] | None = None,
- metrics_diff: object | None = None,
- coverage_adoption_diff_available: bool = False,
- api_surface_diff_available: bool = False,
- include_report_document: bool = False,
-) -> ReportArtifacts:
- contents: dict[str, str | None] = {
- "html": None,
- "json": None,
- "md": None,
- "sarif": None,
- "text": None,
- }
-
- sf = analysis.structural_findings if analysis.structural_findings else None
- report_inventory = {
- "files": {
- "total_found": discovery.files_found,
- "analyzed": processing.files_analyzed,
- "cached": discovery.cache_hits,
- "skipped": processing.files_skipped,
- "source_io_skipped": len(processing.source_read_failures),
- },
- "code": {
- "parsed_lines": processing.analyzed_lines + discovery.cached_lines,
- "functions": processing.analyzed_functions + discovery.cached_functions,
- "methods": processing.analyzed_methods + discovery.cached_methods,
- "classes": processing.analyzed_classes + discovery.cached_classes,
- },
- "file_list": list(discovery.all_file_paths),
- }
- report_document: dict[str, object] | None = None
- needs_report_document = (
- include_report_document
- or boot.output_paths.html is not None
- or any(
- path is not None
- for path in (
- boot.output_paths.json,
- boot.output_paths.md,
- boot.output_paths.sarif,
- boot.output_paths.text,
- )
- )
- )
-
- if needs_report_document:
- metrics_for_report = (
- _enrich_metrics_report_payload(
- metrics_payload=analysis.metrics_payload,
- metrics_diff=cast("MetricsDiff | None", metrics_diff),
- coverage_adoption_diff_available=coverage_adoption_diff_available,
- api_surface_diff_available=api_surface_diff_available,
- )
- if analysis.metrics_payload is not None
- else None
- )
- report_document = build_report_document(
- func_groups=analysis.func_groups,
- block_groups=analysis.block_groups_report,
- segment_groups=analysis.segment_groups,
- suppressed_clone_groups=analysis.suppressed_clone_groups,
- meta=report_meta,
- inventory=report_inventory,
- block_facts=analysis.block_group_facts,
- new_function_group_keys=new_func,
- new_block_group_keys=new_block,
- new_segment_group_keys=set(analysis.segment_groups.keys()),
- metrics=metrics_for_report,
- suggestions=analysis.suggestions,
- structural_findings=sf,
- )
-
- if boot.output_paths.html and html_builder is not None:
- metrics_for_html = (
- _enrich_metrics_report_payload(
- metrics_payload=analysis.metrics_payload,
- metrics_diff=cast("MetricsDiff | None", metrics_diff),
- coverage_adoption_diff_available=coverage_adoption_diff_available,
- api_surface_diff_available=api_surface_diff_available,
- )
- if analysis.metrics_payload is not None
- else None
- )
- contents["html"] = html_builder(
- func_groups=analysis.func_groups,
- block_groups=analysis.block_groups_report,
- segment_groups=analysis.segment_groups,
- block_group_facts=analysis.block_group_facts,
- new_function_group_keys=new_func,
- new_block_group_keys=new_block,
- report_meta=report_meta,
- metrics=metrics_for_html,
- suggestions=analysis.suggestions,
- structural_findings=sf,
- report_document=report_document,
- metrics_diff=metrics_diff,
- title="CodeClone Report",
- context_lines=3,
- max_snippet_lines=220,
- )
-
- if any(
- path is not None
- for path in (
- boot.output_paths.json,
- boot.output_paths.md,
- boot.output_paths.sarif,
- boot.output_paths.text,
- )
- ):
- assert report_document is not None
-
- if boot.output_paths.json and report_document is not None:
- contents["json"] = render_json_report_document(report_document)
-
- def _render_projection_artifact(
- renderer: Callable[..., str],
- ) -> str:
- assert report_document is not None
- return renderer(
- report_document=report_document,
- meta=report_meta,
- inventory=report_inventory,
- func_groups=analysis.func_groups,
- block_groups=analysis.block_groups_report,
- segment_groups=analysis.segment_groups,
- block_facts=analysis.block_group_facts,
- new_function_group_keys=new_func,
- new_block_group_keys=new_block,
- new_segment_group_keys=set(analysis.segment_groups.keys()),
- metrics=analysis.metrics_payload,
- suggestions=analysis.suggestions,
- structural_findings=sf,
- )
-
- for key, output_path, loader in (
- ("md", boot.output_paths.md, _load_markdown_report_renderer),
- ("sarif", boot.output_paths.sarif, _load_sarif_report_renderer),
- ):
- if output_path and report_document is not None:
- contents[key] = _render_projection_artifact(loader())
-
- if boot.output_paths.text and report_document is not None:
- contents["text"] = render_text_report_document(report_document)
-
- return ReportArtifacts(
- html=contents["html"],
- json=contents["json"],
- md=contents["md"],
- sarif=contents["sarif"],
- text=contents["text"],
- report_document=report_document,
- )
-
-
-def metric_gate_reasons(
- *,
- project_metrics: ProjectMetrics,
- coverage_join: CoverageJoinResult | None,
- metrics_diff: MetricsDiff | None,
- config: MetricGateConfig,
-) -> tuple[str, ...]:
- reasons: list[str] = []
- _append_threshold_metric_reasons(
- reasons=reasons,
- project_metrics=project_metrics,
- config=config,
- )
- _append_new_metric_diff_reasons(
- reasons=reasons,
- metrics_diff=metrics_diff,
- config=config,
- )
- _append_adoption_metric_reasons(
- reasons=reasons,
- metrics_diff=metrics_diff,
- project_metrics=project_metrics,
- config=config,
- )
- _append_coverage_join_reasons(
- reasons=reasons,
- coverage_join=coverage_join,
- config=config,
- )
- return tuple(reasons)
-
-
-def _append_threshold_metric_reasons(
- *,
- reasons: list[str],
- project_metrics: ProjectMetrics,
- config: MetricGateConfig,
-) -> None:
- threshold_rows = (
- (
- config.fail_complexity >= 0
- and project_metrics.complexity_max > config.fail_complexity,
- "Complexity threshold exceeded: "
- f"max CC={project_metrics.complexity_max}, "
- f"threshold={config.fail_complexity}.",
- ),
- (
- config.fail_coupling >= 0
- and project_metrics.coupling_max > config.fail_coupling,
- "Coupling threshold exceeded: "
- f"max CBO={project_metrics.coupling_max}, "
- f"threshold={config.fail_coupling}.",
- ),
- (
- config.fail_cohesion >= 0
- and project_metrics.cohesion_max > config.fail_cohesion,
- "Cohesion threshold exceeded: "
- f"max LCOM4={project_metrics.cohesion_max}, "
- f"threshold={config.fail_cohesion}.",
- ),
- (
- config.fail_health >= 0
- and project_metrics.health.total < config.fail_health,
- "Health score below threshold: "
- f"score={project_metrics.health.total}, threshold={config.fail_health}.",
- ),
- )
- reasons.extend(message for triggered, message in threshold_rows if triggered)
- if config.fail_cycles and project_metrics.dependency_cycles:
- reasons.append(
- "Dependency cycles detected: "
- f"{len(project_metrics.dependency_cycles)} cycle(s)."
- )
- high_conf_dead = _high_confidence_dead_code_count(project_metrics.dead_code)
- if config.fail_dead_code and high_conf_dead > 0:
- reasons.append(
- f"Dead code detected (high confidence): {high_conf_dead} item(s)."
- )
-
-
-def _append_new_metric_diff_reasons(
- *,
- reasons: list[str],
- metrics_diff: MetricsDiff | None,
- config: MetricGateConfig,
-) -> None:
- if not config.fail_on_new_metrics or metrics_diff is None:
- return
- if metrics_diff.new_high_risk_functions:
- reasons.append(
- "New high-risk functions vs metrics baseline: "
- f"{len(metrics_diff.new_high_risk_functions)}."
- )
- if metrics_diff.new_high_coupling_classes:
- reasons.append(
- "New high-coupling classes vs metrics baseline: "
- f"{len(metrics_diff.new_high_coupling_classes)}."
- )
- if metrics_diff.new_cycles:
- reasons.append(
- "New dependency cycles vs metrics baseline: "
- f"{len(metrics_diff.new_cycles)}."
- )
- if metrics_diff.new_dead_code:
- reasons.append(
- "New dead code items vs metrics baseline: "
- f"{len(metrics_diff.new_dead_code)}."
- )
- if metrics_diff.health_delta < 0:
- reasons.append(
- "Health score regressed vs metrics baseline: "
- f"delta={metrics_diff.health_delta}."
- )
-
-
-def _append_metric_gate_reason(
- *,
- reasons: list[str],
- enabled: bool,
- triggered: bool,
- message: str,
-) -> None:
- if enabled and triggered:
- reasons.append(message)
-
-
-def _append_adoption_metric_reasons(
- *,
- reasons: list[str],
- metrics_diff: MetricsDiff | None,
- project_metrics: ProjectMetrics,
- config: MetricGateConfig,
-) -> None:
- typing_percent = (
- _permille(
- project_metrics.typing_param_annotated,
- project_metrics.typing_param_total,
- )
- / 10.0
- )
- docstring_percent = (
- _permille(
- project_metrics.docstring_public_documented,
- project_metrics.docstring_public_total,
- )
- / 10.0
- )
- if config.min_typing_coverage >= 0 and typing_percent < float(
- config.min_typing_coverage
- ):
- reasons.append(
- "Typing coverage below threshold: "
- f"coverage={typing_percent:.1f}%, threshold={config.min_typing_coverage}%."
- )
- if config.min_docstring_coverage >= 0 and docstring_percent < float(
- config.min_docstring_coverage
- ):
- reasons.append(
- "Docstring coverage below threshold: "
- "coverage="
- f"{docstring_percent:.1f}%, "
- f"threshold={config.min_docstring_coverage}%."
- )
- if metrics_diff is None:
- return
- if config.fail_on_typing_regression:
- typing_delta = int(getattr(metrics_diff, "typing_param_permille_delta", 0))
- return_delta = int(getattr(metrics_diff, "typing_return_permille_delta", 0))
- if typing_delta < 0 or return_delta < 0:
- reasons.append(
- "Typing coverage regressed vs metrics baseline: "
- f"params_delta={typing_delta}, returns_delta={return_delta}."
- )
- docstring_delta = int(getattr(metrics_diff, "docstring_permille_delta", 0))
- _append_metric_gate_reason(
- reasons=reasons,
- enabled=config.fail_on_docstring_regression,
- triggered=docstring_delta < 0,
- message=(
- "Docstring coverage regressed vs metrics baseline: "
- f"delta={docstring_delta}."
- ),
- )
- api_breaking = tuple(
- cast(
- "Sequence[object]",
- getattr(metrics_diff, "new_api_breaking_changes", ()),
- )
- )
- _append_metric_gate_reason(
- reasons=reasons,
- enabled=config.fail_on_api_break,
- triggered=bool(api_breaking),
- message=(
- f"Public API breaking changes vs metrics baseline: {len(api_breaking)}."
- ),
- )
-
-
-def _append_coverage_join_reasons(
- *,
- reasons: list[str],
- coverage_join: CoverageJoinResult | None,
- config: MetricGateConfig,
-) -> None:
- if not config.fail_on_untested_hotspots or coverage_join is None:
- return
- if coverage_join.status != "ok":
- return
- if coverage_join.coverage_hotspots > 0:
- reasons.append(
- "Coverage hotspots detected: "
- f"hotspots={coverage_join.coverage_hotspots}, "
- f"threshold={config.coverage_min}%."
- )
-
-
-def _high_confidence_dead_code_count(items: Sequence[DeadItem]) -> int:
- return sum(1 for item in items if item.confidence == "high")
-
-
-def gate(
- *,
- boot: BootstrapResult,
- analysis: AnalysisResult,
- new_func: Collection[str],
- new_block: Collection[str],
- metrics_diff: MetricsDiff | None,
-) -> GatingResult:
- reasons: list[str] = []
-
- if analysis.project_metrics is not None:
- metric_reasons = metric_gate_reasons(
- project_metrics=analysis.project_metrics,
- coverage_join=analysis.coverage_join,
- metrics_diff=metrics_diff,
- config=MetricGateConfig(
- fail_complexity=boot.args.fail_complexity,
- fail_coupling=boot.args.fail_coupling,
- fail_cohesion=boot.args.fail_cohesion,
- fail_cycles=boot.args.fail_cycles,
- fail_dead_code=boot.args.fail_dead_code,
- fail_health=boot.args.fail_health,
- fail_on_new_metrics=boot.args.fail_on_new_metrics,
- fail_on_typing_regression=bool(
- getattr(boot.args, "fail_on_typing_regression", False)
- ),
- fail_on_docstring_regression=bool(
- getattr(boot.args, "fail_on_docstring_regression", False)
- ),
- fail_on_api_break=bool(getattr(boot.args, "fail_on_api_break", False)),
- fail_on_untested_hotspots=bool(
- getattr(boot.args, "fail_on_untested_hotspots", False)
- ),
- min_typing_coverage=int(getattr(boot.args, "min_typing_coverage", -1)),
- min_docstring_coverage=int(
- getattr(boot.args, "min_docstring_coverage", -1)
- ),
- coverage_min=int(getattr(boot.args, "coverage_min", 50)),
- ),
- )
- reasons.extend(f"metric:{reason}" for reason in metric_reasons)
-
- if boot.args.fail_on_new and (new_func or new_block):
- reasons.append("clone:new")
-
- total_clone_groups = analysis.func_clones_count + analysis.block_clones_count
- if 0 <= boot.args.fail_threshold < total_clone_groups:
- reasons.append(
- f"clone:threshold:{total_clone_groups}:{boot.args.fail_threshold}"
- )
-
- if reasons:
- return GatingResult(
- exit_code=int(ExitCode.GATING_FAILURE),
- reasons=tuple(reasons),
- )
-
- return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=())
diff --git a/codeclone/qualnames.py b/codeclone/qualnames/__init__.py
similarity index 100%
rename from codeclone/qualnames.py
rename to codeclone/qualnames/__init__.py
diff --git a/codeclone/report/__init__.py b/codeclone/report/__init__.py
index e5869f7..9135843 100644
--- a/codeclone/report/__init__.py
+++ b/codeclone/report/__init__.py
@@ -3,41 +3,3 @@
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# Copyright (c) 2026 Den Rozhnovskiy
-
-from __future__ import annotations
-
-from ..grouping import build_block_groups, build_groups, build_segment_groups
-from .blocks import prepare_block_report_groups
-from .explain import build_block_group_facts
-from .markdown import render_markdown_report_document, to_markdown_report
-from .sarif import render_sarif_report_document, to_sarif_report
-from .segments import (
- SEGMENT_MIN_UNIQUE_STMT_TYPES,
- prepare_segment_report_groups,
-)
-from .serialize import (
- render_json_report_document,
- render_text_report_document,
-)
-from .suggestions import classify_clone_type, generate_suggestions
-from .types import GroupItem, GroupMap
-
-__all__ = [
- "SEGMENT_MIN_UNIQUE_STMT_TYPES",
- "GroupItem",
- "GroupMap",
- "build_block_group_facts",
- "build_block_groups",
- "build_groups",
- "build_segment_groups",
- "classify_clone_type",
- "generate_suggestions",
- "prepare_block_report_groups",
- "prepare_segment_report_groups",
- "render_json_report_document",
- "render_markdown_report_document",
- "render_sarif_report_document",
- "render_text_report_document",
- "to_markdown_report",
- "to_sarif_report",
-]
diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py
index 6873a08..7b07e30 100644
--- a/codeclone/report/derived.py
+++ b/codeclone/report/derived.py
@@ -7,9 +7,8 @@
from __future__ import annotations
from collections import Counter
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
-from .._coerce import as_int as _as_int
from ..domain.source_scope import (
IMPACT_SCOPE_MIXED,
IMPACT_SCOPE_NON_RUNTIME,
@@ -31,6 +30,7 @@
from ..paths import (
relative_repo_path as _relative_repo_path,
)
+from ..utils.coerce import as_int as _as_int
if TYPE_CHECKING:
from collections.abc import Iterable, Mapping, Sequence
@@ -125,7 +125,7 @@ def normalized_source_kind(value: object) -> SourceKind:
def source_scope_from_counts(
counts: Mapping[SourceKind, int] | Mapping[str, int],
) -> dict[str, object]:
- normalized_counts = cast("Mapping[str, int]", counts)
+ normalized_counts = {str(key): int(value) for key, value in counts.items()}
def _count(kind: str) -> int:
value = normalized_counts.get(kind, 0)
diff --git a/codeclone/report/document/__init__.py b/codeclone/report/document/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/report/document/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/report/document/_common.py b/codeclone/report/document/_common.py
new file mode 100644
index 0000000..832c664
--- /dev/null
+++ b/codeclone/report/document/_common.py
@@ -0,0 +1,414 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Collection, Iterable, Mapping, Sequence
+from typing import TYPE_CHECKING
+
+from ...contracts import (
+ DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
+ DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
+ DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
+)
+from ...domain.findings import (
+ CATEGORY_COHESION,
+ CATEGORY_COMPLEXITY,
+ CATEGORY_COUPLING,
+ CLONE_NOVELTY_KNOWN,
+ CLONE_NOVELTY_NEW,
+ FAMILY_DEAD_CODE,
+)
+from ...domain.quality import (
+ EFFORT_WEIGHT,
+ SEVERITY_RANK,
+)
+from ...findings.structural.detectors import normalize_structural_findings
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ..derived import (
+ normalized_source_kind as _normalized_source_kind,
+)
+from ..derived import (
+ relative_report_path,
+ report_location_from_group_item,
+)
+from ..derived import (
+ source_scope_from_counts as _report_source_scope_from_counts,
+)
+from ..derived import (
+ source_scope_from_locations as _report_source_scope_from_locations,
+)
+
+if TYPE_CHECKING:
+ from ...models import (
+ GroupMapLike,
+ SourceKind,
+ StructuralFindingGroup,
+ SuppressedCloneGroup,
+ )
+
+_OVERLOADED_MODULES_FAMILY = "overloaded_modules"
+_COVERAGE_ADOPTION_FAMILY = "coverage_adoption"
+_API_SURFACE_FAMILY = "api_surface"
+_COVERAGE_JOIN_FAMILY = "coverage_join"
+_SECURITY_SURFACES_FAMILY = "security_surfaces"
+
+
+def _optional_str(value: object) -> str | None:
+ if value is None:
+ return None
+ text = str(value).strip()
+ return text or None
+
+
+def _coerced_nonnegative_threshold(value: object, *, default: int) -> int:
+ threshold = _as_int(value, default)
+ return threshold if threshold >= 0 else default
+
+
+def _design_findings_thresholds_payload(
+ raw_meta: Mapping[str, object] | None,
+) -> dict[str, object]:
+ meta = dict(raw_meta or {})
+ return {
+ "design_findings": {
+ CATEGORY_COMPLEXITY: {
+ "metric": "cyclomatic_complexity",
+ "operator": ">",
+ "value": _coerced_nonnegative_threshold(
+ meta.get("design_complexity_threshold"),
+ default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
+ ),
+ },
+ CATEGORY_COUPLING: {
+ "metric": "cbo",
+ "operator": ">",
+ "value": _coerced_nonnegative_threshold(
+ meta.get("design_coupling_threshold"),
+ default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
+ ),
+ },
+ CATEGORY_COHESION: {
+ "metric": "lcom4",
+ "operator": ">=",
+ "value": _coerced_nonnegative_threshold(
+ meta.get("design_cohesion_threshold"),
+ default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
+ ),
+ },
+ }
+ }
+
+
+def _analysis_profile_payload(
+ raw_meta: Mapping[str, object] | None,
+) -> dict[str, int] | None:
+ meta = dict(raw_meta or {})
+ nested = _as_mapping(meta.get("analysis_profile"))
+ if nested:
+ meta = dict(nested)
+ keys = (
+ "min_loc",
+ "min_stmt",
+ "block_min_loc",
+ "block_min_stmt",
+ "segment_min_loc",
+ "segment_min_stmt",
+ )
+ if any(key not in meta for key in keys):
+ return None
+ payload = {key: _as_int(meta.get(key), -1) for key in keys}
+ if any(value < 0 for value in payload.values()):
+ return None
+ return payload
+
+
+def _normalize_path(value: str) -> str:
+ return value.replace("\\", "/").strip()
+
+
+def _is_absolute_path(value: str) -> bool:
+ normalized = _normalize_path(value)
+ if not normalized:
+ return False
+ if normalized.startswith("/"):
+ return True
+ return len(normalized) > 2 and normalized[1] == ":" and normalized[2] == "/"
+
+
+def _contract_path(
+ value: object,
+ *,
+ scan_root: str,
+) -> tuple[str | None, str | None, str | None]:
+ path_text = _optional_str(value)
+ if path_text is None:
+ return None, None, None
+ normalized_path = _normalize_path(path_text)
+ relative_path = relative_report_path(normalized_path, scan_root=scan_root)
+ if relative_path and relative_path != normalized_path:
+ return relative_path, "in_root", normalized_path
+ if _is_absolute_path(normalized_path):
+ return normalized_path.rsplit("/", maxsplit=1)[-1], "external", normalized_path
+ return normalized_path, "relative", None
+
+
+def _contract_report_location_path(location_path: str, *, scan_root: str) -> str:
+ contract_path, _scope, _absolute = _contract_path(
+ location_path,
+ scan_root=scan_root,
+ )
+ return contract_path or ""
+
+
+def _priority(
+ severity: str,
+ effort: str,
+) -> float:
+ severity_rank = SEVERITY_RANK.get(severity, 1)
+ effort_rank = EFFORT_WEIGHT.get(effort, 1)
+ return float(severity_rank) / float(effort_rank)
+
+
+def _clone_novelty(
+ *,
+ group_key: str,
+ baseline_trusted: bool,
+ new_keys: Collection[str] | None,
+) -> str:
+ if not baseline_trusted:
+ return CLONE_NOVELTY_NEW
+ if new_keys is None:
+ return CLONE_NOVELTY_NEW
+ return CLONE_NOVELTY_NEW if group_key in new_keys else CLONE_NOVELTY_KNOWN
+
+
+def _item_sort_key(item: Mapping[str, object]) -> tuple[str, int, int, str]:
+ return (
+ str(item.get("relative_path", "")),
+ _as_int(item.get("start_line")),
+ _as_int(item.get("end_line")),
+ str(item.get("qualname", "")),
+ )
+
+
+def _parse_bool_text(value: object) -> bool:
+ text = str(value).strip().lower()
+ return text in {"1", "true", "yes"}
+
+
+def _parse_ratio_percent(value: object) -> float | None:
+ text = str(value).strip()
+ if not text:
+ return None
+ if text.endswith("%"):
+ try:
+ return float(text[:-1]) / 100.0
+ except ValueError:
+ return None
+ try:
+ numeric = float(text)
+ except ValueError:
+ return None
+ return numeric if numeric <= 1.0 else numeric / 100.0
+
+
+def _normalize_block_machine_facts(
+ *,
+ group_key: str,
+ group_arity: int,
+ block_facts: Mapping[str, str],
+) -> tuple[dict[str, object], dict[str, str]]:
+ facts: dict[str, object] = {
+ "group_key": group_key,
+ "group_arity": group_arity,
+ }
+ display_facts: dict[str, str] = {}
+ for key in sorted(block_facts):
+ value = str(block_facts[key])
+ match key:
+ case "group_arity":
+ facts[key] = _as_int(value)
+ case "block_size" | "consecutive_asserts" | "instance_peer_count":
+ facts[key] = _as_int(value)
+ case "merged_regions":
+ facts[key] = _parse_bool_text(value)
+ case "assert_ratio":
+ ratio = _parse_ratio_percent(value)
+ if ratio is not None:
+ facts[key] = ratio
+ display_facts[key] = value
+ case (
+ "match_rule" | "pattern" | "signature_kind" | "hint" | "hint_confidence"
+ ):
+ facts[key] = value
+ case _:
+ display_facts[key] = value
+ return facts, display_facts
+
+
+def _source_scope_from_filepaths(
+ filepaths: Iterable[str],
+ *,
+ scan_root: str,
+) -> dict[str, object]:
+ counts: Counter[SourceKind] = Counter()
+ for filepath in filepaths:
+ location = report_location_from_group_item(
+ {"filepath": filepath, "start_line": 0, "end_line": 0, "qualname": ""},
+ scan_root=scan_root,
+ )
+ counts[location.source_kind] += 1
+ return _source_scope_from_counts(counts)
+
+
+def _source_scope_from_counts(
+ counts: Mapping[SourceKind, int],
+) -> dict[str, object]:
+ return _report_source_scope_from_counts(counts)
+
+
+def _source_scope_from_locations(
+ locations: Sequence[Mapping[str, object]],
+) -> dict[str, object]:
+ normalized_locations = [
+ {"source_kind": _normalized_source_kind(location.get("source_kind"))}
+ for location in locations
+ ]
+ return _report_source_scope_from_locations(normalized_locations)
+
+
+def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]:
+ paths: set[str] = set()
+ complexity = _as_mapping(metrics.get(CATEGORY_COMPLEXITY))
+ for item in _as_sequence(complexity.get("functions")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ for family_name in (CATEGORY_COUPLING, CATEGORY_COHESION):
+ family = _as_mapping(metrics.get(family_name))
+ for item in _as_sequence(family.get("classes")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ dead_code = _as_mapping(metrics.get(FAMILY_DEAD_CODE))
+ for item in _as_sequence(dead_code.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ for item in _as_sequence(dead_code.get("suppressed_items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ overloaded_modules = _as_mapping(metrics.get(_OVERLOADED_MODULES_FAMILY))
+ for item in _as_sequence(overloaded_modules.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ coverage_adoption = _as_mapping(metrics.get(_COVERAGE_ADOPTION_FAMILY))
+ for item in _as_sequence(coverage_adoption.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ api_surface = _as_mapping(metrics.get(_API_SURFACE_FAMILY))
+ for item in _as_sequence(api_surface.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ coverage_join = _as_mapping(metrics.get(_COVERAGE_JOIN_FAMILY))
+ for item in _as_sequence(coverage_join.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ security_surfaces = _as_mapping(metrics.get(_SECURITY_SURFACES_FAMILY))
+ for item in _as_sequence(security_surfaces.get("items")):
+ item_map = _as_mapping(item)
+ filepath = _optional_str(item_map.get("filepath"))
+ if filepath is not None:
+ paths.add(filepath)
+ return paths
+
+
+def _collect_report_file_list(
+ *,
+ inventory: Mapping[str, object] | None,
+ func_groups: GroupMapLike,
+ block_groups: GroupMapLike,
+ segment_groups: GroupMapLike,
+ suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None,
+ metrics: Mapping[str, object] | None,
+ structural_findings: Sequence[StructuralFindingGroup] | None,
+) -> list[str]:
+ files: set[str] = set()
+ inventory_map = _as_mapping(inventory)
+ for filepath in _as_sequence(inventory_map.get("file_list")):
+ file_text = _optional_str(filepath)
+ if file_text is not None:
+ files.add(file_text)
+ for groups in (func_groups, block_groups, segment_groups):
+ for items in groups.values():
+ for item in items:
+ filepath = _optional_str(item.get("filepath"))
+ if filepath is not None:
+ files.add(filepath)
+ for suppressed_group in suppressed_clone_groups or ():
+ for item in suppressed_group.items:
+ filepath = _optional_str(item.get("filepath"))
+ if filepath is not None:
+ files.add(filepath)
+ if metrics is not None:
+ files.update(_collect_paths_from_metrics(metrics))
+ if structural_findings:
+ for structural_group in normalize_structural_findings(structural_findings):
+ for occurrence in structural_group.items:
+ filepath = _optional_str(occurrence.file_path)
+ if filepath is not None:
+ files.add(filepath)
+ return sorted(files)
+
+
+def _count_file_lines(filepaths: Sequence[str]) -> int:
+ total = 0
+ for filepath in filepaths:
+ total += _count_file_lines_for_path(filepath)
+ return total
+
+
+def _count_file_lines_for_path(filepath: str) -> int:
+ try:
+ with open(filepath, encoding="utf-8", errors="surrogateescape") as handle:
+ return sum(1 for _ in handle)
+ except OSError:
+ return 0
+
+
+def _normalize_nested_string_rows(value: object) -> list[list[str]]:
+ rows: list[tuple[str, ...]] = []
+ for row in _as_sequence(value):
+ modules = tuple(
+ str(module) for module in _as_sequence(row) if str(module).strip()
+ )
+ if modules:
+ rows.append(modules)
+ rows.sort(key=lambda row: (len(row), row))
+ return [list(row) for row in rows]
+
+
+__all__ = [
+ "_collect_report_file_list",
+ "normalize_structural_findings",
+]
diff --git a/codeclone/report/document/_design_groups.py b/codeclone/report/document/_design_groups.py
new file mode 100644
index 0000000..01f0a77
--- /dev/null
+++ b/codeclone/report/document/_design_groups.py
@@ -0,0 +1,389 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+from ...contracts import (
+ DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
+ DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
+ DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
+)
+from ...domain.findings import (
+ CATEGORY_COHESION,
+ CATEGORY_COMPLEXITY,
+ CATEGORY_COUPLING,
+ CATEGORY_COVERAGE,
+ CATEGORY_DEPENDENCY,
+ FAMILY_DESIGN,
+ FINDING_KIND_COVERAGE_HOTSPOT,
+ FINDING_KIND_COVERAGE_SCOPE_GAP,
+)
+from ...domain.quality import (
+ CONFIDENCE_HIGH,
+ EFFORT_HARD,
+ EFFORT_MODERATE,
+ RISK_LOW,
+ SEVERITY_CRITICAL,
+ SEVERITY_WARNING,
+)
+from ...findings.ids import design_group_id
+from ...utils.coerce import as_float as _as_float
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ..derived import (
+ report_location_from_group_item,
+)
+from ._common import (
+ _COVERAGE_JOIN_FAMILY,
+ _coerced_nonnegative_threshold,
+ _contract_report_location_path,
+ _priority,
+ _source_scope_from_filepaths,
+)
+from ._findings_groups import _single_location_source_scope
+
+
+def _design_singleton_group(
+ *,
+ category: str,
+ kind: str,
+ severity: str,
+ qualname: str,
+ filepath: str,
+ start_line: int,
+ end_line: int,
+ scan_root: str,
+ item_data: Mapping[str, object],
+ facts: Mapping[str, object],
+) -> dict[str, object]:
+ return {
+ "id": design_group_id(category, qualname),
+ "family": FAMILY_DESIGN,
+ "category": category,
+ "kind": kind,
+ "severity": severity,
+ "confidence": CONFIDENCE_HIGH,
+ "priority": _priority(severity, EFFORT_MODERATE),
+ "count": 1,
+ "source_scope": _single_location_source_scope(
+ filepath,
+ scan_root=scan_root,
+ ),
+ "spread": {"files": 1, "functions": 1},
+ "items": [
+ {
+ "relative_path": _contract_report_location_path(
+ filepath,
+ scan_root=scan_root,
+ ),
+ "qualname": qualname,
+ "start_line": start_line,
+ "end_line": end_line,
+ **item_data,
+ }
+ ],
+ "facts": dict(facts),
+ }
+
+
+def _complexity_design_group(
+ item_map: Mapping[str, object],
+ *,
+ threshold: int,
+ scan_root: str,
+) -> dict[str, object] | None:
+ cc = _as_int(item_map.get("cyclomatic_complexity"), 1)
+ if cc <= threshold:
+ return None
+ qualname = str(item_map.get("qualname", ""))
+ filepath = str(item_map.get("relative_path", ""))
+ nesting_depth = _as_int(item_map.get("nesting_depth"))
+ severity = SEVERITY_CRITICAL if cc > 40 else SEVERITY_WARNING
+ return _design_singleton_group(
+ category=CATEGORY_COMPLEXITY,
+ kind="function_hotspot",
+ severity=severity,
+ qualname=qualname,
+ filepath=filepath,
+ start_line=_as_int(item_map.get("start_line")),
+ end_line=_as_int(item_map.get("end_line")),
+ scan_root=scan_root,
+ item_data={
+ "cyclomatic_complexity": cc,
+ "nesting_depth": nesting_depth,
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ },
+ facts={
+ "cyclomatic_complexity": cc,
+ "nesting_depth": nesting_depth,
+ },
+ )
+
+
+def _coupling_design_group(
+ item_map: Mapping[str, object],
+ *,
+ threshold: int,
+ scan_root: str,
+) -> dict[str, object] | None:
+ cbo = _as_int(item_map.get("cbo"))
+ if cbo <= threshold:
+ return None
+ qualname = str(item_map.get("qualname", ""))
+ filepath = str(item_map.get("relative_path", ""))
+ coupled_classes = list(_as_sequence(item_map.get("coupled_classes")))
+ return _design_singleton_group(
+ category=CATEGORY_COUPLING,
+ kind="class_hotspot",
+ severity=SEVERITY_WARNING,
+ qualname=qualname,
+ filepath=filepath,
+ start_line=_as_int(item_map.get("start_line")),
+ end_line=_as_int(item_map.get("end_line")),
+ scan_root=scan_root,
+ item_data={
+ "cbo": cbo,
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ "coupled_classes": coupled_classes,
+ },
+ facts={
+ "cbo": cbo,
+ "coupled_classes": coupled_classes,
+ },
+ )
+
+
+def _cohesion_design_group(
+ item_map: Mapping[str, object],
+ *,
+ threshold: int,
+ scan_root: str,
+) -> dict[str, object] | None:
+ lcom4 = _as_int(item_map.get("lcom4"))
+ if lcom4 < threshold:
+ return None
+ qualname = str(item_map.get("qualname", ""))
+ filepath = str(item_map.get("relative_path", ""))
+ method_count = _as_int(item_map.get("method_count"))
+ instance_var_count = _as_int(item_map.get("instance_var_count"))
+ return _design_singleton_group(
+ category=CATEGORY_COHESION,
+ kind="class_hotspot",
+ severity=SEVERITY_WARNING,
+ qualname=qualname,
+ filepath=filepath,
+ start_line=_as_int(item_map.get("start_line")),
+ end_line=_as_int(item_map.get("end_line")),
+ scan_root=scan_root,
+ item_data={
+ "lcom4": lcom4,
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ "method_count": method_count,
+ "instance_var_count": instance_var_count,
+ },
+ facts={
+ "lcom4": lcom4,
+ "method_count": method_count,
+ "instance_var_count": instance_var_count,
+ },
+ )
+
+
+def _dependency_design_group(
+ cycle: object,
+ *,
+ scan_root: str,
+) -> dict[str, object] | None:
+ modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()]
+ if not modules:
+ return None
+ cycle_key = " -> ".join(modules)
+ return {
+ "id": design_group_id(CATEGORY_DEPENDENCY, cycle_key),
+ "family": FAMILY_DESIGN,
+ "category": CATEGORY_DEPENDENCY,
+ "kind": "cycle",
+ "severity": SEVERITY_CRITICAL,
+ "confidence": CONFIDENCE_HIGH,
+ "priority": _priority(SEVERITY_CRITICAL, EFFORT_HARD),
+ "count": len(modules),
+ "source_scope": _source_scope_from_filepaths(
+ (module.replace(".", "/") + ".py" for module in modules),
+ scan_root=scan_root,
+ ),
+ "spread": {"files": len(modules), "functions": 0},
+ "items": [
+ {
+ "module": module,
+ "relative_path": module.replace(".", "/") + ".py",
+ "source_kind": report_location_from_group_item(
+ {
+ "filepath": module.replace(".", "/") + ".py",
+ "qualname": "",
+ "start_line": 0,
+ "end_line": 0,
+ }
+ ).source_kind,
+ }
+ for module in modules
+ ],
+ "facts": {
+ "cycle_length": len(modules),
+ },
+ }
+
+
+def _coverage_design_group(
+ item_map: Mapping[str, object],
+ *,
+ threshold_percent: int,
+ scan_root: str,
+) -> dict[str, object] | None:
+ coverage_hotspot = bool(item_map.get("coverage_hotspot"))
+ scope_gap_hotspot = bool(item_map.get("scope_gap_hotspot"))
+ if not coverage_hotspot and not scope_gap_hotspot:
+ return None
+ qualname = str(item_map.get("qualname", "")).strip()
+ filepath = str(item_map.get("relative_path", "")).strip()
+ if not filepath:
+ return None
+ start_line = _as_int(item_map.get("start_line"))
+ end_line = _as_int(item_map.get("end_line"))
+ subject_key = qualname or f"{filepath}:{start_line}:{end_line}"
+ risk = str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW
+ coverage_status = str(item_map.get("coverage_status", "")).strip()
+ coverage_permille = _as_int(item_map.get("coverage_permille"))
+ covered_lines = _as_int(item_map.get("covered_lines"))
+ executable_lines = _as_int(item_map.get("executable_lines"))
+ complexity = _as_int(item_map.get("cyclomatic_complexity"), 1)
+ severity = SEVERITY_CRITICAL if risk == "high" else SEVERITY_WARNING
+ if scope_gap_hotspot:
+ kind = FINDING_KIND_COVERAGE_SCOPE_GAP
+ detail = "The supplied coverage.xml did not map to this function's file."
+ else:
+ kind = FINDING_KIND_COVERAGE_HOTSPOT
+ detail = "Joined line coverage is below the configured hotspot threshold."
+ return {
+ "id": design_group_id(CATEGORY_COVERAGE, subject_key),
+ "family": FAMILY_DESIGN,
+ "category": CATEGORY_COVERAGE,
+ "kind": kind,
+ "severity": severity,
+ "confidence": CONFIDENCE_HIGH,
+ "priority": _priority(severity, EFFORT_MODERATE),
+ "count": 1,
+ "source_scope": _single_location_source_scope(
+ filepath,
+ scan_root=scan_root,
+ ),
+ "spread": {"files": 1, "functions": 1},
+ "items": [
+ {
+ "relative_path": filepath,
+ "qualname": qualname,
+ "start_line": start_line,
+ "end_line": end_line,
+ "risk": risk,
+ "cyclomatic_complexity": complexity,
+ "coverage_permille": coverage_permille,
+ "coverage_status": coverage_status,
+ "covered_lines": covered_lines,
+ "executable_lines": executable_lines,
+ "coverage_hotspot": coverage_hotspot,
+ "scope_gap_hotspot": scope_gap_hotspot,
+ }
+ ],
+ "facts": {
+ "coverage_permille": coverage_permille,
+ "hotspot_threshold_percent": threshold_percent,
+ "coverage_status": coverage_status,
+ "covered_lines": covered_lines,
+ "executable_lines": executable_lines,
+ "cyclomatic_complexity": complexity,
+ "coverage_hotspot": coverage_hotspot,
+ "scope_gap_hotspot": scope_gap_hotspot,
+ "detail": detail,
+ },
+ }
+
+
+def _build_design_groups(
+ metrics_payload: Mapping[str, object],
+ *,
+ design_thresholds: Mapping[str, object] | None = None,
+ scan_root: str,
+) -> list[dict[str, object]]:
+ families = _as_mapping(metrics_payload.get("families"))
+ thresholds = _as_mapping(design_thresholds)
+ complexity_threshold = _coerced_nonnegative_threshold(
+ _as_mapping(thresholds.get(CATEGORY_COMPLEXITY)).get("value"),
+ default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD,
+ )
+ coupling_threshold = _coerced_nonnegative_threshold(
+ _as_mapping(thresholds.get(CATEGORY_COUPLING)).get("value"),
+ default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD,
+ )
+ cohesion_threshold = _coerced_nonnegative_threshold(
+ _as_mapping(thresholds.get(CATEGORY_COHESION)).get("value"),
+ default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD,
+ )
+ coverage_join = _as_mapping(families.get(_COVERAGE_JOIN_FAMILY))
+ coverage_threshold = _as_int(
+ _as_mapping(coverage_join.get("summary")).get("hotspot_threshold_percent"),
+ 50,
+ )
+ groups: list[dict[str, object]] = []
+
+ complexity = _as_mapping(families.get(CATEGORY_COMPLEXITY))
+ for item in _as_sequence(complexity.get("items")):
+ group = _complexity_design_group(
+ _as_mapping(item),
+ threshold=complexity_threshold,
+ scan_root=scan_root,
+ )
+ if group is not None:
+ groups.append(group)
+
+ coupling = _as_mapping(families.get(CATEGORY_COUPLING))
+ for item in _as_sequence(coupling.get("items")):
+ group = _coupling_design_group(
+ _as_mapping(item),
+ threshold=coupling_threshold,
+ scan_root=scan_root,
+ )
+ if group is not None:
+ groups.append(group)
+
+ cohesion = _as_mapping(families.get(CATEGORY_COHESION))
+ for item in _as_sequence(cohesion.get("items")):
+ group = _cohesion_design_group(
+ _as_mapping(item),
+ threshold=cohesion_threshold,
+ scan_root=scan_root,
+ )
+ if group is not None:
+ groups.append(group)
+
+ dependencies = _as_mapping(families.get("dependencies"))
+ for cycle in _as_sequence(dependencies.get("cycles")):
+ group = _dependency_design_group(cycle, scan_root=scan_root)
+ if group is not None:
+ groups.append(group)
+
+ for item in _as_sequence(coverage_join.get("items")):
+ group = _coverage_design_group(
+ _as_mapping(item),
+ threshold_percent=coverage_threshold,
+ scan_root=scan_root,
+ )
+ if group is not None:
+ groups.append(group)
+
+ groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"])))
+ return groups
diff --git a/codeclone/report/document/_findings_groups.py b/codeclone/report/document/_findings_groups.py
new file mode 100644
index 0000000..8653708
--- /dev/null
+++ b/codeclone/report/document/_findings_groups.py
@@ -0,0 +1,606 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Collection, Mapping, Sequence
+from typing import TYPE_CHECKING, Literal
+
+from ...domain.findings import (
+ CLONE_KIND_BLOCK,
+ CLONE_KIND_FUNCTION,
+ CLONE_KIND_SEGMENT,
+ FAMILY_CLONE,
+ FAMILY_DEAD_CODE,
+ FAMILY_STRUCTURAL,
+)
+from ...domain.quality import (
+ CONFIDENCE_HIGH,
+ CONFIDENCE_MEDIUM,
+ EFFORT_EASY,
+ RISK_LOW,
+ SEVERITY_CRITICAL,
+ SEVERITY_INFO,
+ SEVERITY_WARNING,
+)
+from ...findings.structural.detectors import normalize_structural_findings
+from ...utils.coerce import as_float as _as_float
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ..derived import (
+ group_spread,
+ report_location_from_group_item,
+ report_location_from_structural_occurrence,
+)
+from ..suggestions import classify_clone_type
+
+if TYPE_CHECKING:
+ from ...models import (
+ GroupItemLike,
+ GroupMapLike,
+ StructuralFindingGroup,
+ SuppressedCloneGroup,
+ )
+
+from ...findings.ids import clone_group_id, dead_code_group_id, structural_group_id
+from ._common import (
+ _clone_novelty,
+ _contract_report_location_path,
+ _item_sort_key,
+ _normalize_block_machine_facts,
+ _priority,
+ _source_scope_from_locations,
+)
+
+
+def _clone_group_assessment(
+ *,
+ count: int,
+ clone_type: str,
+) -> tuple[str, float]:
+ match (count >= 4, clone_type in {"Type-1", "Type-2"}):
+ case (True, _):
+ severity = SEVERITY_CRITICAL
+ case (False, True):
+ severity = SEVERITY_WARNING
+ case _:
+ severity = SEVERITY_INFO
+ effort = "easy" if clone_type in {"Type-1", "Type-2"} else "moderate"
+ return severity, _priority(severity, effort)
+
+
+def _build_clone_group_facts(
+ *,
+ group_key: str,
+ kind: Literal["function", "block", "segment"],
+ items: Sequence[GroupItemLike],
+ block_facts: Mapping[str, Mapping[str, str]],
+) -> tuple[dict[str, object], dict[str, str]]:
+ base: dict[str, object] = {
+ "group_key": group_key,
+ "group_arity": len(items),
+ }
+ display_facts: dict[str, str] = {}
+ match kind:
+ case "function":
+ loc_buckets = sorted(
+ {
+ str(item.get("loc_bucket", ""))
+ for item in items
+ if str(item.get("loc_bucket", "")).strip()
+ }
+ )
+ base["loc_buckets"] = loc_buckets
+ case "block" if group_key in block_facts:
+ typed_facts, block_display_facts = _normalize_block_machine_facts(
+ group_key=group_key,
+ group_arity=len(items),
+ block_facts=block_facts[group_key],
+ )
+ base.update(typed_facts)
+ display_facts.update(block_display_facts)
+ case _:
+ pass
+ return base, display_facts
+
+
+def _clone_item_payload(
+ item: GroupItemLike,
+ *,
+ kind: Literal["function", "block", "segment"],
+ scan_root: str,
+) -> dict[str, object]:
+ payload: dict[str, object] = {
+ "relative_path": _contract_report_location_path(
+ str(item.get("filepath", "")),
+ scan_root=scan_root,
+ ),
+ "qualname": str(item.get("qualname", "")),
+ "start_line": _as_int(item.get("start_line", 0)),
+ "end_line": _as_int(item.get("end_line", 0)),
+ }
+ match kind:
+ case "function":
+ payload.update(
+ {
+ "loc": _as_int(item.get("loc", 0)),
+ "stmt_count": _as_int(item.get("stmt_count", 0)),
+ "fingerprint": str(item.get("fingerprint", "")),
+ "loc_bucket": str(item.get("loc_bucket", "")),
+ "cyclomatic_complexity": _as_int(
+ item.get("cyclomatic_complexity", 1)
+ ),
+ "nesting_depth": _as_int(item.get("nesting_depth", 0)),
+ "risk": str(item.get("risk", RISK_LOW)),
+ "raw_hash": str(item.get("raw_hash", "")),
+ }
+ )
+ case "block":
+ payload["size"] = _as_int(item.get("size", 0))
+ case _:
+ payload.update(
+ {
+ "size": _as_int(item.get("size", 0)),
+ "segment_hash": str(item.get("segment_hash", "")),
+ "segment_sig": str(item.get("segment_sig", "")),
+ }
+ )
+ return payload
+
+
+def _build_clone_groups(
+ *,
+ groups: GroupMapLike,
+ kind: Literal["function", "block", "segment"],
+ baseline_trusted: bool,
+ new_keys: Collection[str] | None,
+ block_facts: Mapping[str, Mapping[str, str]],
+ scan_root: str,
+) -> list[dict[str, object]]:
+ encoded_groups: list[dict[str, object]] = []
+ new_key_set = set(new_keys) if new_keys is not None else None
+ for group_key in sorted(groups):
+ items = groups[group_key]
+ clone_type = classify_clone_type(items=items, kind=kind)
+ severity, priority = _clone_group_assessment(
+ count=len(items),
+ clone_type=clone_type,
+ )
+ novelty = _clone_novelty(
+ group_key=group_key,
+ baseline_trusted=baseline_trusted,
+ new_keys=new_key_set,
+ )
+ locations = tuple(
+ report_location_from_group_item(item, scan_root=scan_root) for item in items
+ )
+ source_scope = _source_scope_from_locations(
+ [
+ {
+ "source_kind": location.source_kind,
+ }
+ for location in locations
+ ]
+ )
+ spread_files, spread_functions = group_spread(locations)
+ rows = sorted(
+ [
+ _clone_item_payload(
+ item,
+ kind=kind,
+ scan_root=scan_root,
+ )
+ for item in items
+ ],
+ key=_item_sort_key,
+ )
+ facts, display_facts = _build_clone_group_facts(
+ group_key=group_key,
+ kind=kind,
+ items=items,
+ block_facts=block_facts,
+ )
+ encoded_groups.append(
+ {
+ "id": clone_group_id(kind, group_key),
+ "family": FAMILY_CLONE,
+ "category": kind,
+ "kind": "clone_group",
+ "severity": severity,
+ "confidence": CONFIDENCE_HIGH,
+ "priority": priority,
+ "clone_kind": kind,
+ "clone_type": clone_type,
+ "novelty": novelty,
+ "count": len(items),
+ "source_scope": source_scope,
+ "spread": {
+ "files": spread_files,
+ "functions": spread_functions,
+ },
+ "items": rows,
+ "facts": facts,
+ **({"display_facts": display_facts} if display_facts else {}),
+ }
+ )
+ encoded_groups.sort(
+ key=lambda group: (-_as_int(group.get("count")), str(group["id"]))
+ )
+ return encoded_groups
+
+
+def _build_suppressed_clone_groups(
+ *,
+ groups: Sequence[SuppressedCloneGroup] | None,
+ block_facts: Mapping[str, Mapping[str, str]],
+ scan_root: str,
+) -> dict[str, list[dict[str, object]]]:
+ buckets: dict[str, list[dict[str, object]]] = {
+ CLONE_KIND_FUNCTION: [],
+ CLONE_KIND_BLOCK: [],
+ CLONE_KIND_SEGMENT: [],
+ }
+ for group in groups or ():
+ items = group.items
+ clone_type = classify_clone_type(items=items, kind=group.kind)
+ severity, priority = _clone_group_assessment(
+ count=len(items),
+ clone_type=clone_type,
+ )
+ locations = tuple(
+ report_location_from_group_item(item, scan_root=scan_root) for item in items
+ )
+ source_scope = _source_scope_from_locations(
+ [
+ {
+ "source_kind": location.source_kind,
+ }
+ for location in locations
+ ]
+ )
+ spread_files, spread_functions = group_spread(locations)
+ rows = sorted(
+ [
+ _clone_item_payload(
+ item,
+ kind=group.kind,
+ scan_root=scan_root,
+ )
+ for item in items
+ ],
+ key=_item_sort_key,
+ )
+ facts, display_facts = _build_clone_group_facts(
+ group_key=group.group_key,
+ kind=group.kind,
+ items=items,
+ block_facts=block_facts,
+ )
+ encoded: dict[str, object] = {
+ "id": clone_group_id(group.kind, group.group_key),
+ "family": FAMILY_CLONE,
+ "category": group.kind,
+ "kind": "clone_group",
+ "severity": severity,
+ "confidence": CONFIDENCE_HIGH,
+ "priority": priority,
+ "clone_kind": group.kind,
+ "clone_type": clone_type,
+ "count": len(items),
+ "source_scope": source_scope,
+ "spread": {
+ "files": spread_files,
+ "functions": spread_functions,
+ },
+ "items": rows,
+ "facts": facts,
+ "suppression_rule": group.suppression_rule,
+ "suppression_source": group.suppression_source,
+ "matched_patterns": list(group.matched_patterns),
+ }
+ if display_facts:
+ encoded["display_facts"] = display_facts
+ buckets[group.kind].append(encoded)
+ for bucket in buckets.values():
+ bucket.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"])))
+ return buckets
+
+
+def _structural_group_assessment(
+ *,
+ finding_kind: str,
+ count: int,
+ spread_functions: int,
+) -> tuple[str, float]:
+ match finding_kind:
+ case "clone_guard_exit_divergence" | "clone_cohort_drift":
+ severity = SEVERITY_WARNING
+ if count >= 3 or spread_functions > 1:
+ severity = SEVERITY_CRITICAL
+ return severity, _priority(severity, "moderate")
+ case _:
+ severity = (
+ SEVERITY_WARNING
+ if count >= 4 or spread_functions > 1
+ else SEVERITY_INFO
+ )
+ return severity, _priority(severity, "moderate")
+
+
+def _csv_values(value: object) -> list[str]:
+ raw = str(value).strip()
+ if not raw:
+ return []
+ return sorted({part.strip() for part in raw.split(",") if part.strip()})
+
+
+def _build_structural_signature(
+ finding_kind: str,
+ signature: Mapping[str, str],
+) -> dict[str, object]:
+ debug = {str(key): str(signature[key]) for key in sorted(signature)}
+ match finding_kind:
+ case "clone_guard_exit_divergence":
+ return {
+ "version": "1",
+ "stable": {
+ "family": "clone_guard_exit_divergence",
+ "cohort_id": str(signature.get("cohort_id", "")),
+ "majority_guard_count": _as_int(
+ signature.get("majority_guard_count")
+ ),
+ "majority_guard_terminal_profile": str(
+ signature.get("majority_guard_terminal_profile", "none")
+ ),
+ "majority_terminal_kind": str(
+ signature.get("majority_terminal_kind", "fallthrough")
+ ),
+ "majority_side_effect_before_guard": (
+ str(signature.get("majority_side_effect_before_guard", "0"))
+ == "1"
+ ),
+ },
+ "debug": debug,
+ }
+ case "clone_cohort_drift":
+ return {
+ "version": "1",
+ "stable": {
+ "family": "clone_cohort_drift",
+ "cohort_id": str(signature.get("cohort_id", "")),
+ "drift_fields": _csv_values(signature.get("drift_fields")),
+ "majority_profile": {
+ "terminal_kind": str(
+ signature.get("majority_terminal_kind", "")
+ ),
+ "guard_exit_profile": str(
+ signature.get("majority_guard_exit_profile", "")
+ ),
+ "try_finally_profile": str(
+ signature.get("majority_try_finally_profile", "")
+ ),
+ "side_effect_order_profile": str(
+ signature.get("majority_side_effect_order_profile", "")
+ ),
+ },
+ },
+ "debug": debug,
+ }
+ case _:
+ return {
+ "version": "1",
+ "stable": {
+ "family": "duplicated_branches",
+ "stmt_shape": str(signature.get("stmt_seq", "")),
+ "terminal_kind": str(signature.get("terminal", "")),
+ "control_flow": {
+ "has_loop": str(signature.get("has_loop", "0")) == "1",
+ "has_try": str(signature.get("has_try", "0")) == "1",
+ "nested_if": str(signature.get("nested_if", "0")) == "1",
+ },
+ },
+ "debug": debug,
+ }
+
+
+def _build_structural_facts(
+ finding_kind: str,
+ signature: Mapping[str, str],
+ *,
+ count: int,
+) -> dict[str, object]:
+ match finding_kind:
+ case "clone_guard_exit_divergence":
+ return {
+ "cohort_id": str(signature.get("cohort_id", "")),
+ "cohort_arity": _as_int(signature.get("cohort_arity")),
+ "divergent_members": _as_int(signature.get("divergent_members"), count),
+ "majority_entry_guard_count": _as_int(
+ signature.get("majority_guard_count"),
+ ),
+ "majority_guard_terminal_profile": str(
+ signature.get("majority_guard_terminal_profile", "none")
+ ),
+ "majority_terminal_kind": str(
+ signature.get("majority_terminal_kind", "fallthrough")
+ ),
+ "majority_side_effect_before_guard": (
+ str(signature.get("majority_side_effect_before_guard", "0")) == "1"
+ ),
+ "guard_count_values": _csv_values(signature.get("guard_count_values")),
+ "guard_terminal_values": _csv_values(
+ signature.get("guard_terminal_values"),
+ ),
+ "terminal_values": _csv_values(signature.get("terminal_values")),
+ "side_effect_before_guard_values": _csv_values(
+ signature.get("side_effect_before_guard_values"),
+ ),
+ }
+ case "clone_cohort_drift":
+ return {
+ "cohort_id": str(signature.get("cohort_id", "")),
+ "cohort_arity": _as_int(signature.get("cohort_arity")),
+ "divergent_members": _as_int(signature.get("divergent_members"), count),
+ "drift_fields": _csv_values(signature.get("drift_fields")),
+ "stable_majority_profile": {
+ "terminal_kind": str(signature.get("majority_terminal_kind", "")),
+ "guard_exit_profile": str(
+ signature.get("majority_guard_exit_profile", "")
+ ),
+ "try_finally_profile": str(
+ signature.get("majority_try_finally_profile", "")
+ ),
+ "side_effect_order_profile": str(
+ signature.get("majority_side_effect_order_profile", "")
+ ),
+ },
+ }
+ case _:
+ return {
+ "occurrence_count": count,
+ "non_overlapping": True,
+ "call_bucket": _as_int(signature.get("calls", "0")),
+ "raise_bucket": _as_int(signature.get("raises", "0")),
+ }
+
+
+def _build_structural_groups(
+ groups: Sequence[StructuralFindingGroup] | None,
+ *,
+ scan_root: str,
+) -> list[dict[str, object]]:
+ normalized_groups = normalize_structural_findings(groups or ())
+ out: list[dict[str, object]] = []
+ for group in normalized_groups:
+ locations = tuple(
+ report_location_from_structural_occurrence(item, scan_root=scan_root)
+ for item in group.items
+ )
+ source_scope = _source_scope_from_locations(
+ [{"source_kind": location.source_kind} for location in locations]
+ )
+ spread_files, spread_functions = group_spread(locations)
+ severity, priority = _structural_group_assessment(
+ finding_kind=group.finding_kind,
+ count=len(group.items),
+ spread_functions=spread_functions,
+ )
+ out.append(
+ {
+ "id": structural_group_id(group.finding_kind, group.finding_key),
+ "family": FAMILY_STRUCTURAL,
+ "category": group.finding_kind,
+ "kind": group.finding_kind,
+ "severity": severity,
+ "confidence": (
+ CONFIDENCE_HIGH
+ if group.finding_kind
+ in {"clone_guard_exit_divergence", "clone_cohort_drift"}
+ else CONFIDENCE_MEDIUM
+ ),
+ "priority": priority,
+ "count": len(group.items),
+ "source_scope": source_scope,
+ "spread": {
+ "files": spread_files,
+ "functions": spread_functions,
+ },
+ "signature": _build_structural_signature(
+ group.finding_kind,
+ group.signature,
+ ),
+ "items": sorted(
+ [
+ {
+ "relative_path": _contract_report_location_path(
+ item.file_path,
+ scan_root=scan_root,
+ ),
+ "qualname": item.qualname,
+ "start_line": item.start,
+ "end_line": item.end,
+ }
+ for item in group.items
+ ],
+ key=_item_sort_key,
+ ),
+ "facts": _build_structural_facts(
+ group.finding_kind,
+ group.signature,
+ count=len(group.items),
+ ),
+ }
+ )
+ out.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"])))
+ return out
+
+
+def _single_location_source_scope(
+ filepath: str,
+ *,
+ scan_root: str,
+) -> dict[str, object]:
+ location = report_location_from_group_item(
+ {
+ "filepath": filepath,
+ "qualname": "",
+ "start_line": 0,
+ "end_line": 0,
+ },
+ scan_root=scan_root,
+ )
+ return _source_scope_from_locations([{"source_kind": location.source_kind}])
+
+
+def _build_dead_code_groups(
+ metrics_payload: Mapping[str, object],
+ *,
+ scan_root: str,
+) -> list[dict[str, object]]:
+ families = _as_mapping(metrics_payload.get("families"))
+ dead_code = _as_mapping(families.get(FAMILY_DEAD_CODE))
+ groups: list[dict[str, object]] = []
+ for item in _as_sequence(dead_code.get("items")):
+ item_map = _as_mapping(item)
+ qualname = str(item_map.get("qualname", ""))
+ filepath = str(item_map.get("relative_path", ""))
+ confidence = str(item_map.get("confidence", CONFIDENCE_MEDIUM))
+ severity = SEVERITY_WARNING if confidence == CONFIDENCE_HIGH else SEVERITY_INFO
+ groups.append(
+ {
+ "id": dead_code_group_id(qualname),
+ "family": FAMILY_DEAD_CODE,
+ "category": str(item_map.get("kind", "unknown")),
+ "kind": "unused_symbol",
+ "severity": severity,
+ "confidence": confidence,
+ "priority": _priority(severity, EFFORT_EASY),
+ "count": 1,
+ "source_scope": _single_location_source_scope(
+ filepath,
+ scan_root=scan_root,
+ ),
+ "spread": {"files": 1, "functions": 1 if qualname else 0},
+ "items": [
+ {
+ "relative_path": _contract_report_location_path(
+ filepath,
+ scan_root=scan_root,
+ ),
+ "qualname": qualname,
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ }
+ ],
+ "facts": {
+ "kind": str(item_map.get("kind", "unknown")),
+ "confidence": confidence,
+ },
+ }
+ )
+ groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"])))
+ return groups
diff --git a/codeclone/report/document/builder.py b/codeclone/report/document/builder.py
new file mode 100644
index 0000000..9d22dfa
--- /dev/null
+++ b/codeclone/report/document/builder.py
@@ -0,0 +1,114 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Collection, Mapping, Sequence
+from typing import TYPE_CHECKING
+
+from ...contracts import (
+ REPORT_SCHEMA_VERSION,
+)
+from ...utils.coerce import as_mapping as _as_mapping
+
+if TYPE_CHECKING:
+ from ...models import (
+ GroupMapLike,
+ StructuralFindingGroup,
+ Suggestion,
+ SuppressedCloneGroup,
+ )
+
+from ._common import _collect_report_file_list
+from .derived import _build_derived_overview, _build_derived_suggestions
+from .findings import _build_findings_payload
+from .integrity import _build_integrity_payload
+from .inventory import (
+ _baseline_is_trusted,
+ _build_inventory_payload,
+ _build_meta_payload,
+)
+from .metrics import _build_metrics_payload
+
+
+def build_report_document(
+ *,
+ func_groups: GroupMapLike,
+ block_groups: GroupMapLike,
+ segment_groups: GroupMapLike,
+ meta: Mapping[str, object] | None = None,
+ inventory: Mapping[str, object] | None = None,
+ block_facts: Mapping[str, Mapping[str, str]] | None = None,
+ new_function_group_keys: Collection[str] | None = None,
+ new_block_group_keys: Collection[str] | None = None,
+ new_segment_group_keys: Collection[str] | None = None,
+ suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None,
+ metrics: Mapping[str, object] | None = None,
+ suggestions: Sequence[Suggestion] | None = None,
+ structural_findings: Sequence[StructuralFindingGroup] | None = None,
+) -> dict[str, object]:
+ report_schema_version = REPORT_SCHEMA_VERSION
+ scan_root = str(_as_mapping(meta).get("scan_root", ""))
+ meta_payload = _build_meta_payload(meta, scan_root=scan_root)
+ design_thresholds = _as_mapping(
+ _as_mapping(meta_payload.get("analysis_thresholds")).get("design_findings")
+ )
+ metrics_payload = _build_metrics_payload(metrics, scan_root=scan_root)
+ file_list = _collect_report_file_list(
+ inventory=inventory,
+ func_groups=func_groups,
+ block_groups=block_groups,
+ segment_groups=segment_groups,
+ suppressed_clone_groups=suppressed_clone_groups,
+ metrics=metrics,
+ structural_findings=structural_findings,
+ )
+ inventory_payload = _build_inventory_payload(
+ inventory=inventory,
+ file_list=file_list,
+ metrics_payload=metrics_payload,
+ scan_root=scan_root,
+ )
+ findings_payload = _build_findings_payload(
+ func_groups=func_groups,
+ block_groups=block_groups,
+ segment_groups=segment_groups,
+ block_facts=block_facts or {},
+ structural_findings=structural_findings,
+ metrics_payload=metrics_payload,
+ baseline_trusted=_baseline_is_trusted(meta_payload),
+ new_function_group_keys=new_function_group_keys,
+ new_block_group_keys=new_block_group_keys,
+ new_segment_group_keys=new_segment_group_keys,
+ suppressed_clone_groups=suppressed_clone_groups,
+ design_thresholds=design_thresholds,
+ scan_root=scan_root,
+ )
+ overview_payload, hotlists_payload = _build_derived_overview(
+ findings=findings_payload,
+ metrics_payload=metrics_payload,
+ )
+ derived_payload = {
+ "suggestions": _build_derived_suggestions(suggestions),
+ "overview": overview_payload,
+ "hotlists": hotlists_payload,
+ }
+ integrity_payload = _build_integrity_payload(
+ report_schema_version=report_schema_version,
+ meta=meta_payload,
+ inventory=inventory_payload,
+ findings=findings_payload,
+ metrics=metrics_payload,
+ )
+ return {
+ "report_schema_version": report_schema_version,
+ "meta": meta_payload,
+ "inventory": inventory_payload,
+ "findings": findings_payload,
+ "metrics": metrics_payload,
+ "derived": derived_payload,
+ "integrity": integrity_payload,
+ }
diff --git a/codeclone/report/document/derived.py b/codeclone/report/document/derived.py
new file mode 100644
index 0000000..add2042
--- /dev/null
+++ b/codeclone/report/document/derived.py
@@ -0,0 +1,425 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING
+
+from ...domain.findings import (
+ CATEGORY_COHESION,
+ CATEGORY_COMPLEXITY,
+ CATEGORY_COUPLING,
+ CATEGORY_DEAD_CODE,
+ CATEGORY_DEPENDENCY,
+ CLONE_KIND_BLOCK,
+ CLONE_KIND_FUNCTION,
+ CLONE_KIND_SEGMENT,
+ FAMILY_CLONE,
+ FAMILY_CLONES,
+ FAMILY_DEAD_CODE,
+ FAMILY_DESIGN,
+ FAMILY_STRUCTURAL,
+)
+from ...domain.quality import (
+ SEVERITY_INFO,
+ SEVERITY_ORDER,
+)
+from ...domain.source_scope import (
+ IMPACT_SCOPE_MIXED,
+ IMPACT_SCOPE_NON_RUNTIME,
+ IMPACT_SCOPE_RUNTIME,
+ SOURCE_KIND_FIXTURES,
+ SOURCE_KIND_MIXED,
+ SOURCE_KIND_OTHER,
+ SOURCE_KIND_PRODUCTION,
+ SOURCE_KIND_TESTS,
+)
+from ...findings.ids import (
+ clone_group_id,
+ dead_code_group_id,
+ design_group_id,
+ structural_group_id,
+)
+from ...utils.coerce import as_float as _as_float
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ..overview import build_directory_hotspots
+from ._common import _contract_report_location_path, _is_absolute_path
+
+if TYPE_CHECKING:
+ from ...models import (
+ Suggestion,
+ )
+
+
+def _sort_flat_finding_ids(
+ groups: Sequence[Mapping[str, object]],
+) -> list[str]:
+ ordered = sorted(
+ groups,
+ key=lambda group: (
+ -_as_float(group.get("priority")),
+ SEVERITY_ORDER.get(str(group.get("severity", SEVERITY_INFO)), 9),
+ -_as_int(_as_mapping(group.get("spread")).get("files")),
+ -_as_int(_as_mapping(group.get("spread")).get("functions")),
+ -_as_int(group.get("count")),
+ str(group.get("id", "")),
+ ),
+ )
+ return [str(group["id"]) for group in ordered]
+
+
+def _sort_highest_spread_ids(
+ groups: Sequence[Mapping[str, object]],
+) -> list[str]:
+ ordered = sorted(
+ groups,
+ key=lambda group: (
+ -_as_int(_as_mapping(group.get("spread")).get("files")),
+ -_as_int(_as_mapping(group.get("spread")).get("functions")),
+ -_as_int(group.get("count")),
+ -_as_float(group.get("priority")),
+ str(group.get("id", "")),
+ ),
+ )
+ return [str(group["id"]) for group in ordered]
+
+
+def _health_snapshot(metrics_payload: Mapping[str, object]) -> dict[str, object]:
+ health = _as_mapping(_as_mapping(metrics_payload.get("families")).get("health"))
+ summary = _as_mapping(health.get("summary"))
+ dimensions = {
+ str(key): _as_int(value)
+ for key, value in _as_mapping(summary.get("dimensions")).items()
+ }
+ strongest = None
+ weakest = None
+ if dimensions:
+ strongest = min(
+ sorted(dimensions),
+ key=lambda key: (-dimensions[key], key),
+ )
+ weakest = min(
+ sorted(dimensions),
+ key=lambda key: (dimensions[key], key),
+ )
+ return {
+ "score": _as_int(summary.get("score")),
+ "grade": str(summary.get("grade", "")),
+ "strongest_dimension": strongest,
+ "weakest_dimension": weakest,
+ }
+
+
+def _combined_impact_scope(groups: Sequence[Mapping[str, object]]) -> str:
+ impact_scopes = {
+ str(
+ _as_mapping(group.get("source_scope")).get(
+ "impact_scope",
+ IMPACT_SCOPE_NON_RUNTIME,
+ )
+ )
+ for group in groups
+ }
+ if not impact_scopes:
+ return IMPACT_SCOPE_NON_RUNTIME
+ if len(impact_scopes) == 1:
+ return next(iter(impact_scopes))
+ return IMPACT_SCOPE_MIXED
+
+
+def _top_risks(
+ *,
+ dead_code_groups: Sequence[Mapping[str, object]],
+ design_groups: Sequence[Mapping[str, object]],
+ structural_groups: Sequence[Mapping[str, object]],
+ clone_groups: Sequence[Mapping[str, object]],
+) -> list[dict[str, object]]:
+ risks: list[dict[str, object]] = []
+
+ if dead_code_groups:
+ label = (
+ "1 dead code item"
+ if len(dead_code_groups) == 1
+ else f"{len(dead_code_groups)} dead code items"
+ )
+ risks.append(
+ {
+ "kind": "family_summary",
+ "family": FAMILY_DEAD_CODE,
+ "count": len(dead_code_groups),
+ "scope": IMPACT_SCOPE_MIXED
+ if len(
+ {
+ _as_mapping(group.get("source_scope")).get("impact_scope")
+ for group in dead_code_groups
+ }
+ )
+ > 1
+ else str(
+ _as_mapping(dead_code_groups[0].get("source_scope")).get(
+ "impact_scope",
+ IMPACT_SCOPE_NON_RUNTIME,
+ )
+ ),
+ "label": label,
+ }
+ )
+
+ low_cohesion = [
+ group
+ for group in design_groups
+ if str(group.get("category", "")) == CATEGORY_COHESION
+ ]
+ if low_cohesion:
+ label = (
+ "1 low cohesion class"
+ if len(low_cohesion) == 1
+ else f"{len(low_cohesion)} low cohesion classes"
+ )
+ risks.append(
+ {
+ "kind": "family_summary",
+ "family": FAMILY_DESIGN,
+ "category": CATEGORY_COHESION,
+ "count": len(low_cohesion),
+ "scope": _combined_impact_scope(low_cohesion),
+ "label": label,
+ }
+ )
+
+ production_structural = [
+ group
+ for group in structural_groups
+ if str(_as_mapping(group.get("source_scope")).get("impact_scope"))
+ in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED}
+ ]
+ if production_structural:
+ label = (
+ "1 structural finding in production code"
+ if len(production_structural) == 1
+ else (
+ f"{len(production_structural)} structural findings in production code"
+ )
+ )
+ risks.append(
+ {
+ "kind": "family_summary",
+ "family": FAMILY_STRUCTURAL,
+ "count": len(production_structural),
+ "scope": SOURCE_KIND_PRODUCTION,
+ "label": label,
+ }
+ )
+
+ fixture_test_clones = [
+ group
+ for group in clone_groups
+ if _as_mapping(group.get("source_scope")).get("impact_scope")
+ == IMPACT_SCOPE_NON_RUNTIME
+ and _as_mapping(group.get("source_scope")).get("dominant_kind")
+ in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES}
+ ]
+ if fixture_test_clones:
+ label = (
+ "1 clone group in fixtures/tests"
+ if len(fixture_test_clones) == 1
+ else f"{len(fixture_test_clones)} clone groups in fixtures/tests"
+ )
+ risks.append(
+ {
+ "kind": "family_summary",
+ "family": FAMILY_CLONE,
+ "count": len(fixture_test_clones),
+ "scope": IMPACT_SCOPE_NON_RUNTIME,
+ "label": label,
+ }
+ )
+
+ return risks[:6]
+
+
+def _build_derived_overview(
+ *,
+ findings: Mapping[str, object],
+ metrics_payload: Mapping[str, object],
+) -> tuple[dict[str, object], dict[str, object]]:
+ groups = _as_mapping(findings.get("groups"))
+ clones = _as_mapping(groups.get(FAMILY_CLONES))
+ clone_groups = [
+ *_as_sequence(clones.get("functions")),
+ *_as_sequence(clones.get("blocks")),
+ *_as_sequence(clones.get("segments")),
+ ]
+ structural_groups = _as_sequence(
+ _as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups")
+ )
+ dead_code_groups = _as_sequence(
+ _as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups")
+ )
+ design_groups = _as_sequence(_as_mapping(groups.get("design")).get("groups"))
+ flat_groups = [
+ *clone_groups,
+ *structural_groups,
+ *dead_code_groups,
+ *design_groups,
+ ]
+ dominant_kind_counts: Counter[str] = Counter(
+ str(
+ _as_mapping(_as_mapping(group).get("source_scope")).get(
+ "dominant_kind",
+ SOURCE_KIND_OTHER,
+ )
+ )
+ for group in flat_groups
+ )
+ summary = _as_mapping(findings.get("summary"))
+ overview: dict[str, object] = {
+ "families": dict(_as_mapping(summary.get("families"))),
+ "top_risks": _top_risks(
+ dead_code_groups=[_as_mapping(group) for group in dead_code_groups],
+ design_groups=[_as_mapping(group) for group in design_groups],
+ structural_groups=[_as_mapping(group) for group in structural_groups],
+ clone_groups=[_as_mapping(group) for group in clone_groups],
+ ),
+ "source_scope_breakdown": {
+ key: dominant_kind_counts[key]
+ for key in (
+ SOURCE_KIND_PRODUCTION,
+ SOURCE_KIND_TESTS,
+ SOURCE_KIND_FIXTURES,
+ SOURCE_KIND_MIXED,
+ SOURCE_KIND_OTHER,
+ )
+ if dominant_kind_counts[key] > 0
+ },
+ "health_snapshot": _health_snapshot(metrics_payload),
+ "directory_hotspots": build_directory_hotspots(findings=findings),
+ }
+ hotlists: dict[str, object] = {
+ "most_actionable_ids": _sort_flat_finding_ids(
+ [
+ group
+ for group in map(_as_mapping, flat_groups)
+ if str(group.get("severity")) != SEVERITY_INFO
+ ]
+ )[:5],
+ "highest_spread_ids": _sort_highest_spread_ids(
+ list(map(_as_mapping, flat_groups))
+ )[:5],
+ "production_hotspot_ids": _sort_flat_finding_ids(
+ [
+ group
+ for group in map(_as_mapping, flat_groups)
+ if str(_as_mapping(group.get("source_scope")).get("impact_scope"))
+ in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED}
+ ]
+ )[:5],
+ "test_fixture_hotspot_ids": _sort_flat_finding_ids(
+ [
+ group
+ for group in map(_as_mapping, flat_groups)
+ if str(_as_mapping(group.get("source_scope")).get("impact_scope"))
+ == IMPACT_SCOPE_NON_RUNTIME
+ and str(_as_mapping(group.get("source_scope")).get("dominant_kind"))
+ in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES}
+ ]
+ )[:5],
+ }
+ return overview, hotlists
+
+
+def _representative_location_rows(
+ suggestion: Suggestion,
+) -> list[dict[str, object]]:
+ rows = [
+ {
+ "relative_path": (
+ location.relative_path
+ if (
+ location.relative_path
+ and not _is_absolute_path(location.relative_path)
+ )
+ else _contract_report_location_path(
+ location.filepath,
+ scan_root="",
+ )
+ ),
+ "start_line": location.start_line,
+ "end_line": location.end_line,
+ "qualname": location.qualname,
+ "source_kind": location.source_kind,
+ }
+ for location in suggestion.representative_locations
+ ]
+ rows.sort(
+ key=lambda row: (
+ str(row["relative_path"]),
+ _as_int(row["start_line"]),
+ _as_int(row["end_line"]),
+ str(row["qualname"]),
+ )
+ )
+ return rows[:3]
+
+
+def _suggestion_finding_id(suggestion: Suggestion) -> str:
+ if suggestion.finding_family == FAMILY_CLONES:
+ if suggestion.fact_kind.startswith("Function"):
+ return clone_group_id(CLONE_KIND_FUNCTION, suggestion.subject_key)
+ if suggestion.fact_kind.startswith("Block"):
+ return clone_group_id(CLONE_KIND_BLOCK, suggestion.subject_key)
+ return clone_group_id(CLONE_KIND_SEGMENT, suggestion.subject_key)
+ if suggestion.finding_family == FAMILY_STRUCTURAL:
+ return structural_group_id(
+ suggestion.finding_kind or "duplicated_branches",
+ suggestion.subject_key,
+ )
+ if suggestion.category == CATEGORY_DEAD_CODE:
+ return dead_code_group_id(suggestion.subject_key)
+ if suggestion.category in {
+ CATEGORY_COMPLEXITY,
+ CATEGORY_COUPLING,
+ CATEGORY_COHESION,
+ CATEGORY_DEPENDENCY,
+ }:
+ return design_group_id(suggestion.category, suggestion.subject_key)
+ return design_group_id(
+ suggestion.category,
+ suggestion.subject_key or suggestion.title,
+ )
+
+
+def _build_derived_suggestions(
+ suggestions: Sequence[Suggestion] | None,
+) -> list[dict[str, object]]:
+ suggestion_rows = list(suggestions or ())
+ suggestion_rows.sort(
+ key=lambda suggestion: (
+ -suggestion.priority,
+ SEVERITY_ORDER.get(suggestion.severity, 9),
+ suggestion.title,
+ _suggestion_finding_id(suggestion),
+ )
+ )
+ return [
+ {
+ "id": f"suggestion:{_suggestion_finding_id(suggestion)}",
+ "finding_id": _suggestion_finding_id(suggestion),
+ "title": suggestion.title,
+ "summary": suggestion.fact_summary,
+ "location_label": suggestion.location_label or suggestion.location,
+ "representative_locations": _representative_location_rows(suggestion),
+ "action": {
+ "effort": suggestion.effort,
+ "steps": list(suggestion.steps),
+ },
+ }
+ for suggestion in suggestion_rows
+ ]
diff --git a/codeclone/report/document/findings.py b/codeclone/report/document/findings.py
new file mode 100644
index 0000000..43aaedd
--- /dev/null
+++ b/codeclone/report/document/findings.py
@@ -0,0 +1,245 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Collection, Mapping, Sequence
+from typing import TYPE_CHECKING
+
+from ...domain.findings import (
+ CLONE_KIND_BLOCK,
+ CLONE_KIND_FUNCTION,
+ CLONE_KIND_SEGMENT,
+ CLONE_NOVELTY_KNOWN,
+ CLONE_NOVELTY_NEW,
+ FAMILY_CLONES,
+ FAMILY_DEAD_CODE,
+ FAMILY_STRUCTURAL,
+)
+from ...domain.quality import (
+ SEVERITY_CRITICAL,
+ SEVERITY_INFO,
+ SEVERITY_WARNING,
+)
+from ...domain.source_scope import (
+ IMPACT_SCOPE_MIXED,
+ IMPACT_SCOPE_NON_RUNTIME,
+ IMPACT_SCOPE_RUNTIME,
+)
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+
+if TYPE_CHECKING:
+ from ...models import (
+ GroupMapLike,
+ StructuralFindingGroup,
+ SuppressedCloneGroup,
+ )
+
+from ._design_groups import _build_design_groups
+from ._findings_groups import (
+ _build_clone_groups,
+ _build_dead_code_groups,
+ _build_structural_groups,
+ _build_suppressed_clone_groups,
+)
+
+
+def _findings_summary(
+ *,
+ clone_functions: Sequence[Mapping[str, object]],
+ clone_blocks: Sequence[Mapping[str, object]],
+ clone_segments: Sequence[Mapping[str, object]],
+ structural_groups: Sequence[Mapping[str, object]],
+ dead_code_groups: Sequence[Mapping[str, object]],
+ design_groups: Sequence[Mapping[str, object]],
+ suppressed_clone_groups: Mapping[str, Sequence[Mapping[str, object]]] | None = None,
+ dead_code_suppressed: int = 0,
+) -> dict[str, object]:
+ flat_groups = [
+ *clone_functions,
+ *clone_blocks,
+ *clone_segments,
+ *structural_groups,
+ *dead_code_groups,
+ *design_groups,
+ ]
+ severity_counts = dict.fromkeys(
+ (SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO),
+ 0,
+ )
+ source_scope_counts = dict.fromkeys(
+ (IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_NON_RUNTIME, IMPACT_SCOPE_MIXED),
+ 0,
+ )
+ for group in flat_groups:
+ severity = str(group.get("severity", SEVERITY_INFO))
+ if severity in severity_counts:
+ severity_counts[severity] += 1
+ impact_scope = str(
+ _as_mapping(group.get("source_scope")).get(
+ "impact_scope",
+ IMPACT_SCOPE_NON_RUNTIME,
+ )
+ )
+ if impact_scope in source_scope_counts:
+ source_scope_counts[impact_scope] += 1
+ clone_groups = [*clone_functions, *clone_blocks, *clone_segments]
+ clone_suppressed_map = _as_mapping(suppressed_clone_groups)
+ suppressed_functions = len(_as_sequence(clone_suppressed_map.get("function")))
+ suppressed_blocks = len(_as_sequence(clone_suppressed_map.get("block")))
+ suppressed_segments = len(_as_sequence(clone_suppressed_map.get("segment")))
+ suppressed_clone_total = (
+ suppressed_functions + suppressed_blocks + suppressed_segments
+ )
+ clones_summary: dict[str, object] = {
+ "functions": len(clone_functions),
+ "blocks": len(clone_blocks),
+ "segments": len(clone_segments),
+ CLONE_NOVELTY_NEW: sum(
+ 1
+ for group in clone_groups
+ if str(group.get("novelty", "")) == CLONE_NOVELTY_NEW
+ ),
+ CLONE_NOVELTY_KNOWN: sum(
+ 1
+ for group in clone_groups
+ if str(group.get("novelty", "")) == CLONE_NOVELTY_KNOWN
+ ),
+ }
+ if suppressed_clone_total > 0:
+ clones_summary.update(
+ {
+ "suppressed": suppressed_clone_total,
+ "suppressed_functions": suppressed_functions,
+ "suppressed_blocks": suppressed_blocks,
+ "suppressed_segments": suppressed_segments,
+ }
+ )
+ suppressed_summary = {
+ FAMILY_DEAD_CODE: max(0, dead_code_suppressed),
+ }
+ if suppressed_clone_total > 0:
+ suppressed_summary[FAMILY_CLONES] = suppressed_clone_total
+ return {
+ "total": len(flat_groups),
+ "families": {
+ FAMILY_CLONES: len(clone_groups),
+ FAMILY_STRUCTURAL: len(structural_groups),
+ FAMILY_DEAD_CODE: len(dead_code_groups),
+ "design": len(design_groups),
+ },
+ "severity": severity_counts,
+ "impact_scope": source_scope_counts,
+ "clones": clones_summary,
+ "suppressed": suppressed_summary,
+ }
+
+
+def _build_findings_payload(
+ *,
+ func_groups: GroupMapLike,
+ block_groups: GroupMapLike,
+ segment_groups: GroupMapLike,
+ block_facts: Mapping[str, Mapping[str, str]],
+ structural_findings: Sequence[StructuralFindingGroup] | None,
+ metrics_payload: Mapping[str, object],
+ baseline_trusted: bool,
+ new_function_group_keys: Collection[str] | None,
+ new_block_group_keys: Collection[str] | None,
+ new_segment_group_keys: Collection[str] | None,
+ suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None,
+ design_thresholds: Mapping[str, object] | None,
+ scan_root: str,
+) -> dict[str, object]:
+ clone_functions = _build_clone_groups(
+ groups=func_groups,
+ kind=CLONE_KIND_FUNCTION,
+ baseline_trusted=baseline_trusted,
+ new_keys=new_function_group_keys,
+ block_facts=block_facts,
+ scan_root=scan_root,
+ )
+ clone_blocks = _build_clone_groups(
+ groups=block_groups,
+ kind=CLONE_KIND_BLOCK,
+ baseline_trusted=baseline_trusted,
+ new_keys=new_block_group_keys,
+ block_facts=block_facts,
+ scan_root=scan_root,
+ )
+ clone_segments = _build_clone_groups(
+ groups=segment_groups,
+ kind=CLONE_KIND_SEGMENT,
+ baseline_trusted=baseline_trusted,
+ new_keys=new_segment_group_keys,
+ block_facts={},
+ scan_root=scan_root,
+ )
+ structural_groups = _build_structural_groups(
+ structural_findings,
+ scan_root=scan_root,
+ )
+ dead_code_groups = _build_dead_code_groups(
+ metrics_payload,
+ scan_root=scan_root,
+ )
+ dead_code_family = _as_mapping(
+ _as_mapping(metrics_payload.get("families")).get(FAMILY_DEAD_CODE)
+ )
+ dead_code_summary = _as_mapping(dead_code_family.get("summary"))
+ dead_code_suppressed = _as_int(
+ dead_code_summary.get(
+ "suppressed",
+ len(_as_sequence(dead_code_family.get("suppressed_items"))),
+ )
+ )
+ design_groups = _build_design_groups(
+ metrics_payload,
+ design_thresholds=design_thresholds,
+ scan_root=scan_root,
+ )
+ suppressed_clone_payload = _build_suppressed_clone_groups(
+ groups=suppressed_clone_groups,
+ block_facts=block_facts,
+ scan_root=scan_root,
+ )
+ clone_groups_payload: dict[str, object] = {
+ "functions": clone_functions,
+ "blocks": clone_blocks,
+ "segments": clone_segments,
+ }
+ if any(suppressed_clone_payload.values()):
+ clone_groups_payload["suppressed"] = {
+ "functions": suppressed_clone_payload[CLONE_KIND_FUNCTION],
+ "blocks": suppressed_clone_payload[CLONE_KIND_BLOCK],
+ "segments": suppressed_clone_payload[CLONE_KIND_SEGMENT],
+ }
+ return {
+ "summary": _findings_summary(
+ clone_functions=clone_functions,
+ clone_blocks=clone_blocks,
+ clone_segments=clone_segments,
+ structural_groups=structural_groups,
+ dead_code_groups=dead_code_groups,
+ design_groups=design_groups,
+ suppressed_clone_groups=suppressed_clone_payload,
+ dead_code_suppressed=dead_code_suppressed,
+ ),
+ "groups": {
+ FAMILY_CLONES: clone_groups_payload,
+ FAMILY_STRUCTURAL: {
+ "groups": structural_groups,
+ },
+ FAMILY_DEAD_CODE: {
+ "groups": dead_code_groups,
+ },
+ "design": {
+ "groups": design_groups,
+ },
+ },
+ }
diff --git a/codeclone/report/document/integrity.py b/codeclone/report/document/integrity.py
new file mode 100644
index 0000000..5360ef8
--- /dev/null
+++ b/codeclone/report/document/integrity.py
@@ -0,0 +1,87 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+from hashlib import sha256
+
+import orjson
+
+
+def _canonical_integrity_payload(
+ *,
+ report_schema_version: str,
+ meta: Mapping[str, object],
+ inventory: Mapping[str, object],
+ findings: Mapping[str, object],
+ metrics: Mapping[str, object],
+) -> dict[str, object]:
+ canonical_meta = {
+ str(key): value for key, value in meta.items() if str(key) != "runtime"
+ }
+
+ def _strip_noncanonical(value: object) -> object:
+ if isinstance(value, Mapping):
+ return {
+ str(key): _strip_noncanonical(item)
+ for key, item in value.items()
+ if str(key) != "display_facts"
+ }
+ if isinstance(value, Sequence) and not isinstance(
+ value,
+ (str, bytes, bytearray),
+ ):
+ return [_strip_noncanonical(item) for item in value]
+ return value
+
+ return {
+ "report_schema_version": report_schema_version,
+ "meta": canonical_meta,
+ "inventory": inventory,
+ "findings": _strip_noncanonical(findings),
+ "metrics": metrics,
+ }
+
+
+def _build_integrity_payload(
+ *,
+ report_schema_version: str,
+ meta: Mapping[str, object],
+ inventory: Mapping[str, object],
+ findings: Mapping[str, object],
+ metrics: Mapping[str, object],
+) -> dict[str, object]:
+ canonical_payload = _canonical_integrity_payload(
+ report_schema_version=report_schema_version,
+ meta=meta,
+ inventory=inventory,
+ findings=findings,
+ metrics=metrics,
+ )
+ canonical_json = orjson.dumps(
+ canonical_payload,
+ option=orjson.OPT_SORT_KEYS,
+ )
+ payload_sha = sha256(canonical_json).hexdigest()
+ return {
+ "canonicalization": {
+ "version": "1",
+ "scope": "canonical_only",
+ "sections": [
+ "report_schema_version",
+ "meta",
+ "inventory",
+ "findings",
+ "metrics",
+ ],
+ },
+ "digest": {
+ "verified": True,
+ "algorithm": "sha256",
+ "value": payload_sha,
+ },
+ }
diff --git a/codeclone/report/document/inventory.py b/codeclone/report/document/inventory.py
new file mode 100644
index 0000000..17ed577
--- /dev/null
+++ b/codeclone/report/document/inventory.py
@@ -0,0 +1,218 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+from ...domain.findings import (
+ CATEGORY_COHESION,
+ CATEGORY_COMPLEXITY,
+)
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ._common import (
+ _analysis_profile_payload,
+ _contract_path,
+ _count_file_lines,
+ _design_findings_thresholds_payload,
+ _optional_str,
+)
+
+
+def _derive_inventory_code_counts(
+ *,
+ metrics_payload: Mapping[str, object],
+ inventory_code: Mapping[str, object],
+ file_list: Sequence[str],
+ cached_files: int,
+) -> dict[str, object]:
+ complexity = _as_mapping(
+ _as_mapping(metrics_payload.get("families")).get(CATEGORY_COMPLEXITY)
+ )
+ cohesion = _as_mapping(
+ _as_mapping(metrics_payload.get("families")).get(CATEGORY_COHESION)
+ )
+ complexity_items = _as_sequence(complexity.get("items"))
+ cohesion_items = _as_sequence(cohesion.get("items"))
+
+ exact_entities = bool(complexity_items or cohesion_items)
+ method_count = sum(
+ _as_int(_as_mapping(item).get("method_count")) for item in cohesion_items
+ )
+ class_count = len(cohesion_items)
+ function_total = max(len(complexity_items) - method_count, 0)
+
+ if not exact_entities:
+ function_total = _as_int(inventory_code.get("functions"))
+ method_count = _as_int(inventory_code.get("methods"))
+ class_count = _as_int(inventory_code.get("classes"))
+
+ parsed_lines_raw = inventory_code.get("parsed_lines")
+ if isinstance(parsed_lines_raw, int) and parsed_lines_raw >= 0:
+ parsed_lines = parsed_lines_raw
+ elif cached_files > 0 and file_list:
+ parsed_lines = _count_file_lines(file_list)
+ else:
+ parsed_lines = _as_int(parsed_lines_raw)
+
+ if exact_entities and ((cached_files > 0 and file_list) or parsed_lines > 0):
+ scope = "analysis_root"
+ elif cached_files > 0 and file_list:
+ scope = "mixed"
+ else:
+ scope = "current_run"
+
+ return {
+ "scope": scope,
+ "parsed_lines": parsed_lines,
+ "functions": function_total,
+ "methods": method_count,
+ "classes": class_count,
+ }
+
+
+def _build_inventory_payload(
+ *,
+ inventory: Mapping[str, object] | None,
+ file_list: Sequence[str],
+ metrics_payload: Mapping[str, object],
+ scan_root: str,
+) -> dict[str, object]:
+ inventory_map = _as_mapping(inventory)
+ files_map = _as_mapping(inventory_map.get("files"))
+ code_map = _as_mapping(inventory_map.get("code"))
+ cached_files = _as_int(files_map.get("cached"))
+ file_registry = [
+ path
+ for path in (
+ _contract_path(filepath, scan_root=scan_root)[0] for filepath in file_list
+ )
+ if path is not None
+ ]
+ return {
+ "files": {
+ "total_found": _as_int(files_map.get("total_found"), len(file_list)),
+ "analyzed": _as_int(files_map.get("analyzed")),
+ "cached": cached_files,
+ "skipped": _as_int(files_map.get("skipped")),
+ "source_io_skipped": _as_int(files_map.get("source_io_skipped")),
+ },
+ "code": _derive_inventory_code_counts(
+ metrics_payload=metrics_payload,
+ inventory_code=code_map,
+ file_list=file_list,
+ cached_files=cached_files,
+ ),
+ "file_registry": {
+ "encoding": "relative_path",
+ "items": file_registry,
+ },
+ }
+
+
+def _baseline_is_trusted(meta: Mapping[str, object]) -> bool:
+ baseline = _as_mapping(meta.get("baseline"))
+ return (
+ baseline.get("loaded") is True
+ and str(baseline.get("status", "")).strip().lower() == "ok"
+ )
+
+
+def _build_meta_payload(
+ raw_meta: Mapping[str, object] | None,
+ *,
+ scan_root: str,
+) -> dict[str, object]:
+ meta = dict(raw_meta or {})
+ metrics_computed = sorted(
+ {
+ str(item)
+ for item in _as_sequence(meta.get("metrics_computed"))
+ if str(item).strip()
+ }
+ )
+ baseline_path, baseline_path_scope, baseline_abs = _contract_path(
+ meta.get("baseline_path"),
+ scan_root=scan_root,
+ )
+ cache_path, cache_path_scope, cache_abs = _contract_path(
+ meta.get("cache_path"),
+ scan_root=scan_root,
+ )
+ metrics_baseline_path, metrics_baseline_path_scope, metrics_baseline_abs = (
+ _contract_path(
+ meta.get("metrics_baseline_path"),
+ scan_root=scan_root,
+ )
+ )
+ payload: dict[str, object] = {
+ "codeclone_version": str(meta.get("codeclone_version", "")),
+ "project_name": str(meta.get("project_name", "")),
+ "scan_root": ".",
+ "python_version": str(meta.get("python_version", "")),
+ "python_tag": str(meta.get("python_tag", "")),
+ "analysis_mode": str(meta.get("analysis_mode", "full") or "full"),
+ "report_mode": str(meta.get("report_mode", "full") or "full"),
+ "computed_metric_families": metrics_computed,
+ "analysis_thresholds": _design_findings_thresholds_payload(meta),
+ "baseline": {
+ "path": baseline_path,
+ "path_scope": baseline_path_scope,
+ "loaded": bool(meta.get("baseline_loaded")),
+ "status": _optional_str(meta.get("baseline_status")),
+ "fingerprint_version": _optional_str(
+ meta.get("baseline_fingerprint_version")
+ ),
+ "schema_version": _optional_str(meta.get("baseline_schema_version")),
+ "python_tag": _optional_str(meta.get("baseline_python_tag")),
+ "generator_name": _optional_str(meta.get("baseline_generator_name")),
+ "generator_version": _optional_str(meta.get("baseline_generator_version")),
+ "payload_sha256": _optional_str(meta.get("baseline_payload_sha256")),
+ "payload_sha256_verified": bool(
+ meta.get("baseline_payload_sha256_verified")
+ ),
+ },
+ "cache": {
+ "path": cache_path,
+ "path_scope": cache_path_scope,
+ "used": bool(meta.get("cache_used")),
+ "status": _optional_str(meta.get("cache_status")),
+ "schema_version": _optional_str(meta.get("cache_schema_version")),
+ },
+ "metrics_baseline": {
+ "path": metrics_baseline_path,
+ "path_scope": metrics_baseline_path_scope,
+ "loaded": bool(meta.get("metrics_baseline_loaded")),
+ "status": _optional_str(meta.get("metrics_baseline_status")),
+ "schema_version": _optional_str(
+ meta.get("metrics_baseline_schema_version")
+ ),
+ "payload_sha256": _optional_str(
+ meta.get("metrics_baseline_payload_sha256")
+ ),
+ "payload_sha256_verified": bool(
+ meta.get("metrics_baseline_payload_sha256_verified")
+ ),
+ },
+ "runtime": {
+ "analysis_started_at_utc": _optional_str(
+ meta.get("analysis_started_at_utc")
+ ),
+ "report_generated_at_utc": _optional_str(
+ meta.get("report_generated_at_utc")
+ ),
+ "scan_root_absolute": _optional_str(meta.get("scan_root")),
+ "baseline_path_absolute": baseline_abs,
+ "cache_path_absolute": cache_abs,
+ "metrics_baseline_path_absolute": metrics_baseline_abs,
+ },
+ }
+ analysis_profile = _analysis_profile_payload(meta)
+ if analysis_profile is not None:
+ payload["analysis_profile"] = analysis_profile
+ return payload
diff --git a/codeclone/report/document/metrics.py b/codeclone/report/document/metrics.py
new file mode 100644
index 0000000..dbb165c
--- /dev/null
+++ b/codeclone/report/document/metrics.py
@@ -0,0 +1,781 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+from ...analysis.suppressions import INLINE_CODECLONE_SUPPRESSION_SOURCE
+from ...domain.findings import (
+ CATEGORY_COHESION,
+ CATEGORY_COMPLEXITY,
+ CATEGORY_COUPLING,
+ FAMILY_DEAD_CODE,
+)
+from ...domain.quality import (
+ CONFIDENCE_HIGH,
+ CONFIDENCE_MEDIUM,
+ RISK_LOW,
+)
+from ...domain.source_scope import (
+ SOURCE_KIND_FIXTURES,
+ SOURCE_KIND_OTHER,
+ SOURCE_KIND_PRODUCTION,
+ SOURCE_KIND_TESTS,
+)
+from ...metrics.registry import METRIC_FAMILIES
+from ...utils.coerce import as_float as _as_float
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+from ..derived import normalized_source_kind as _normalized_source_kind
+from ._common import (
+ _contract_path,
+ _normalize_nested_string_rows,
+ _optional_str,
+)
+
+_OVERLOADED_MODULES_FAMILY = "overloaded_modules"
+
+_COVERAGE_ADOPTION_FAMILY = "coverage_adoption"
+
+_API_SURFACE_FAMILY = "api_surface"
+
+_COVERAGE_JOIN_FAMILY = "coverage_join"
+
+_SECURITY_SURFACES_FAMILY = "security_surfaces"
+
+
+def _normalize_metrics_families(
+ metrics: Mapping[str, object] | None,
+ *,
+ scan_root: str,
+) -> dict[str, object]:
+ metrics_map = _as_mapping(metrics)
+ complexity = _as_mapping(metrics_map.get(CATEGORY_COMPLEXITY))
+ complexity_items = sorted(
+ (
+ {
+ "qualname": str(item_map.get("qualname", "")),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "cyclomatic_complexity": _as_int(
+ item_map.get("cyclomatic_complexity"),
+ 1,
+ ),
+ "nesting_depth": _as_int(item_map.get("nesting_depth")),
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ }
+ for item in _as_sequence(complexity.get("functions"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ ),
+ )
+
+ coupling = _as_mapping(metrics_map.get(CATEGORY_COUPLING))
+ coupling_items = sorted(
+ (
+ {
+ "qualname": str(item_map.get("qualname", "")),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "cbo": _as_int(item_map.get("cbo")),
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ "coupled_classes": sorted(
+ {
+ str(name)
+ for name in _as_sequence(item_map.get("coupled_classes"))
+ if str(name).strip()
+ }
+ ),
+ }
+ for item in _as_sequence(coupling.get("classes"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ ),
+ )
+
+ cohesion = _as_mapping(metrics_map.get(CATEGORY_COHESION))
+ cohesion_items = sorted(
+ (
+ {
+ "qualname": str(item_map.get("qualname", "")),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "lcom4": _as_int(item_map.get("lcom4")),
+ "risk": str(item_map.get("risk", RISK_LOW)),
+ "method_count": _as_int(item_map.get("method_count")),
+ "instance_var_count": _as_int(item_map.get("instance_var_count")),
+ }
+ for item in _as_sequence(cohesion.get("classes"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ ),
+ )
+
+ dependencies = _as_mapping(metrics_map.get("dependencies"))
+ dependency_edges = sorted(
+ (
+ {
+ "source": str(item_map.get("source", "")),
+ "target": str(item_map.get("target", "")),
+ "import_type": str(item_map.get("import_type", "")),
+ "line": _as_int(item_map.get("line")),
+ }
+ for item in _as_sequence(dependencies.get("edge_list"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["source"],
+ item["target"],
+ item["import_type"],
+ item["line"],
+ ),
+ )
+ dependency_cycles = _normalize_nested_string_rows(dependencies.get("cycles"))
+ longest_chains = _normalize_nested_string_rows(dependencies.get("longest_chains"))
+
+ dead_code = _as_mapping(metrics_map.get(FAMILY_DEAD_CODE))
+
+ def _normalize_suppressed_by(
+ raw_bindings: object,
+ ) -> list[dict[str, str]]:
+ normalized_bindings = sorted(
+ {
+ (
+ str(binding_map.get("rule", "")).strip(),
+ str(binding_map.get("source", "")).strip(),
+ )
+ for binding in _as_sequence(raw_bindings)
+ for binding_map in (_as_mapping(binding),)
+ if str(binding_map.get("rule", "")).strip()
+ },
+ key=lambda item: (item[0], item[1]),
+ )
+ if not normalized_bindings:
+ return []
+ return [
+ {
+ "rule": rule,
+ "source": source or INLINE_CODECLONE_SUPPRESSION_SOURCE,
+ }
+ for rule, source in normalized_bindings
+ ]
+
+ dead_items = sorted(
+ (
+ {
+ "qualname": str(item_map.get("qualname", "")),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "kind": str(item_map.get("kind", "")),
+ "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)),
+ }
+ for item in _as_sequence(dead_code.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["kind"],
+ ),
+ )
+ dead_suppressed_items = sorted(
+ (
+ {
+ "qualname": str(item_map.get("qualname", "")),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "kind": str(item_map.get("kind", "")),
+ "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)),
+ "suppressed_by": _normalize_suppressed_by(
+ item_map.get("suppressed_by")
+ ),
+ }
+ for item in _as_sequence(dead_code.get("suppressed_items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["kind"],
+ item["confidence"],
+ tuple(
+ (
+ str(_as_mapping(binding).get("rule", "")),
+ str(_as_mapping(binding).get("source", "")),
+ )
+ for binding in _as_sequence(item.get("suppressed_by"))
+ ),
+ ),
+ )
+ for item in dead_suppressed_items:
+ suppressed_by = _as_sequence(item.get("suppressed_by"))
+ first_binding = _as_mapping(suppressed_by[0]) if suppressed_by else {}
+ item["suppression_rule"] = str(first_binding.get("rule", ""))
+ item["suppression_source"] = str(first_binding.get("source", ""))
+
+ health = _as_mapping(metrics_map.get("health"))
+ health_dimensions = {
+ str(key): _as_int(value)
+ for key, value in sorted(_as_mapping(health.get("dimensions")).items())
+ }
+ overloaded_modules = _as_mapping(metrics_map.get(_OVERLOADED_MODULES_FAMILY))
+ overloaded_modules_detection = _as_mapping(overloaded_modules.get("detection"))
+ overloaded_module_items = sorted(
+ (
+ {
+ "module": str(item_map.get("module", "")).strip(),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "source_kind": _normalized_source_kind(item_map.get("source_kind")),
+ "loc": _as_int(item_map.get("loc")),
+ "functions": _as_int(item_map.get("functions")),
+ "methods": _as_int(item_map.get("methods")),
+ "classes": _as_int(item_map.get("classes")),
+ "callable_count": _as_int(item_map.get("callable_count")),
+ "complexity_total": _as_int(item_map.get("complexity_total")),
+ "complexity_max": _as_int(item_map.get("complexity_max")),
+ "fan_in": _as_int(item_map.get("fan_in")),
+ "fan_out": _as_int(item_map.get("fan_out")),
+ "total_deps": _as_int(item_map.get("total_deps")),
+ "import_edges": _as_int(item_map.get("import_edges")),
+ "reimport_edges": _as_int(item_map.get("reimport_edges")),
+ "reimport_ratio": round(
+ _as_float(item_map.get("reimport_ratio")),
+ 4,
+ ),
+ "instability": round(_as_float(item_map.get("instability")), 4),
+ "hub_balance": round(_as_float(item_map.get("hub_balance")), 4),
+ "size_score": round(_as_float(item_map.get("size_score")), 4),
+ "dependency_score": round(
+ _as_float(item_map.get("dependency_score")),
+ 4,
+ ),
+ "shape_score": round(_as_float(item_map.get("shape_score")), 4),
+ "score": round(_as_float(item_map.get("score")), 4),
+ "candidate_status": str(
+ item_map.get("candidate_status", "non_candidate")
+ ),
+ "candidate_reasons": [
+ str(reason)
+ for reason in _as_sequence(item_map.get("candidate_reasons"))
+ if str(reason).strip()
+ ],
+ }
+ for item in _as_sequence(overloaded_modules.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ {"candidate": 0, "ranked_only": 1, "non_candidate": 2}.get(
+ str(item["candidate_status"]),
+ 3,
+ ),
+ -_as_float(item["score"]),
+ -_as_float(item["size_score"]),
+ -_as_float(item["dependency_score"]),
+ item["relative_path"],
+ item["module"],
+ ),
+ )
+
+ complexity_summary = _as_mapping(complexity.get("summary"))
+ coupling_summary = _as_mapping(coupling.get("summary"))
+ cohesion_summary = _as_mapping(cohesion.get("summary"))
+ dead_code_summary = _as_mapping(dead_code.get("summary"))
+ overloaded_modules_summary = _as_mapping(overloaded_modules.get("summary"))
+ coverage_adoption = _as_mapping(metrics_map.get(_COVERAGE_ADOPTION_FAMILY))
+ coverage_adoption_summary = _as_mapping(coverage_adoption.get("summary"))
+ coverage_adoption_items = sorted(
+ (
+ {
+ "module": str(item_map.get("module", "")).strip(),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "callable_count": _as_int(item_map.get("callable_count")),
+ "params_total": _as_int(item_map.get("params_total")),
+ "params_annotated": _as_int(item_map.get("params_annotated")),
+ "param_permille": _as_int(item_map.get("param_permille")),
+ "returns_total": _as_int(item_map.get("returns_total")),
+ "returns_annotated": _as_int(item_map.get("returns_annotated")),
+ "return_permille": _as_int(item_map.get("return_permille")),
+ "any_annotation_count": _as_int(item_map.get("any_annotation_count")),
+ "public_symbol_total": _as_int(item_map.get("public_symbol_total")),
+ "public_symbol_documented": _as_int(
+ item_map.get("public_symbol_documented")
+ ),
+ "docstring_permille": _as_int(item_map.get("docstring_permille")),
+ }
+ for item in _as_sequence(coverage_adoption.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["module"],
+ ),
+ )
+ api_surface = _as_mapping(metrics_map.get(_API_SURFACE_FAMILY))
+ api_surface_summary = _as_mapping(api_surface.get("summary"))
+ api_surface_items = sorted(
+ (
+ {
+ "record_kind": str(item_map.get("record_kind", "symbol")),
+ "module": str(item_map.get("module", "")).strip(),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "qualname": str(item_map.get("qualname", "")),
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "symbol_kind": str(item_map.get("symbol_kind", "")),
+ "exported_via": _optional_str(item_map.get("exported_via")),
+ "params_total": _as_int(item_map.get("params_total")),
+ "params": [
+ {
+ "name": str(param_map.get("name", "")),
+ "kind": str(param_map.get("kind", "")),
+ "has_default": bool(param_map.get("has_default")),
+ "annotated": bool(param_map.get("annotated")),
+ }
+ for param in _as_sequence(item_map.get("params"))
+ for param_map in (_as_mapping(param),)
+ ],
+ "returns_annotated": bool(item_map.get("returns_annotated")),
+ "change_kind": _optional_str(item_map.get("change_kind")),
+ "detail": _optional_str(item_map.get("detail")),
+ }
+ for item in _as_sequence(api_surface.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["record_kind"],
+ ),
+ )
+ coverage_join = _as_mapping(metrics_map.get(_COVERAGE_JOIN_FAMILY))
+ coverage_join_summary = _as_mapping(coverage_join.get("summary"))
+ coverage_join_items = sorted(
+ (
+ {
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "qualname": str(item_map.get("qualname", "")).strip(),
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "cyclomatic_complexity": _as_int(
+ item_map.get("cyclomatic_complexity"),
+ 1,
+ ),
+ "risk": str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW,
+ "executable_lines": _as_int(item_map.get("executable_lines")),
+ "covered_lines": _as_int(item_map.get("covered_lines")),
+ "coverage_permille": _as_int(item_map.get("coverage_permille")),
+ "coverage_status": str(item_map.get("coverage_status", "")).strip(),
+ "coverage_hotspot": bool(item_map.get("coverage_hotspot")),
+ "scope_gap_hotspot": bool(item_map.get("scope_gap_hotspot")),
+ }
+ for item in _as_sequence(coverage_join.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ 0 if bool(item["coverage_hotspot"]) else 1,
+ 0 if bool(item["scope_gap_hotspot"]) else 1,
+ {"high": 0, "medium": 1, "low": 2}.get(str(item["risk"]), 3),
+ _as_int(item["coverage_permille"]),
+ -_as_int(item["cyclomatic_complexity"]),
+ item["relative_path"],
+ _as_int(item["start_line"]),
+ item["qualname"],
+ ),
+ )
+ security_surfaces = _as_mapping(metrics_map.get(_SECURITY_SURFACES_FAMILY))
+ security_surfaces_summary = _as_mapping(security_surfaces.get("summary"))
+ raw_category_counts = _as_mapping(security_surfaces_summary.get("categories"))
+ raw_source_kind_counts = _as_mapping(
+ security_surfaces_summary.get("by_source_kind")
+ )
+ security_surface_items = sorted(
+ (
+ {
+ "category": str(item_map.get("category", "")).strip(),
+ "capability": str(item_map.get("capability", "")).strip(),
+ "module": str(item_map.get("module", "")).strip(),
+ "qualname": str(item_map.get("qualname", "")).strip(),
+ "relative_path": _contract_path(
+ item_map.get("filepath", ""),
+ scan_root=scan_root,
+ )[0]
+ or "",
+ "source_kind": str(item_map.get("source_kind", SOURCE_KIND_OTHER)),
+ "start_line": _as_int(item_map.get("start_line")),
+ "end_line": _as_int(item_map.get("end_line")),
+ "location_scope": str(item_map.get("location_scope", "")).strip(),
+ "classification_mode": str(
+ item_map.get("classification_mode", "")
+ ).strip(),
+ "evidence_kind": str(item_map.get("evidence_kind", "")).strip(),
+ "evidence_symbol": str(item_map.get("evidence_symbol", "")).strip(),
+ }
+ for item in _as_sequence(security_surfaces.get("items"))
+ for item_map in (_as_mapping(item),)
+ ),
+ key=lambda item: (
+ item["relative_path"],
+ item["start_line"],
+ item["end_line"],
+ item["qualname"],
+ item["category"],
+ item["capability"],
+ item["evidence_symbol"],
+ ),
+ )
+ dead_high_confidence = sum(
+ 1
+ for item in dead_items
+ if str(_as_mapping(item).get("confidence", "")).strip().lower()
+ == CONFIDENCE_HIGH
+ )
+
+ family_sections: dict[str, object] = {
+ CATEGORY_COMPLEXITY: {
+ "summary": {
+ "total": len(complexity_items),
+ "average": round(_as_float(complexity_summary.get("average")), 2),
+ "max": _as_int(complexity_summary.get("max")),
+ "high_risk": _as_int(complexity_summary.get("high_risk")),
+ },
+ "items": complexity_items,
+ "items_truncated": False,
+ },
+ CATEGORY_COUPLING: {
+ "summary": {
+ "total": len(coupling_items),
+ "average": round(_as_float(coupling_summary.get("average")), 2),
+ "max": _as_int(coupling_summary.get("max")),
+ "high_risk": _as_int(coupling_summary.get("high_risk")),
+ },
+ "items": coupling_items,
+ "items_truncated": False,
+ },
+ CATEGORY_COHESION: {
+ "summary": {
+ "total": len(cohesion_items),
+ "average": round(_as_float(cohesion_summary.get("average")), 2),
+ "max": _as_int(cohesion_summary.get("max")),
+ "low_cohesion": _as_int(cohesion_summary.get("low_cohesion")),
+ },
+ "items": cohesion_items,
+ "items_truncated": False,
+ },
+ "dependencies": {
+ "summary": {
+ "modules": _as_int(dependencies.get("modules")),
+ "edges": _as_int(dependencies.get("edges")),
+ "cycles": len(dependency_cycles),
+ "max_depth": _as_int(dependencies.get("max_depth")),
+ "avg_depth": round(_as_float(dependencies.get("avg_depth")), 2),
+ "p95_depth": _as_int(dependencies.get("p95_depth")),
+ },
+ "items": dependency_edges,
+ "cycles": dependency_cycles,
+ "longest_chains": longest_chains,
+ "items_truncated": False,
+ },
+ FAMILY_DEAD_CODE: {
+ "summary": {
+ "total": len(dead_items),
+ "high_confidence": dead_high_confidence
+ or _as_int(
+ dead_code_summary.get(
+ "high_confidence", dead_code_summary.get("critical")
+ )
+ ),
+ "suppressed": len(dead_suppressed_items)
+ or _as_int(dead_code_summary.get("suppressed")),
+ },
+ "items": dead_items,
+ "suppressed_items": dead_suppressed_items,
+ "items_truncated": False,
+ },
+ "health": {
+ "summary": {
+ "score": _as_int(health.get("score")),
+ "grade": str(health.get("grade", "")),
+ "dimensions": health_dimensions,
+ },
+ "items": [],
+ "items_truncated": False,
+ },
+ _COVERAGE_ADOPTION_FAMILY: {
+ "summary": {
+ "modules": len(coverage_adoption_items),
+ "params_total": _as_int(coverage_adoption_summary.get("params_total")),
+ "params_annotated": _as_int(
+ coverage_adoption_summary.get("params_annotated")
+ ),
+ "param_permille": _as_int(
+ coverage_adoption_summary.get("param_permille")
+ ),
+ "baseline_diff_available": bool(
+ coverage_adoption_summary.get("baseline_diff_available")
+ ),
+ "param_delta": _as_int(coverage_adoption_summary.get("param_delta")),
+ "returns_total": _as_int(
+ coverage_adoption_summary.get("returns_total")
+ ),
+ "returns_annotated": _as_int(
+ coverage_adoption_summary.get("returns_annotated")
+ ),
+ "return_permille": _as_int(
+ coverage_adoption_summary.get("return_permille")
+ ),
+ "return_delta": _as_int(coverage_adoption_summary.get("return_delta")),
+ "public_symbol_total": _as_int(
+ coverage_adoption_summary.get("public_symbol_total")
+ ),
+ "public_symbol_documented": _as_int(
+ coverage_adoption_summary.get("public_symbol_documented")
+ ),
+ "docstring_permille": _as_int(
+ coverage_adoption_summary.get("docstring_permille")
+ ),
+ "docstring_delta": _as_int(
+ coverage_adoption_summary.get("docstring_delta")
+ ),
+ "typing_any_count": _as_int(
+ coverage_adoption_summary.get("typing_any_count")
+ ),
+ },
+ "items": coverage_adoption_items,
+ "items_truncated": False,
+ },
+ _API_SURFACE_FAMILY: {
+ "summary": {
+ "enabled": bool(api_surface_summary.get("enabled")),
+ "baseline_diff_available": bool(
+ api_surface_summary.get("baseline_diff_available")
+ ),
+ "modules": _as_int(api_surface_summary.get("modules")),
+ "public_symbols": _as_int(api_surface_summary.get("public_symbols")),
+ "added": _as_int(api_surface_summary.get("added")),
+ "breaking": _as_int(api_surface_summary.get("breaking")),
+ "strict_types": bool(api_surface_summary.get("strict_types")),
+ },
+ "items": api_surface_items,
+ "items_truncated": False,
+ },
+ _OVERLOADED_MODULES_FAMILY: {
+ "summary": {
+ "total": len(overloaded_module_items),
+ "candidates": _as_int(overloaded_modules_summary.get("candidates")),
+ "population_status": str(
+ overloaded_modules_summary.get("population_status", "limited")
+ ),
+ "top_score": round(
+ _as_float(overloaded_modules_summary.get("top_score")),
+ 4,
+ ),
+ "average_score": round(
+ _as_float(overloaded_modules_summary.get("average_score")),
+ 4,
+ ),
+ "candidate_score_cutoff": round(
+ _as_float(overloaded_modules_summary.get("candidate_score_cutoff")),
+ 4,
+ ),
+ },
+ "detection": {
+ "version": str(overloaded_modules_detection.get("version", "1")),
+ "scope": str(overloaded_modules_detection.get("scope", "report_only")),
+ "strategy": str(
+ overloaded_modules_detection.get(
+ "strategy",
+ "project_relative_composite",
+ )
+ ),
+ "minimum_population": _as_int(
+ overloaded_modules_detection.get("minimum_population"),
+ ),
+ "size_signals": [
+ str(signal)
+ for signal in _as_sequence(
+ overloaded_modules_detection.get("size_signals")
+ )
+ if str(signal).strip()
+ ],
+ "dependency_signals": [
+ str(signal)
+ for signal in _as_sequence(
+ overloaded_modules_detection.get("dependency_signals")
+ )
+ if str(signal).strip()
+ ],
+ "shape_signals": [
+ str(signal)
+ for signal in _as_sequence(
+ overloaded_modules_detection.get("shape_signals")
+ )
+ if str(signal).strip()
+ ],
+ },
+ "items": overloaded_module_items,
+ "items_truncated": False,
+ },
+ _SECURITY_SURFACES_FAMILY: {
+ "summary": {
+ "items": _as_int(security_surfaces_summary.get("items")),
+ "modules": _as_int(security_surfaces_summary.get("modules")),
+ "exact_items": _as_int(security_surfaces_summary.get("exact_items")),
+ "category_count": _as_int(
+ security_surfaces_summary.get("category_count")
+ ),
+ "categories": {
+ str(key): _as_int(value)
+ for key, value in sorted(raw_category_counts.items())
+ if str(key).strip()
+ },
+ "by_source_kind": {
+ SOURCE_KIND_PRODUCTION: _as_int(
+ raw_source_kind_counts.get(SOURCE_KIND_PRODUCTION)
+ ),
+ SOURCE_KIND_TESTS: _as_int(
+ raw_source_kind_counts.get(SOURCE_KIND_TESTS)
+ ),
+ SOURCE_KIND_FIXTURES: _as_int(
+ raw_source_kind_counts.get(SOURCE_KIND_FIXTURES)
+ ),
+ SOURCE_KIND_OTHER: _as_int(
+ raw_source_kind_counts.get(SOURCE_KIND_OTHER)
+ ),
+ },
+ "production": _as_int(security_surfaces_summary.get("production")),
+ "tests": _as_int(security_surfaces_summary.get("tests")),
+ "fixtures": _as_int(security_surfaces_summary.get("fixtures")),
+ "other": _as_int(security_surfaces_summary.get("other")),
+ "report_only": bool(security_surfaces_summary.get("report_only")),
+ },
+ "items": security_surface_items,
+ "items_truncated": False,
+ },
+ }
+ if coverage_join_summary or coverage_join_items or coverage_join:
+ family_sections[_COVERAGE_JOIN_FAMILY] = {
+ "summary": {
+ "status": str(coverage_join_summary.get("status", "")),
+ "source": _contract_path(
+ coverage_join_summary.get("source", ""),
+ scan_root=scan_root,
+ )[0],
+ "files": _as_int(coverage_join_summary.get("files")),
+ "units": _as_int(coverage_join_summary.get("units")),
+ "measured_units": _as_int(coverage_join_summary.get("measured_units")),
+ "overall_executable_lines": _as_int(
+ coverage_join_summary.get("overall_executable_lines")
+ ),
+ "overall_covered_lines": _as_int(
+ coverage_join_summary.get("overall_covered_lines")
+ ),
+ "overall_permille": _as_int(
+ coverage_join_summary.get("overall_permille")
+ ),
+ "missing_from_report_units": _as_int(
+ coverage_join_summary.get("missing_from_report_units")
+ ),
+ "coverage_hotspots": _as_int(
+ coverage_join_summary.get("coverage_hotspots")
+ ),
+ "scope_gap_hotspots": _as_int(
+ coverage_join_summary.get("scope_gap_hotspots")
+ ),
+ "hotspot_threshold_percent": _as_int(
+ coverage_join_summary.get("hotspot_threshold_percent")
+ ),
+ "invalid_reason": _optional_str(
+ coverage_join_summary.get("invalid_reason")
+ ),
+ },
+ "items": coverage_join_items,
+ "items_truncated": False,
+ }
+ normalized: dict[str, object] = {}
+ for family in METRIC_FAMILIES.values():
+ section = family.report_section
+ if section in family_sections:
+ normalized[section] = family_sections[section]
+ return normalized
+
+
+def _build_metrics_payload(
+ metrics: Mapping[str, object] | None,
+ *,
+ scan_root: str,
+) -> dict[str, object]:
+ families = _normalize_metrics_families(metrics, scan_root=scan_root)
+ return {
+ "summary": {
+ family_name: _as_mapping(_as_mapping(family_payload).get("summary"))
+ for family_name, family_payload in families.items()
+ },
+ "families": families,
+ }
diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py
index 73605b0..2a85a1f 100644
--- a/codeclone/report/explain.py
+++ b/codeclone/report/explain.py
@@ -12,7 +12,7 @@
from pathlib import Path
from typing import TYPE_CHECKING
-from .._coerce import as_int
+from ..utils.coerce import as_int
from .explain_contract import (
BLOCK_HINT_ASSERT_ONLY,
BLOCK_HINT_ASSERT_ONLY_LABEL,
diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py
index 350b836..7843967 100644
--- a/codeclone/report/findings.py
+++ b/codeclone/report/findings.py
@@ -6,7 +6,7 @@
"""Deterministic structural-finding helpers for the report layer.
-HTML rendering lives in ``codeclone._html_report._sections._structural``.
+HTML rendering lives in ``codeclone.report.html.sections._structural``.
"""
from __future__ import annotations
diff --git a/codeclone/report/gates/__init__.py b/codeclone/report/gates/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/report/gates/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/report/gates/evaluator.py b/codeclone/report/gates/evaluator.py
new file mode 100644
index 0000000..62310f2
--- /dev/null
+++ b/codeclone/report/gates/evaluator.py
@@ -0,0 +1,681 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+from __future__ import annotations
+
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from ...contracts import DEFAULT_COVERAGE_MIN, ExitCode
+from ...metrics.registry import METRIC_FAMILIES
+from ...utils.coerce import as_int as _as_int
+from ...utils.coerce import as_mapping as _as_mapping
+from ...utils.coerce import as_sequence as _as_sequence
+
+if TYPE_CHECKING:
+ from ...models import CoverageJoinResult, ProjectMetrics
+
+
+@dataclass(frozen=True, slots=True)
+class MetricGateConfig:
+ fail_complexity: int
+ fail_coupling: int
+ fail_cohesion: int
+ fail_cycles: bool
+ fail_dead_code: bool
+ fail_health: int
+ fail_on_new_metrics: bool
+ fail_on_typing_regression: bool = False
+ fail_on_docstring_regression: bool = False
+ fail_on_api_break: bool = False
+ fail_on_untested_hotspots: bool = False
+ min_typing_coverage: int = -1
+ min_docstring_coverage: int = -1
+ coverage_min: int = DEFAULT_COVERAGE_MIN
+ fail_on_new: bool = False
+ fail_threshold: int = -1
+
+
+@dataclass(frozen=True, slots=True)
+class GateResult:
+ exit_code: int
+ reasons: tuple[str, ...]
+
+
+@dataclass(frozen=True, slots=True)
+class GateState:
+ clone_new_count: int = 0
+ clone_total: int = 0
+ complexity_max: int = 0
+ coupling_max: int = 0
+ cohesion_max: int = 0
+ dependency_cycles: int = 0
+ dead_high_confidence: int = 0
+ health_score: int = 0
+ typing_param_permille: int = 0
+ docstring_permille: int = 0
+ coverage_join_status: str = ""
+ coverage_hotspots: int = 0
+ api_breaking_changes: int = 0
+ diff_new_high_risk_functions: int = 0
+ diff_new_high_coupling_classes: int = 0
+ diff_new_cycles: int = 0
+ diff_new_dead_code: int = 0
+ diff_health_delta: int = 0
+ diff_typing_param_permille_delta: int = 0
+ diff_typing_return_permille_delta: int = 0
+ diff_docstring_permille_delta: int = 0
+
+
+def summarize_metrics_diff(metrics_diff: object | None) -> dict[str, object] | None:
+ if metrics_diff is None:
+ return None
+
+ if isinstance(metrics_diff, Mapping):
+ payload = metrics_diff
+ return {
+ "new_high_risk_functions": _as_int(
+ payload.get("new_high_risk_functions"),
+ 0,
+ ),
+ "new_high_coupling_classes": _as_int(
+ payload.get("new_high_coupling_classes"),
+ 0,
+ ),
+ "new_cycles": _as_int(payload.get("new_cycles"), 0),
+ "new_dead_code": _as_int(payload.get("new_dead_code"), 0),
+ "health_delta": _as_int(payload.get("health_delta"), 0),
+ "typing_param_permille_delta": _as_int(
+ payload.get("typing_param_permille_delta"),
+ 0,
+ ),
+ "typing_return_permille_delta": _as_int(
+ payload.get("typing_return_permille_delta"),
+ 0,
+ ),
+ "docstring_permille_delta": _as_int(
+ payload.get("docstring_permille_delta"),
+ 0,
+ ),
+ "new_api_symbols": _as_int(payload.get("new_api_symbols"), 0),
+ "api_breaking_changes": _as_int(
+ payload.get("api_breaking_changes"),
+ _as_int(payload.get("new_api_breaking_changes"), 0),
+ ),
+ }
+
+ new_high_risk_functions = tuple(
+ str(item)
+ for item in _as_sequence(getattr(metrics_diff, "new_high_risk_functions", ()))
+ if str(item).strip()
+ )
+ new_high_coupling_classes = tuple(
+ str(item)
+ for item in _as_sequence(getattr(metrics_diff, "new_high_coupling_classes", ()))
+ if str(item).strip()
+ )
+ new_cycles = tuple(
+ tuple(str(part) for part in _as_sequence(item) if str(part).strip())
+ for item in _as_sequence(getattr(metrics_diff, "new_cycles", ()))
+ )
+ new_dead_code = tuple(
+ str(item)
+ for item in _as_sequence(getattr(metrics_diff, "new_dead_code", ()))
+ if str(item).strip()
+ )
+ api_breaking_changes = tuple(
+ _as_sequence(getattr(metrics_diff, "new_api_breaking_changes", ()))
+ )
+ new_api_symbols = tuple(_as_sequence(getattr(metrics_diff, "new_api_symbols", ())))
+ return {
+ "new_high_risk_functions": len(new_high_risk_functions),
+ "new_high_coupling_classes": len(new_high_coupling_classes),
+ "new_cycles": len(new_cycles),
+ "new_dead_code": len(new_dead_code),
+ "health_delta": _as_int(getattr(metrics_diff, "health_delta", 0), 0),
+ "typing_param_permille_delta": _as_int(
+ getattr(metrics_diff, "typing_param_permille_delta", 0),
+ 0,
+ ),
+ "typing_return_permille_delta": _as_int(
+ getattr(metrics_diff, "typing_return_permille_delta", 0),
+ 0,
+ ),
+ "docstring_permille_delta": _as_int(
+ getattr(metrics_diff, "docstring_permille_delta", 0),
+ 0,
+ ),
+ "new_api_symbols": len(new_api_symbols),
+ "api_breaking_changes": len(api_breaking_changes),
+ }
+
+
+def gate_state_from_project_metrics(
+ *,
+ project_metrics: ProjectMetrics,
+ coverage_join: CoverageJoinResult | None,
+ metrics_diff: object | None,
+ clone_new_count: int = 0,
+ clone_total: int = 0,
+) -> GateState:
+ diff_summary = summarize_metrics_diff(metrics_diff) or {}
+ return GateState(
+ clone_new_count=max(clone_new_count, 0),
+ clone_total=max(clone_total, 0),
+ complexity_max=max(int(project_metrics.complexity_max), 0),
+ coupling_max=max(int(project_metrics.coupling_max), 0),
+ cohesion_max=max(int(project_metrics.cohesion_max), 0),
+ dependency_cycles=len(tuple(project_metrics.dependency_cycles)),
+ dead_high_confidence=sum(
+ 1
+ for item in project_metrics.dead_code
+ if str(getattr(item, "confidence", "")).strip().lower() == "high"
+ ),
+ health_score=max(int(project_metrics.health.total), 0),
+ typing_param_permille=_permille(
+ int(project_metrics.typing_param_annotated),
+ int(project_metrics.typing_param_total),
+ ),
+ docstring_permille=_permille(
+ int(project_metrics.docstring_public_documented),
+ int(project_metrics.docstring_public_total),
+ ),
+ coverage_join_status=(
+ str(coverage_join.status) if coverage_join is not None else ""
+ ),
+ coverage_hotspots=(
+ int(coverage_join.coverage_hotspots) if coverage_join is not None else 0
+ ),
+ api_breaking_changes=_as_int(diff_summary.get("api_breaking_changes"), 0),
+ diff_new_high_risk_functions=_as_int(
+ diff_summary.get("new_high_risk_functions"),
+ 0,
+ ),
+ diff_new_high_coupling_classes=_as_int(
+ diff_summary.get("new_high_coupling_classes"),
+ 0,
+ ),
+ diff_new_cycles=_as_int(diff_summary.get("new_cycles"), 0),
+ diff_new_dead_code=_as_int(diff_summary.get("new_dead_code"), 0),
+ diff_health_delta=_as_int(diff_summary.get("health_delta"), 0),
+ diff_typing_param_permille_delta=_as_int(
+ diff_summary.get("typing_param_permille_delta"),
+ 0,
+ ),
+ diff_typing_return_permille_delta=_as_int(
+ diff_summary.get("typing_return_permille_delta"),
+ 0,
+ ),
+ diff_docstring_permille_delta=_as_int(
+ diff_summary.get("docstring_permille_delta"),
+ 0,
+ ),
+ )
+
+
+def metric_gate_reasons_for_state(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ gate_keys = sorted(
+ {
+ gate_key
+ for family in METRIC_FAMILIES.values()
+ for gate_key in family.gate_keys
+ },
+ key=lambda gate_key: (_GATE_REASON_ORDER.get(gate_key, 999), gate_key),
+ )
+ reasons: list[str] = []
+ for gate_key in gate_keys:
+ builder = _GATE_REASON_BUILDERS.get(gate_key)
+ if builder is None:
+ continue
+ reasons.extend(builder(state=state, config=config))
+ return tuple(reasons)
+
+
+_GATE_REASON_ORDER = {
+ "complexity_threshold": 10,
+ "coupling_threshold": 20,
+ "cohesion_threshold": 30,
+ "health_threshold": 40,
+ "dependency_cycles": 50,
+ "dead_code_high_confidence": 60,
+ "new_high_risk_functions": 70,
+ "new_high_coupling_classes": 80,
+ "new_dependency_cycles": 90,
+ "new_dead_code": 100,
+ "health_regression": 110,
+ "typing_coverage_threshold": 120,
+ "docstring_coverage_threshold": 130,
+ "typing_regression": 140,
+ "docstring_regression": 150,
+ "api_breaking_changes": 160,
+ "coverage_hotspots": 170,
+}
+
+
+def _reason_if(triggered: bool, message: str) -> tuple[str, ...]:
+ return (message,) if triggered else ()
+
+
+def _complexity_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ 0 <= config.fail_complexity < state.complexity_max,
+ "Complexity threshold exceeded: "
+ f"max CC={state.complexity_max}, "
+ f"threshold={config.fail_complexity}.",
+ )
+
+
+def _coupling_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ 0 <= config.fail_coupling < state.coupling_max,
+ "Coupling threshold exceeded: "
+ f"max CBO={state.coupling_max}, "
+ f"threshold={config.fail_coupling}.",
+ )
+
+
+def _cohesion_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ 0 <= config.fail_cohesion < state.cohesion_max,
+ "Cohesion threshold exceeded: "
+ f"max LCOM4={state.cohesion_max}, "
+ f"threshold={config.fail_cohesion}.",
+ )
+
+
+def _health_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_health >= 0 and state.health_score < config.fail_health,
+ "Health score below threshold: "
+ f"score={state.health_score}, threshold={config.fail_health}.",
+ )
+
+
+def _dependency_cycles_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_cycles and state.dependency_cycles > 0,
+ f"Dependency cycles detected: {state.dependency_cycles} cycle(s).",
+ )
+
+
+def _dead_code_high_confidence_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_dead_code and state.dead_high_confidence > 0,
+ f"Dead code detected (high confidence): {state.dead_high_confidence} item(s).",
+ )
+
+
+def _new_high_risk_functions_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_new_metrics and state.diff_new_high_risk_functions > 0,
+ "New high-risk functions vs metrics baseline: "
+ f"{state.diff_new_high_risk_functions}.",
+ )
+
+
+def _new_high_coupling_classes_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_new_metrics and state.diff_new_high_coupling_classes > 0,
+ "New high-coupling classes vs metrics baseline: "
+ f"{state.diff_new_high_coupling_classes}.",
+ )
+
+
+def _new_dependency_cycles_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_new_metrics and state.diff_new_cycles > 0,
+ f"New dependency cycles vs metrics baseline: {state.diff_new_cycles}.",
+ )
+
+
+def _new_dead_code_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_new_metrics and state.diff_new_dead_code > 0,
+ f"New dead code items vs metrics baseline: {state.diff_new_dead_code}.",
+ )
+
+
+def _health_regression_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_new_metrics and state.diff_health_delta < 0,
+ f"Health score regressed vs metrics baseline: delta={state.diff_health_delta}.",
+ )
+
+
+def _typing_coverage_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ typing_percent = state.typing_param_permille / 10.0
+ return _reason_if(
+ config.min_typing_coverage >= 0
+ and typing_percent < float(config.min_typing_coverage),
+ "Typing coverage below threshold: "
+ f"coverage={typing_percent:.1f}%, threshold={config.min_typing_coverage}%.",
+ )
+
+
+def _docstring_coverage_threshold_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ docstring_percent = state.docstring_permille / 10.0
+ return _reason_if(
+ config.min_docstring_coverage >= 0
+ and docstring_percent < float(config.min_docstring_coverage),
+ "Docstring coverage below threshold: "
+ f"coverage={docstring_percent:.1f}%, "
+ f"threshold={config.min_docstring_coverage}%.",
+ )
+
+
+def _typing_regression_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_typing_regression
+ and (
+ state.diff_typing_param_permille_delta < 0
+ or state.diff_typing_return_permille_delta < 0
+ ),
+ "Typing coverage regressed vs metrics baseline: "
+ f"params_delta={state.diff_typing_param_permille_delta}, "
+ f"returns_delta={state.diff_typing_return_permille_delta}.",
+ )
+
+
+def _docstring_regression_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_docstring_regression and state.diff_docstring_permille_delta < 0,
+ "Docstring coverage regressed vs metrics baseline: "
+ f"delta={state.diff_docstring_permille_delta}.",
+ )
+
+
+def _api_breaking_changes_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_api_break and state.api_breaking_changes > 0,
+ "Public API breaking changes vs metrics baseline: "
+ f"{state.api_breaking_changes}.",
+ )
+
+
+def _coverage_hotspots_reason(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> tuple[str, ...]:
+ return _reason_if(
+ config.fail_on_untested_hotspots
+ and state.coverage_join_status == "ok"
+ and state.coverage_hotspots > 0,
+ "Coverage hotspots detected: "
+ f"hotspots={state.coverage_hotspots}, "
+ f"threshold={config.coverage_min}%.",
+ )
+
+
+_GATE_REASON_BUILDERS: dict[str, Callable[..., tuple[str, ...]]] = {
+ "complexity_threshold": _complexity_threshold_reason,
+ "coupling_threshold": _coupling_threshold_reason,
+ "cohesion_threshold": _cohesion_threshold_reason,
+ "health_threshold": _health_threshold_reason,
+ "dependency_cycles": _dependency_cycles_reason,
+ "dead_code_high_confidence": _dead_code_high_confidence_reason,
+ "new_high_risk_functions": _new_high_risk_functions_reason,
+ "new_high_coupling_classes": _new_high_coupling_classes_reason,
+ "new_dependency_cycles": _new_dependency_cycles_reason,
+ "new_dead_code": _new_dead_code_reason,
+ "health_regression": _health_regression_reason,
+ "typing_coverage_threshold": _typing_coverage_threshold_reason,
+ "docstring_coverage_threshold": _docstring_coverage_threshold_reason,
+ "typing_regression": _typing_regression_reason,
+ "docstring_regression": _docstring_regression_reason,
+ "api_breaking_changes": _api_breaking_changes_reason,
+ "coverage_hotspots": _coverage_hotspots_reason,
+}
+
+
+def evaluate_gate_state(
+ *,
+ state: GateState,
+ config: MetricGateConfig,
+) -> GateResult:
+ reasons = [
+ f"metric:{reason}"
+ for reason in metric_gate_reasons_for_state(state=state, config=config)
+ ]
+
+ if config.fail_on_new and state.clone_new_count > 0:
+ reasons.append("clone:new")
+
+ if 0 <= config.fail_threshold < state.clone_total:
+ reasons.append(f"clone:threshold:{state.clone_total}:{config.fail_threshold}")
+
+ if reasons:
+ return GateResult(
+ exit_code=int(ExitCode.GATING_FAILURE),
+ reasons=tuple(reasons),
+ )
+ return GateResult(exit_code=int(ExitCode.SUCCESS), reasons=())
+
+
+# codeclone: ignore[dead-code]
+def metric_gate_reasons(
+ *,
+ report_document: Mapping[str, object],
+ config: MetricGateConfig,
+ metrics_diff: object | None = None,
+) -> tuple[str, ...]:
+ state = _gate_state_from_report_document(
+ report_document=report_document,
+ metrics_diff=metrics_diff,
+ )
+ return metric_gate_reasons_for_state(state=state, config=config)
+
+
+def evaluate_gates(
+ *,
+ report_document: Mapping[str, object],
+ config: MetricGateConfig,
+ baseline_status: str | None = None,
+ metrics_diff: object | None = None,
+ clone_new_count: int | None = None,
+ clone_total: int | None = None,
+) -> GateResult:
+ _ = baseline_status
+ state = _gate_state_from_report_document(
+ report_document=report_document,
+ metrics_diff=metrics_diff,
+ clone_new_count=clone_new_count,
+ clone_total=clone_total,
+ )
+ return evaluate_gate_state(state=state, config=config)
+
+
+def _gate_state_from_report_document(
+ *,
+ report_document: Mapping[str, object],
+ metrics_diff: object | None,
+ clone_new_count: int | None = None,
+ clone_total: int | None = None,
+) -> GateState:
+ findings = _as_mapping(report_document.get("findings"))
+ groups = _as_mapping(findings.get("groups"))
+ clone_groups = _as_mapping(groups.get("clones"))
+ function_groups = _as_sequence(clone_groups.get("functions"))
+ block_groups = _as_sequence(clone_groups.get("blocks"))
+ derived_clone_new_count = sum(
+ 1
+ for group in (*function_groups, *block_groups)
+ if str(_as_mapping(group).get("novelty", "")).strip() == "new"
+ )
+ metrics = _as_mapping(report_document.get("metrics"))
+ families = _as_mapping(metrics.get("families"))
+ complexity_summary = _as_mapping(
+ _as_mapping(families.get("complexity")).get("summary")
+ )
+ coupling_summary = _as_mapping(_as_mapping(families.get("coupling")).get("summary"))
+ cohesion_summary = _as_mapping(_as_mapping(families.get("cohesion")).get("summary"))
+ dependencies_summary = _as_mapping(
+ _as_mapping(families.get("dependencies")).get("summary")
+ )
+ dead_code_summary = _as_mapping(
+ _as_mapping(families.get("dead_code")).get("summary")
+ )
+ health_summary = _as_mapping(_as_mapping(families.get("health")).get("summary"))
+ coverage_adoption_summary = _as_mapping(
+ _as_mapping(families.get("coverage_adoption")).get("summary")
+ )
+ api_surface_summary = _as_mapping(
+ _as_mapping(families.get("api_surface")).get("summary")
+ )
+ coverage_join_summary = _as_mapping(
+ _as_mapping(families.get("coverage_join")).get("summary")
+ )
+ diff_summary = summarize_metrics_diff(metrics_diff) or {}
+ prefer_diff_summary = metrics_diff is not None
+ return GateState(
+ clone_new_count=max(
+ clone_new_count if clone_new_count is not None else derived_clone_new_count,
+ 0,
+ ),
+ clone_total=max(
+ clone_total
+ if clone_total is not None
+ else len(function_groups) + len(block_groups),
+ 0,
+ ),
+ complexity_max=_as_int(complexity_summary.get("max"), 0),
+ coupling_max=_as_int(coupling_summary.get("max"), 0),
+ cohesion_max=_as_int(cohesion_summary.get("max"), 0),
+ dependency_cycles=_as_int(dependencies_summary.get("cycles"), 0),
+ dead_high_confidence=_as_int(dead_code_summary.get("high_confidence"), 0),
+ health_score=_as_int(health_summary.get("score"), 0),
+ typing_param_permille=_as_int(
+ coverage_adoption_summary.get("param_permille"), 0
+ ),
+ docstring_permille=_as_int(
+ coverage_adoption_summary.get("docstring_permille"),
+ 0,
+ ),
+ coverage_join_status=str(coverage_join_summary.get("status", "")),
+ coverage_hotspots=_as_int(
+ coverage_join_summary.get("coverage_hotspots"),
+ 0,
+ ),
+ api_breaking_changes=(
+ _as_int(diff_summary.get("api_breaking_changes"), 0)
+ if prefer_diff_summary
+ else _as_int(api_surface_summary.get("breaking"), 0)
+ ),
+ diff_new_high_risk_functions=_as_int(
+ diff_summary.get("new_high_risk_functions"),
+ 0,
+ ),
+ diff_new_high_coupling_classes=_as_int(
+ diff_summary.get("new_high_coupling_classes"),
+ 0,
+ ),
+ diff_new_cycles=_as_int(diff_summary.get("new_cycles"), 0),
+ diff_new_dead_code=_as_int(diff_summary.get("new_dead_code"), 0),
+ diff_health_delta=_as_int(diff_summary.get("health_delta"), 0),
+ diff_typing_param_permille_delta=(
+ _as_int(diff_summary.get("typing_param_permille_delta"), 0)
+ if prefer_diff_summary
+ else _as_int(coverage_adoption_summary.get("param_delta"), 0)
+ ),
+ diff_typing_return_permille_delta=(
+ _as_int(diff_summary.get("typing_return_permille_delta"), 0)
+ if prefer_diff_summary
+ else _as_int(coverage_adoption_summary.get("return_delta"), 0)
+ ),
+ diff_docstring_permille_delta=(
+ _as_int(diff_summary.get("docstring_permille_delta"), 0)
+ if prefer_diff_summary
+ else _as_int(coverage_adoption_summary.get("docstring_delta"), 0)
+ ),
+ )
+
+
+def _permille(numerator: int, denominator: int) -> int:
+ if denominator <= 0:
+ return 0
+ return round(numerator * 1000 / denominator)
+
+
+__all__ = [
+ "GateResult",
+ "GateState",
+ "MetricGateConfig",
+ "evaluate_gate_state",
+ "evaluate_gates",
+ "gate_state_from_project_metrics",
+ "metric_gate_reasons",
+ "metric_gate_reasons_for_state",
+ "summarize_metrics_diff",
+]
diff --git a/codeclone/_cli_gating.py b/codeclone/report/gates/reasons.py
similarity index 100%
rename from codeclone/_cli_gating.py
rename to codeclone/report/gates/reasons.py
diff --git a/codeclone/_html_report/__init__.py b/codeclone/report/html/__init__.py
similarity index 76%
rename from codeclone/_html_report/__init__.py
rename to codeclone/report/html/__init__.py
index 69b89c1..cdde57d 100644
--- a/codeclone/_html_report/__init__.py
+++ b/codeclone/report/html/__init__.py
@@ -4,10 +4,10 @@
# SPDX-License-Identifier: MPL-2.0
# Copyright (c) 2026 Den Rozhnovskiy
-"""New HTML report package — component-based architecture."""
+"""Canonical HTML report package."""
from __future__ import annotations
-from ._assemble import build_html_report
+from .assemble import build_html_report
__all__ = ["build_html_report"]
diff --git a/codeclone/_html_report/_context.py b/codeclone/report/html/_context.py
similarity index 93%
rename from codeclone/_html_report/_context.py
rename to codeclone/report/html/_context.py
index efac981..02865cb 100644
--- a/codeclone/_html_report/_context.py
+++ b/codeclone/report/html/_context.py
@@ -12,19 +12,19 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
-from .._coerce import as_mapping as _as_mapping
-from ..contracts import REPORT_SCHEMA_VERSION
-from ..report.overview import build_report_overview, materialize_report_overview
+from ...contracts import REPORT_SCHEMA_VERSION
+from ...utils.coerce import as_mapping as _as_mapping
+from ..overview import build_report_overview, materialize_report_overview
if TYPE_CHECKING:
- from .._html_snippets import _FileCache
- from ..models import (
+ from ...models import (
GroupItemLike,
GroupMapLike,
MetricsDiff,
StructuralFindingGroup,
Suggestion,
)
+ from .widgets.snippets import _FileCache
@dataclass(frozen=True, slots=True)
@@ -63,6 +63,7 @@ class ReportContext:
dependencies_map: Mapping[str, object]
dead_code_map: Mapping[str, object]
overloaded_modules_map: Mapping[str, object]
+ security_surfaces_map: Mapping[str, object]
health_map: Mapping[str, object]
# -- suggestions + structural --
@@ -166,7 +167,7 @@ def build_context(
max_snippet_lines: int = 220,
) -> ReportContext:
"""Build a ReportContext from raw build_html_report parameters."""
- from .._html_escape import _escape_html
+ from .primitives.escape import _escape_html
meta = dict(report_meta or {})
baseline_meta = _as_mapping(meta.get("baseline"))
@@ -177,6 +178,8 @@ def build_context(
inventory_map = _as_mapping(report_document_map.get("inventory"))
derived_map = _as_mapping(report_document_map.get("derived"))
integrity_map = _as_mapping(report_document_map.get("integrity"))
+ report_metrics_map = _as_mapping(report_document_map.get("metrics"))
+ report_metric_families = _as_mapping(report_metrics_map.get("families"))
report_schema_version = str(
meta.get("report_schema_version") or REPORT_SCHEMA_VERSION
@@ -237,6 +240,9 @@ def build_context(
overloaded_modules_map = _as_mapping(metrics_map.get("overloaded_modules"))
if not overloaded_modules_map:
overloaded_modules_map = _as_mapping(metrics_map.get("god_modules"))
+ security_surfaces_map = _as_mapping(report_metric_families.get("security_surfaces"))
+ if not security_surfaces_map:
+ security_surfaces_map = _as_mapping(metrics_map.get("security_surfaces"))
health_map = _as_mapping(metrics_map.get("health"))
suggestions_tuple = tuple(suggestions or ())
@@ -282,6 +288,7 @@ def build_context(
dependencies_map=dependencies_map,
dead_code_map=dead_code_map,
overloaded_modules_map=overloaded_modules_map,
+ security_surfaces_map=security_surfaces_map,
health_map=health_map,
suggestions=suggestions_tuple,
structural_findings=tuple(structural_findings or ()),
diff --git a/codeclone/_html_report/_assemble.py b/codeclone/report/html/assemble.py
similarity index 93%
rename from codeclone/_html_report/_assemble.py
rename to codeclone/report/html/assemble.py
index 13f4964..ed40620 100644
--- a/codeclone/_html_report/_assemble.py
+++ b/codeclone/report/html/assemble.py
@@ -11,28 +11,29 @@
from collections.abc import Collection, Mapping, Sequence
from typing import TYPE_CHECKING
-from .. import __version__, _coerce
-from .._html_css import build_css
-from .._html_escape import _escape_html
-from .._html_js import build_js
-from .._html_snippets import _FileCache, _pygments_css
-from ..contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL
-from ..domain.quality import CONFIDENCE_HIGH
-from ..structural_findings import normalize_structural_findings
-from ..templates import FONT_CSS_URL, REPORT_TEMPLATE
+from ... import __version__
+from ...contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL
+from ...domain.quality import CONFIDENCE_HIGH
+from ...findings.structural.detectors import normalize_structural_findings
+from ...utils import coerce as _coerce
from ._context import _meta_pick, build_context
-from ._icons import BRAND_LOGO, ICONS, section_icon_html
-from ._sections._clones import render_clones_panel
-from ._sections._coupling import render_quality_panel
-from ._sections._dead_code import render_dead_code_panel
-from ._sections._dependencies import render_dependencies_panel
-from ._sections._meta import build_topbar_provenance_summary, render_meta_panel
-from ._sections._overview import render_overview_panel
-from ._sections._structural import render_structural_panel
-from ._sections._suggestions import render_suggestions_panel
+from .assets.css import build_css
+from .assets.js import build_js
+from .primitives.escape import _escape_html
+from .sections._clones import render_clones_panel
+from .sections._coupling import render_quality_panel
+from .sections._dead_code import render_dead_code_panel
+from .sections._dependencies import render_dependencies_panel
+from .sections._meta import build_topbar_provenance_summary, render_meta_panel
+from .sections._overview import render_overview_panel
+from .sections._structural import render_structural_panel
+from .sections._suggestions import render_suggestions_panel
+from .template import FONT_CSS_URL, REPORT_TEMPLATE
+from .widgets.icons import BRAND_LOGO, ICONS, section_icon_html
+from .widgets.snippets import _FileCache, _pygments_css
if TYPE_CHECKING:
- from ..models import GroupMapLike, MetricsDiff, StructuralFindingGroup, Suggestion
+ from ...models import GroupMapLike, MetricsDiff, StructuralFindingGroup, Suggestion
def build_html_report(
@@ -124,6 +125,7 @@ def build_html_report(
_as_mapping(ctx.overloaded_modules_map.get("summary")).get("candidates")
)
+ coverage_review_items
+ + _as_int(_as_mapping(ctx.security_surfaces_map.get("summary")).get("items"))
)
def _tab_badge(count: int) -> str:
diff --git a/codeclone/report/html/assets/__init__.py b/codeclone/report/html/assets/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/report/html/assets/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/_html_css.py b/codeclone/report/html/assets/css.py
similarity index 99%
rename from codeclone/_html_css.py
rename to codeclone/report/html/assets/css.py
index 66a4609..af4776d 100644
--- a/codeclone/_html_css.py
+++ b/codeclone/report/html/assets/css.py
@@ -360,6 +360,11 @@
.insight-warn{border-left-color:var(--warning);background:var(--warning-muted)}
.insight-risk{border-left-color:var(--error);background:var(--error-muted)}
.insight-info{border-left-color:var(--info);background:var(--info-muted)}
+.insight-banner .overview-summary-grid{margin:0}
+.insight-banner .overview-summary-item{background:none;border:none;border-radius:0;padding:0}
+.insight-banner .overview-summary-label{font-size:.76rem;margin-bottom:var(--sp-2);
+ padding-bottom:var(--sp-1);border-bottom:1px solid color-mix(in srgb,var(--border) 55%,transparent)}
+.insight-banner .overview-fact-row{font-size:.78rem}
"""
# ---------------------------------------------------------------------------
@@ -844,7 +849,7 @@
.stat-cards .kpi-detail,.dep-stats .kpi-detail{margin-top:0;align-self:end}
.dep-graph-wrap{overflow:hidden;margin-bottom:var(--sp-4);border:1px solid var(--border);
border-radius:var(--radius-lg);background:var(--bg-surface);padding:var(--sp-4)}
-.dep-graph-svg{width:100%;height:auto;max-height:520px}
+.dep-graph-svg{display:block;width:100%;height:auto;max-height:680px;margin:0 auto}
.dep-graph-svg text{fill:var(--text-secondary);font-family:var(--font-mono)}
.dep-node{transition:fill-opacity var(--dur-fast) var(--ease)}
.dep-edge{transition:stroke-opacity var(--dur-fast) var(--ease)}
@@ -1129,7 +1134,6 @@
.prov-copy-btn svg{width:12px;height:12px}
"""
-
# ---------------------------------------------------------------------------
# Shared micro-interactions
# ---------------------------------------------------------------------------
@@ -1414,7 +1418,6 @@
font-variant-numeric:tabular-nums;opacity:.85}
"""
-
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
diff --git a/codeclone/_html_js.py b/codeclone/report/html/assets/js.py
similarity index 100%
rename from codeclone/_html_js.py
rename to codeclone/report/html/assets/js.py
diff --git a/codeclone/report/html/primitives/__init__.py b/codeclone/report/html/primitives/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/report/html/primitives/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/_html_data_attrs.py b/codeclone/report/html/primitives/data_attrs.py
similarity index 96%
rename from codeclone/_html_data_attrs.py
rename to codeclone/report/html/primitives/data_attrs.py
index d4e94f3..3c942a1 100644
--- a/codeclone/_html_data_attrs.py
+++ b/codeclone/report/html/primitives/data_attrs.py
@@ -8,7 +8,7 @@
from __future__ import annotations
-from ._html_escape import _escape_html
+from .escape import _escape_html
__all__ = ["_build_data_attrs"]
diff --git a/codeclone/_html_escape.py b/codeclone/report/html/primitives/escape.py
similarity index 100%
rename from codeclone/_html_escape.py
rename to codeclone/report/html/primitives/escape.py
diff --git a/codeclone/_html_filters.py b/codeclone/report/html/primitives/filters.py
similarity index 97%
rename from codeclone/_html_filters.py
rename to codeclone/report/html/primitives/filters.py
index e700fad..f578b16 100644
--- a/codeclone/_html_filters.py
+++ b/codeclone/report/html/primitives/filters.py
@@ -10,7 +10,7 @@
from collections.abc import Sequence
-from ._html_escape import _escape_html
+from .escape import _escape_html
__all__ = [
"CLONE_TYPE_OPTIONS",
diff --git a/codeclone/report/html/primitives/location.py b/codeclone/report/html/primitives/location.py
new file mode 100644
index 0000000..859d0d8
--- /dev/null
+++ b/codeclone/report/html/primitives/location.py
@@ -0,0 +1,48 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
+
+"""Shared location/path helpers for HTML section renderers."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from .._context import ReportContext
+
+
+def relative_location_path(ctx: ReportContext, item: Mapping[str, object]) -> str:
+ relative_path = str(item.get("relative_path", "")).strip()
+ if relative_path:
+ return relative_path
+ filepath = str(item.get("filepath", "")).strip()
+ if not filepath:
+ return ""
+ return ctx.relative_path(filepath).strip()
+
+
+def location_file_target(
+ ctx: ReportContext,
+ item: Mapping[str, object],
+ *,
+ relative_path: str,
+) -> str:
+ filepath = str(item.get("filepath", "")).strip()
+ if filepath:
+ path_obj = Path(filepath)
+ if path_obj.is_absolute():
+ return filepath
+ if ctx.scan_root:
+ return str((Path(ctx.scan_root) / path_obj).resolve())
+ return filepath
+ if ctx.scan_root and relative_path:
+ return str((Path(ctx.scan_root) / relative_path).resolve())
+ return relative_path
+
+
+__all__ = ["location_file_target", "relative_location_path"]
diff --git a/codeclone/report/html/sections/__init__.py b/codeclone/report/html/sections/__init__.py
new file mode 100644
index 0000000..9135843
--- /dev/null
+++ b/codeclone/report/html/sections/__init__.py
@@ -0,0 +1,5 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# Copyright (c) 2026 Den Rozhnovskiy
diff --git a/codeclone/_html_report/_sections/_clones.py b/codeclone/report/html/sections/_clones.py
similarity index 97%
rename from codeclone/_html_report/_sections/_clones.py
rename to codeclone/report/html/sections/_clones.py
index 65ab657..b18624f 100644
--- a/codeclone/_html_report/_sections/_clones.py
+++ b/codeclone/report/html/sections/_clones.py
@@ -11,29 +11,31 @@
from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING, Literal
-from ... import _coerce
-from ..._html_badges import _micro_badges, _source_kind_badge_html, _stat_card
-from ..._html_data_attrs import _build_data_attrs
-from ..._html_escape import _escape_html
-from ..._html_filters import CLONE_TYPE_OPTIONS, SPREAD_OPTIONS, _render_select
-from ..._html_snippets import _render_code_block
-from ...report._source_kinds import SOURCE_KIND_FILTER_VALUES
-from ...report.derived import (
+from codeclone.findings.ids import clone_group_id
+from codeclone.utils import coerce as _coerce
+
+from ..._source_kinds import SOURCE_KIND_FILTER_VALUES
+from ...derived import (
combine_source_kinds,
group_spread,
report_location_from_group_item,
)
-from ...report.explain_contract import format_group_instance_compare_meta
-from ...report.json_contract import clone_group_id
-from ...report.suggestions import classify_clone_type
-from .._components import Tone, insight_block
-from .._glossary import glossary_tip
-from .._icons import ICONS
-from .._tables import render_rows_table
-from .._tabs import render_split_tabs
+from ...explain_contract import format_group_instance_compare_meta
+from ...suggestions import classify_clone_type
+from ..primitives.data_attrs import _build_data_attrs
+from ..primitives.escape import _escape_html
+from ..primitives.filters import CLONE_TYPE_OPTIONS, SPREAD_OPTIONS, _render_select
+from ..widgets.badges import _micro_badges, _source_kind_badge_html, _stat_card
+from ..widgets.components import Tone, insight_block
+from ..widgets.glossary import glossary_tip
+from ..widgets.icons import ICONS
+from ..widgets.snippets import _render_code_block
+from ..widgets.tables import render_rows_table
+from ..widgets.tabs import render_split_tabs
if TYPE_CHECKING:
- from ...models import GroupItemLike
+ from codeclone.models import GroupItemLike
+
from .._context import ReportContext
_as_int = _coerce.as_int
diff --git a/codeclone/_html_report/_sections/_coupling.py b/codeclone/report/html/sections/_coupling.py
similarity index 94%
rename from codeclone/_html_report/_sections/_coupling.py
rename to codeclone/report/html/sections/_coupling.py
index 08fdf22..23a860d 100644
--- a/codeclone/_html_report/_sections/_coupling.py
+++ b/codeclone/report/html/sections/_coupling.py
@@ -10,17 +10,22 @@
from typing import TYPE_CHECKING
-from ... import _coerce
-from ..._html_badges import _micro_badges, _render_chain_flow, _stat_card
-from .._components import Tone, insight_block
-from .._glossary import glossary_tip
-from .._tables import render_rows_table
-from .._tabs import render_split_tabs
+from codeclone.utils import coerce as _coerce
+
+from ..widgets.badges import _micro_badges, _render_chain_flow, _stat_card
+from ..widgets.components import Tone, insight_block
+from ..widgets.glossary import glossary_tip
+from ..widgets.tables import render_rows_table
+from ..widgets.tabs import render_split_tabs
from ._coverage_join import (
coverage_join_quality_count,
coverage_join_quality_summary,
render_coverage_join_panel,
)
+from ._security_surfaces import (
+ render_security_surfaces_panel,
+ security_surfaces_quality_count,
+)
if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
@@ -269,12 +274,12 @@ def render_quality_panel(ctx: ReportContext) -> str:
complexity_summary = _as_mapping(ctx.complexity_map.get("summary"))
overloaded_modules_summary = _as_mapping(ctx.overloaded_modules_map.get("summary"))
coverage_join_summary = coverage_join_quality_summary(ctx)
-
coupling_high_risk = _as_int(coupling_summary.get("high_risk"))
cohesion_low = _as_int(cohesion_summary.get("low_cohesion"))
complexity_high_risk = _as_int(complexity_summary.get("high_risk"))
overloaded_module_candidates = _as_int(overloaded_modules_summary.get("candidates"))
coverage_review_items = coverage_join_quality_count(ctx)
+ security_surface_items = security_surfaces_quality_count(ctx)
coverage_hotspots = _as_int(coverage_join_summary.get("coverage_hotspots"))
coverage_scope_gaps = _as_int(coverage_join_summary.get("scope_gap_hotspots"))
coverage_join_status = str(coverage_join_summary.get("status", "")).strip()
@@ -292,6 +297,7 @@ def render_quality_panel(ctx: ReportContext) -> str:
f"high-coupling: {coupling_high_risk}; "
f"low-cohesion: {cohesion_low}; "
f"overloaded modules: {overloaded_module_candidates}; "
+ f"security surfaces: {security_surface_items}; "
f"max CC {cc_max}; "
f"max CBO {coupling_summary.get('max', 'n/a')}; "
f"max LCOM4 {cohesion_summary.get('max', 'n/a')}."
@@ -441,6 +447,16 @@ def render_quality_panel(ctx: ReportContext) -> str:
coverage_join_panel,
)
)
+ security_surfaces_panel = render_security_surfaces_panel(ctx)
+ if security_surfaces_panel:
+ sub_tabs.append(
+ (
+ "security-surfaces",
+ "Security Surfaces",
+ security_surface_items,
+ security_surfaces_panel,
+ )
+ )
return insight_block(
question="Are there quality hotspots in the codebase?",
diff --git a/codeclone/_html_report/_sections/_coverage_join.py b/codeclone/report/html/sections/_coverage_join.py
similarity index 94%
rename from codeclone/_html_report/_sections/_coverage_join.py
rename to codeclone/report/html/sections/_coverage_join.py
index 5268d50..4821850 100644
--- a/codeclone/_html_report/_sections/_coverage_join.py
+++ b/codeclone/report/html/sections/_coverage_join.py
@@ -11,11 +11,13 @@
from pathlib import Path
from typing import TYPE_CHECKING
-from ... import _coerce
-from ..._html_badges import _micro_badges, _stat_card, _tab_empty_info
-from ..._html_escape import _escape_html
-from .._glossary import glossary_tip
-from .._tables import render_rows_table
+from codeclone.utils import coerce as _coerce
+
+from ..primitives.escape import _escape_html
+from ..primitives.location import location_file_target, relative_location_path
+from ..widgets.badges import _micro_badges, _stat_card, _tab_empty_info
+from ..widgets.glossary import glossary_tip
+from ..widgets.tables import render_rows_table
if TYPE_CHECKING:
from collections.abc import Mapping
@@ -199,7 +201,7 @@ def _coverage_join_empty_description() -> str:
def _location_cell_html(ctx: ReportContext, item: Mapping[str, object]) -> str:
- relative_path = str(item.get("relative_path", "")).strip()
+ relative_path = relative_location_path(ctx, item)
start_line = _as_int(item.get("start_line"))
end_line = _as_int(item.get("end_line"))
line_label = (
@@ -209,11 +211,7 @@ def _location_cell_html(ctx: ReportContext, item: Mapping[str, object]) -> str:
)
if end_line > start_line > 0:
line_label = f"{relative_path}:{start_line}-{end_line}"
- file_target = (
- f"{ctx.scan_root.rstrip('/')}/{relative_path}"
- if ctx.scan_root and relative_path
- else relative_path
- )
+ file_target = location_file_target(ctx, item, relative_path=relative_path)
return (
f' 0 else 1}">'
diff --git a/codeclone/_html_report/_sections/_dead_code.py b/codeclone/report/html/sections/_dead_code.py
similarity index 94%
rename from codeclone/_html_report/_sections/_dead_code.py
rename to codeclone/report/html/sections/_dead_code.py
index eaa5bd2..ffdad1d 100644
--- a/codeclone/_html_report/_sections/_dead_code.py
+++ b/codeclone/report/html/sections/_dead_code.py
@@ -10,12 +10,13 @@
from typing import TYPE_CHECKING
-from ... import _coerce
-from ..._html_badges import _micro_badges, _stat_card
-from .._components import Tone, insight_block
-from .._glossary import glossary_tip
-from .._tables import render_rows_table
-from .._tabs import render_split_tabs
+from codeclone.utils import coerce as _coerce
+
+from ..widgets.badges import _micro_badges, _stat_card
+from ..widgets.components import Tone, insight_block
+from ..widgets.glossary import glossary_tip
+from ..widgets.tables import render_rows_table
+from ..widgets.tabs import render_split_tabs
if TYPE_CHECKING:
from collections.abc import Mapping
diff --git a/codeclone/_html_report/_sections/_dependencies.py b/codeclone/report/html/sections/_dependencies.py
similarity index 73%
rename from codeclone/_html_report/_sections/_dependencies.py
rename to codeclone/report/html/sections/_dependencies.py
index b0df4af..c3fbbfb 100644
--- a/codeclone/_html_report/_sections/_dependencies.py
+++ b/codeclone/report/html/sections/_dependencies.py
@@ -12,29 +12,34 @@
from collections.abc import Mapping, Sequence
from typing import TYPE_CHECKING
-from ... import _coerce
-from ..._html_badges import (
+from codeclone.utils import coerce as _coerce
+
+from ..primitives.escape import _escape_html
+from ..widgets.badges import (
_micro_badges,
_render_chain_flow,
_short_label,
_stat_card,
_tab_empty,
)
-from ..._html_escape import _escape_html
-from .._components import Tone, insight_block
-from .._glossary import glossary_tip
-from .._tables import render_rows_table
+from ..widgets.components import Tone, insight_block
+from ..widgets.glossary import glossary_tip
+from ..widgets.tables import render_rows_table
if TYPE_CHECKING:
from .._context import ReportContext
_as_int = _coerce.as_int
+_as_float = _coerce.as_float
_as_mapping = _coerce.as_mapping
_as_sequence = _coerce.as_sequence
def _select_dep_nodes(
edges: Sequence[tuple[str, str]],
+ *,
+ dep_cycles: Sequence[object],
+ longest_chains: Sequence[object],
) -> tuple[list[str], list[tuple[str, str]]]:
all_nodes = sorted({part for edge in edges for part in edge})
if len(all_nodes) > 20:
@@ -42,7 +47,38 @@ def _select_dep_nodes(
for source, target in edges:
degree_count[source] = degree_count.get(source, 0) + 1
degree_count[target] = degree_count.get(target, 0) + 1
- nodes = sorted(all_nodes, key=lambda node: -degree_count.get(node, 0))[:20]
+ all_node_set = set(all_nodes)
+ nodes: list[str] = []
+ node_set: set[str] = set()
+
+ def _seed_node(node: object) -> None:
+ node_name = str(node).strip()
+ if (
+ not node_name
+ or node_name not in all_node_set
+ or node_name in node_set
+ or len(nodes) >= 20
+ ):
+ return
+ nodes.append(node_name)
+ node_set.add(node_name)
+
+ # Keep the visual graph aligned with the dependency tables. When we
+ # downsample a large graph, cycle members and longest-chain nodes must
+ # remain visible instead of being dropped behind high-degree hubs.
+ for cycle in dep_cycles:
+ for node in _as_sequence(cycle):
+ _seed_node(node)
+ for chain in longest_chains:
+ for node in _as_sequence(chain):
+ _seed_node(node)
+
+ for node in sorted(
+ all_nodes, key=lambda item: (-degree_count.get(item, 0), item)
+ ):
+ _seed_node(node)
+ if len(nodes) >= 20:
+ break
nodes.sort()
else:
nodes = all_nodes
@@ -107,17 +143,58 @@ def _build_layer_groups(
def _layout_dep_graph(
layer_groups: Mapping[int, Sequence[str]],
+ *,
+ in_degree: Mapping[str, int],
+ out_degree: Mapping[str, int],
) -> tuple[int, int, int, dict[str, tuple[float, float]]]:
num_layers = max(layer_groups.keys(), default=0) + 1
max_per_layer = max((len(members) for members in layer_groups.values()), default=1)
- width = max(600, min(1200, max_per_layer * 70 + 140))
- height = max(260, num_layers * 80 + 80)
- pad_x, pad_y = 60.0, 40.0
+ pad_x, pad_y = 56.0, 36.0
+ prefer_horizontal = num_layers >= 6 and num_layers > max_per_layer + 2
+
+ def _ordered_members(members: Sequence[str]) -> list[str]:
+ if not prefer_horizontal or len(members) < 3:
+ return list(members)
+ ranked = sorted(
+ members,
+ key=lambda node: (
+ -(in_degree.get(node, 0) + out_degree.get(node, 0)),
+ node,
+ ),
+ )
+ center = (len(ranked) - 1) / 2
+ slot_order = sorted(
+ range(len(ranked)),
+ key=lambda index: (abs(index - center), index),
+ )
+ ordered = [""] * len(ranked)
+ for node, slot in zip(ranked, slot_order, strict=False):
+ ordered[slot] = node
+ return ordered
+
+ if prefer_horizontal:
+ width = max(920, min(1600, num_layers * 118 + max_per_layer * 28 + 180))
+ height = max(300, max_per_layer * 84 + 104)
+ else:
+ width = max(600, min(1200, max_per_layer * 70 + 140))
+ height = max(260, num_layers * 80 + 80)
positions: dict[str, tuple[float, float]] = {}
for layer_index in range(num_layers):
members = layer_groups.get(layer_index, [])
count = len(members)
+ if prefer_horizontal:
+ members = _ordered_members(members)
+ layer_step = (width - 2 * pad_x) / max(1, num_layers - 1)
+ x = pad_x + layer_index * layer_step
+ fan = min(14.0, layer_step * 0.12)
+ offset_unit = fan / max(1, count - 1)
+ center = (count - 1) / 2
+ for index, node in enumerate(members):
+ y = pad_y + (index + 0.5) * ((height - 2 * pad_y) / max(1, count))
+ positions[node] = (x + (index - center) * offset_unit, y)
+ continue
+
y = pad_y + layer_index * ((height - 2 * pad_y) / max(1, num_layers - 1))
for index, node in enumerate(members):
x = pad_x + (index + 0.5) * ((width - 2 * pad_x) / max(1, count))
@@ -222,10 +299,11 @@ def _render_dep_nodes_and_labels(
cycle_node_set: set[str],
hub_threshold: int,
max_per_layer: int,
+ prefer_horizontal: bool,
) -> tuple[list[str], list[str]]:
nodes_svg: list[str] = []
labels_svg: list[str] = []
- rotate_labels = max_per_layer > 6
+ rotate_labels = prefer_horizontal or max_per_layer > 6
for node in nodes:
x, y = positions[node]
@@ -234,6 +312,7 @@ def _render_dep_nodes_and_labels(
label = _short_label(node)
is_cycle = node in cycle_node_set
is_hub = degree >= hub_threshold and degree > 2
+ is_secondary = not is_hub and not is_cycle
if is_cycle:
fill, fill_opacity, extra = (
@@ -258,19 +337,25 @@ def _render_dep_nodes_and_labels(
f'fill="{fill}" fill-opacity="{fill_opacity}" {extra}/>'
)
- font_size = "10" if is_hub else "9"
+ font_size = "10" if is_hub else ("8" if is_secondary else "9")
if rotate_labels:
+ label_x = (
+ x + radius + (4 if is_secondary else 6 if prefer_horizontal else 0)
+ )
+ label_y = (
+ y - radius - (1 if is_secondary else 2 if prefer_horizontal else 6)
+ )
labels_svg.append(
f''
+ f'transform="translate({label_x:.1f},{label_y:.1f}) rotate(-45)">'
f"{_escape_html(node)}{_escape_html(label)}"
)
continue
labels_svg.append(
f''
+ f'x="{x:.1f}" y="{y - radius - (4 if is_secondary else 5):.1f}" font-size="{font_size}" text-anchor="middle">'
f"{_escape_html(node)}{_escape_html(label)}"
)
@@ -281,14 +366,24 @@ def _render_dep_svg(
edges: Sequence[tuple[str, str]],
cycle_node_set: set[str],
dep_cycles: Sequence[object],
+ longest_chains: Sequence[object],
) -> str:
if not edges:
return _tab_empty("Dependency graph is not available.")
- nodes, filtered_edges = _select_dep_nodes(edges)
+ nodes, filtered_edges = _select_dep_nodes(
+ edges,
+ dep_cycles=dep_cycles,
+ longest_chains=longest_chains,
+ )
in_degree, out_degree = _build_degree_maps(nodes, filtered_edges)
layer_groups = _build_layer_groups(nodes, filtered_edges, in_degree, out_degree)
- width, height, max_per_layer, positions = _layout_dep_graph(layer_groups)
+ width, height, max_per_layer, positions = _layout_dep_graph(
+ layer_groups,
+ in_degree=in_degree,
+ out_degree=out_degree,
+ )
+ prefer_horizontal = width > height
hub_threshold = _hub_threshold(nodes, in_degree, out_degree)
node_radii = _build_node_radii(
nodes,
@@ -309,15 +404,19 @@ def _render_dep_svg(
cycle_node_set=cycle_node_set,
hub_threshold=hub_threshold,
max_per_layer=max_per_layer,
+ prefer_horizontal=prefer_horizontal,
)
- label_pad = 50 if max_per_layer > 6 else 0
+ label_pad = 44 if prefer_horizontal else (50 if max_per_layer > 6 else 0)
+ label_pad_x = 52 if prefer_horizontal else (28 if max_per_layer > 6 else 0)
+ vb_x = -label_pad_x
vb_y = -label_pad
+ vb_w = width + label_pad_x * 2
vb_h = height + label_pad
return (
''
- f'