diff --git a/README.md b/README.md index 01371097..0bb6948e 100644 --- a/README.md +++ b/README.md @@ -76,39 +76,24 @@ No config files, no Docker, no JVM, no API keys, no accounts. Point your agent a ### Feature comparison -Comparison last verified: March 2026. Full analysis: COMPETITIVE_ANALYSIS.md - -| Capability | codegraph | [joern](https://github.com/joernio/joern) | [narsil-mcp](https://github.com/postrv/narsil-mcp) | [code-graph-rag](https://github.com/vitali87/code-graph-rag) | [cpg](https://github.com/Fraunhofer-AISEC/cpg) | [GitNexus](https://github.com/abhigyanpatwari/GitNexus) | [CodeMCP](https://github.com/SimplyLiz/CodeMCP) | [axon](https://github.com/harshkedia177/axon) | -|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -| MCP / AI agent support | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | -| Batch querying | **Yes** | — | — | — | — | — | — | — | -| Composite audit command | **Yes** | — | — | — | — | — | — | — | -| Function-level analysis | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | -| Multi-language | **11** | **14** | **32** | **11** | **~10** | **12** | **12** | **3** | -| Semantic search | **Yes** | — | **Yes** | **Yes** | — | **Yes** | — | **Yes** | -| Hybrid BM25 + semantic | **Yes** | — | — | — | — | **Yes** | — | **Yes** | -| CODEOWNERS integration | **Yes** | — | — | — | — | — | — | — | -| Architecture boundary rules | **Yes** | — | — | — | — | — | — | — | -| CI validation predicates | **Yes** | — | — | — | — | — | — | — | -| Graph snapshots | **Yes** | — | — | — | — | — | — | — | -| Git diff impact | **Yes** | — | — | — | — | **Yes** | **Yes** | **Yes** | -| Branch structural diff | **Yes** | — | — | — | — | — | — | **Yes** | -| Git co-change analysis | **Yes** | — | — | — | — | — | — | **Yes** | -| Watch mode | **Yes** | — | **Yes** | **Yes** | — | — | **Yes** | **Yes** | -| Dead code / role classification | **Yes** | — | **Yes** | — | — | — | **Yes** | **Yes** | -| Cycle detection | **Yes** | — | — | — | — | — | — | — | -| Incremental rebuilds | **O(changed)** | — | O(n) Merkle | — | — | — | Go only | **Yes** | -| Zero config | **Yes** | — | **Yes** | — | — | **Yes** | — | **Yes** | -| Embeddable JS library (`npm install`) | **Yes** | — | — | — | — | — | — | — | -| LLM-optional (works without API keys) | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | -| Dataflow analysis | **Yes** | **Yes** | — | — | **Yes** | — | — | — | -| Control flow graph (CFG) | **Yes** | **Yes** | — | — | **Yes** | — | — | — | -| AST node querying | **Yes** | **Yes** | — | — | **Yes** | — | — | — | -| Expanded node/edge types | **Yes** | **Yes** | — | — | **Yes** | — | — | — | -| GraphML / Neo4j export | **Yes** | **Yes** | — | — | — | — | — | — | -| Interactive graph viewer | **Yes** | — | — | — | — | — | — | — | -| Commercial use allowed | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | No | Paid | **Yes** | -| Open source | **Yes** | Yes | Yes | Yes | Yes | No | No | Yes | +Comparison last verified: March 2026. Claims verified against each repo's README/docs. Full analysis: COMPETITIVE_ANALYSIS.md + +| Capability | codegraph | [joern](https://github.com/joernio/joern) | [narsil-mcp](https://github.com/postrv/narsil-mcp) | [cpg](https://github.com/Fraunhofer-AISEC/cpg) | [axon](https://github.com/harshkedia177/axon) | [GitNexus](https://github.com/abhigyanpatwari/GitNexus) | +|---|:---:|:---:|:---:|:---:|:---:|:---:| +| Languages | **11** | ~12 | **32** | ~10 | 3 | 13 | +| MCP server | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | +| Dataflow + CFG + AST querying | **Yes** | **Yes** | **Yes**¹ | **Yes** | — | — | +| Hybrid search (BM25 + semantic) | **Yes** | — | — | — | **Yes** | **Yes** | +| Git-aware (diff impact, co-change, branch diff) | **All 3** | — | — | — | **All 3** | — | +| Dead code / role classification | **Yes** | — | **Yes** | — | **Yes** | — | +| Incremental rebuilds | **O(changed)** | — | O(n) | — | **Yes** | Commit-level⁴ | +| Architecture rules + CI gate | **Yes** | — | — | — | — | — | +| Security scanning (SAST / vuln detection) | Intentionally out of scope² | **Yes** | **Yes** | **Yes** | — | — | +| Zero config, `npm install` | **Yes** | — | **Yes** | — | **Yes** | **Yes** | +| Graph export (GraphML / Neo4j / DOT) | **Yes** | **Yes** | — | — | — | — | +| Open source + commercial use | **Yes** (Apache-2.0) | **Yes** (Apache-2.0) | **Yes** (MIT/Apache-2.0) | **Yes** (Apache-2.0) | Source-available³ | Non-commercial⁵ | + +¹ narsil-mcp added CFG and dataflow in recent versions. ² Codegraph focuses on structural understanding, not vulnerability detection — use dedicated SAST tools (Semgrep, CodeQL, Snyk) for that. ³ axon claims MIT in pyproject.toml but has no LICENSE file in the repo. ⁴ GitNexus skips re-index if the git commit hasn't changed, but re-processes the entire repo when it does — no per-file incremental parsing. ⁵ GitNexus uses the PolyForm Noncommercial 1.0.0 license. ### What makes codegraph different @@ -490,16 +475,16 @@ codegraph registry remove # Unregister |---|---|---|:---:|:---:| | ![JavaScript](https://img.shields.io/badge/-JavaScript-F7DF1E?style=flat-square&logo=javascript&logoColor=black) | `.js`, `.jsx`, `.mjs`, `.cjs` | functions, classes, methods, imports, exports, call sites, constants, dataflow | ✅ | ✅ | | ![TypeScript](https://img.shields.io/badge/-TypeScript-3178C6?style=flat-square&logo=typescript&logoColor=white) | `.ts`, `.tsx` | functions, classes, interfaces, type aliases, methods, imports, exports, call sites, dataflow | ✅ | ✅ | -| ![Python](https://img.shields.io/badge/-Python-3776AB?style=flat-square&logo=python&logoColor=white) | `.py` | functions, classes, methods, imports, decorators, constants, call sites, dataflow | ✅ | ✅ | +| ![Python](https://img.shields.io/badge/-Python-3776AB?style=flat-square&logo=python&logoColor=white) | `.py`, `.pyi` | functions, classes, methods, imports, decorators, constants, call sites, dataflow | ✅ | ✅ | | ![Go](https://img.shields.io/badge/-Go-00ADD8?style=flat-square&logo=go&logoColor=white) | `.go` | functions, methods, structs, interfaces, constants, imports, call sites, dataflow | ✅ | ✅ | | ![Rust](https://img.shields.io/badge/-Rust-000000?style=flat-square&logo=rust&logoColor=white) | `.rs` | functions, methods, structs, enums, traits, constants, `use` imports, call sites, dataflow | ✅ | ✅ | | ![Java](https://img.shields.io/badge/-Java-ED8B00?style=flat-square&logo=openjdk&logoColor=white) | `.java` | classes, methods, constructors, interfaces, enums, imports, call sites, dataflow | ✅ | ✅ | | ![C#](https://img.shields.io/badge/-C%23-512BD4?style=flat-square&logo=dotnet&logoColor=white) | `.cs` | classes, structs, records, interfaces, enums, methods, constructors, properties, using directives, call sites, dataflow | ✅ | ✅ | -| ![PHP](https://img.shields.io/badge/-PHP-777BB4?style=flat-square&logo=php&logoColor=white) | `.php` | functions, classes, interfaces, traits, enums, methods, namespace use, call sites, dataflow | ✅ | ✅ | -| ![Ruby](https://img.shields.io/badge/-Ruby-CC342D?style=flat-square&logo=ruby&logoColor=white) | `.rb` | classes, modules, methods, singleton methods, require/require_relative, include/extend, dataflow | — | ✅ | -| ![Terraform](https://img.shields.io/badge/-Terraform-844FBA?style=flat-square&logo=terraform&logoColor=white) | `.tf`, `.hcl` | resource, data, variable, module, output blocks | — | ✅ | +| ![PHP](https://img.shields.io/badge/-PHP-777BB4?style=flat-square&logo=php&logoColor=white) | `.php`, `.phtml` | functions, classes, interfaces, traits, enums, methods, namespace use, call sites, dataflow | ✅ | ✅ | +| ![Ruby](https://img.shields.io/badge/-Ruby-CC342D?style=flat-square&logo=ruby&logoColor=white) | `.rb`, `.rake`, `.gemspec` | classes, modules, methods, singleton methods, require/require_relative, include/extend, dataflow | N/A⁴ | ✅ | +| ![Terraform](https://img.shields.io/badge/-Terraform-844FBA?style=flat-square&logo=terraform&logoColor=white) | `.tf`, `.hcl` | resource, data, variable, module, output blocks | N/A⁴ | ✅ | -> **Type Inference** extracts a per-file type map from annotations (`const x: Router`, `MyType x`, `x: MyType`) and `new` expressions, enabling the edge resolver to connect `x.method()` → `Type.method()`. **Parity** = WASM and native Rust engines produce identical output. +> **Type Inference** extracts a per-file type map from annotations (`const x: Router`, `MyType x`, `x: MyType`) and `new` expressions, enabling the edge resolver to connect `x.method()` → `Type.method()`. **Parity** = WASM and native Rust engines produce identical output. ⁴ Ruby and HCL are dynamically typed / declarative — type inference does not apply. ## ⚙️ How It Works @@ -768,7 +753,7 @@ const { results: fused } = await multiSearchData( ## ⚠️ Limitations -- **No full type inference** — parses `.d.ts` interfaces but doesn't use TypeScript's type checker for overload resolution +- **No TypeScript type-checker integration** — type inference resolves annotations, `new` expressions, and assignment chains, but does not invoke `tsc` for overload resolution or complex generics - **Dynamic calls are best-effort** — complex computed property access and `eval` patterns are not resolved - **Python imports** — resolves relative imports but doesn't follow `sys.path` or virtual environment packages - **Dataflow analysis** — intraprocedural (single-function scope), not interprocedural @@ -778,13 +763,18 @@ const { results: fused } = await multiSearchData( See **[ROADMAP.md](docs/roadmap/ROADMAP.md)** for the full development roadmap and **[STABILITY.md](STABILITY.md)** for the stability policy and versioning guarantees. Current plan: 1. ~~**Rust Core**~~ — **Complete** (v1.3.0) — native tree-sitter parsing via napi-rs, parallel multi-core parsing, incremental re-parsing, import resolution & cycle detection in Rust -2. ~~**Foundation Hardening**~~ — **Complete** (v1.4.0) — parser registry, 12-tool MCP server with multi-repo support, test coverage 62%→75%, `apiKeyCommand` secret resolution, global repo registry -3. ~~**Deep Analysis**~~ — **Complete** (v3.0.0) — dataflow analysis (flows_to, returns, mutates), intraprocedural CFG for all 11 languages, stored AST nodes, expanded node/edge types (parameter, property, constant, contains, parameter_of, receiver), GraphML/GraphSON/Neo4j CSV export, interactive HTML viewer, CLI consolidation, stable JSON schema -4. ~~**Architectural Refactoring**~~ — **Complete** (v3.1.5) — unified AST analysis, composable MCP, domain errors, builder pipeline, embedder subsystem, graph model, qualified names, presentation layer, InMemoryRepository, domain directory grouping, CLI composability -5. **Natural Language Queries** — `codegraph ask` command, conversational sessions -6. **Expanded Language Support** — 8 new languages (12 → 20) -7. **GitHub Integration & CI** — reusable GitHub Action, PR review, SARIF output -8. **TypeScript Migration** — gradual migration from JS to TypeScript +2. ~~**Foundation Hardening**~~ — **Complete** (v1.5.0) — parser registry, complete MCP, test coverage, enhanced config, multi-repo MCP +3. ~~**Analysis Expansion**~~ — **Complete** (v2.7.0) — complexity metrics, community detection, flow tracing, co-change, manifesto, boundary rules, check, triage, audit, batch, hybrid search +4. ~~**Deep Analysis & Graph Enrichment**~~ — **Complete** (v3.0.0) — dataflow analysis, intraprocedural CFG, AST node storage, expanded node/edge types, interactive viewer, exports command +5. ~~**Architectural Refactoring**~~ — **Complete** (v3.1.5) — unified AST analysis, composable MCP, domain errors, builder pipeline, graph model, qualified names, presentation layer, CLI composability +6. **Native Analysis Acceleration** — move JS-only build phases to Rust, sub-100ms 1-file rebuilds +7. **TypeScript Migration** — project setup, core type definitions, leaf → core → orchestration migration +8. **Runtime & Extensibility** — event-driven pipeline, plugin system, query caching, pagination +9. **Intelligent Embeddings** — LLM-generated descriptions, enhanced embeddings, module summaries +10. **Natural Language Queries** — `codegraph ask` command, conversational sessions +11. **Expanded Language Support** — 8 new languages (11 → 19) +12. **GitHub Integration & CI** — reusable GitHub Action, LLM-enhanced PR review, SARIF output +13. **Visualization & Advanced** — web UI, dead code detection, monorepo, agentic search ## 🤝 Contributing diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index d9f0c0d6..f860dbac 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -344,11 +344,15 @@ fn extract_go_type_map_depth(node: &Node, source: &[u8], symbols: &mut FileSymbo "var_spec" => { if let Some(type_node) = node.child_by_field_name("type") { if let Some(type_name) = extract_go_type_name(&type_node, source) { - if let Some(name_node) = node.child_by_field_name("name") { - symbols.type_map.push(TypeMapEntry { - name: node_text(&name_node, source).to_string(), - type_name: type_name.to_string(), - }); + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "identifier" { + symbols.type_map.push(TypeMapEntry { + name: node_text(&child, source).to_string(), + type_name: type_name.to_string(), + }); + } + } } } } diff --git a/crates/codegraph-core/src/import_resolution.rs b/crates/codegraph-core/src/import_resolution.rs index f0071502..69480151 100644 --- a/crates/codegraph-core/src/import_resolution.rs +++ b/crates/codegraph-core/src/import_resolution.rs @@ -140,6 +140,7 @@ fn resolve_import_path_inner( ".jsx", ".mjs", ".py", + ".pyi", "/index.ts", "/index.tsx", "/index.js", diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index f800b275..0dde0bd6 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -50,14 +50,14 @@ impl LanguageKind { } match ext { "js" | "jsx" | "mjs" | "cjs" => Some(Self::JavaScript), - "py" => Some(Self::Python), + "py" | "pyi" => Some(Self::Python), "tf" | "hcl" => Some(Self::Hcl), "go" => Some(Self::Go), "rs" => Some(Self::Rust), "java" => Some(Self::Java), "cs" => Some(Self::CSharp), - "rb" => Some(Self::Ruby), - "php" => Some(Self::Php), + "rb" | "rake" | "gemspec" => Some(Self::Ruby), + "php" | "phtml" => Some(Self::Php), _ => None, } } diff --git a/docs/roadmap/BACKLOG.md b/docs/roadmap/BACKLOG.md index f017a518..23f84128 100644 --- a/docs/roadmap/BACKLOG.md +++ b/docs/roadmap/BACKLOG.md @@ -23,14 +23,14 @@ Each item has a short title, description, category, expected benefit, and four a ### Tier 0 — Promote before Phase 4-5 (highest immediate impact) -These two items directly improve agent experience and graph accuracy today, without requiring Rust porting or TypeScript migration. They should be implemented before any Phase 4+ roadmap work begins. +Both items are now **DONE**. These directly improved agent experience and graph accuracy without requiring Rust porting or TypeScript migration. -**Rationale:** Item #83 enriches the *passively-injected* context that agents actually see via hooks — the single highest-leverage surface for reducing blind edits. Item #71 closes the biggest accuracy gap in the graph for TypeScript and Java, where missing type-aware resolution causes hallucinated "no callers" results. +**Rationale:** Item #83 enriches the *passively-injected* context that agents actually see via hooks — the single highest-leverage surface for reducing blind edits. Item #71 closed the biggest accuracy gap in the graph for TypeScript and Java, where missing type-aware resolution caused hallucinated "no callers" results. | ID | Title | Description | Category | Benefit | Zero-dep | Foundation-aligned | Problem-fit (1-5) | Breaking | Depends on | |----|-------|-------------|----------|---------|----------|-------------------|-------------------|----------|------------| | 83 | ~~Hook-optimized `codegraph brief` command~~ | New `codegraph brief ` command designed for Claude Code hook context injection. Returns a compact, token-efficient summary per file: each symbol with its role and caller count (e.g. `buildGraph [core, 12 callers]`), blast radius count on importers (`Imported by: src/cli.js (+8 transitive)`), and overall file risk tier. Current `deps --json` output used by `enrich-context.sh` is shallow — just file-level imports/importedBy and symbol names with no role or blast radius info. The `brief` command would include: **(a)** symbol roles in the output — knowing a file defines `core` vs `leaf` symbols changes editing caution; **(b)** per-symbol transitive caller counts — makes blast radius visible without a separate `fn-impact` call; **(c)** file-level risk tier (high/medium/low based on max fan-in and role composition). Output optimized for `additionalContext` injection — single compact block, not verbose JSON. Also add `--brief` flag to `deps` as an alias. | Embeddability | The `enrich-context.sh` hook is the only codegraph context agents actually see (they ignore CLAUDE.md instructions to run commands manually). Making that passively-injected context richer — with roles, caller counts, and risk tiers — directly reduces blind edits to high-impact code. Currently the hook shows `Defines: function buildGraph` but not that it's a core symbol with 12 transitive callers | ✓ | ✓ | 4 | No | — | **DONE** — `codegraph brief ` command with symbol roles, caller counts, and risk tiers. CLI command, MCP tool, and presentation layer. ([#480](https://github.com/optave/codegraph/pull/480)) | -| 71 | Basic type inference for typed languages | Extract type annotations from TypeScript and Java AST nodes (variable declarations, function parameters, return types, generics) to resolve method calls through typed references. Currently `const x: Router = express.Router(); x.get(...)` produces no edge because `x.get` can't be resolved without knowing `x` is a `Router`. Tree-sitter already parses type annotations — we just don't use them for resolution. Start with declared types (no flow inference), which covers the majority of TS/Java code. | Resolution | Dramatically improves call graph completeness for TypeScript and Java — the two languages where developers annotate types explicitly and expect tooling to use them. Directly prevents hallucinated "no callers" results for methods called through typed variables | ✓ | ✓ | 5 | No | — | +| 71 | ~~Basic type inference for typed languages~~ | Extract type annotations from TypeScript and Java AST nodes (variable declarations, function parameters, return types, generics) to resolve method calls through typed references. Currently `const x: Router = express.Router(); x.get(...)` produces no edge because `x.get` can't be resolved without knowing `x` is a `Router`. Tree-sitter already parses type annotations — we just don't use them for resolution. Start with declared types (no flow inference), which covers the majority of TS/Java code. | Resolution | Dramatically improves call graph completeness for TypeScript and Java — the two languages where developers annotate types explicitly and expect tooling to use them. Directly prevents hallucinated "no callers" results for methods called through typed variables | ✓ | ✓ | 5 | No | — | **DONE** — Type inference for all typed languages (TS, Java, Go, Rust, C#, PHP, Python). WASM + native engines. ([#501](https://github.com/optave/codegraph/pull/501)) | ### Tier 1 — Zero-dep + Foundation-aligned (build these first) diff --git a/src/domain/graph/builder/stages/build-edges.js b/src/domain/graph/builder/stages/build-edges.js index 085717fa..47d75320 100644 --- a/src/domain/graph/builder/stages/build-edges.js +++ b/src/domain/graph/builder/stages/build-edges.js @@ -128,6 +128,14 @@ function buildCallEdgesNative(ctx, getNodeIdStmt, allEdgeRows, allNodes, native) for (const e of nativeEdges) { allEdgeRows.push([e.sourceId, e.targetId, e.kind, e.confidence, e.dynamic]); } + + // Older native binaries (< 3.2.0) don't emit receiver or type-resolved method-call + // edges. Supplement them on the JS side if the native binary missed them. + // TODO: Remove once all published native binaries handle receivers (>= 3.2.0) + const hasReceiver = nativeEdges.some((e) => e.kind === 'receiver'); + if (!hasReceiver) { + supplementReceiverEdges(ctx, nativeFiles, getNodeIdStmt, allEdgeRows); + } } function buildImportedNamesForNative(ctx, relPath, symbols, rootDir) { @@ -147,6 +155,50 @@ function buildImportedNamesForNative(ctx, relPath, symbols, rootDir) { return importedNames; } +// ── Receiver edge supplement for older native binaries ────────────────── + +function supplementReceiverEdges(ctx, nativeFiles, getNodeIdStmt, allEdgeRows) { + const seenCallEdges = new Set(); + // Collect existing edges to avoid duplicates + for (const row of allEdgeRows) { + seenCallEdges.add(`${row[0]}|${row[1]}|${row[2]}`); + } + + for (const nf of nativeFiles) { + const relPath = nf.file; + const typeMap = new Map(nf.typeMap.map((t) => [t.name, t.typeName])); + const fileNodeRow = { id: nf.fileNodeId }; + + for (const call of nf.calls) { + if (!call.receiver || BUILTIN_RECEIVERS.has(call.receiver)) continue; + if (call.receiver === 'this' || call.receiver === 'self' || call.receiver === 'super') + continue; + + const caller = findCaller(call, nf.definitions, relPath, getNodeIdStmt, fileNodeRow); + + // Receiver edge: caller → receiver type node + buildReceiverEdge(ctx, call, caller, relPath, seenCallEdges, allEdgeRows, typeMap); + + // Type-resolved method call: caller → Type.method + const typeName = typeMap.get(call.receiver); + if (typeName) { + const qualifiedName = `${typeName}.${call.name}`; + const targets = (ctx.nodesByName.get(qualifiedName) || []).filter( + (n) => n.kind === 'method', + ); + for (const t of targets) { + const key = `${caller.id}|${t.id}|calls`; + if (t.id !== caller.id && !seenCallEdges.has(key)) { + seenCallEdges.add(key); + const confidence = computeConfidence(relPath, t.file, null); + allEdgeRows.push([caller.id, t.id, 'calls', confidence, call.dynamic ? 1 : 0]); + } + } + } + } + } +} + // ── Call edges (JS fallback) ──────────────────────────────────────────── function buildCallEdgesJS(ctx, getNodeIdStmt, allEdgeRows) { @@ -244,11 +296,6 @@ function resolveCallTargets(ctx, call, relPath, importedNames, typeMap) { } function resolveByMethodOrGlobal(ctx, call, relPath, typeMap) { - const methodCandidates = (ctx.nodesByName.get(call.name) || []).filter( - (n) => n.name.endsWith(`.${call.name}`) && n.kind === 'method', - ); - if (methodCandidates.length > 0) return methodCandidates; - // Type-aware resolution: translate variable receiver to its declared type if (call.receiver && typeMap) { const typeName = typeMap.get(call.receiver); diff --git a/src/domain/graph/resolve.js b/src/domain/graph/resolve.js index 5e0ab1d3..5a82a5c6 100644 --- a/src/domain/graph/resolve.js +++ b/src/domain/graph/resolve.js @@ -78,6 +78,7 @@ function resolveImportPathJS(fromFile, importSource, rootDir, aliases) { '.jsx', '.mjs', '.py', + '.pyi', '/index.ts', '/index.tsx', '/index.js', diff --git a/src/domain/parser.js b/src/domain/parser.js index 476e6184..8ccbcd3b 100644 --- a/src/domain/parser.js +++ b/src/domain/parser.js @@ -320,7 +320,7 @@ export const LANGUAGE_REGISTRY = [ }, { id: 'python', - extensions: ['.py'], + extensions: ['.py', '.pyi'], grammarFile: 'tree-sitter-python.wasm', extractor: extractPythonSymbols, required: false, @@ -355,14 +355,14 @@ export const LANGUAGE_REGISTRY = [ }, { id: 'ruby', - extensions: ['.rb'], + extensions: ['.rb', '.rake', '.gemspec'], grammarFile: 'tree-sitter-ruby.wasm', extractor: extractRubySymbols, required: false, }, { id: 'php', - extensions: ['.php'], + extensions: ['.php', '.phtml'], grammarFile: 'tree-sitter-php.wasm', extractor: extractPHPSymbols, required: false, @@ -378,6 +378,31 @@ for (const entry of LANGUAGE_REGISTRY) { export const SUPPORTED_EXTENSIONS = new Set(_extToLang.keys()); +/** + * WASM-based typeMap backfill for older native binaries that don't emit typeMap. + * Uses tree-sitter AST extraction instead of regex to avoid false positives from + * matches inside comments and string literals. + * TODO: Remove once all published native binaries include typeMap extraction (>= 3.2.0) + */ +async function backfillTypeMap(filePath, source) { + let code = source; + if (!code) { + try { + code = fs.readFileSync(filePath, 'utf-8'); + } catch { + return { typeMap: [], backfilled: false }; + } + } + const parsers = await createParsers(); + const extracted = wasmExtractSymbols(parsers, filePath, code); + if (!extracted?.symbols?.typeMap) return { typeMap: [], backfilled: false }; + const tm = extracted.symbols.typeMap; + return { + typeMap: tm instanceof Map ? tm : new Map(tm.map((e) => [e.name, e.typeName])), + backfilled: true, + }; +} + /** * WASM extraction helper: picks the right extractor based on file extension. */ @@ -414,7 +439,14 @@ export async function parseFileAuto(filePath, source, opts = {}) { if (native) { const result = native.parseFile(filePath, source, !!opts.dataflow, opts.ast !== false); - return result ? patchNativeResult(result) : null; + if (!result) return null; + const patched = patchNativeResult(result); + if (!patched.typeMap || patched.typeMap.length === 0) { + const { typeMap, backfilled } = await backfillTypeMap(filePath, source); + patched.typeMap = typeMap; + if (backfilled) patched._typeMapBackfilled = true; + } + return patched; } // WASM path @@ -442,10 +474,35 @@ export async function parseFilesAuto(filePaths, rootDir, opts = {}) { !!opts.dataflow, opts.ast !== false, ); + const needsTypeMap = []; for (const r of nativeResults) { if (!r) continue; + const patched = patchNativeResult(r); const relPath = path.relative(rootDir, r.file).split(path.sep).join('/'); - result.set(relPath, patchNativeResult(r)); + result.set(relPath, patched); + if (!patched.typeMap || patched.typeMap.length === 0) { + needsTypeMap.push({ filePath: r.file, relPath }); + } + } + // Backfill typeMap via WASM for native binaries that predate the type-map feature + if (needsTypeMap.length > 0) { + const parsers = await createParsers(); + for (const { filePath, relPath } of needsTypeMap) { + try { + const code = fs.readFileSync(filePath, 'utf-8'); + const extracted = wasmExtractSymbols(parsers, filePath, code); + if (extracted?.symbols?.typeMap) { + const symbols = result.get(relPath); + symbols.typeMap = + extracted.symbols.typeMap instanceof Map + ? extracted.symbols.typeMap + : new Map(extracted.symbols.typeMap.map((e) => [e.name, e.typeName])); + symbols._typeMapBackfilled = true; + } + } catch { + /* skip — typeMap is a best-effort backfill */ + } + } } return result; } @@ -519,7 +576,14 @@ export function createParseTreeCache() { export async function parseFileIncremental(cache, filePath, source, opts = {}) { if (cache) { const result = cache.parseFile(filePath, source); - return result ? patchNativeResult(result) : null; + if (!result) return null; + const patched = patchNativeResult(result); + if (!patched.typeMap || patched.typeMap.length === 0) { + const { typeMap, backfilled } = await backfillTypeMap(filePath, source); + patched.typeMap = typeMap; + if (backfilled) patched._typeMapBackfilled = true; + } + return patched; } return parseFileAuto(filePath, source, opts); } diff --git a/src/extractors/go.js b/src/extractors/go.js index 33cf44e6..23b5f1b0 100644 --- a/src/extractors/go.js +++ b/src/extractors/go.js @@ -211,13 +211,19 @@ function extractGoTypeMap(node, ctx) { function extractGoTypeMapDepth(node, ctx, depth) { if (depth >= 200) return; - // var x MyType = ... → var_declaration > var_spec + // var x MyType = ... or var x, y MyType → var_declaration > var_spec if (node.type === 'var_spec') { - const nameNode = node.childForFieldName('name'); const typeNode = node.childForFieldName('type'); - if (nameNode && typeNode) { + if (typeNode) { const typeName = extractGoTypeName(typeNode); - if (typeName) ctx.typeMap.set(nameNode.text, typeName); + if (typeName) { + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (child && child.type === 'identifier') { + ctx.typeMap.set(child.text, typeName); + } + } + } } } diff --git a/src/extractors/javascript.js b/src/extractors/javascript.js index fc52d117..7762959c 100644 --- a/src/extractors/javascript.js +++ b/src/extractors/javascript.js @@ -825,7 +825,8 @@ function extractNewExprTypeName(newExprNode) { } function extractTypeMapWalk(rootNode, typeMap) { - function walk(node) { + function walk(node, depth) { + if (depth >= 200) return; const t = node.type; if (t === 'variable_declarator') { const nameN = node.childForFieldName('name'); @@ -854,10 +855,10 @@ function extractTypeMapWalk(rootNode, typeMap) { } } for (let i = 0; i < node.childCount; i++) { - walk(node.child(i)); + walk(node.child(i), depth + 1); } } - walk(rootNode); + walk(rootNode, 0); } function extractReceiverName(objNode) { diff --git a/tests/integration/build.test.js b/tests/integration/build.test.js index 0d0b3d64..a4148642 100644 --- a/tests/integration/build.test.js +++ b/tests/integration/build.test.js @@ -497,7 +497,7 @@ describe('typed method call resolution', () => { '', ].join('\n'), ); - // Force WASM engine — typeMap resolution is JS-only (native deferred) + // Force WASM engine — native binary may not be present in all test environments await buildGraph(typedDir, { skipRegistry: true, engine: 'wasm' }); typedDbPath = path.join(typedDir, '.codegraph', 'graph.db'); });