From 20900e69f409bd4658306884478d957705177780 Mon Sep 17 00:00:00 2001 From: kingchenc Date: Mon, 16 Mar 2026 17:02:07 +0100 Subject: [PATCH 1/2] Fix Python/SQL parsing quality: route false positives, Depends() tracking, SQL size guard, DLL calls - Fix #28: Restrict source-based route extractors (Go/Express/Laravel/Ktor) to their own file extensions. Prevents Python dict .get() from matching Ktor route regex and creating ~125 spurious Route nodes. - Fix #27: Track FastAPI Depends(func_ref) in parameter defaults as CALLS edges. Scans Python function signatures for Depends() patterns so dependency-injected functions (e.g. get_current_user) no longer appear as dead code with in_degree=0. - Fix #62: Add file size guard in cbmParseFile() to prevent tree-sitter SQL parser stack overflow on large .sql files (bulk INSERTs). SQL files >1MB and any file >4MB are skipped with a logged warning. - Fix #29: Detect dynamic DLL resolution patterns (GetProcAddress, dlsym, Resolve) in C/C++ source and create CALLS edges to synthetic stub nodes with dll_name/dll_function metadata. --- internal/httplink/httplink.go | 18 ++- internal/pipeline/pipeline.go | 19 ++- internal/pipeline/pipeline_cbm.go | 256 ++++++++++++++++++++++++++++++ 3 files changed, 283 insertions(+), 10 deletions(-) diff --git a/internal/httplink/httplink.go b/internal/httplink/httplink.go index b4eb4fd..81686ea 100644 --- a/internal/httplink/httplink.go +++ b/internal/httplink/httplink.go @@ -375,14 +375,22 @@ func (l *Linker) discoverRoutes(rootPath string) []RouteHandler { // C# ASP.NET: check attribute decorators routes = append(routes, extractASPNetRoutes(f)...) - // Source-based route discovery (Go gin, Express.js, PHP Laravel, Kotlin Ktor) + // Source-based route discovery — each extractor only runs on its own language's files + // to prevent false positives (e.g., Python dict .get() matching Ktor/Go route regex). if f.FilePath != "" && f.StartLine > 0 && f.EndLine > 0 { + ext := strings.ToLower(filepath.Ext(f.FilePath)) source := readSourceLines(rootPath, f.FilePath, f.StartLine, f.EndLine) if source != "" { - routes = append(routes, extractGoRoutes(f, source)...) - routes = append(routes, extractExpressRoutes(f, source)...) - routes = append(routes, extractLaravelRoutes(f, source)...) - routes = append(routes, extractKtorRoutes(f, source)...) + switch ext { + case ".go": + routes = append(routes, extractGoRoutes(f, source)...) + case ".js", ".ts", ".mjs", ".mts", ".jsx", ".tsx": + routes = append(routes, extractExpressRoutes(f, source)...) + case ".php": + routes = append(routes, extractLaravelRoutes(f, source)...) + case ".kt", ".kts": + routes = append(routes, extractKtorRoutes(f, source)...) + } } } diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go index 0e5efc2..9326a45 100644 --- a/internal/pipeline/pipeline.go +++ b/internal/pipeline/pipeline.go @@ -1328,9 +1328,9 @@ func collectEdgeQNs(results [][]resolvedEdge) (qnSet map[string]struct{}, totalE return qnSet, totalEdges } -// createLSPStubNodes creates stub nodes for LSP-resolved targets that don't exist in the graph. -// This happens for stdlib/external methods (e.g., context.Context.Done) that -// the LSP resolver correctly identifies but aren't indexed as nodes. +// createLSPStubNodes creates stub nodes for targets that don't exist in the graph. +// This handles LSP-resolved targets (stdlib/external methods) and DLL-resolved +// targets (dynamic DLL function references from GetProcAddress/dlsym/Resolve). func (p *Pipeline) createLSPStubNodes(results [][]resolvedEdge, qnToID map[string]int64) { var stubs []*store.Node stubQNs := make(map[string]bool) @@ -1343,7 +1343,7 @@ func (p *Pipeline) createLSPStubNodes(results [][]resolvedEdge, qnToID map[strin continue } strategy, _ := re.Properties["resolution_strategy"].(string) - if !strings.HasPrefix(strategy, "lsp_") { + if !strings.HasPrefix(strategy, "lsp_") && strategy != "dll_resolve" { continue } stubQNs[re.TargetQN] = true @@ -1355,12 +1355,21 @@ func (p *Pipeline) createLSPStubNodes(results [][]resolvedEdge, qnToID map[strin if strings.Count(re.TargetQN, ".") >= 2 { label = "Method" } + + props := map[string]any{"stub": true, "source": strategy} + // Carry over DLL metadata for DLL-resolved stubs + if strategy == "dll_resolve" { + if dllName, ok := re.Properties["dll_name"].(string); ok { + props["dll_name"] = dllName + } + } + stubs = append(stubs, &store.Node{ Project: p.ProjectName, Label: label, Name: name, QualifiedName: re.TargetQN, - Properties: map[string]any{"stub": true, "source": "lsp_resolution"}, + Properties: props, }) } } diff --git a/internal/pipeline/pipeline_cbm.go b/internal/pipeline/pipeline_cbm.go index 42d07bd..bd3ea2e 100644 --- a/internal/pipeline/pipeline_cbm.go +++ b/internal/pipeline/pipeline_cbm.go @@ -1,8 +1,10 @@ package pipeline import ( + "fmt" "log/slog" "path/filepath" + "regexp" "strings" "github.com/DeusData/codebase-memory-mcp/internal/cbm" @@ -19,10 +21,29 @@ type cachedExtraction struct { Language lang.Language } +// Per-language file size limits to prevent tree-sitter stack overflows. +// SQL deeply recurses on bulk INSERTs; Windows has a 1MB default C stack. +const ( + maxSQLFileSize = 1 << 20 // 1 MB + maxGeneralFileSize = 4 << 20 // 4 MB +) + // cbmParseFile reads a file, calls cbm.ExtractFile(), and converts the // result to the same parseResult format used by the batch write infrastructure. // This replaces parseFileAST() — all AST walking happens in C. func cbmParseFile(projectName string, f discover.FileInfo) *parseResult { + // Guard: SQL tree-sitter grammar deeply recurses on large files (bulk INSERT + // dumps), exhausting the C stack — especially on Windows (1 MB default). + if f.Language == lang.SQL && f.Size > maxSQLFileSize { + slog.Info("cbm.skip.large_sql", "path", f.RelPath, "size", f.Size) + return &parseResult{File: f, Err: fmt.Errorf("skipped: SQL file too large (%d bytes, max %d)", f.Size, maxSQLFileSize)} + } + // General safety: files > 4 MB are likely generated/vendored and risk OOM or stack issues. + if f.Size > maxGeneralFileSize { + slog.Info("cbm.skip.large_file", "path", f.RelPath, "size", f.Size) + return &parseResult{File: f, Err: fmt.Errorf("skipped: file too large (%d bytes, max %d)", f.Size, maxGeneralFileSize)} + } + source, cleanup, err := mmapFile(f.Path) if cleanup != nil { defer cleanup() @@ -253,6 +274,12 @@ func (p *Pipeline) resolveFileCallsCBM(relPath string, ext *cachedExtraction) [] } } + // Python: track FastAPI Depends(func_ref) as CALLS edges + edges = append(edges, p.extractPythonDependsEdges(relPath, ext)...) + + // C/C++: track dynamic DLL resolution (GetProcAddress/dlsym/Resolve) as CALLS edges + edges = append(edges, p.extractDLLResolveEdges(relPath, ext)...) + return edges } @@ -603,3 +630,232 @@ func isCheckedException(excName string) bool { } return false } + +// --- Python FastAPI Depends() tracking (#27) --- + +// pythonDependsRe matches Depends(func_ref) patterns in Python function signatures. +// Captures the function reference passed to Depends(). +var pythonDependsRe = regexp.MustCompile(`Depends\(\s*([\w.]+)`) + +// extractPythonDependsEdges scans Python function signatures for Depends(func_ref) +// patterns and creates CALLS edges from the endpoint to the dependency function. +// Without this, functions referenced via Depends() appear as dead code (in_degree=0). +func (p *Pipeline) extractPythonDependsEdges(relPath string, ext *cachedExtraction) []resolvedEdge { + if ext.Language != lang.Python || ext.Result == nil { + return nil + } + + // Early bail: check if any call in this file targets "Depends" + hasDependsCall := false + for _, call := range ext.Result.Calls { + if call.CalleeName == "Depends" || strings.HasSuffix(call.CalleeName, ".Depends") { + hasDependsCall = true + break + } + } + if !hasDependsCall { + return nil + } + + // Read full file source + source := readFileSource(p.RepoPath, relPath) + if len(source) == 0 { + return nil + } + lines := strings.Split(string(source), "\n") + + moduleQN := fqn.ModuleQN(p.ProjectName, relPath) + importMap := p.importMaps[moduleQN] + + var edges []resolvedEdge + seen := make(map[[2]string]bool) + + for _, def := range ext.Result.Definitions { + if def.Label != "Function" && def.Label != "Method" { + continue + } + if def.StartLine <= 0 { + continue + } + + // Extract function signature lines (def line through closing paren + colon). + // Signatures can span multiple lines in Python. + sigEnd := def.StartLine + 15 // scan up to 15 lines for multi-line signatures + if def.EndLine > 0 && sigEnd > def.EndLine { + sigEnd = def.EndLine + } + if sigEnd > len(lines) { + sigEnd = len(lines) + } + + var sig strings.Builder + for i := def.StartLine - 1; i < sigEnd; i++ { + sig.WriteString(lines[i]) + sig.WriteByte('\n') + trimmed := strings.TrimSpace(lines[i]) + // Stop once we hit the colon that ends the function definition + if strings.HasSuffix(trimmed, "):") || strings.HasSuffix(trimmed, ") :") || + strings.HasSuffix(trimmed, ") -> None:") || strings.Contains(trimmed, ") ->") { + break + } + } + + sigStr := sig.String() + matches := pythonDependsRe.FindAllStringSubmatch(sigStr, -1) + for _, m := range matches { + funcRef := m[1] + if funcRef == "" { + continue + } + + key := [2]string{def.QualifiedName, funcRef} + if seen[key] { + continue + } + seen[key] = true + + result := p.registry.Resolve(funcRef, moduleQN, importMap) + if result.QualifiedName == "" { + continue + } + + edges = append(edges, resolvedEdge{ + CallerQN: def.QualifiedName, + TargetQN: result.QualifiedName, + Type: "CALLS", + Properties: map[string]any{ + "confidence": 0.95, + "confidence_band": "high", + "resolution_strategy": "fastapi_depends", + }, + }) + } + } + + if len(edges) > 0 { + slog.Info("pass3.fastapi_depends", "file", relPath, "edges", len(edges)) + } + return edges +} + +// --- C/C++ dynamic DLL resolution tracking (#29) --- + +// DLL resolution patterns for C/C++ dynamic linking. +var ( + // GetProcAddress(handle, "FunctionName") — Win32 API + dllGetProcAddrRe = regexp.MustCompile(`GetProcAddress\s*\(\s*\w+\s*,\s*["'](\w+)["']`) + // dlsym(handle, "function_name") — POSIX + dllDlsymRe = regexp.MustCompile(`dlsym\s*\(\s*\w+\s*,\s*["'](\w+)["']`) + // obj.Resolve("FunctionName") or obj->Resolve("FunctionName") — custom DLL loaders + dllResolveRe = regexp.MustCompile(`(?:->|\.)\s*Resolve\s*\(\s*["'](\w+)["']`) + // LoadLibrary("dll_name.dll") or dlopen("lib.so") — DLL name extraction + dllLoadRe = regexp.MustCompile(`(?:LoadLibrary[AW]?|dlopen)\s*\(\s*["']([^"']+)["']`) +) + +// extractDLLResolveEdges scans C/C++ function source for dynamic DLL resolution +// patterns (GetProcAddress, dlsym, Resolve) and creates CALLS edges to synthetic +// external function nodes, enabling call graph tracking across DLL boundaries. +func (p *Pipeline) extractDLLResolveEdges(relPath string, ext *cachedExtraction) []resolvedEdge { + if ext.Language != lang.CPP && ext.Language != lang.C { + return nil + } + if ext.Result == nil { + return nil + } + + // Early bail: check if any call targets a DLL resolution function + hasDLLCall := false + for _, call := range ext.Result.Calls { + name := call.CalleeName + if name == "GetProcAddress" || name == "GetProcAddressA" || name == "GetProcAddressW" || + name == "dlsym" || strings.HasSuffix(name, ".Resolve") || strings.HasSuffix(name, "->Resolve") { + hasDLLCall = true + break + } + } + if !hasDLLCall { + return nil + } + + // Read full file source + source := readFileSource(p.RepoPath, relPath) + if len(source) == 0 { + return nil + } + sourceStr := string(source) + sourceLines := strings.Split(sourceStr, "\n") + + // Extract DLL name from LoadLibrary/dlopen calls (best-effort) + dllName := "external" + if m := dllLoadRe.FindStringSubmatch(sourceStr); m != nil { + dllName = filepath.Base(m[1]) + // Strip extension + if ext := filepath.Ext(dllName); ext != "" { + dllName = strings.TrimSuffix(dllName, ext) + } + } + + moduleQN := fqn.ModuleQN(p.ProjectName, relPath) + + var edges []resolvedEdge + seen := make(map[[2]string]bool) + + for _, def := range ext.Result.Definitions { + if def.Label != "Function" && def.Label != "Method" { + continue + } + if def.StartLine <= 0 || def.EndLine <= 0 { + continue + } + + // Extract function body source + endLine := def.EndLine + if endLine > len(sourceLines) { + endLine = len(sourceLines) + } + var body strings.Builder + for i := def.StartLine - 1; i < endLine; i++ { + body.WriteString(sourceLines[i]) + body.WriteByte('\n') + } + bodyStr := body.String() + + // Match DLL resolution patterns + for _, re := range []*regexp.Regexp{dllGetProcAddrRe, dllDlsymRe, dllResolveRe} { + for _, m := range re.FindAllStringSubmatch(bodyStr, -1) { + funcName := m[1] + if funcName == "" { + continue + } + + callerQN := def.QualifiedName + targetQN := moduleQN + ".dll." + dllName + "." + funcName + + key := [2]string{callerQN, targetQN} + if seen[key] { + continue + } + seen[key] = true + + edges = append(edges, resolvedEdge{ + CallerQN: callerQN, + TargetQN: targetQN, + Type: "CALLS", + Properties: map[string]any{ + "confidence": 0.85, + "confidence_band": "high", + "resolution_strategy": "dll_resolve", + "dll_name": dllName, + "dll_function": funcName, + }, + }) + } + } + } + + if len(edges) > 0 { + slog.Info("pass3.dll_resolve", "file", relPath, "edges", len(edges)) + } + + return edges +} From 45822d2c04796189c5e431fb89d37d08cd3919c1 Mon Sep 17 00:00:00 2001 From: kingchenc Date: Mon, 16 Mar 2026 17:40:01 +0100 Subject: [PATCH 2/2] Fix Python/SQL parsing quality: route false positives, Depends() tracking, SQL size guard, DLL calls - Fix #28: Restrict source-based route extractors (Go/Express/Laravel/Ktor) to their own file extensions. Prevents Python dict .get() from matching Ktor route regex and creating ~125 spurious Route nodes. - Fix #27: Track FastAPI Depends(func_ref) in parameter defaults as CALLS edges. Scans Python function signatures for Depends() patterns so dependency-injected functions no longer appear as dead code (in_degree=0). Includes fallback for import aliases (e.g. `import X as _Y`). - Fix #62: Add file size guard in cbmParseFile() to prevent tree-sitter SQL parser stack overflow on large .sql files (bulk INSERTs). SQL files >1MB and any file >4MB are skipped with a logged warning. - Fix #29: Detect dynamic DLL resolution patterns (GetProcAddress, dlsym, Resolve) in C/C++ source and create CALLS edges to synthetic stub nodes with dll_name/dll_function metadata. --- internal/pipeline/pipeline_cbm.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/pipeline/pipeline_cbm.go b/internal/pipeline/pipeline_cbm.go index bd3ea2e..4ccd592 100644 --- a/internal/pipeline/pipeline_cbm.go +++ b/internal/pipeline/pipeline_cbm.go @@ -716,7 +716,18 @@ func (p *Pipeline) extractPythonDependsEdges(relPath string, ext *cachedExtracti result := p.registry.Resolve(funcRef, moduleQN, importMap) if result.QualifiedName == "" { - continue + // Fallback for import aliases: Depends(_dep_require_admin) where + // _dep_require_admin is aliased from "require_admin". Extract the + // original function name from the import path and retry. + if importPath, ok := importMap[funcRef]; ok { + if lastDot := strings.LastIndex(importPath, "."); lastDot >= 0 { + originalName := importPath[lastDot+1:] + result = p.registry.Resolve(originalName, moduleQN, importMap) + } + } + if result.QualifiedName == "" { + continue + } } edges = append(edges, resolvedEdge{