From fb84c38e73ba05f84568e33dddca650c1d64541d Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:33:13 +0500 Subject: [PATCH 01/10] feat(pipeline): add pass_idl_scan for gRPC IDL Route + HANDLES emission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a new sequential pass (pass_idl_scan) that runs after pass_calls. Two responsibilities, both purely additive: 1. Emit canonical Route nodes from .proto-derived service/rpc definitions. For each Class node with file_path ending in ".proto", iterate its DEFINES_METHOD edges (rpc methods) and create a Route node per rpc with QN __route__grpc__/, plus a HANDLES edge from the rpc Function node back to the Route. 2. Bind consumer-side gRPC handler classes via INHERITS edges. For each inheritance whose base class name matches a known server-stub suffix (Servicer, ServicerBase, ImplBase, ServiceBase, AsyncServicer, Base — tried longest-first), strip the suffix to derive the expected service name, walk methods of the inheriting class, strip *Async wrappers from method names, and emit HANDLES edges to the matching Route node. The HANDLES edges are the rendezvous point for the existing cross-repo matcher (pass_cross_repo.c match_typed_routes for GRPC_CALLS) — once producer-side typed-client GRPC_CALLS edges land in a follow-up pass, end-to-end CROSS_GRPC_CALLS edges become possible without further changes. Producer-side detection (typed gRPC client method calls) is intentionally deferred — see the design proposal for the full Tier 1-4 roadmap. This PR ships the consumer half, which is the larger code surface and the part where cross-language genericity is exercised (Python servicer, Java ImplBase, C# ServiceBase all hit the same code path). Coverage: 4 unit tests in tests/test_pipeline.c covering Python, C# (with *Async stripping), Java, and the negative case (non-proto class skipped). 2603 tests pass overall, no regressions. --- Makefile.cbm | 1 + src/pipeline/pass_idl_scan.c | 311 ++++++++++++++++++++++++++++ src/pipeline/pipeline.c | 3 +- src/pipeline/pipeline_incremental.c | 1 + src/pipeline/pipeline_internal.h | 6 + tests/test_pipeline.c | 198 ++++++++++++++++++ 6 files changed, 519 insertions(+), 1 deletion(-) create mode 100644 src/pipeline/pass_idl_scan.c diff --git a/Makefile.cbm b/Makefile.cbm index 6bc1eb12..e328f28f 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -186,6 +186,7 @@ PIPELINE_SRCS = \ src/pipeline/pass_configures.c \ src/pipeline/pass_configlink.c \ src/pipeline/pass_route_nodes.c \ + src/pipeline/pass_idl_scan.c \ src/pipeline/pass_enrichment.c \ src/pipeline/pass_envscan.c \ src/pipeline/pass_compile_commands.c \ diff --git a/src/pipeline/pass_idl_scan.c b/src/pipeline/pass_idl_scan.c new file mode 100644 index 00000000..9c1c6cd0 --- /dev/null +++ b/src/pipeline/pass_idl_scan.c @@ -0,0 +1,311 @@ +/* + * pass_idl_scan.c — IDL-driven cross-repo binding for gRPC. + * + * Runs after pass_definitions / pass_calls. Two responsibilities: + * + * 1. Emit Route nodes derived from .proto-defined services and rpcs. + * For each Class node whose file_path ends in ".proto", iterate its + * DEFINES_METHOD edges to find rpc methods, and create: + * Route node with QN __route__grpc__/ + * HANDLES edge from the rpc Function node back to the Route + * + * 2. Bind consumer-side gRPC handler classes. + * For each INHERITS edge whose base name matches a server-stub suffix + * (Servicer, ServicerBase, ImplBase, ServiceBase, Base, AsyncServicer), + * strip the suffix to derive the expected service name, walk methods + * of the inheriting class via DEFINES_METHOD edges, and emit: + * HANDLES edge from each method to the matching IDL Route + * + * The HANDLES edges are the rendezvous point for pass_cross_repo's existing + * Phase D matcher (match_typed_routes for GRPC_CALLS), which already looks + * up Routes by QN and follows HANDLES edges to find handlers. + * + * Producer-side typed gRPC client detection (emitting GRPC_CALLS edges) is + * intentionally deferred to a follow-up pass — it requires call-site type + * resolution that is not yet wired through the call resolution pipeline. + * + * Builds on the cross-repo scaffolding in pass_cross_repo.c without modifying it. + */ +#include "foundation/constants.h" + +enum { + IDL_QN_BUF = 768, + IDL_PROPS_BUF = 256, + IDL_NAME_BUF = 256, + IDL_LOG_BUF = 16, +}; + +#include "pipeline/pipeline.h" +#include "pipeline/pipeline_internal.h" +#include "graph_buffer/graph_buffer.h" +#include "foundation/log.h" +#include "foundation/compat.h" + +#include +#include +#include +#include +#include +#include +#include + +/* ── Small helpers ───────────────────────────────────────────────── */ + +static bool idl_ends_with(const char *s, const char *suffix) { + if (!s || !suffix) { + return false; + } + size_t sl = strlen(s); + size_t fl = strlen(suffix); + if (fl > sl) { + return false; + } + return strcmp(s + sl - fl, suffix) == 0; +} + +static bool idl_is_proto_file(const char *path) { + return idl_ends_with(path, ".proto"); +} + +/* Strip the longest matching suffix from name, returning a heap-allocated + * copy of the prefix. Returns NULL if no suffix matched or on alloc error. + * Suffix table is NULL-terminated; longer entries should appear first so + * "ServicerBase" matches before "Servicer". */ +static char *idl_strip_suffix(const char *name, const char *const *suffixes) { + if (!name) { + return NULL; + } + size_t nl = strlen(name); + for (int i = 0; suffixes[i]; i++) { + size_t sl = strlen(suffixes[i]); + if (nl > sl && strcmp(name + nl - sl, suffixes[i]) == 0) { + char *out = malloc(nl - sl + SKIP_ONE); + if (!out) { + return NULL; + } + memcpy(out, name, nl - sl); + out[nl - sl] = '\0'; + return out; + } + } + return NULL; +} + +/* Strip language-specific async wrappers from a method name in-place. + * "GetVoucherAsync" → "GetVoucher". Leaves bare names unchanged. */ +static void idl_strip_async_suffix(char *name) { + if (!name) { + return; + } + static const char *const k_async[] = {"Async", "_async", NULL}; + for (int i = 0; k_async[i]; i++) { + size_t sl = strlen(k_async[i]); + size_t nl = strlen(name); + if (nl > sl && strcmp(name + nl - sl, k_async[i]) == 0) { + name[nl - sl] = '\0'; + return; + } + } +} + +static void idl_capitalize_first(char *s) { + if (s && s[0] && islower((unsigned char)s[0])) { + s[0] = (char)toupper((unsigned char)s[0]); + } +} + +static void idl_build_route_qn(char *buf, size_t bufsz, const char *service, const char *method) { + snprintf(buf, bufsz, "__route__grpc__%s/%s", service, method); +} + +/* Emit a single Route node + HANDLES edge from the rpc method node. */ +static void idl_emit_route_for_rpc(cbm_gbuf_t *gbuf, const cbm_gbuf_node_t *service_node, + const cbm_gbuf_node_t *rpc_node, int *out_count) { + if (!service_node->name || !rpc_node->name) { + return; + } + char qn[IDL_QN_BUF]; + idl_build_route_qn(qn, sizeof(qn), service_node->name, rpc_node->name); + + char display[IDL_QN_BUF]; + snprintf(display, sizeof(display), "%s/%s", service_node->name, rpc_node->name); + + char props[IDL_PROPS_BUF]; + snprintf(props, sizeof(props), + "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\"}", service_node->name, + rpc_node->name); + + int64_t route_id = + cbm_gbuf_upsert_node(gbuf, "Route", display, qn, + service_node->file_path ? service_node->file_path : "", 0, 0, props); + if (route_id <= 0) { + return; + } + cbm_gbuf_insert_edge(gbuf, rpc_node->id, route_id, "HANDLES", "{\"via\":\"idl_grpc\"}"); + if (out_count) { + (*out_count)++; + } +} + +/* Visitor state for the proto Class walk. */ +typedef struct { + cbm_gbuf_t *gbuf; + int services; + int routes; +} idl_walk_ctx_t; + +static void idl_proto_class_visitor(const cbm_gbuf_node_t *node, void *userdata) { + idl_walk_ctx_t *ctx = (idl_walk_ctx_t *)userdata; + if (!node || !node->label || !node->file_path) { + return; + } + if (strcmp(node->label, "Class") != 0) { + return; + } + if (!idl_is_proto_file(node->file_path)) { + return; + } + /* Iterate DEFINES_METHOD edges to find rpc method nodes. */ + const cbm_gbuf_edge_t **edges = NULL; + int edge_count = 0; + if (cbm_gbuf_find_edges_by_source_type(ctx->gbuf, node->id, "DEFINES_METHOD", &edges, + &edge_count) != 0) { + return; + } + if (edge_count == 0) { + return; + } + ctx->services++; + for (int i = 0; i < edge_count; i++) { + const cbm_gbuf_node_t *rpc = cbm_gbuf_find_by_id(ctx->gbuf, edges[i]->target_id); + if (!rpc) { + continue; + } + idl_emit_route_for_rpc(ctx->gbuf, node, rpc, &ctx->routes); + } +} + +/* Server-side base class suffixes — longest first so e.g. "GreeterImplBase" + * matches "ImplBase" before falling through to "Base". Source-language coverage: + * Python grpcio: *Servicer + * Java protoc-gen-grpc: *ImplBase + * C# Grpc.Tools: *ServiceBase, *Base (matches the Grpc.Tools `.Base`) + * Rust tonic: impl Server for ... (handled by IMPLEMENTS, not INHERITS) + * Go grpc-go: UnimplementedXXXServer (struct embedding) — out of scope for v1 + * + * "Base" is intentionally last and shortest. False positives (e.g. inheriting from + * a non-gRPC class that happens to end in "Base") are filtered downstream because + * idl_bind_inheritance_edge only emits HANDLES when a Route node with the derived + * service name actually exists in the gbuf. + */ +static const char *const k_grpc_server_suffixes[] = { + "ServicerBase", "AsyncServicer", "ServiceBase", "ImplBase", "Servicer", "Base", NULL, +}; + +/* Given an inheritance edge (impl class → base class), if base name matches a + * known gRPC server-stub suffix, bind methods of impl class to matching Routes. + * Tries case-tolerant variants when looking up the Route QN to bridge naming + * conventions across languages (snake_case vs CamelCase). */ +static int idl_bind_inheritance_edge(cbm_gbuf_t *gbuf, const cbm_gbuf_edge_t *edge) { + const cbm_gbuf_node_t *base = cbm_gbuf_find_by_id(gbuf, edge->target_id); + if (!base || !base->name) { + return 0; + } + char *service = idl_strip_suffix(base->name, k_grpc_server_suffixes); + if (!service || !service[0]) { + free(service); + return 0; + } + const cbm_gbuf_node_t *impl = cbm_gbuf_find_by_id(gbuf, edge->source_id); + if (!impl) { + free(service); + return 0; + } + + int handles = 0; + const cbm_gbuf_edge_t **method_edges = NULL; + int method_count = 0; + if (cbm_gbuf_find_edges_by_source_type(gbuf, impl->id, "DEFINES_METHOD", &method_edges, + &method_count) != 0) { + free(service); + return 0; + } + + for (int i = 0; i < method_count; i++) { + const cbm_gbuf_node_t *m = cbm_gbuf_find_by_id(gbuf, method_edges[i]->target_id); + if (!m || !m->name) { + continue; + } + + char bare[IDL_NAME_BUF]; + snprintf(bare, sizeof(bare), "%s", m->name); + idl_strip_async_suffix(bare); + + char qn[IDL_QN_BUF]; + idl_build_route_qn(qn, sizeof(qn), service, bare); + const cbm_gbuf_node_t *route = cbm_gbuf_find_by_qn(gbuf, qn); + + if (!route) { + char cap[IDL_NAME_BUF]; + snprintf(cap, sizeof(cap), "%s", bare); + idl_capitalize_first(cap); + if (strcmp(cap, bare) != 0) { + idl_build_route_qn(qn, sizeof(qn), service, cap); + route = cbm_gbuf_find_by_qn(gbuf, qn); + } + } + + if (!route) { + continue; + } + + cbm_gbuf_insert_edge(gbuf, m->id, route->id, "HANDLES", "{\"via\":\"idl_grpc\"}"); + handles++; + } + + free(service); + return handles; +} + +static int idl_bind_consumer_handlers(cbm_gbuf_t *gbuf) { + const cbm_gbuf_edge_t **edges = NULL; + int edge_count = 0; + if (cbm_gbuf_find_edges_by_type(gbuf, "INHERITS", &edges, &edge_count) != 0) { + return 0; + } + int handles = 0; + for (int i = 0; i < edge_count; i++) { + handles += idl_bind_inheritance_edge(gbuf, edges[i]); + } + return handles; +} + +/* TLS-backed itoa for log calls. */ +static const char *idl_itoa(int v) { + static CBM_TLS char buf[IDL_LOG_BUF]; + snprintf(buf, sizeof(buf), "%d", v); + return buf; +} + +/* Public entry point. Idempotent: re-running over the same gbuf only adds the + * same Route + HANDLES tuples (deduped by gbuf upsert/insert semantics). */ +int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, + int file_count) { + (void)files; + (void)file_count; + if (!ctx || !ctx->gbuf) { + return 0; + } + + cbm_log_info("pass.start", "pass", "idl_scan"); + + idl_walk_ctx_t walk = {.gbuf = ctx->gbuf, .services = 0, .routes = 0}; + cbm_gbuf_foreach_node(ctx->gbuf, idl_proto_class_visitor, &walk); + + int handles = idl_bind_consumer_handlers(ctx->gbuf); + + cbm_log_info("pass.done", "pass", "idl_scan", "services", idl_itoa(walk.services), "routes", + idl_itoa(walk.routes), "handles", idl_itoa(handles)); + + return 0; +} diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 1f8e1330..d2e209e4 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -12,7 +12,7 @@ */ #include "foundation/constants.h" -enum { CBM_DIR_PERMS = 0755, PL_RING = 4, PL_RING_MASK = 3, PL_SEQ_PASSES = 5, PL_WAL_BUF = 1040 }; +enum { CBM_DIR_PERMS = 0755, PL_RING = 4, PL_RING_MASK = 3, PL_SEQ_PASSES = 6, PL_WAL_BUF = 1040 }; #define PL_NSEC_PER_SEC 1000000000LL #include "pipeline/pipeline.h" #include "pipeline/artifact.h" @@ -494,6 +494,7 @@ static int run_sequential_pipeline(cbm_pipeline_t *p, cbm_pipeline_ctx_t *ctx, {cbm_pipeline_pass_definitions, "definitions", false}, {cbm_pipeline_pass_k8s, "k8s", true}, {cbm_pipeline_pass_calls, "calls", false}, + {cbm_pipeline_pass_idl_scan, "idl_scan", true}, {cbm_pipeline_pass_usages, "usages", false}, {cbm_pipeline_pass_semantic, "semantic", false}, }; diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index 0c6f49aa..3c465519 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -220,6 +220,7 @@ static void run_extract_resolve(cbm_pipeline_ctx_t *ctx, cbm_file_info_t *change cbm_log_info("incremental.mode", "mode", "sequential", "changed", itoa_buf(ci)); cbm_pipeline_pass_definitions(ctx, changed_files, ci); cbm_pipeline_pass_calls(ctx, changed_files, ci); + cbm_pipeline_pass_idl_scan(ctx, changed_files, ci); cbm_pipeline_pass_usages(ctx, changed_files, ci); cbm_pipeline_pass_semantic(ctx, changed_files, ci); } diff --git a/src/pipeline/pipeline_internal.h b/src/pipeline/pipeline_internal.h index 316002a9..42eb83cd 100644 --- a/src/pipeline/pipeline_internal.h +++ b/src/pipeline/pipeline_internal.h @@ -363,6 +363,12 @@ int cbm_pipeline_pass_calls(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *file void cbm_pipeline_pass_fastapi_depends(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count); +/* IDL-driven cross-repo binding for gRPC: emits Route nodes from .proto-derived + * service/rpc Class+Function definitions and HANDLES edges from server-stub + * subclasses. Runs after pass_calls. Idempotent. */ +int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, + int file_count); + int cbm_pipeline_pass_usages(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count); int cbm_pipeline_pass_semantic(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index cbe8fc86..05d890ba 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -5170,6 +5170,199 @@ TEST(project_name_trailing_slash) { PASS(); } +/* ── pass_idl_scan tests ──────────────────────────────────────────── */ + +/* Minimal harness: create a gbuf, populate it with the nodes/edges that the + * upstream extractor would emit for a proto file plus a consumer-side Python + * servicer class, run cbm_pipeline_pass_idl_scan, and check the resulting + * Route + HANDLES topology. */ + +TEST(idl_scan_emits_route_from_proto_class) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "PromoCodeService", + "test-proj.contracts.promo.PromoCodeService", + "contracts/promo.proto", 1, 5, "{}"); + ASSERT_GT(svc_id, 0); + + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "GetVoucher", + "test-proj.contracts.promo.PromoCodeService.GetVoucher", + "contracts/promo.proto", 2, 2, "{}"); + ASSERT_GT(rpc_id, 0); + + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + }; + int rc = cbm_pipeline_pass_idl_scan(&ctx, NULL, 0); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_node_t *route = + cbm_gbuf_find_by_qn(gb, "__route__grpc__PromoCodeService/GetVoucher"); + ASSERT_NOT_NULL(route); + ASSERT_STR_EQ(route->label, "Route"); + + const cbm_gbuf_edge_t **handles = NULL; + int handles_count = 0; + rc = cbm_gbuf_find_edges_by_source_type(gb, rpc_id, "HANDLES", &handles, &handles_count); + ASSERT_EQ(rc, 0); + ASSERT_EQ(handles_count, 1); + ASSERT_EQ(handles[0]->target_id, route->id); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_binds_python_servicer_subclass) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + /* Proto-derived service + rpc */ + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "PromoCodeService", + "test-proj.contracts.promo.PromoCodeService", + "contracts/promo.proto", 1, 5, "{}"); + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "GetVoucher", + "test-proj.contracts.promo.PromoCodeService.GetVoucher", + "contracts/promo.proto", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + /* Generated Python base (e.g., promo_pb2_grpc.PromoCodeServiceServicer) */ + int64_t base_id = cbm_gbuf_upsert_node( + gb, "Class", "PromoCodeServiceServicer", + "test-proj.gen.promo_pb2_grpc.PromoCodeServiceServicer", "gen/promo_pb2_grpc.py", 1, 1, + "{}"); + + /* User-written impl */ + int64_t impl_id = cbm_gbuf_upsert_node(gb, "Class", "PromoServicer", + "test-proj.server.promo.PromoServicer", + "server/promo.py", 10, 30, "{}"); + cbm_gbuf_insert_edge(gb, impl_id, base_id, "INHERITS", "{}"); + + /* Impl method using snake_case (Python convention): get_voucher */ + int64_t impl_method_id = cbm_gbuf_upsert_node( + gb, "Method", "GetVoucher", "test-proj.server.promo.PromoServicer.GetVoucher", + "server/promo.py", 11, 15, "{}"); + cbm_gbuf_insert_edge(gb, impl_id, impl_method_id, "DEFINES_METHOD", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + }; + int rc = cbm_pipeline_pass_idl_scan(&ctx, NULL, 0); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_node_t *route = + cbm_gbuf_find_by_qn(gb, "__route__grpc__PromoCodeService/GetVoucher"); + ASSERT_NOT_NULL(route); + + /* HANDLES edges expected: rpc → Route, impl method → Route. */ + const cbm_gbuf_edge_t **rpc_handles = NULL; + int rpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, rpc_id, "HANDLES", &rpc_handles, &rpc_count); + ASSERT_EQ(rpc_count, 1); + + const cbm_gbuf_edge_t **impl_handles = NULL; + int impl_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, impl_method_id, "HANDLES", &impl_handles, &impl_count); + ASSERT_EQ(impl_count, 1); + ASSERT_EQ(impl_handles[0]->target_id, route->id); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_binds_csharp_servicebase_subclass) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "Greeter", + "test-proj.contracts.greet.Greeter", + "contracts/greet.proto", 1, 3, "{}"); + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "SayHello", + "test-proj.contracts.greet.Greeter.SayHello", + "contracts/greet.proto", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + /* Generated C# base from Grpc.Tools: Greeter.GreeterBase */ + int64_t base_id = + cbm_gbuf_upsert_node(gb, "Class", "GreeterBase", "test-proj.gen.GreetGrpc.GreeterBase", + "gen/GreetGrpc.cs", 1, 1, "{}"); + + int64_t impl_id = + cbm_gbuf_upsert_node(gb, "Class", "GreeterService", "test-proj.server.GreeterService", + "server/GreeterService.cs", 5, 20, "{}"); + cbm_gbuf_insert_edge(gb, impl_id, base_id, "INHERITS", "{}"); + + /* C# stubs use *Async suffix; v1 strips it before route lookup. */ + int64_t impl_method_id = cbm_gbuf_upsert_node( + gb, "Method", "SayHelloAsync", "test-proj.server.GreeterService.SayHelloAsync", + "server/GreeterService.cs", 6, 12, "{}"); + cbm_gbuf_insert_edge(gb, impl_id, impl_method_id, "DEFINES_METHOD", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + }; + int rc = cbm_pipeline_pass_idl_scan(&ctx, NULL, 0); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_node_t *route = cbm_gbuf_find_by_qn(gb, "__route__grpc__Greeter/SayHello"); + ASSERT_NOT_NULL(route); + + const cbm_gbuf_edge_t **handles = NULL; + int hcount = 0; + cbm_gbuf_find_edges_by_source_type(gb, impl_method_id, "HANDLES", &handles, &hcount); + ASSERT_EQ(hcount, 1); + ASSERT_EQ(handles[0]->target_id, route->id); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_skips_non_proto_class) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + /* Class in a regular .py file — not a proto-derived service. */ + int64_t cid = cbm_gbuf_upsert_node(gb, "Class", "Helper", "test-proj.lib.Helper", + "lib/helper.py", 1, 5, "{}"); + int64_t mid = cbm_gbuf_upsert_node(gb, "Method", "Run", "test-proj.lib.Helper.Run", + "lib/helper.py", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, cid, mid, "DEFINES_METHOD", "{}"); + + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + }; + int rc = cbm_pipeline_pass_idl_scan(&ctx, NULL, 0); + ASSERT_EQ(rc, 0); + + /* No Route should have been created. */ + ASSERT_NULL(cbm_gbuf_find_by_qn(gb, "__route__grpc__Helper/Run")); + + cbm_gbuf_free(gb); + PASS(); +} + SUITE(pipeline) { /* Index lock */ RUN_TEST(pipeline_lock_try_acquire); @@ -5415,4 +5608,9 @@ SUITE(pipeline) { /* Project name edge cases */ RUN_TEST(project_name_special_chars); RUN_TEST(project_name_trailing_slash); + /* IDL scan (gRPC cross-repo) */ + RUN_TEST(idl_scan_emits_route_from_proto_class); + RUN_TEST(idl_scan_binds_python_servicer_subclass); + RUN_TEST(idl_scan_binds_csharp_servicebase_subclass); + RUN_TEST(idl_scan_skips_non_proto_class); } From 2041a2f6432b3f8cc766bf37354660212f66944c Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:33:14 +0500 Subject: [PATCH 02/10] test: add cross-language gRPC fixtures for IDL scan reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit testdata/cross-repo/grpc/ holds reference snippets across the four target ecosystems for pass_idl_scan validation: - contracts/promo.proto: shared IDL with package + service + 2 rpcs - server-python/promo_server.py: Python *Servicer subclass - server-csharp/PromoCodeService.cs: .NET *Base subclass with *Async methods - server-java/PromoCodeServiceImpl.java: Java *ImplBase subclass These are reference fixtures, not buildable projects — no .csproj, pom.xml, or requirements.txt. Their purpose is to give reviewers a realistic shape of what the indexer encounters in real consumer codebases. Unit tests in tests/test_pipeline.c mirror these shapes with synthetic gbuf nodes. Producer-side fixtures (client-csharp, client-go, client-python) are intentionally absent and will land alongside the producer-side typed-call detection in the follow-up Tier 1b PR. --- testdata/cross-repo/grpc/README.md | 27 ++++++++ .../cross-repo/grpc/contracts/promo.proto | 32 ++++++++++ .../grpc/server-csharp/PromoCodeService.cs | 61 +++++++++++++++++++ .../server-java/PromoCodeServiceImpl.java | 54 ++++++++++++++++ .../grpc/server-python/promo_server.py | 34 +++++++++++ 5 files changed, 208 insertions(+) create mode 100644 testdata/cross-repo/grpc/README.md create mode 100644 testdata/cross-repo/grpc/contracts/promo.proto create mode 100644 testdata/cross-repo/grpc/server-csharp/PromoCodeService.cs create mode 100644 testdata/cross-repo/grpc/server-java/PromoCodeServiceImpl.java create mode 100644 testdata/cross-repo/grpc/server-python/promo_server.py diff --git a/testdata/cross-repo/grpc/README.md b/testdata/cross-repo/grpc/README.md new file mode 100644 index 00000000..20638534 --- /dev/null +++ b/testdata/cross-repo/grpc/README.md @@ -0,0 +1,27 @@ +# Cross-repo gRPC fixtures + +Reference fixtures for `pass_idl_scan` (gRPC IDL Route + HANDLES emission). + +## Layout + +| Path | Role | What cbm should detect | +|---|---|---| +| `contracts/promo.proto` | Shared IDL | Service `PromoCodeService` + rpcs `GetVoucher`, `RedeemVoucher` extracted as Class+Function nodes; `pass_idl_scan` emits two `Route` nodes with QN `__route__grpc__PromoCodeService/` plus `HANDLES` from each rpc back to its Route. | +| `server-python/promo_server.py` | Python consumer | `PromoCodeServicer` inherits from generated `PromoCodeServiceServicer` base. The "Servicer" suffix is stripped → service name `PromoCodeService`. Each method emits a `HANDLES` edge to the matching Route. | +| `server-csharp/PromoCodeService.cs` | .NET consumer | `PromoCodeServiceImpl` inherits from generated `PromoCodeServiceBase`. The "Base" suffix is stripped, `*Async` method suffixes are stripped before route lookup. | +| `server-java/PromoCodeServiceImpl.java` | Java consumer | `PromoCodeServiceImpl` extends `PromoCodeServiceImplBase`. The "ImplBase" suffix is stripped (matched before "Base" — longer-first ordering). | + +## Producer side + +Producer-side typed-client `GRPC_CALLS` emission is intentionally deferred to +follow-up Tier 1b. Once it lands, fixtures here will gain `client-csharp/`, +`client-go/`, `client-python/` directories whose calls into the generated stubs +get classified as `GRPC_CALLS` edges, completing the cross-repo round-trip via +the existing `pass_cross_repo.c` Phase D matcher (`match_typed_routes`). + +## Why fixtures stay tiny + +These are reference snippets, not buildable projects — no `Cargo.toml`, +`pom.xml`, `requirements.txt`, `*.csproj`. Their only job is to give the +indexer realistic class shapes (suffix names, inheritance) for the unit tests +in `tests/test_pipeline.c` to mirror. diff --git a/testdata/cross-repo/grpc/contracts/promo.proto b/testdata/cross-repo/grpc/contracts/promo.proto new file mode 100644 index 00000000..3249c74b --- /dev/null +++ b/testdata/cross-repo/grpc/contracts/promo.proto @@ -0,0 +1,32 @@ +syntax = "proto3"; + +package promo; + +option csharp_namespace = "Promo.V1"; +option go_package = "example.com/promo/promov1"; +option java_package = "com.example.promo"; + +service PromoCodeService { + rpc GetVoucher(GetVoucherRequest) returns (GetVoucherResponse); + rpc RedeemVoucher(RedeemVoucherRequest) returns (RedeemVoucherResponse); +} + +message GetVoucherRequest { + string voucher_id = 1; +} + +message GetVoucherResponse { + string voucher_id = 1; + int32 discount_percent = 2; + bool is_valid = 3; +} + +message RedeemVoucherRequest { + string voucher_id = 1; + string user_id = 2; +} + +message RedeemVoucherResponse { + bool success = 1; + string error_message = 2; +} diff --git a/testdata/cross-repo/grpc/server-csharp/PromoCodeService.cs b/testdata/cross-repo/grpc/server-csharp/PromoCodeService.cs new file mode 100644 index 00000000..a9f84ffd --- /dev/null +++ b/testdata/cross-repo/grpc/server-csharp/PromoCodeService.cs @@ -0,0 +1,61 @@ +// C# gRPC server fixture for IDL cross-repo binding tests. +// +// Mirrors what Grpc.Tools emits via protoc-gen-grpc-csharp: a generated nested +// base class named .Base. User code derives from Base +// and overrides each rpc method with the *Async suffix. +// +// The cbm pass_idl_scan should detect: +// - PromoCodeService Class node (from contracts/promo.proto) +// - INHERITS edge from PromoCodeServiceImpl → PromoCodeServiceBase +// - Strip the "Base" suffix to derive the service name +// - Strip the "Async" suffix from method names before route lookup +// - Emit HANDLES edges from each method to __route__grpc__PromoCodeService/ + +namespace Promo.V1.Server; + +// Stand-in for Grpc.Tools-generated PromoCodeServiceBase. +public abstract class PromoCodeServiceBase +{ + public virtual System.Threading.Tasks.Task GetVoucherAsync( + GetVoucherRequest request, Grpc.Core.ServerCallContext context) + => throw new System.NotImplementedException(); + + public virtual System.Threading.Tasks.Task RedeemVoucherAsync( + RedeemVoucherRequest request, Grpc.Core.ServerCallContext context) + => throw new System.NotImplementedException(); +} + +public class PromoCodeServiceImpl : PromoCodeServiceBase +{ + private readonly IVoucherStore _store; + + public PromoCodeServiceImpl(IVoucherStore store) + { + _store = store; + } + + public override System.Threading.Tasks.Task GetVoucherAsync( + GetVoucherRequest request, Grpc.Core.ServerCallContext context) + { + var voucher = _store.Get(request.VoucherId); + return System.Threading.Tasks.Task.FromResult(voucher); + } + + public override System.Threading.Tasks.Task RedeemVoucherAsync( + RedeemVoucherRequest request, Grpc.Core.ServerCallContext context) + { + var ok = _store.Redeem(request.VoucherId, request.UserId); + return System.Threading.Tasks.Task.FromResult(ok); + } +} + +public interface IVoucherStore +{ + GetVoucherResponse Get(string voucherId); + RedeemVoucherResponse Redeem(string voucherId, string userId); +} + +public class GetVoucherRequest { public string VoucherId { get; set; } = ""; } +public class GetVoucherResponse { public string VoucherId { get; set; } = ""; public int DiscountPercent { get; set; } public bool IsValid { get; set; } } +public class RedeemVoucherRequest { public string VoucherId { get; set; } = ""; public string UserId { get; set; } = ""; } +public class RedeemVoucherResponse { public bool Success { get; set; } public string ErrorMessage { get; set; } = ""; } diff --git a/testdata/cross-repo/grpc/server-java/PromoCodeServiceImpl.java b/testdata/cross-repo/grpc/server-java/PromoCodeServiceImpl.java new file mode 100644 index 00000000..49bd9c66 --- /dev/null +++ b/testdata/cross-repo/grpc/server-java/PromoCodeServiceImpl.java @@ -0,0 +1,54 @@ +// Java gRPC server fixture for IDL cross-repo binding tests. +// +// Mirrors what protoc-gen-grpc-java emits: an inner abstract class named +// ImplBase that user code extends. cbm pass_idl_scan should detect: +// - INHERITS edge from PromoCodeServiceImpl → PromoCodeServiceImplBase +// - Strip "ImplBase" suffix to derive service name "PromoCodeService" +// - Match each method to corresponding __route__grpc__PromoCodeService/ + +package com.example.promo; + +import io.grpc.stub.StreamObserver; + +// Stand-in for protoc-gen-grpc-java-generated PromoCodeServiceGrpc.PromoCodeServiceImplBase. +abstract class PromoCodeServiceImplBase { + public void GetVoucher(GetVoucherRequest request, StreamObserver obs) { + throw new UnsupportedOperationException(); + } + + public void RedeemVoucher(RedeemVoucherRequest request, StreamObserver obs) { + throw new UnsupportedOperationException(); + } +} + +public class PromoCodeServiceImpl extends PromoCodeServiceImplBase { + private final VoucherStore store; + + public PromoCodeServiceImpl(VoucherStore store) { + this.store = store; + } + + @Override + public void GetVoucher(GetVoucherRequest request, StreamObserver obs) { + GetVoucherResponse v = store.get(request.getVoucherId()); + obs.onNext(v); + obs.onCompleted(); + } + + @Override + public void RedeemVoucher(RedeemVoucherRequest request, StreamObserver obs) { + RedeemVoucherResponse r = store.redeem(request.getVoucherId(), request.getUserId()); + obs.onNext(r); + obs.onCompleted(); + } +} + +interface VoucherStore { + GetVoucherResponse get(String voucherId); + RedeemVoucherResponse redeem(String voucherId, String userId); +} + +class GetVoucherRequest { public String getVoucherId() { return ""; } } +class GetVoucherResponse {} +class RedeemVoucherRequest { public String getVoucherId() { return ""; } public String getUserId() { return ""; } } +class RedeemVoucherResponse {} diff --git a/testdata/cross-repo/grpc/server-python/promo_server.py b/testdata/cross-repo/grpc/server-python/promo_server.py new file mode 100644 index 00000000..4893c087 --- /dev/null +++ b/testdata/cross-repo/grpc/server-python/promo_server.py @@ -0,0 +1,34 @@ +"""Python gRPC server fixture for IDL cross-repo binding tests. + +Mirrors what protoc-gen-grpc-python emits: a base class named +PromoCodeServiceServicer that user code subclasses. + +The cbm pass_idl_scan should detect: + - The PromoCodeService Class node (extracted from contracts/promo.proto) + - The INHERITS edge from PromoCodeServicer → PromoCodeServiceServicer + - And emit HANDLES edges from each method on PromoCodeServicer to the + corresponding __route__grpc__PromoCodeService/ Route node. +""" + +# Stand-in for protoc-gen-grpc-python output. Real code would import this +# from generated promo_pb2_grpc. +class PromoCodeServiceServicer: + """Generated by protoc — placeholder so the test fixture stands alone.""" + def GetVoucher(self, request, context): + raise NotImplementedError + + def RedeemVoucher(self, request, context): + raise NotImplementedError + + +class PromoCodeServicer(PromoCodeServiceServicer): + def __init__(self, store): + self._store = store + + def GetVoucher(self, request, context): + voucher = self._store.get(request.voucher_id) + return voucher + + def RedeemVoucher(self, request, context): + ok = self._store.redeem(request.voucher_id, request.user_id) + return ok From 6a27761852ece05ede0761ca81d1360b5777d502 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Sun, 26 Apr 2026 18:11:59 +0500 Subject: [PATCH 03/10] feat(pipeline): producer-side typed gRPC client detection in pass_idl_scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the cross-repo gRPC matching loop by adding the producer half: walks per-file extraction results (CBMFileResult.type_assigns) to find variables assigned to generated gRPC client/stub types, then emits GRPC_CALLS edges for each var.Method(...) call site. Detection rules: - Stub-type suffixes (longest-first): BlockingStub, FutureStub, AsyncStub, AsyncClient, Stub, Client. Covers Python grpcio (*Stub), Java protoc-gen-grpc-java (*BlockingStub, *FutureStub), C# Grpc.Tools (*Client, *AsyncClient), Rust tonic (*Client). - Suffix-stripped service name MUST match a Class node in the gbuf with a .proto file_path. Filters out false positives from non-gRPC classes that happen to end in "Client" (HttpClient, WebClient, etc.). - Method name has *Async suffix stripped and first character capitalized before route lookup. Bridges Java's lowerCamelCase invocations and C#'s *Async wrapper convention. - Caller node resolved via enclosing_func_qn → file_node fallback, mirroring pass_calls.c calls_find_source. Together with the existing consumer-side HANDLES edges, pass_cross_repo.c match_typed_routes (Phase D) now produces CROSS_GRPC_CALLS edges end-to-end without further changes. Coverage: 4 new tests in tests/test_pipeline.c — Python *Stub, C# *Client with *Async stripping, Java *BlockingStub with lowerCamelCase, plus a negative case verifying HttpClient does not get a GRPC_CALLS edge. 2607 tests pass overall, 0 regressions. Producer-side fixtures added under testdata/cross-repo/grpc/client-* mirroring the server-* layout. Go grpc-go (pointer types + struct embedding) and TS @grpc/grpc-js (dynamic stubs) remain out of scope — documented in fixture README. --- src/pipeline/pass_idl_scan.c | 368 +++++++++++++++++- testdata/cross-repo/grpc/README.md | 39 +- .../grpc/client-csharp/PromoCodeClient.cs | 59 +++ .../grpc/client-java/PromoCodeClient.java | 58 +++ .../grpc/client-python/promo_client.py | 38 ++ tests/test_pipeline.c | 246 +++++++++++- 6 files changed, 782 insertions(+), 26 deletions(-) create mode 100644 testdata/cross-repo/grpc/client-csharp/PromoCodeClient.cs create mode 100644 testdata/cross-repo/grpc/client-java/PromoCodeClient.java create mode 100644 testdata/cross-repo/grpc/client-python/promo_client.py diff --git a/src/pipeline/pass_idl_scan.c b/src/pipeline/pass_idl_scan.c index 9c1c6cd0..0af2b2c8 100644 --- a/src/pipeline/pass_idl_scan.c +++ b/src/pipeline/pass_idl_scan.c @@ -1,7 +1,7 @@ /* * pass_idl_scan.c — IDL-driven cross-repo binding for gRPC. * - * Runs after pass_definitions / pass_calls. Two responsibilities: + * Runs after pass_definitions / pass_calls. Three responsibilities: * * 1. Emit Route nodes derived from .proto-defined services and rpcs. * For each Class node whose file_path ends in ".proto", iterate its @@ -16,13 +16,27 @@ * of the inheriting class via DEFINES_METHOD edges, and emit: * HANDLES edge from each method to the matching IDL Route * + * 3. Emit producer-side GRPC_CALLS edges from typed-client method calls. + * Walk per-file extraction results (CBMFileResult.type_assigns) to + * build a (var, enclosing_func_qn) → service_name map for variables + * typed as a generated client/stub (*Stub, *BlockingStub, *FutureStub, + * *AsyncStub, *Client, *AsyncClient). For each call whose callee_name + * is "var.Method", look up var in the map; if found and the derived + * service matches a known proto Service, upsert a local Route node and + * emit a GRPC_CALLS edge from the caller to the Route with + * {service, method} properties. + * * The HANDLES edges are the rendezvous point for pass_cross_repo's existing - * Phase D matcher (match_typed_routes for GRPC_CALLS), which already looks - * up Routes by QN and follows HANDLES edges to find handlers. + * Phase D matcher (match_typed_routes for GRPC_CALLS): producer-side + * GRPC_CALLS edges + consumer-side HANDLES edges close the loop, and the + * cross-repo pass emits CROSS_GRPC_CALLS without further changes here. * - * Producer-side typed gRPC client detection (emitting GRPC_CALLS edges) is - * intentionally deferred to a follow-up pass — it requires call-site type - * resolution that is not yet wired through the call resolution pipeline. + * Producer-side detection only fires when the producer repo also indexes the + * .proto contract (vendored, submoduled, or inline). Cross-repo matching + * works through Phase D regardless of which repo the .proto lives in, but + * detecting that a given call IS a gRPC client call requires a Service node + * to exist somewhere in the producer's gbuf. Repos that import compiled + * stubs without source access fall through to ordinary CALLS edges. * * Builds on the cross-repo scaffolding in pass_cross_repo.c without modifying it. */ @@ -33,6 +47,8 @@ enum { IDL_PROPS_BUF = 256, IDL_NAME_BUF = 256, IDL_LOG_BUF = 16, + IDL_VAR_INIT_CAP = 16, + IDL_SVC_INIT_CAP = 8, }; #include "pipeline/pipeline.h" @@ -40,6 +56,7 @@ enum { #include "graph_buffer/graph_buffer.h" #include "foundation/log.h" #include "foundation/compat.h" +#include "cbm.h" #include #include @@ -280,6 +297,337 @@ static int idl_bind_consumer_handlers(cbm_gbuf_t *gbuf) { return handles; } +/* ── Producer-side typed-client detection ──────────────────────────── + * + * Source-language coverage of stub/client suffixes (longest first): + * Java/Kotlin protoc-gen-grpc-java: BlockingStub, FutureStub + * Python grpcio: Stub + * C# Grpc.Tools: Client, AsyncClient + * Rust tonic: Client + * + * Go grpc-go uses pointer types like *PromoCodeClient produced by NewPromoCodeClient(conn); + * extracting the type from the call expression rather than a typed assignment is feasible + * but needs more plumbing — left as a follow-up. + */ +static const char *const k_grpc_client_suffixes[] = { + "BlockingStub", "FutureStub", "AsyncStub", "AsyncClient", "Stub", "Client", NULL, +}; + +/* In-pass index of proto-derived service names (Class nodes from .proto files). + * Built once during route emission so producer-side detection can validate the + * derived service name actually corresponds to an indexed gRPC service. */ +typedef struct { + char **names; + int count; + int cap; +} idl_service_set_t; + +static void idl_service_set_init(idl_service_set_t *s) { + s->names = NULL; + s->count = 0; + s->cap = 0; +} + +static void idl_service_set_add(idl_service_set_t *s, const char *name) { + if (!name || !name[0]) { + return; + } + for (int i = 0; i < s->count; i++) { + if (strcmp(s->names[i], name) == 0) { + return; + } + } + if (s->count >= s->cap) { + int new_cap = s->cap == 0 ? IDL_SVC_INIT_CAP : s->cap * 2; + char **grow = realloc(s->names, (size_t)new_cap * sizeof(char *)); + if (!grow) { + return; + } + s->names = grow; + s->cap = new_cap; + } + s->names[s->count] = strdup(name); + if (s->names[s->count]) { + s->count++; + } +} + +static bool idl_service_set_contains(const idl_service_set_t *s, const char *name) { + if (!name) { + return false; + } + for (int i = 0; i < s->count; i++) { + if (strcmp(s->names[i], name) == 0) { + return true; + } + } + return false; +} + +static void idl_service_set_free(idl_service_set_t *s) { + for (int i = 0; i < s->count; i++) { + free(s->names[i]); + } + free(s->names); + s->names = NULL; + s->count = 0; + s->cap = 0; +} + +/* Per-function scoped record: var name → derived service name. */ +typedef struct { + char *enclosing_qn; + char *var_name; + char *service_name; +} idl_stub_var_t; + +typedef struct { + idl_stub_var_t *items; + int count; + int cap; +} idl_stub_var_arr_t; + +static void idl_stub_var_arr_init(idl_stub_var_arr_t *a) { + a->items = NULL; + a->count = 0; + a->cap = 0; +} + +static void idl_stub_var_arr_push(idl_stub_var_arr_t *a, const char *enclosing_qn, + const char *var_name, const char *service_name) { + if (a->count >= a->cap) { + int new_cap = a->cap == 0 ? IDL_VAR_INIT_CAP : a->cap * 2; + idl_stub_var_t *grow = realloc(a->items, (size_t)new_cap * sizeof(idl_stub_var_t)); + if (!grow) { + return; + } + a->items = grow; + a->cap = new_cap; + } + idl_stub_var_t *e = &a->items[a->count]; + e->enclosing_qn = enclosing_qn ? strdup(enclosing_qn) : NULL; + e->var_name = strdup(var_name); + e->service_name = strdup(service_name); + if (e->var_name && e->service_name) { + a->count++; + } else { + free(e->enclosing_qn); + free(e->var_name); + free(e->service_name); + } +} + +static const idl_stub_var_t *idl_stub_var_arr_find(const idl_stub_var_arr_t *a, + const char *enclosing_qn, const char *var_name) { + if (!var_name) { + return NULL; + } + for (int i = 0; i < a->count; i++) { + const idl_stub_var_t *e = &a->items[i]; + if (strcmp(e->var_name, var_name) != 0) { + continue; + } + /* Require enclosing QN match when both sides specify one; allow a NULL + * call-site enclosing to match any (module-scope variables). */ + if (enclosing_qn && e->enclosing_qn && strcmp(enclosing_qn, e->enclosing_qn) != 0) { + continue; + } + return e; + } + return NULL; +} + +static void idl_stub_var_arr_free(idl_stub_var_arr_t *a) { + for (int i = 0; i < a->count; i++) { + free(a->items[i].enclosing_qn); + free(a->items[i].var_name); + free(a->items[i].service_name); + } + free(a->items); + a->items = NULL; + a->count = 0; + a->cap = 0; +} + +/* Get the unqualified (basename) form of a possibly-qualified type name. + * "promo_pb2_grpc.PromoCodeStub" → "PromoCodeStub". */ +static const char *idl_type_basename(const char *qualified) { + if (!qualified) { + return NULL; + } + const char *dot = strrchr(qualified, '.'); + return dot ? dot + 1 : qualified; +} + +/* Scan one CBMFileResult's type_assigns; for each assignment whose RHS type + * matches a stub/client suffix AND the suffix-stripped base name matches a + * known proto service, record (enclosing_qn, var_name, service_name). */ +static void idl_collect_stub_vars_for_file(const CBMFileResult *result, + const idl_service_set_t *known_services, + idl_stub_var_arr_t *out) { + if (!result) { + return; + } + for (int i = 0; i < result->type_assigns.count; i++) { + const CBMTypeAssign *ta = &result->type_assigns.items[i]; + if (!ta->var_name || !ta->type_name) { + continue; + } + const char *base = idl_type_basename(ta->type_name); + char *service = idl_strip_suffix(base, k_grpc_client_suffixes); + if (!service || !service[0]) { + free(service); + continue; + } + if (!idl_service_set_contains(known_services, service)) { + free(service); + continue; + } + idl_stub_var_arr_push(out, ta->enclosing_func_qn, ta->var_name, service); + free(service); + } +} + +/* Locate the caller node for a producer-side edge: prefer the enclosing function + * QN's gbuf node, fall back to the file node. Mirrors pass_calls' calls_find_source. */ +static const cbm_gbuf_node_t *idl_find_caller(cbm_pipeline_ctx_t *ctx, const char *rel_path, + const char *enclosing_qn) { + const cbm_gbuf_node_t *src = NULL; + if (enclosing_qn && enclosing_qn[0]) { + src = cbm_gbuf_find_by_qn(ctx->gbuf, enclosing_qn); + } + if (!src && rel_path) { + char *fqn = cbm_pipeline_fqn_compute(ctx->project_name, rel_path, "__file__"); + if (fqn) { + src = cbm_gbuf_find_by_qn(ctx->gbuf, fqn); + free(fqn); + } + } + return src; +} + +/* Walk one file's calls and emit GRPC_CALLS edges for matched stub-var.method patterns. */ +static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *fi, + const CBMFileResult *result, + const idl_stub_var_arr_t *stub_vars) { + if (!result || stub_vars->count == 0) { + return 0; + } + int emitted = 0; + for (int c = 0; c < result->calls.count; c++) { + const CBMCall *call = &result->calls.items[c]; + if (!call->callee_name) { + continue; + } + const char *dot = strchr(call->callee_name, '.'); + if (!dot || dot == call->callee_name) { + continue; + } + size_t var_len = (size_t)(dot - call->callee_name); + if (var_len == 0 || var_len >= IDL_NAME_BUF) { + continue; + } + char var_buf[IDL_NAME_BUF]; + memcpy(var_buf, call->callee_name, var_len); + var_buf[var_len] = '\0'; + + const char *rest = dot + 1; + if (!rest[0] || strchr(rest, '.') != NULL) { + /* Skip multi-segment receivers like "self.client.Method" — out of scope for v1. */ + continue; + } + + const idl_stub_var_t *stub = + idl_stub_var_arr_find(stub_vars, call->enclosing_func_qn, var_buf); + if (!stub) { + continue; + } + + char method_buf[IDL_NAME_BUF]; + snprintf(method_buf, sizeof(method_buf), "%s", rest); + idl_strip_async_suffix(method_buf); + idl_capitalize_first(method_buf); + + char route_qn[IDL_QN_BUF]; + idl_build_route_qn(route_qn, sizeof(route_qn), stub->service_name, method_buf); + char display[IDL_QN_BUF]; + snprintf(display, sizeof(display), "%s/%s", stub->service_name, method_buf); + char route_props[IDL_PROPS_BUF]; + snprintf(route_props, sizeof(route_props), + "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\"}", stub->service_name, + method_buf); + int64_t route_id = cbm_gbuf_upsert_node(ctx->gbuf, "Route", display, route_qn, "", 0, 0, + route_props); + if (route_id <= 0) { + continue; + } + + const cbm_gbuf_node_t *caller = + idl_find_caller(ctx, fi ? fi->rel_path : NULL, call->enclosing_func_qn); + if (!caller) { + continue; + } + + char edge_props[IDL_PROPS_BUF]; + snprintf(edge_props, sizeof(edge_props), + "{\"service\":\"%s\",\"method\":\"%s\",\"via\":\"idl_grpc_stub\"}", + stub->service_name, method_buf); + cbm_gbuf_insert_edge(ctx->gbuf, caller->id, route_id, "GRPC_CALLS", edge_props); + emitted++; + } + return emitted; +} + +/* Build the proto-service set by scanning Class nodes with .proto file_path. */ +typedef struct { + idl_service_set_t *set; +} idl_svc_collect_ctx_t; + +static void idl_svc_collect_visitor(const cbm_gbuf_node_t *node, void *userdata) { + idl_svc_collect_ctx_t *c = (idl_svc_collect_ctx_t *)userdata; + if (!node || !node->label || !node->file_path || !node->name) { + return; + } + if (strcmp(node->label, "Class") == 0 && idl_is_proto_file(node->file_path)) { + idl_service_set_add(c->set, node->name); + } +} + +static int idl_emit_producer_edges(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, + int file_count) { + if (!ctx->result_cache || file_count <= 0) { + return 0; + } + + idl_service_set_t known_services; + idl_service_set_init(&known_services); + idl_svc_collect_ctx_t collect = {.set = &known_services}; + cbm_gbuf_foreach_node(ctx->gbuf, idl_svc_collect_visitor, &collect); + + int total_emitted = 0; + if (known_services.count == 0) { + idl_service_set_free(&known_services); + return 0; + } + + for (int i = 0; i < file_count; i++) { + const CBMFileResult *result = ctx->result_cache[i]; + if (!result) { + continue; + } + idl_stub_var_arr_t stub_vars; + idl_stub_var_arr_init(&stub_vars); + idl_collect_stub_vars_for_file(result, &known_services, &stub_vars); + if (stub_vars.count > 0) { + total_emitted += idl_emit_producer_edges_for_file(ctx, &files[i], result, &stub_vars); + } + idl_stub_var_arr_free(&stub_vars); + } + + idl_service_set_free(&known_services); + return total_emitted; +} + /* TLS-backed itoa for log calls. */ static const char *idl_itoa(int v) { static CBM_TLS char buf[IDL_LOG_BUF]; @@ -288,11 +636,9 @@ static const char *idl_itoa(int v) { } /* Public entry point. Idempotent: re-running over the same gbuf only adds the - * same Route + HANDLES tuples (deduped by gbuf upsert/insert semantics). */ + * same Route + HANDLES + GRPC_CALLS tuples (deduped by gbuf upsert/insert semantics). */ int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count) { - (void)files; - (void)file_count; if (!ctx || !ctx->gbuf) { return 0; } @@ -303,9 +649,11 @@ int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *f cbm_gbuf_foreach_node(ctx->gbuf, idl_proto_class_visitor, &walk); int handles = idl_bind_consumer_handlers(ctx->gbuf); + int grpc_calls = idl_emit_producer_edges(ctx, files, file_count); cbm_log_info("pass.done", "pass", "idl_scan", "services", idl_itoa(walk.services), "routes", - idl_itoa(walk.routes), "handles", idl_itoa(handles)); + idl_itoa(walk.routes), "handles", idl_itoa(handles), "grpc_calls", + idl_itoa(grpc_calls)); return 0; } diff --git a/testdata/cross-repo/grpc/README.md b/testdata/cross-repo/grpc/README.md index 20638534..c9e25b72 100644 --- a/testdata/cross-repo/grpc/README.md +++ b/testdata/cross-repo/grpc/README.md @@ -1,27 +1,36 @@ # Cross-repo gRPC fixtures -Reference fixtures for `pass_idl_scan` (gRPC IDL Route + HANDLES emission). +Reference fixtures for `pass_idl_scan` covering both producer (client) and consumer (server) sides of cross-repo gRPC matching. ## Layout | Path | Role | What cbm should detect | |---|---|---| -| `contracts/promo.proto` | Shared IDL | Service `PromoCodeService` + rpcs `GetVoucher`, `RedeemVoucher` extracted as Class+Function nodes; `pass_idl_scan` emits two `Route` nodes with QN `__route__grpc__PromoCodeService/` plus `HANDLES` from each rpc back to its Route. | -| `server-python/promo_server.py` | Python consumer | `PromoCodeServicer` inherits from generated `PromoCodeServiceServicer` base. The "Servicer" suffix is stripped → service name `PromoCodeService`. Each method emits a `HANDLES` edge to the matching Route. | -| `server-csharp/PromoCodeService.cs` | .NET consumer | `PromoCodeServiceImpl` inherits from generated `PromoCodeServiceBase`. The "Base" suffix is stripped, `*Async` method suffixes are stripped before route lookup. | -| `server-java/PromoCodeServiceImpl.java` | Java consumer | `PromoCodeServiceImpl` extends `PromoCodeServiceImplBase`. The "ImplBase" suffix is stripped (matched before "Base" — longer-first ordering). | +| `contracts/promo.proto` | Shared IDL | `PromoCodeService` + rpcs `GetVoucher`, `RedeemVoucher` extracted as Class+Function nodes; `pass_idl_scan` emits two `Route` nodes with QN `__route__grpc__PromoCodeService/` plus `HANDLES` from each rpc back to its Route. | +| **Server side (consumer)** | | | +| `server-python/promo_server.py` | Python consumer | `PromoCodeServicer` inherits from generated `PromoCodeServiceServicer`. The `Servicer` suffix is stripped → service name `PromoCodeService`. Each method emits a `HANDLES` edge to the matching Route. | +| `server-csharp/PromoCodeService.cs` | .NET consumer | `PromoCodeServiceImpl` inherits from generated `PromoCodeServiceBase`. The `Base` suffix is stripped, `*Async` method suffixes are stripped before route lookup. | +| `server-java/PromoCodeServiceImpl.java` | Java consumer | `PromoCodeServiceImpl` extends `PromoCodeServiceImplBase`. The `ImplBase` suffix is stripped (matched before `Base` — longer-first ordering). | +| **Client side (producer)** | | | +| `client-python/promo_client.py` | Python producer | `stub = PromoCodeServiceStub(channel)` records `stub` as a typed-stub variable; `stub.GetVoucher(...)` calls emit `GRPC_CALLS` edges with `{service: "PromoCodeService", method: "GetVoucher"}` properties to the local Route node. | +| `client-csharp/PromoCodeClient.cs` | .NET producer | `_client = new PromoCodeServiceClient(channel)` records `_client` as a typed-stub variable; `_client.GetVoucherAsync(...)` calls have `Async` stripped before route lookup. | +| `client-java/PromoCodeClient.java` | Java producer | `stub = PromoCodeServiceGrpc.newBlockingStub(channel)` types `stub` as `PromoCodeServiceBlockingStub`; the `BlockingStub` suffix is stripped to derive `PromoCodeService`. Method calls use lowerCamelCase (`stub.getVoucher`) which is capitalized before route lookup. | -## Producer side +## How matching works end-to-end -Producer-side typed-client `GRPC_CALLS` emission is intentionally deferred to -follow-up Tier 1b. Once it lands, fixtures here will gain `client-csharp/`, -`client-go/`, `client-python/` directories whose calls into the generated stubs -get classified as `GRPC_CALLS` edges, completing the cross-repo round-trip via -the existing `pass_cross_repo.c` Phase D matcher (`match_typed_routes`). +After indexing the proto + a producer + a consumer: + +1. **`pass_idl_scan`** emits Route nodes from `.proto`-derived service/rpc Class+Function pairs. +2. **Consumer side**: walks `INHERITS` edges; classes inheriting `*Servicer` / `*ImplBase` / `*ServiceBase` / `*Base` get `HANDLES` edges from their methods to the matching Routes. +3. **Producer side**: walks per-file `type_assigns` to find variables typed as `*Stub` / `*BlockingStub` / `*FutureStub` / `*Client` / `*AsyncClient` / `*AsyncStub`. For each `var.Method(...)` call on such a variable, emits a `GRPC_CALLS` edge from the caller to the Route. +4. **`pass_cross_repo`** Phase D matcher (already in `main`): for each `GRPC_CALLS` edge, looks up the Route's QN in target-project DBs; if found, follows the target's `HANDLES` edge to the impl method; emits `CROSS_GRPC_CALLS` bidirectionally. ## Why fixtures stay tiny -These are reference snippets, not buildable projects — no `Cargo.toml`, -`pom.xml`, `requirements.txt`, `*.csproj`. Their only job is to give the -indexer realistic class shapes (suffix names, inheritance) for the unit tests -in `tests/test_pipeline.c` to mirror. +These are reference snippets, not buildable projects — no `Cargo.toml`, `pom.xml`, `requirements.txt`, `*.csproj`. Their job is to show the indexer realistic shapes (suffix patterns, inheritance, stub instantiation). Unit tests in `tests/test_pipeline.c` mirror these shapes with synthetic gbuf nodes + synthetic `CBMFileResult` extraction data so each rule is exercised in isolation. + +## What v1 doesn't cover + +- **Go grpc-go**: uses pointer types (`*PromoCodeClient`) and struct embedding (`UnimplementedPromoCodeServer`) instead of classical inheritance / class typing. Producer-side detection is feasible but needs additional plumbing in the call extractor; deferred to follow-up. +- **TypeScript `@grpc/grpc-js` dynamic clients**: stubs are generated at runtime from `.proto` rather than statically typed; out of scope for v1. +- **Generated stub source not indexed**: producer-side detection requires the producer repo to also index the `.proto` (vendored, submoduled, or inline). Repos that import only compiled stubs without source access fall through to ordinary `CALLS` edges. diff --git a/testdata/cross-repo/grpc/client-csharp/PromoCodeClient.cs b/testdata/cross-repo/grpc/client-csharp/PromoCodeClient.cs new file mode 100644 index 00000000..08011d2d --- /dev/null +++ b/testdata/cross-repo/grpc/client-csharp/PromoCodeClient.cs @@ -0,0 +1,59 @@ +// C# gRPC client fixture for IDL cross-repo binding tests. +// +// Mirrors what Grpc.Tools emits via protoc-gen-grpc-csharp: a generated nested +// client class named `.Client` constructed with a Grpc.Net +// channel; rpcs are invoked as `*Async` methods on the client instance. +// +// The cbm pass_idl_scan should detect: +// - The `_client` field typed as `PromoCodeService.PromoCodeServiceClient` +// (suffix `Client`) +// - Strip the `Client` suffix → service name `PromoCodeService` +// - Match against the proto-derived `PromoCodeService` Class node +// - For each `_client.Async(...)` call within `FetchVoucherAsync`, +// strip the `Async` suffix and emit a `GRPC_CALLS` edge to the Route +// `__route__grpc__PromoCodeService/`. + +namespace Promo.V1.Client; + +// Stand-in for Grpc.Tools-generated PromoCodeServiceClient. +public class PromoCodeServiceClient +{ + public PromoCodeServiceClient(Grpc.Net.Client.GrpcChannel channel) { _channel = channel; } + private readonly Grpc.Net.Client.GrpcChannel _channel; + + public virtual System.Threading.Tasks.Task GetVoucherAsync( + GetVoucherRequest request, Grpc.Core.CallOptions options = default) + => System.Threading.Tasks.Task.FromResult(new GetVoucherResponse()); + + public virtual System.Threading.Tasks.Task RedeemVoucherAsync( + RedeemVoucherRequest request, Grpc.Core.CallOptions options = default) + => System.Threading.Tasks.Task.FromResult(new RedeemVoucherResponse()); +} + +public class CheckoutFlow +{ + private readonly PromoCodeServiceClient _client; + + public CheckoutFlow(Grpc.Net.Client.GrpcChannel channel) + { + _client = new PromoCodeServiceClient(channel); + } + + public async System.Threading.Tasks.Task FetchVoucherAsync(string voucherId) + { + var request = new GetVoucherRequest { VoucherId = voucherId }; + return await _client.GetVoucherAsync(request); + } + + public async System.Threading.Tasks.Task RedeemAsync(string voucherId, string userId) + { + var req = new RedeemVoucherRequest { VoucherId = voucherId, UserId = userId }; + var resp = await _client.RedeemVoucherAsync(req); + return resp.Success; + } +} + +public class GetVoucherRequest { public string VoucherId { get; set; } = ""; } +public class GetVoucherResponse { public string VoucherId { get; set; } = ""; public int DiscountPercent { get; set; } public bool IsValid { get; set; } } +public class RedeemVoucherRequest { public string VoucherId { get; set; } = ""; public string UserId { get; set; } = ""; } +public class RedeemVoucherResponse { public bool Success { get; set; } public string ErrorMessage { get; set; } = ""; } diff --git a/testdata/cross-repo/grpc/client-java/PromoCodeClient.java b/testdata/cross-repo/grpc/client-java/PromoCodeClient.java new file mode 100644 index 00000000..6f00d752 --- /dev/null +++ b/testdata/cross-repo/grpc/client-java/PromoCodeClient.java @@ -0,0 +1,58 @@ +// Java gRPC client fixture for IDL cross-repo binding tests. +// +// Mirrors what protoc-gen-grpc-java emits: a stub factory `PromoCodeServiceGrpc` +// returns a typed `PromoCodeServiceBlockingStub`. RPC methods are invoked +// as lowerCamelCase methods on the stub. +// +// The cbm pass_idl_scan should detect: +// - The `stub` variable typed as `PromoCodeServiceGrpc.PromoCodeServiceBlockingStub` +// (suffix `BlockingStub`) +// - Strip the `BlockingStub` suffix → service name `PromoCodeService` +// - Match against the proto-derived `PromoCodeService` Class node +// - For each `stub.(...)` call, capitalize the first character and +// emit a `GRPC_CALLS` edge to the Route `__route__grpc__PromoCodeService/`. + +package com.example.promo.client; + +import io.grpc.Channel; +import io.grpc.ManagedChannelBuilder; + +// Stand-in for protoc-gen-grpc-java-generated PromoCodeServiceGrpc. +class PromoCodeServiceGrpc { + public static PromoCodeServiceBlockingStub newBlockingStub(Channel channel) { + return new PromoCodeServiceBlockingStub(); + } + + public static class PromoCodeServiceBlockingStub { + public GetVoucherResponse getVoucher(GetVoucherRequest req) { return new GetVoucherResponse(); } + public RedeemVoucherResponse redeemVoucher(RedeemVoucherRequest req) { return new RedeemVoucherResponse(); } + } +} + +public class PromoCodeClient { + private final PromoCodeServiceGrpc.PromoCodeServiceBlockingStub stub; + + public PromoCodeClient(String target) { + Channel channel = ManagedChannelBuilder.forTarget(target).usePlaintext().build(); + this.stub = PromoCodeServiceGrpc.newBlockingStub(channel); + } + + public GetVoucherResponse fetchVoucher(String voucherId) { + GetVoucherRequest request = new GetVoucherRequest(); + request.voucherId = voucherId; + return stub.getVoucher(request); + } + + public boolean redeem(String voucherId, String userId) { + RedeemVoucherRequest req = new RedeemVoucherRequest(); + req.voucherId = voucherId; + req.userId = userId; + RedeemVoucherResponse resp = stub.redeemVoucher(req); + return resp.success; + } +} + +class GetVoucherRequest { public String voucherId; } +class GetVoucherResponse {} +class RedeemVoucherRequest { public String voucherId; public String userId; } +class RedeemVoucherResponse { public boolean success; } diff --git a/testdata/cross-repo/grpc/client-python/promo_client.py b/testdata/cross-repo/grpc/client-python/promo_client.py new file mode 100644 index 00000000..82fb5220 --- /dev/null +++ b/testdata/cross-repo/grpc/client-python/promo_client.py @@ -0,0 +1,38 @@ +"""Python gRPC client fixture for IDL cross-repo binding tests. + +Mirrors what protoc-gen-grpc-python emits: a stub class +PromoCodeServiceStub instantiated with a channel; rpcs are invoked as +methods on the stub instance. + +The cbm pass_idl_scan should detect: + - The `stub` variable assigned to type `PromoCodeServiceStub` (suffix `Stub`) + - Strip the `Stub` suffix → service name `PromoCodeService` + - Match against the proto-derived `PromoCodeService` Class node + - For each `stub.(...)` call within `fetch_voucher`, emit a + `GRPC_CALLS` edge to the Route node + `__route__grpc__PromoCodeService/` with `{service, method}` props. +""" + +# Stand-in for protoc-gen-grpc-python output. Real code imports promo_pb2_grpc. +class PromoCodeServiceStub: + """Generated by protoc — placeholder to keep the fixture buildable in isolation.""" + def __init__(self, channel): + self._channel = channel + + def GetVoucher(self, request, timeout=None): + return None + + def RedeemVoucher(self, request, timeout=None): + return None + + +def fetch_voucher(channel, voucher_id): + stub = PromoCodeServiceStub(channel) + response = stub.GetVoucher({"voucher_id": voucher_id}) + return response + + +def redeem_voucher(channel, voucher_id, user_id): + stub = PromoCodeServiceStub(channel) + response = stub.RedeemVoucher({"voucher_id": voucher_id, "user_id": user_id}) + return response.success diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index 05d890ba..908609eb 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -5334,6 +5334,245 @@ TEST(idl_scan_binds_csharp_servicebase_subclass) { PASS(); } +/* Helper: zero-initialized synthetic CBMFileResult with caller-provided + * type_assigns / calls arrays. The arrays must outlive the test scope. */ +static void mk_synthetic_result(CBMFileResult *r, CBMTypeAssign *ta_items, int ta_count, + CBMCall *call_items, int call_count) { + memset(r, 0, sizeof(*r)); + r->type_assigns.items = ta_items; + r->type_assigns.count = ta_count; + r->type_assigns.cap = ta_count; + r->calls.items = call_items; + r->calls.count = call_count; + r->calls.cap = call_count; +} + +TEST(idl_scan_emits_grpc_calls_for_python_stub) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + /* Proto-derived service. */ + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "PromoCodeService", + "test-proj.contracts.promo.PromoCodeService", + "contracts/promo.proto", 1, 5, "{}"); + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "GetVoucher", + "test-proj.contracts.promo.PromoCodeService.GetVoucher", + "contracts/promo.proto", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + /* Caller function on producer side. */ + int64_t caller_id = + cbm_gbuf_upsert_node(gb, "Function", "fetch_voucher", "test-proj.client.main.fetch_voucher", + "client/main.py", 10, 20, "{}"); + + CBMTypeAssign ta[] = {{ + .var_name = "stub", + .type_name = "promo_pb2_grpc.PromoCodeServiceStub", + .enclosing_func_qn = "test-proj.client.main.fetch_voucher", + }}; + CBMCall calls[] = {{ + .callee_name = "stub.GetVoucher", + .enclosing_func_qn = "test-proj.client.main.fetch_voucher", + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "client/main.py"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_node_t *route = + cbm_gbuf_find_by_qn(gb, "__route__grpc__PromoCodeService/GetVoucher"); + ASSERT_NOT_NULL(route); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 1); + ASSERT_EQ(grpc[0]->target_id, route->id); + ASSERT_NOT_NULL(strstr(grpc[0]->properties_json, "\"service\":\"PromoCodeService\"")); + ASSERT_NOT_NULL(strstr(grpc[0]->properties_json, "\"method\":\"GetVoucher\"")); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_emits_grpc_calls_for_csharp_client_with_async) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "Greeter", + "test-proj.contracts.greet.Greeter", + "contracts/greet.proto", 1, 3, "{}"); + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "SayHello", + "test-proj.contracts.greet.Greeter.SayHello", + "contracts/greet.proto", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + int64_t caller_id = cbm_gbuf_upsert_node(gb, "Method", "FetchGreeting", + "test-proj.client.GreetingService.FetchGreeting", + "client/GreetingService.cs", 15, 25, "{}"); + + /* C# `var client = new Greeter.GreeterClient(channel);` */ + CBMTypeAssign ta[] = {{ + .var_name = "client", + .type_name = "Greeter.GreeterClient", + .enclosing_func_qn = "test-proj.client.GreetingService.FetchGreeting", + }}; + /* `client.SayHelloAsync(req)` — Async suffix should be stripped before lookup. */ + CBMCall calls[] = {{ + .callee_name = "client.SayHelloAsync", + .enclosing_func_qn = "test-proj.client.GreetingService.FetchGreeting", + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "client/GreetingService.cs"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + /* Route should match the bare rpc name (Async stripped). */ + const cbm_gbuf_node_t *route = cbm_gbuf_find_by_qn(gb, "__route__grpc__Greeter/SayHello"); + ASSERT_NOT_NULL(route); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 1); + ASSERT_EQ(grpc[0]->target_id, route->id); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_emits_grpc_calls_for_java_blocking_stub) { + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t svc_id = cbm_gbuf_upsert_node(gb, "Class", "OrderService", + "test-proj.contracts.order.OrderService", + "contracts/order.proto", 1, 5, "{}"); + int64_t rpc_id = cbm_gbuf_upsert_node(gb, "Function", "PlaceOrder", + "test-proj.contracts.order.OrderService.PlaceOrder", + "contracts/order.proto", 2, 2, "{}"); + cbm_gbuf_insert_edge(gb, svc_id, rpc_id, "DEFINES_METHOD", "{}"); + + int64_t caller_id = + cbm_gbuf_upsert_node(gb, "Method", "submitOrder", + "test-proj.client.OrderClient.submitOrder", + "client/OrderClient.java", 30, 40, "{}"); + + /* Java `OrderServiceGrpc.OrderServiceBlockingStub stub = OrderServiceGrpc.newBlockingStub(ch);` + * — extractor records type as the BlockingStub class. */ + CBMTypeAssign ta[] = {{ + .var_name = "stub", + .type_name = "OrderServiceGrpc.OrderServiceBlockingStub", + .enclosing_func_qn = "test-proj.client.OrderClient.submitOrder", + }}; + /* Java convention: rpc method `PlaceOrder` is invoked as `placeOrder` (lowerCamelCase). + * Pass should capitalize before route lookup. */ + CBMCall calls[] = {{ + .callee_name = "stub.placeOrder", + .enclosing_func_qn = "test-proj.client.OrderClient.submitOrder", + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "client/OrderClient.java"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_node_t *route = + cbm_gbuf_find_by_qn(gb, "__route__grpc__OrderService/PlaceOrder"); + ASSERT_NOT_NULL(route); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 1); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_skips_unknown_service_for_producer_call) { + /* HttpClient looks like a stub-suffix but isn't a known proto service — + * pass should not emit a GRPC_CALLS edge for it. */ + cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); + ASSERT_NOT_NULL(gb); + + int64_t caller_id = cbm_gbuf_upsert_node(gb, "Function", "fetch", "test-proj.client.fetch", + "client/fetch.cs", 5, 10, "{}"); + + CBMTypeAssign ta[] = {{ + .var_name = "http", + .type_name = "System.Net.Http.HttpClient", + .enclosing_func_qn = "test-proj.client.fetch", + }}; + CBMCall calls[] = {{ + .callee_name = "http.GetAsync", + .enclosing_func_qn = "test-proj.client.fetch", + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "test-proj", + .repo_path = "/tmp/test", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "client/fetch.cs"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 0); + + cbm_gbuf_free(gb); + PASS(); +} + TEST(idl_scan_skips_non_proto_class) { cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); ASSERT_NOT_NULL(gb); @@ -5608,9 +5847,14 @@ SUITE(pipeline) { /* Project name edge cases */ RUN_TEST(project_name_special_chars); RUN_TEST(project_name_trailing_slash); - /* IDL scan (gRPC cross-repo) */ + /* IDL scan (gRPC cross-repo) — consumer side */ RUN_TEST(idl_scan_emits_route_from_proto_class); RUN_TEST(idl_scan_binds_python_servicer_subclass); RUN_TEST(idl_scan_binds_csharp_servicebase_subclass); RUN_TEST(idl_scan_skips_non_proto_class); + /* IDL scan — producer side (typed-client GRPC_CALLS) */ + RUN_TEST(idl_scan_emits_grpc_calls_for_python_stub); + RUN_TEST(idl_scan_emits_grpc_calls_for_csharp_client_with_async); + RUN_TEST(idl_scan_emits_grpc_calls_for_java_blocking_stub); + RUN_TEST(idl_scan_skips_unknown_service_for_producer_call); } From d94a5017409655c033e48580bd50fc9b391fc7b5 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Sun, 26 Apr 2026 19:25:51 +0500 Subject: [PATCH 04/10] fix(idl_scan): support NuGet/Maven-distributed contracts in producer detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world testing on a snoonu microservice fleet (gateway-service + loyalty-gateway + ~10 .NET services) exposed a critical gap: producer-side detection was gated on idl_service_set_contains(known_services, service), where known_services was populated only from .proto files in the SAME repo. In real fleets contracts ship via NuGet/Maven/PyPI packages (Snoonu.Promo.V1.Contracts, etc.) — the producer never has a local .proto and the gate filtered out every legitimate stub-var call. Three fixes: 1. Drop the local-proto gate. Producer-side detection runs on suffix shape alone (BlockingStub, FutureStub, AsyncStub, AsyncClient, Stub, Client). pass_cross_repo Phase D handles the actual cross-repo match by looking up Routes in target stores; non-matching GRPC_CALLS edges are inert (no CROSS_GRPC_CALLS) and cost only one stray local Route per unique stub type. 2. Add a type-name denylist (k_non_grpc_type_markers) for prefixes that end in "Client" but are definitively not gRPC: System.Net.*, Microsoft.Extensions.Http, RestSharp, Refit, Flurl, java.net.http, okhttp3, reqwest, urllib, httpx. Cuts off the obvious false-positive surface that the local-proto gate was masking. 3. Relax var-scope lookup with a file-scope fallback. C# class-field pattern (`_client = new XClient(channel)` in ctor → `_client.Method` in another method) has different enclosing_func_qn between assignment and call sites; the previous strict-match implementation missed it entirely. Lookup now prefers same-function scope, falls back to any same-var-name match in the file. Tests: - idl_scan_skips_unknown_service_for_producer_call → renamed to idl_scan_denylist_skips_httpclient (denylist-driven instead of proto-list-driven), plus assertion that no stray Route is emitted. - idl_scan_emits_grpc_calls_without_local_proto: producer detection fires when no Class node from a .proto exists in the gbuf. - idl_scan_resolves_class_field_assigned_in_constructor: ctor-scope assignment + method-scope call resolves via file-scope fallback. 10 idl_scan tests pass. 2609 tests pass overall, 0 regressions. Discovered during testing but out of scope for this PR (will be filed as a separate upstream issue): pass_parallel.c emit_grpc_edge emits Routes with QN format `__grpc__/` (without the `__route__` prefix) using greedy suffix-stripping (ServiceClient before Client), which produces phantom service names like `provider`, `builder`, `experimentProvider` from local var-name matching. These coexist in their own QN namespace; pass_idl_scan's `__route__grpc__` Routes are unaffected. --- src/pipeline/pass_idl_scan.c | 160 ++++++++++++++--------------------- tests/test_pipeline.c | 119 +++++++++++++++++++++++++- 2 files changed, 179 insertions(+), 100 deletions(-) diff --git a/src/pipeline/pass_idl_scan.c b/src/pipeline/pass_idl_scan.c index 0af2b2c8..874a85bb 100644 --- a/src/pipeline/pass_idl_scan.c +++ b/src/pipeline/pass_idl_scan.c @@ -48,7 +48,6 @@ enum { IDL_NAME_BUF = 256, IDL_LOG_BUF = 16, IDL_VAR_INIT_CAP = 16, - IDL_SVC_INIT_CAP = 8, }; #include "pipeline/pipeline.h" @@ -308,73 +307,56 @@ static int idl_bind_consumer_handlers(cbm_gbuf_t *gbuf) { * Go grpc-go uses pointer types like *PromoCodeClient produced by NewPromoCodeClient(conn); * extracting the type from the call expression rather than a typed assignment is feasible * but needs more plumbing — left as a follow-up. + * + * Note: producer-side detection runs WITHOUT requiring a local .proto file. + * In real microservice fleets, contracts are commonly distributed via + * NuGet/Maven/PyPI packages (e.g. `Snoonu.PromoCodeService.Contracts`) so the + * consumer never has the .proto in source. pass_cross_repo's Phase D matches + * GRPC_CALLS edges against Route nodes in TARGET stores, so consumer-side + * indexing of the contracts repo (or wherever the .proto lives) is what + * actually closes the loop. False-positive surface is limited by: + * 1. Suffix shape (BlockingStub/FutureStub/Stub are gRPC-conventional) + * 2. Type-name denylist (System.Net.*, Microsoft.Extensions.Http, Refit, ...) + * 3. Phase D filter — non-matching GRPC_CALLS produce no CROSS_GRPC_CALLS */ static const char *const k_grpc_client_suffixes[] = { "BlockingStub", "FutureStub", "AsyncStub", "AsyncClient", "Stub", "Client", NULL, }; -/* In-pass index of proto-derived service names (Class nodes from .proto files). - * Built once during route emission so producer-side detection can validate the - * derived service name actually corresponds to an indexed gRPC service. */ -typedef struct { - char **names; - int count; - int cap; -} idl_service_set_t; - -static void idl_service_set_init(idl_service_set_t *s) { - s->names = NULL; - s->count = 0; - s->cap = 0; -} - -static void idl_service_set_add(idl_service_set_t *s, const char *name) { - if (!name || !name[0]) { - return; - } - for (int i = 0; i < s->count; i++) { - if (strcmp(s->names[i], name) == 0) { - return; - } - } - if (s->count >= s->cap) { - int new_cap = s->cap == 0 ? IDL_SVC_INIT_CAP : s->cap * 2; - char **grow = realloc(s->names, (size_t)new_cap * sizeof(char *)); - if (!grow) { - return; - } - s->names = grow; - s->cap = new_cap; - } - s->names[s->count] = strdup(name); - if (s->names[s->count]) { - s->count++; - } -} +/* Type-name prefixes that look like client/stub suffixes but are definitively + * NOT gRPC. Matched as substring against the (possibly qualified) type_name. + * Keep the list short; Phase D filters anything that slips through. */ +static const char *const k_non_grpc_type_markers[] = { + "System.Net.", /* HttpClient, WebClient, TcpClient, etc. */ + "System.Web.", /* legacy WebForms / WebClient */ + "Microsoft.Extensions.Http", /* IHttpClientFactory, HttpClient DI */ + "RestSharp", /* REST client, not gRPC */ + "Refit", /* attribute-routed REST client */ + "Flurl", /* fluent HTTP client */ + "java.net.http", /* JDK HttpClient */ + "okhttp3", /* OkHttp */ + "reqwest", /* Rust HTTP */ + "urllib", /* Python HTTP */ + "httpx", /* Python HTTP */ + NULL, +}; -static bool idl_service_set_contains(const idl_service_set_t *s, const char *name) { - if (!name) { +static bool idl_type_is_denylisted(const char *type_name) { + if (!type_name) { return false; } - for (int i = 0; i < s->count; i++) { - if (strcmp(s->names[i], name) == 0) { + for (int i = 0; k_non_grpc_type_markers[i]; i++) { + if (strstr(type_name, k_non_grpc_type_markers[i]) != NULL) { return true; } } return false; } -static void idl_service_set_free(idl_service_set_t *s) { - for (int i = 0; i < s->count; i++) { - free(s->names[i]); - } - free(s->names); - s->names = NULL; - s->count = 0; - s->cap = 0; -} - -/* Per-function scoped record: var name → derived service name. */ +/* Per-file scoped record: var name → derived service name. enclosing_qn is + * preferred for matching but lookup falls back to file-scope when call-site + * and assignment-site scopes differ (e.g., C# class field assigned in ctor, + * accessed in a method). */ typedef struct { char *enclosing_qn; char *var_name; @@ -422,17 +404,23 @@ static const idl_stub_var_t *idl_stub_var_arr_find(const idl_stub_var_arr_t *a, if (!var_name) { return NULL; } + /* Pass 1: prefer same-function scope match. */ + if (enclosing_qn && enclosing_qn[0]) { + for (int i = 0; i < a->count; i++) { + const idl_stub_var_t *e = &a->items[i]; + if (strcmp(e->var_name, var_name) == 0 && e->enclosing_qn && + strcmp(enclosing_qn, e->enclosing_qn) == 0) { + return e; + } + } + } + /* Pass 2: file-scope fallback. Covers class fields (assigned in ctor, + * called in methods) and module-scope vars accessed from inner functions. */ for (int i = 0; i < a->count; i++) { const idl_stub_var_t *e = &a->items[i]; - if (strcmp(e->var_name, var_name) != 0) { - continue; - } - /* Require enclosing QN match when both sides specify one; allow a NULL - * call-site enclosing to match any (module-scope variables). */ - if (enclosing_qn && e->enclosing_qn && strcmp(enclosing_qn, e->enclosing_qn) != 0) { - continue; + if (strcmp(e->var_name, var_name) == 0) { + return e; } - return e; } return NULL; } @@ -460,10 +448,17 @@ static const char *idl_type_basename(const char *qualified) { } /* Scan one CBMFileResult's type_assigns; for each assignment whose RHS type - * matches a stub/client suffix AND the suffix-stripped base name matches a - * known proto service, record (enclosing_qn, var_name, service_name). */ + * matches a stub/client suffix and is not denylisted, record + * (enclosing_qn, var_name, service_name). + * + * Detection runs WITHOUT requiring a local Route in the gbuf: in real + * microservice fleets contracts ship via NuGet/Maven/PyPI and the producer + * repo never has the .proto in source. Phase D in pass_cross_repo.c handles + * the actual cross-repo match; non-matching GRPC_CALLS edges produced here + * are inert (no CROSS_GRPC_CALLS) and cost is one stray local Route per + * unique stub type. The denylist (k_non_grpc_type_markers) cuts off the + * obvious false positives like System.Net.Http.HttpClient. */ static void idl_collect_stub_vars_for_file(const CBMFileResult *result, - const idl_service_set_t *known_services, idl_stub_var_arr_t *out) { if (!result) { return; @@ -473,16 +468,15 @@ static void idl_collect_stub_vars_for_file(const CBMFileResult *result, if (!ta->var_name || !ta->type_name) { continue; } + if (idl_type_is_denylisted(ta->type_name)) { + continue; + } const char *base = idl_type_basename(ta->type_name); char *service = idl_strip_suffix(base, k_grpc_client_suffixes); if (!service || !service[0]) { free(service); continue; } - if (!idl_service_set_contains(known_services, service)) { - free(service); - continue; - } idl_stub_var_arr_push(out, ta->enclosing_func_qn, ta->var_name, service); free(service); } @@ -578,38 +572,13 @@ static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_f return emitted; } -/* Build the proto-service set by scanning Class nodes with .proto file_path. */ -typedef struct { - idl_service_set_t *set; -} idl_svc_collect_ctx_t; - -static void idl_svc_collect_visitor(const cbm_gbuf_node_t *node, void *userdata) { - idl_svc_collect_ctx_t *c = (idl_svc_collect_ctx_t *)userdata; - if (!node || !node->label || !node->file_path || !node->name) { - return; - } - if (strcmp(node->label, "Class") == 0 && idl_is_proto_file(node->file_path)) { - idl_service_set_add(c->set, node->name); - } -} - static int idl_emit_producer_edges(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count) { if (!ctx->result_cache || file_count <= 0) { return 0; } - idl_service_set_t known_services; - idl_service_set_init(&known_services); - idl_svc_collect_ctx_t collect = {.set = &known_services}; - cbm_gbuf_foreach_node(ctx->gbuf, idl_svc_collect_visitor, &collect); - int total_emitted = 0; - if (known_services.count == 0) { - idl_service_set_free(&known_services); - return 0; - } - for (int i = 0; i < file_count; i++) { const CBMFileResult *result = ctx->result_cache[i]; if (!result) { @@ -617,14 +586,13 @@ static int idl_emit_producer_edges(cbm_pipeline_ctx_t *ctx, const cbm_file_info_ } idl_stub_var_arr_t stub_vars; idl_stub_var_arr_init(&stub_vars); - idl_collect_stub_vars_for_file(result, &known_services, &stub_vars); + idl_collect_stub_vars_for_file(result, &stub_vars); if (stub_vars.count > 0) { total_emitted += idl_emit_producer_edges_for_file(ctx, &files[i], result, &stub_vars); } idl_stub_var_arr_free(&stub_vars); } - idl_service_set_free(&known_services); return total_emitted; } diff --git a/tests/test_pipeline.c b/tests/test_pipeline.c index 908609eb..12ea067d 100644 --- a/tests/test_pipeline.c +++ b/tests/test_pipeline.c @@ -5528,9 +5528,9 @@ TEST(idl_scan_emits_grpc_calls_for_java_blocking_stub) { PASS(); } -TEST(idl_scan_skips_unknown_service_for_producer_call) { - /* HttpClient looks like a stub-suffix but isn't a known proto service — - * pass should not emit a GRPC_CALLS edge for it. */ +TEST(idl_scan_denylist_skips_httpclient) { + /* System.Net.Http.HttpClient ends in "Client" but is on the deny prefix + * list — pass should not emit a GRPC_CALLS edge or stray Route. */ cbm_gbuf_t *gb = cbm_gbuf_new("test-proj", "/tmp/test"); ASSERT_NOT_NULL(gb); @@ -5569,6 +5569,115 @@ TEST(idl_scan_skips_unknown_service_for_producer_call) { cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); ASSERT_EQ(grpc_count, 0); + /* Also assert no stray Route was emitted for "Http" (HttpClient minus Client). */ + ASSERT_NULL(cbm_gbuf_find_by_qn(gb, "__route__grpc__Http/GetAsync")); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_emits_grpc_calls_without_local_proto) { + /* NuGet/Maven/PyPI-distributed contracts pattern: producer repo has the + * generated client class assignment + call but NO local .proto file. + * Phase D in pass_cross_repo.c handles the actual cross-repo match + * against the consumer's Routes. Pass should still emit the GRPC_CALLS + * edge so Phase D has something to match. */ + cbm_gbuf_t *gb = cbm_gbuf_new("gateway", "/tmp/gateway"); + ASSERT_NOT_NULL(gb); + + /* Note: NO Class node from a .proto file — gateway consumes via NuGet. */ + int64_t caller_id = + cbm_gbuf_upsert_node(gb, "Method", "FetchVoucher", + "gateway.Promo.PromoController.FetchVoucher", + "Controllers/PromoController.cs", 20, 35, "{}"); + + CBMTypeAssign ta[] = {{ + .var_name = "_promoClient", + .type_name = "Snoonu.Promo.V1.PromoCodeServiceClient", + .enclosing_func_qn = "gateway.Promo.PromoController.FetchVoucher", + }}; + CBMCall calls[] = {{ + .callee_name = "_promoClient.GetVoucherAsync", + .enclosing_func_qn = "gateway.Promo.PromoController.FetchVoucher", + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "gateway", + .repo_path = "/tmp/gateway", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "Controllers/PromoController.cs"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + /* Route is created locally even without a .proto — Phase D matches it + * against the consumer repo's Route by QN. */ + const cbm_gbuf_node_t *route = + cbm_gbuf_find_by_qn(gb, "__route__grpc__PromoCodeService/GetVoucher"); + ASSERT_NOT_NULL(route); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, caller_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 1); + ASSERT_EQ(grpc[0]->target_id, route->id); + + cbm_gbuf_free(gb); + PASS(); +} + +TEST(idl_scan_resolves_class_field_assigned_in_constructor) { + /* C# pattern: `_client = new XClient(channel)` in the constructor, + * `_client.Method(...)` in another method on the same class. The + * assignment's enclosing_func_qn (ctor) differs from the call site's + * enclosing_func_qn (method). File-scope fallback should bridge them. */ + cbm_gbuf_t *gb = cbm_gbuf_new("svc", "/tmp/svc"); + ASSERT_NOT_NULL(gb); + + int64_t method_id = + cbm_gbuf_upsert_node(gb, "Method", "DoWork", "svc.Svc.MyClass.DoWork", + "Svc/MyClass.cs", 30, 40, "{}"); + + CBMTypeAssign ta[] = {{ + .var_name = "_client", + .type_name = "Promo.V1.PromoCodeServiceClient", + .enclosing_func_qn = "svc.Svc.MyClass.ctor", /* assigned in ctor */ + }}; + CBMCall calls[] = {{ + .callee_name = "_client.GetVoucherAsync", + .enclosing_func_qn = "svc.Svc.MyClass.DoWork", /* called from method */ + }}; + CBMFileResult result; + mk_synthetic_result(&result, ta, 1, calls, 1); + + CBMFileResult *cache[1] = {&result}; + atomic_int cancelled = 0; + cbm_pipeline_ctx_t ctx = { + .project_name = "svc", + .repo_path = "/tmp/svc", + .gbuf = gb, + .registry = NULL, + .cancelled = &cancelled, + .result_cache = cache, + }; + cbm_file_info_t fi = {.rel_path = "Svc/MyClass.cs"}; + + int rc = cbm_pipeline_pass_idl_scan(&ctx, &fi, 1); + ASSERT_EQ(rc, 0); + + const cbm_gbuf_edge_t **grpc = NULL; + int grpc_count = 0; + cbm_gbuf_find_edges_by_source_type(gb, method_id, "GRPC_CALLS", &grpc, &grpc_count); + ASSERT_EQ(grpc_count, 1); + cbm_gbuf_free(gb); PASS(); } @@ -5856,5 +5965,7 @@ SUITE(pipeline) { RUN_TEST(idl_scan_emits_grpc_calls_for_python_stub); RUN_TEST(idl_scan_emits_grpc_calls_for_csharp_client_with_async); RUN_TEST(idl_scan_emits_grpc_calls_for_java_blocking_stub); - RUN_TEST(idl_scan_skips_unknown_service_for_producer_call); + RUN_TEST(idl_scan_denylist_skips_httpclient); + RUN_TEST(idl_scan_emits_grpc_calls_without_local_proto); + RUN_TEST(idl_scan_resolves_class_field_assigned_in_constructor); } From ded4d2dd3cc9274b19208abb505687395f1ba13e Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:12:27 +0500 Subject: [PATCH 05/10] feat(extract): support C# 12 primary-ctor params, factory patterns, property fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small extractor changes that surface signal Tier 1 producer-side detection needs: 1. extract_defs.c — C# 12 primary-constructor parameters now emit Field defs scoped to the enclosing class. Iterates the class_declaration's parameter_list child (via field-name "parameters" or by direct child walk for grammars that don't surface the field name) and emits one Field per param with parent_class and return_type set. Modern .NET 8+/9+ controllers/services use this syntax as default; without it the class-field walker sees zero typed-client fields. 2. extract_type_assigns.c — recognize Go-style `pb.NewFooClient(ch)` and Java-style `fooGrpc.newBlockingStub(ch)` factory calls as constructor- typed assignments. Accepts qualified names whose last segment matches a typed-stub factory pattern (`New*Client`, `new*Stub`). 3. lang_specs.c — C# now uses cs_field_types (field_declaration + property_declaration) for its `field_types` slot, so property declarations also emit Field defs. --- internal/cbm/extract_defs.c | 87 ++++++++++++++++++++++++++++- internal/cbm/extract_type_assigns.c | 20 +++++++ internal/cbm/lang_specs.c | 3 +- 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/internal/cbm/extract_defs.c b/internal/cbm/extract_defs.c index b40d9f55..6257f472 100644 --- a/internal/cbm/extract_defs.c +++ b/internal/cbm/extract_defs.c @@ -1893,6 +1893,55 @@ static void extract_class_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec // Extract class-level variables (field declarations) extract_class_variables(ctx, node, spec); + + // C# 12 primary-constructor parameters: declared on the class line + // (`class Foo(IBar bar, IBaz baz) : Base { ... }`) and bound to implicit + // captured fields accessible from any instance member. Tree-sitter c-sharp + // wraps them inside the hidden _class_declaration_initializer node, so the + // `parameters` field on class_declaration may not always resolve directly; + // iterate top-level children for parameter_list as a robust fallback. + if (ctx->language == CBM_LANG_CSHARP) { + TSNode primary_params = ts_node_child_by_field_name(node, TS_FIELD("parameters")); + if (ts_node_is_null(primary_params)) { + uint32_t total = ts_node_child_count(node); + for (uint32_t i = 0; i < total; i++) { + TSNode c = ts_node_child(node, i); + if (!ts_node_is_null(c) && strcmp(ts_node_type(c), "parameter_list") == 0) { + primary_params = c; + break; + } + } + } + if (!ts_node_is_null(primary_params)) { + uint32_t pcount = ts_node_child_count(primary_params); + for (uint32_t k = 0; k < pcount; k++) { + TSNode p = ts_node_child(primary_params, k); + if (ts_node_is_null(p) || !ts_node_is_named(p)) { + continue; + } + char *pname = resolve_param_name(a, p, ctx->source); + if (!pname || !pname[0]) { + continue; + } + char *ptype = resolve_param_type_text(a, p, ctx->source, ctx->language); + if (!ptype || !ptype[0]) { + continue; + } + CBMDefinition pdef; + memset(&pdef, 0, sizeof(pdef)); + pdef.name = pname; + pdef.qualified_name = cbm_arena_sprintf(a, "%s.%s", class_qn, pname); + pdef.label = "Field"; + pdef.file_path = ctx->rel_path; + pdef.parent_class = class_qn; + pdef.return_type = ptype; + pdef.start_line = ts_node_start_point(p).row + TS_LINE_OFFSET; + pdef.end_line = ts_node_end_point(p).row + TS_LINE_OFFSET; + pdef.is_exported = false; + cbm_defs_push(&ctx->result->defs, a, pdef); + } + } + } } // Find the body/members node inside a class node @@ -2049,6 +2098,7 @@ static void push_method_def(CBMExtractCtx *ctx, TSNode child, const char *class_ TSNode params = ts_node_child_by_field_name(child, TS_FIELD("parameters")); if (!ts_node_is_null(params)) { def.signature = cbm_node_text(a, params, ctx->source); + def.param_names = extract_param_names(a, params, ctx->source, ctx->language); def.param_types = extract_param_types(a, params, ctx->source, ctx->language); } @@ -2207,6 +2257,7 @@ static void extract_rust_impl(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec TSNode params = ts_node_child_by_field_name(child, TS_FIELD("parameters")); if (!ts_node_is_null(params)) { def.signature = cbm_node_text(a, params, ctx->source); + def.param_names = extract_param_names(a, params, ctx->source, ctx->language); def.param_types = extract_param_types(a, params, ctx->source, ctx->language); } @@ -3203,8 +3254,41 @@ static void extract_class_fields(CBMExtractCtx *ctx, TSNode class_node, const ch continue; } - // Extract type from "type" field + /* Locate the field's "type" + name node. Two shapes: + * - direct (Java/Go/Rust/C/C++): + * field_declaration .type=identifier .declarator=variable_declarator(.name) + * - nested (C#): + * field_declaration > variable_declaration(.type=identifier, + * variable_declarator(.name)) + * For the nested case, the child has no "type" field directly. Detect by + * walking named children for a variable_declaration. */ TSNode type_node = ts_node_child_by_field_name(child, TS_FIELD("type")); + TSNode name_node = ts_node_is_null(type_node) ? (TSNode){0} : resolve_field_name_node(child); + + if (ts_node_is_null(type_node)) { + uint32_t cnc = ts_node_named_child_count(child); + for (uint32_t k = 0; k < cnc; k++) { + TSNode inner = ts_node_named_child(child, k); + if (strcmp(ts_node_type(inner), "variable_declaration") != 0) { + continue; + } + type_node = ts_node_child_by_field_name(inner, TS_FIELD("type")); + /* Find first variable_declarator child for the name. */ + uint32_t nc = ts_node_named_child_count(inner); + for (uint32_t j = 0; j < nc; j++) { + TSNode vd = ts_node_named_child(inner, j); + if (strcmp(ts_node_type(vd), "variable_declarator") == 0) { + TSNode nm = ts_node_child_by_field_name(vd, TS_FIELD("name")); + if (!ts_node_is_null(nm)) { + name_node = nm; + break; + } + } + } + break; + } + } + if (ts_node_is_null(type_node)) { continue; } @@ -3213,7 +3297,6 @@ static void extract_class_fields(CBMExtractCtx *ctx, TSNode class_node, const ch continue; } - TSNode name_node = resolve_field_name_node(child); if (ts_node_is_null(name_node)) { continue; } diff --git a/internal/cbm/extract_type_assigns.c b/internal/cbm/extract_type_assigns.c index 74ead85f..47fcbf82 100644 --- a/internal/cbm/extract_type_assigns.c +++ b/internal/cbm/extract_type_assigns.c @@ -55,6 +55,26 @@ static const char *extract_constructor_type(CBMArena *a, TSNode rhs, const char if (fname && fname[0] >= 'A' && fname[0] <= 'Z') { return fname; } + /* Lower-cased package prefix: Go-style `pb.NewFooClient(...)` and + * Java-style `fooGrpc.newBlockingStub(...)`. Accept the qualified + * name when the last segment matches a typed-stub factory pattern. + * Downstream pass_idl_scan handles factory→service inference; we + * just pass enough information through type_assigns. */ + if (fname && fname[0]) { + const char *last = strrchr(fname, '.'); + last = last ? last + 1 : fname; + bool is_factory = false; + if ((strncmp(last, "New", 3) == 0 || strncmp(last, "new", 3) == 0) && last[3]) { + size_t llen = strlen(last); + if ((llen > 6 && strcmp(last + llen - 6, "Client") == 0) || + (llen > 4 && strcmp(last + llen - 4, "Stub") == 0)) { + is_factory = true; + } + } + if (is_factory) { + return fname; + } + } } } diff --git a/internal/cbm/lang_specs.c b/internal/cbm/lang_specs.c index 9cb6caa3..93341a71 100644 --- a/internal/cbm/lang_specs.c +++ b/internal/cbm/lang_specs.c @@ -332,6 +332,7 @@ static const char *cs_branch_types[] = {"if_statement", "for_statement", " "while_statement", "switch_statement", "case_switch_label", "try_statement", "catch_clause", NULL}; static const char *cs_var_types[] = {"field_declaration", "local_declaration_statement", NULL}; +static const char *cs_field_types[] = {"field_declaration", "property_declaration", NULL}; static const char *cs_assign_types[] = {"assignment_expression", NULL}; static const char *cs_throw_types[] = {"throw_statement", "throw_expression", NULL}; static const char *cs_decorator_types[] = {"attribute", NULL}; @@ -1534,7 +1535,7 @@ static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = { empty_types, cpp_env_funcs, NULL, tree_sitter_cpp}, // CBM_LANG_CSHARP - [CBM_LANG_CSHARP] = {CBM_LANG_CSHARP, cs_func_types, cs_class_types, empty_types, + [CBM_LANG_CSHARP] = {CBM_LANG_CSHARP, cs_func_types, cs_class_types, cs_field_types, cs_module_types, cs_call_types, cs_import_types, cs_import_types, cs_branch_types, cs_var_types, cs_assign_types, cs_throw_types, NULL, cs_decorator_types, cs_env_funcs, NULL, tree_sitter_c_sharp}, From a020ec3bebc90b9148999a383b2a4b88bfa06293 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:13:01 +0500 Subject: [PATCH 06/10] feat(idl_scan): producer-side typed-client detection for Tier 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends pass_idl_scan with the producer-side signal sources from the cross-repo intelligence proposal, plus production-readiness fixes exposed when running tier1 against a real .NET microservice fleet with NuGet-distributed proto contracts and C# 12 primary constructors. Producer-side detection — for each call `var.Method(...)` whose receiver var resolves to a known stub type, emit a `GRPC_CALLS` edge to a local Route. Stub vars are discovered via four signal sources: * Constructor-parameter tracking: walk gbuf Method nodes that look like constructors; for each ctor param whose type matches a stub-suffix pattern (or appears in the DI registry below), record (class_qn, param_name, service_name) with class-wide scope. * Factory-function inference: when type_assigns gives us `var = pb.NewFooClient(...)` / `fooGrpc.newBlockingStub(...)`, derive the service from the factory's last segment by stripping `New`/`new` and the trailing `Client`/`Stub` suffix. * DI-registration scanning: harvests stub-type FQNs from `services.AddGrpcClient(...)`, `@GrpcClient(...)` annotations, and NestJS-style `@Client({...})` decorators. Stub vars whose declared type is in this registry are treated as gRPC clients even without the conventional suffix. * Field/property type tracking: for class fields whose declared type matches a stub-suffix pattern or DI-registered FQN, record (class_qn, field_name, service_name) with class-wide scope. Production-readiness fixes: * Proto rpc → service mapping fallback. tree-sitter-protobuf emits rpc Functions as flat siblings of the service Class rather than children, so DEFINES_METHOD edges may not exist. When that happens, match rpc Functions by file_path equality + start_line/end_line containment within the service Class. Optimized to O(N+F) via a single pre-pass that collects proto Classes and Functions into flat arrays (avoids quadratic blowup on heavy proto-defining repos). * Safer stub-var fallback. idl_stub_var_arr_find_ext() takes a new allow_name_only_fallback flag. The class_vars lookup (project-wide) passes false so a class-scope variable can only match calls whose enclosing function lives under the same class; without this guard two unrelated classes both declaring `_client` would silently bind to each other's typed-client and emit wrong GRPC_CALLS edges. * Cross-package collision visibility. Routes are still keyed __route__grpc__/ using the bare service name, since cross-repo matching joins on that key and the consumer side has no proto-package source. When a second .proto with the same bare key is upserted, log a warning at idl_scan.route_collision so the operator sees the ambiguity, and write the service node's qualified_name as a service_qn property so a future FQN-aware matcher can recover provenance. The full FQN-keyed Route data model (Tier 1g) is intentionally deferred to a focused follow-on PR — see .planning/cbm-cross-repo-proposal.md §5.7 for the rationale and four-piece sequencing. --- src/pipeline/pass_idl_scan.c | 819 ++++++++++++++++++++++++++++++++--- 1 file changed, 747 insertions(+), 72 deletions(-) diff --git a/src/pipeline/pass_idl_scan.c b/src/pipeline/pass_idl_scan.c index 874a85bb..589bbb2d 100644 --- a/src/pipeline/pass_idl_scan.c +++ b/src/pipeline/pass_idl_scan.c @@ -48,6 +48,8 @@ enum { IDL_NAME_BUF = 256, IDL_LOG_BUF = 16, IDL_VAR_INIT_CAP = 16, + IDL_PARAM_MAX = 32, + IDL_DI_INIT_CAP = 16, }; #include "pipeline/pipeline.h" @@ -134,7 +136,15 @@ static void idl_build_route_qn(char *buf, size_t bufsz, const char *service, con snprintf(buf, bufsz, "__route__grpc__%s/%s", service, method); } -/* Emit a single Route node + HANDLES edge from the rpc method node. */ +/* Emit a single Route node + HANDLES edge from the rpc method node. + * + * Cross-package collision visibility (Gap 7 mitigation): when two .proto + * files declare the same /, the second emission's upsert + * silently overwrites the first's properties. Logs a warning at + * idl_scan.route_collision when the existing Route's file_path differs, + * and writes the service node's qualified_name as a service_qn property + * so a future FQN-aware matcher (Tier 1g) can recover provenance even + * after the upsert. */ static void idl_emit_route_for_rpc(cbm_gbuf_t *gbuf, const cbm_gbuf_node_t *service_node, const cbm_gbuf_node_t *rpc_node, int *out_count) { if (!service_node->name || !rpc_node->name) { @@ -146,14 +156,24 @@ static void idl_emit_route_for_rpc(cbm_gbuf_t *gbuf, const cbm_gbuf_node_t *serv char display[IDL_QN_BUF]; snprintf(display, sizeof(display), "%s/%s", service_node->name, rpc_node->name); + const char *src_file = service_node->file_path ? service_node->file_path : ""; + + /* Detect cross-package collision via file_path mismatch on existing + * Route. Useful as an operator signal; does not change emission. */ + const cbm_gbuf_node_t *existing = cbm_gbuf_find_by_qn(gbuf, qn); + if (existing && existing->file_path && existing->file_path[0] && src_file[0] && + strcmp(existing->file_path, src_file) != 0) { + cbm_log_warn("idl_scan.route_collision", "qn", qn, "first_file", existing->file_path, + "second_file", src_file); + } + char props[IDL_PROPS_BUF]; snprintf(props, sizeof(props), - "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\"}", service_node->name, - rpc_node->name); + "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\",\"service_qn\":\"%s\"}", + service_node->name, rpc_node->name, + service_node->qualified_name ? service_node->qualified_name : service_node->name); - int64_t route_id = - cbm_gbuf_upsert_node(gbuf, "Route", display, qn, - service_node->file_path ? service_node->file_path : "", 0, 0, props); + int64_t route_id = cbm_gbuf_upsert_node(gbuf, "Route", display, qn, src_file, 0, 0, props); if (route_id <= 0) { return; } @@ -170,34 +190,103 @@ typedef struct { int routes; } idl_walk_ctx_t; -static void idl_proto_class_visitor(const cbm_gbuf_node_t *node, void *userdata) { - idl_walk_ctx_t *ctx = (idl_walk_ctx_t *)userdata; - if (!node || !node->label || !node->file_path) { - return; +/* Pre-collected per-pass index of proto-file nodes. Built in a single O(M) + * walk before per-Class processing so the fallback rpc-by-line-range match + * stays linear in the number of proto Functions, not quadratic in gbuf size. */ +typedef struct { + const cbm_gbuf_node_t **classes; + int class_count; + int class_cap; + const cbm_gbuf_node_t **functions; + int func_count; + int func_cap; +} idl_proto_index_t; + +static void idl_proto_index_init(idl_proto_index_t *idx) { + memset(idx, 0, sizeof(*idx)); +} + +static void idl_proto_index_free(idl_proto_index_t *idx) { + free(idx->classes); + free(idx->functions); + memset(idx, 0, sizeof(*idx)); +} + +static void idl_proto_index_push_class(idl_proto_index_t *idx, const cbm_gbuf_node_t *node) { + if (idx->class_count >= idx->class_cap) { + int nc = idx->class_cap == 0 ? IDL_VAR_INIT_CAP : idx->class_cap * 2; + const cbm_gbuf_node_t **g = realloc(idx->classes, (size_t)nc * sizeof(*idx->classes)); + if (!g) return; + idx->classes = g; + idx->class_cap = nc; } - if (strcmp(node->label, "Class") != 0) { + idx->classes[idx->class_count++] = node; +} + +static void idl_proto_index_push_function(idl_proto_index_t *idx, const cbm_gbuf_node_t *node) { + if (idx->func_count >= idx->func_cap) { + int nc = idx->func_cap == 0 ? IDL_VAR_INIT_CAP : idx->func_cap * 2; + const cbm_gbuf_node_t **g = realloc(idx->functions, (size_t)nc * sizeof(*idx->functions)); + if (!g) return; + idx->functions = g; + idx->func_cap = nc; + } + idx->functions[idx->func_count++] = node; +} + +static void idl_proto_index_visitor(const cbm_gbuf_node_t *node, void *userdata) { + idl_proto_index_t *idx = (idl_proto_index_t *)userdata; + if (!node || !node->label || !node->file_path) { return; } if (!idl_is_proto_file(node->file_path)) { return; } - /* Iterate DEFINES_METHOD edges to find rpc method nodes. */ + if (strcmp(node->label, "Class") == 0) { + idl_proto_index_push_class(idx, node); + } else if (strcmp(node->label, "Function") == 0) { + idl_proto_index_push_function(idx, node); + } +} + +/* Process one proto service Class. Tries DEFINES_METHOD edges first; falls + * back to file_path + line-range scan over the pre-collected proto Functions + * (typical for tree-sitter-protobuf, which emits rpc Functions as flat + * siblings of the service Class rather than children). */ +static void idl_emit_routes_for_proto_class(cbm_gbuf_t *gbuf, const cbm_gbuf_node_t *cls, + const idl_proto_index_t *idx, int *services, + int *routes) { const cbm_gbuf_edge_t **edges = NULL; int edge_count = 0; - if (cbm_gbuf_find_edges_by_source_type(ctx->gbuf, node->id, "DEFINES_METHOD", &edges, - &edge_count) != 0) { - return; - } - if (edge_count == 0) { + if (cbm_gbuf_find_edges_by_source_type(gbuf, cls->id, "DEFINES_METHOD", &edges, &edge_count) == + 0 && + edge_count > 0) { + (*services)++; + for (int i = 0; i < edge_count; i++) { + const cbm_gbuf_node_t *rpc = cbm_gbuf_find_by_id(gbuf, edges[i]->target_id); + if (rpc) { + idl_emit_route_for_rpc(gbuf, cls, rpc, routes); + } + } return; } - ctx->services++; - for (int i = 0; i < edge_count; i++) { - const cbm_gbuf_node_t *rpc = cbm_gbuf_find_by_id(ctx->gbuf, edges[i]->target_id); - if (!rpc) { + + /* Fallback: filter pre-collected proto Functions by same file_path + + * line-range containment. O(F) per class; F is small (proto-file rpcs). */ + int matched = 0; + for (int i = 0; i < idx->func_count; i++) { + const cbm_gbuf_node_t *fn = idx->functions[i]; + if (!fn->file_path || strcmp(fn->file_path, cls->file_path) != 0) { continue; } - idl_emit_route_for_rpc(ctx->gbuf, node, rpc, &ctx->routes); + if (fn->start_line < cls->start_line || fn->end_line > cls->end_line) { + continue; + } + if (matched == 0) { + (*services)++; + } + matched++; + idl_emit_route_for_rpc(gbuf, cls, fn, routes); } } @@ -353,12 +442,23 @@ static bool idl_type_is_denylisted(const char *type_name) { return false; } -/* Per-file scoped record: var name → derived service name. enclosing_qn is - * preferred for matching but lookup falls back to file-scope when call-site - * and assignment-site scopes differ (e.g., C# class field assigned in ctor, - * accessed in a method). */ +/* Var → service map entry. Scope is determined by which fields are populated: + * + * function-scope: enclosing_qn set, class_qn empty. + * Matches when a call's enclosing_func_qn equals enclosing_qn exactly. + * Source: type_assigns (Tier 1b), local var = SomeStub(...). + * + * class-scope: class_qn set, enclosing_qn empty. + * Matches when a call's enclosing_func_qn starts with class_qn + ".". + * Source: ctor params (Tier 1c) + class fields (Tier 1f). + * + * file-scope fallback: any var match within an array attached to one file. + * Used when both function and class scope miss but the call's file's + * per-file array has the var name. Same as #293 behavior. + */ typedef struct { char *enclosing_qn; + char *class_qn; char *var_name; char *service_name; } idl_stub_var_t; @@ -375,8 +475,9 @@ static void idl_stub_var_arr_init(idl_stub_var_arr_t *a) { a->cap = 0; } -static void idl_stub_var_arr_push(idl_stub_var_arr_t *a, const char *enclosing_qn, - const char *var_name, const char *service_name) { +static void idl_stub_var_arr_push_scoped(idl_stub_var_arr_t *a, const char *enclosing_qn, + const char *class_qn, const char *var_name, + const char *service_name) { if (a->count >= a->cap) { int new_cap = a->cap == 0 ? IDL_VAR_INIT_CAP : a->cap * 2; idl_stub_var_t *grow = realloc(a->items, (size_t)new_cap * sizeof(idl_stub_var_t)); @@ -387,24 +488,61 @@ static void idl_stub_var_arr_push(idl_stub_var_arr_t *a, const char *enclosing_q a->cap = new_cap; } idl_stub_var_t *e = &a->items[a->count]; - e->enclosing_qn = enclosing_qn ? strdup(enclosing_qn) : NULL; + e->enclosing_qn = (enclosing_qn && enclosing_qn[0]) ? strdup(enclosing_qn) : NULL; + e->class_qn = (class_qn && class_qn[0]) ? strdup(class_qn) : NULL; e->var_name = strdup(var_name); e->service_name = strdup(service_name); if (e->var_name && e->service_name) { a->count++; } else { free(e->enclosing_qn); + free(e->class_qn); free(e->var_name); free(e->service_name); } } -static const idl_stub_var_t *idl_stub_var_arr_find(const idl_stub_var_arr_t *a, - const char *enclosing_qn, const char *var_name) { +static void idl_stub_var_arr_push(idl_stub_var_arr_t *a, const char *enclosing_qn, + const char *var_name, const char *service_name) { + idl_stub_var_arr_push_scoped(a, enclosing_qn, NULL, var_name, service_name); +} + +/* True when call_qn is "." — i.e., the call's enclosing + * function is a method of the given class. Matches both single-method and + * nested-scope cases. */ +static bool idl_qn_in_class(const char *call_qn, const char *class_qn) { + if (!call_qn || !class_qn || !class_qn[0]) { + return false; + } + size_t cl = strlen(class_qn); + if (strncmp(call_qn, class_qn, cl) != 0) { + return false; + } + return call_qn[cl] == '.'; +} + +/* Look up a stub var with scope priority: + * 1. function-scope exact match + * 2. class-scope (call enclosing is method of var's class) + * 3. file-scope fallback (any matching var name in this array) + */ +/* allow_name_only_fallback: + * true — Pass 3 (name-only match) is allowed. Safe for per-file arrays + * where every entry is in the caller's translation unit and the + * worst-case false positive stays inside one file. + * false — Pass 3 is suppressed. Required for project-wide arrays + * (class_vars). Without a class/file check, a bare name match + * could bind a `_client` call to an unrelated class's stub and + * emit a wrong (CROSS_)GRPC_CALLS edge. + */ +static const idl_stub_var_t *idl_stub_var_arr_find_ext(const idl_stub_var_arr_t *a, + const char *enclosing_qn, + const char *var_name, + bool allow_name_only_fallback) { if (!var_name) { return NULL; } - /* Pass 1: prefer same-function scope match. */ + /* Pass 1: function-scope exact match. */ if (enclosing_qn && enclosing_qn[0]) { for (int i = 0; i < a->count; i++) { const idl_stub_var_t *e = &a->items[i]; @@ -414,20 +552,38 @@ static const idl_stub_var_t *idl_stub_var_arr_find(const idl_stub_var_arr_t *a, } } } - /* Pass 2: file-scope fallback. Covers class fields (assigned in ctor, - * called in methods) and module-scope vars accessed from inner functions. */ - for (int i = 0; i < a->count; i++) { - const idl_stub_var_t *e = &a->items[i]; - if (strcmp(e->var_name, var_name) == 0) { - return e; + /* Pass 2: class-scope (var declared on the class whose method is calling). */ + if (enclosing_qn && enclosing_qn[0]) { + for (int i = 0; i < a->count; i++) { + const idl_stub_var_t *e = &a->items[i]; + if (strcmp(e->var_name, var_name) == 0 && e->class_qn && + idl_qn_in_class(enclosing_qn, e->class_qn)) { + return e; + } + } + } + /* Pass 3: name-only fallback — only when the array is known to be + * single-file-scope. Project-wide arrays must fail closed here. */ + if (allow_name_only_fallback) { + for (int i = 0; i < a->count; i++) { + const idl_stub_var_t *e = &a->items[i]; + if (strcmp(e->var_name, var_name) == 0) { + return e; + } } } return NULL; } +static const idl_stub_var_t *idl_stub_var_arr_find(const idl_stub_var_arr_t *a, + const char *enclosing_qn, const char *var_name) { + return idl_stub_var_arr_find_ext(a, enclosing_qn, var_name, true); +} + static void idl_stub_var_arr_free(idl_stub_var_arr_t *a) { for (int i = 0; i < a->count; i++) { free(a->items[i].enclosing_qn); + free(a->items[i].class_qn); free(a->items[i].var_name); free(a->items[i].service_name); } @@ -437,6 +593,66 @@ static void idl_stub_var_arr_free(idl_stub_var_arr_t *a) { a->cap = 0; } +/* DI-registered stub-type registry (Tier 1e). Holds FQNs or basenames of types + * that have been positively identified as gRPC stubs even when they don't carry + * the conventional Stub/Client suffix. Populated by scanning AddGrpcClient + * call sites and @GrpcClient annotations. Consumed by Tier 1c/1f to upgrade + * non-suffix types into stub vars. */ +typedef struct { + char **types; + int count; + int cap; +} idl_di_registry_t; + +static void idl_di_registry_init(idl_di_registry_t *r) { + r->types = NULL; + r->count = 0; + r->cap = 0; +} + +static void idl_di_registry_free(idl_di_registry_t *r) { + for (int i = 0; i < r->count; i++) { + free(r->types[i]); + } + free(r->types); + r->types = NULL; + r->count = 0; + r->cap = 0; +} + +static void idl_di_registry_add(idl_di_registry_t *r, const char *type_name) { + if (!type_name || !type_name[0]) { + return; + } + for (int i = 0; i < r->count; i++) { + if (strcmp(r->types[i], type_name) == 0) { + return; + } + } + if (r->count >= r->cap) { + int new_cap = r->cap == 0 ? IDL_DI_INIT_CAP : r->cap * 2; + char **grow = realloc(r->types, (size_t)new_cap * sizeof(char *)); + if (!grow) { + return; + } + r->types = grow; + r->cap = new_cap; + } + r->types[r->count++] = strdup(type_name); +} + +static bool idl_di_registry_contains(const idl_di_registry_t *r, const char *type_name) { + if (!r || !type_name) { + return false; + } + for (int i = 0; i < r->count; i++) { + if (strcmp(r->types[i], type_name) == 0) { + return true; + } + } + return false; +} + /* Get the unqualified (basename) form of a possibly-qualified type name. * "promo_pb2_grpc.PromoCodeStub" → "PromoCodeStub". */ static const char *idl_type_basename(const char *qualified) { @@ -447,6 +663,195 @@ static const char *idl_type_basename(const char *qualified) { return dot ? dot + 1 : qualified; } +/* Tier 1d — Factory function return-type inference. + * + * Resolve a service name from a factory call's qualified text. Handles: + * Java/Kotlin: "FooGrpc.newBlockingStub" → "Foo" + * "FooGrpc.newFutureStub" → "Foo" + * "FooGrpc.newAsyncStub" → "Foo" + * "FooGrpc.newStub" → "Foo" + * Go: "pb.NewFooClient" → "Foo" + * "client.NewFooClient" → "Foo" + * Rust: "FooClient.connect" → handled by suffix path, not here + * + * Returns heap-allocated service name or NULL on miss. Caller frees. */ +static char *idl_factory_to_service(const char *qualified_type) { + if (!qualified_type) { + return NULL; + } + const char *dot = strrchr(qualified_type, '.'); + const char *factory = dot ? dot + 1 : qualified_type; + size_t pkg_len = dot ? (size_t)(dot - qualified_type) : 0; + + /* Strip "New" / "new" prefix from factory name. */ + const char *body = NULL; + if (strncmp(factory, "New", 3) == 0 && factory[3]) { + body = factory + 3; + } else if (strncmp(factory, "new", 3) == 0 && factory[3]) { + body = factory + 3; + } else { + return NULL; + } + + /* Case A: body has a stub/client suffix where prefix carries the service. + * Go: "FooClient" → "Foo". */ + char *stripped = idl_strip_suffix(body, k_grpc_client_suffixes); + if (stripped && stripped[0]) { + return stripped; + } + free(stripped); + + /* Case B: body is the raw stub kind ("BlockingStub"/"FutureStub"/...). + * Service lives in the package prefix. Java: "FooGrpc.newBlockingStub". */ + if (pkg_len == 0) { + return NULL; + } + bool body_is_pure_suffix = false; + for (int i = 0; k_grpc_client_suffixes[i]; i++) { + if (strcmp(body, k_grpc_client_suffixes[i]) == 0) { + body_is_pure_suffix = true; + break; + } + } + if (!body_is_pure_suffix) { + return NULL; + } + char pkg[IDL_NAME_BUF]; + size_t copy = pkg_len < sizeof(pkg) - 1 ? pkg_len : sizeof(pkg) - 1; + memcpy(pkg, qualified_type, copy); + pkg[copy] = '\0'; + /* Take last segment if pkg is itself qualified ("com.foo.FooGrpc" → "FooGrpc"). */ + char *last_seg = strrchr(pkg, '.'); + const char *seg = last_seg ? last_seg + 1 : pkg; + static const char *const k_pkg_suffixes[] = {"Grpc", NULL}; + char *svc = idl_strip_suffix(seg, k_pkg_suffixes); + if (svc && svc[0]) { + return svc; + } + free(svc); + return NULL; +} + +/* Extract a single JSON string property from properties_json. Returns buf on + * success or NULL on miss. Mirrors json_str_value in pass_semantic_edges.c. */ +static const char *idl_json_string(const char *json, const char *key, char *buf, size_t bufsize) { + if (!json || !key || !buf || bufsize == 0) { + return NULL; + } + char pat[64]; + snprintf(pat, sizeof(pat), "\"%s\":\"", key); + const char *start = strstr(json, pat); + if (!start) { + return NULL; + } + start += strlen(pat); + const char *end = strchr(start, '"'); + if (!end) { + return NULL; + } + size_t len = (size_t)(end - start); + if (len >= bufsize) { + len = bufsize - 1; + } + memcpy(buf, start, len); + buf[len] = '\0'; + return buf; +} + +/* Extract a JSON string array. Caller frees out[i] on success. + * Returns count, 0 on miss. */ +static int idl_json_str_array(const char *json, const char *key, char **out, int max_out) { + if (!json || !key || !out || max_out <= 0) { + return 0; + } + char pat[64]; + snprintf(pat, sizeof(pat), "\"%s\":[", key); + const char *start = strstr(json, pat); + if (!start) { + return 0; + } + start += strlen(pat); + int count = 0; + while (*start && *start != ']' && count < max_out) { + if (*start == '"') { + start++; + const char *end = strchr(start, '"'); + if (!end) { + break; + } + size_t len = (size_t)(end - start); + char *s = malloc(len + 1); + if (!s) { + break; + } + memcpy(s, start, len); + s[len] = '\0'; + out[count++] = s; + start = end + 1; + } else { + start++; + } + } + return count; +} + +static void idl_free_str_array(char **arr, int count) { + for (int i = 0; i < count; i++) { + free(arr[i]); + } +} + +/* Last segment of a dotted QN. "pkg.Foo" → "Foo". Returns pointer into qn. */ +static const char *idl_qn_basename(const char *qn) { + if (!qn) { + return NULL; + } + const char *dot = strrchr(qn, '.'); + return dot ? dot + 1 : qn; +} + +/* Decide whether type_name should register a stub var. Returns heap-allocated + * service name or NULL. Caller frees. + * + * Resolution order: + * 1. denylist → reject + * 2. DI registry exact match → service = type basename (already known stub) + * 3. factory pattern → service derived from factory name/package + * 4. suffix match → service = prefix before suffix + */ +static char *idl_type_to_service(const char *type_name, const idl_di_registry_t *di_registry) { + if (!type_name) { + return NULL; + } + if (idl_type_is_denylisted(type_name)) { + return NULL; + } + const char *base = idl_type_basename(type_name); + /* DI registry hit: emit raw basename as service even if no suffix matched. */ + if (di_registry && (idl_di_registry_contains(di_registry, type_name) || + idl_di_registry_contains(di_registry, base))) { + char *stripped = idl_strip_suffix(base, k_grpc_client_suffixes); + if (stripped && stripped[0]) { + return stripped; + } + free(stripped); + return strdup(base); + } + /* Factory pattern (Tier 1d). */ + char *factory_svc = idl_factory_to_service(type_name); + if (factory_svc && factory_svc[0]) { + return factory_svc; + } + free(factory_svc); + /* Default: suffix strip. */ + char *suffix_svc = idl_strip_suffix(base, k_grpc_client_suffixes); + if (suffix_svc && suffix_svc[0]) { + return suffix_svc; + } + free(suffix_svc); + return NULL; +} + /* Scan one CBMFileResult's type_assigns; for each assignment whose RHS type * matches a stub/client suffix and is not denylisted, record * (enclosing_qn, var_name, service_name). @@ -459,6 +864,7 @@ static const char *idl_type_basename(const char *qualified) { * unique stub type. The denylist (k_non_grpc_type_markers) cuts off the * obvious false positives like System.Net.Http.HttpClient. */ static void idl_collect_stub_vars_for_file(const CBMFileResult *result, + const idl_di_registry_t *di_registry, idl_stub_var_arr_t *out) { if (!result) { return; @@ -468,11 +874,7 @@ static void idl_collect_stub_vars_for_file(const CBMFileResult *result, if (!ta->var_name || !ta->type_name) { continue; } - if (idl_type_is_denylisted(ta->type_name)) { - continue; - } - const char *base = idl_type_basename(ta->type_name); - char *service = idl_strip_suffix(base, k_grpc_client_suffixes); + char *service = idl_type_to_service(ta->type_name, di_registry); if (!service || !service[0]) { free(service); continue; @@ -482,6 +884,210 @@ static void idl_collect_stub_vars_for_file(const CBMFileResult *result, } } +/* Tier 1e — Scan calls for DI-registration patterns and harvest stub-type + * generic args. Currently handles: + * C# `services.AddGrpcClient(...)` — generic type arg appears + * inside the callee_name text as "AddGrpcClient". + * C# `services.AddGrpcClientFactory` and similar variants + * caught by the substring search. + * + * Spring `@GrpcClient("name")` annotations on fields are picked up by the + * field walker via decorators (Tier 1f) so they don't need a separate path + * here. */ +static void idl_extract_di_generic_arg(const char *callee_name, idl_di_registry_t *out) { + if (!callee_name) { + return; + } + const char *anchor = strstr(callee_name, "AddGrpcClient"); + if (!anchor) { + return; + } + const char *lt = strchr(anchor, '<'); + if (!lt) { + return; + } + const char *gt = strchr(lt, '>'); + if (!gt || gt <= lt + 1) { + return; + } + char type_buf[IDL_NAME_BUF]; + size_t len = (size_t)(gt - lt - 1); + if (len >= sizeof(type_buf)) { + len = sizeof(type_buf) - 1; + } + memcpy(type_buf, lt + 1, len); + type_buf[len] = '\0'; + /* Trim whitespace and any pointer/ref artifacts. */ + char *p = type_buf; + while (*p == ' ' || *p == '*' || *p == '&') { + p++; + } + char *end = p + strlen(p); + while (end > p && (end[-1] == ' ' || end[-1] == ',')) { + *--end = '\0'; + } + if (!*p) { + return; + } + idl_di_registry_add(out, p); + /* Also register the basename (last segment) for non-FQN matches. */ + const char *base = idl_type_basename(p); + if (base && base != p) { + idl_di_registry_add(out, base); + } +} + +static void idl_collect_di_registry(const CBMFileResult *const *results, int file_count, + idl_di_registry_t *out) { + if (!results) { + return; + } + for (int f = 0; f < file_count; f++) { + const CBMFileResult *r = results[f]; + if (!r) { + continue; + } + for (int i = 0; i < r->calls.count; i++) { + const CBMCall *c = &r->calls.items[i]; + idl_extract_di_generic_arg(c->callee_name, out); + } + /* Also register field decorator-driven Spring/NestJS clients via + * the field walker — see idl_collect_class_scope_stubs. */ + } +} + +/* True when method name is a constructor for parent_class. + * Python: "__init__" + * JS/TS: "constructor" + * C#/Java/Kotlin: name == basename(parent_class_qn) + */ +static bool idl_is_constructor(const char *method_name, const char *parent_class_qn) { + if (!method_name) { + return false; + } + if (strcmp(method_name, "__init__") == 0 || strcmp(method_name, "constructor") == 0) { + return true; + } + if (!parent_class_qn) { + return false; + } + const char *cls = idl_qn_basename(parent_class_qn); + return cls && strcmp(method_name, cls) == 0; +} + +/* True when decorator string smells like a Spring/NestJS gRPC client + * annotation. The decorator strings come from CBMDefinition.decorators which + * preserves the leading '@' for Java/Kotlin and the call-form text for TS. */ +static bool idl_decorator_marks_grpc_client(const char *decorator) { + if (!decorator) { + return false; + } + if (strstr(decorator, "GrpcClient") || strstr(decorator, "grpcClient")) { + return true; + } + if (strstr(decorator, "@Client") || strstr(decorator, "ClientGrpc")) { + return true; + } + return false; +} + +/* Tier 1c — Walk Method nodes; for each constructor-shaped method, harvest + * stub-typed parameters and register them as class-scope vars. + * Tier 1f — Walk Field nodes and register stub-typed fields class-scoped. */ +typedef struct { + idl_stub_var_arr_t *out; + const idl_di_registry_t *di_registry; + int ctor_params; + int fields; +} idl_class_scope_ctx_t; + +static void idl_class_scope_visitor(const cbm_gbuf_node_t *node, void *userdata) { + idl_class_scope_ctx_t *ctx = (idl_class_scope_ctx_t *)userdata; + if (!node || !node->label || !node->properties_json) { + return; + } + const char *props = node->properties_json; + + char parent_buf[IDL_QN_BUF]; + const char *parent_class = idl_json_string(props, "parent_class", parent_buf, sizeof(parent_buf)); + + if (strcmp(node->label, "Method") == 0) { + if (!parent_class || !idl_is_constructor(node->name, parent_class)) { + return; + } + char *names[IDL_PARAM_MAX]; + char *types[IDL_PARAM_MAX]; + int nname = idl_json_str_array(props, "param_names", names, IDL_PARAM_MAX); + int ntype = idl_json_str_array(props, "param_types", types, IDL_PARAM_MAX); + int n = nname < ntype ? nname : ntype; + for (int i = 0; i < n; i++) { + char *service = idl_type_to_service(types[i], ctx->di_registry); + if (service && service[0]) { + idl_stub_var_arr_push_scoped(ctx->out, NULL, parent_class, names[i], service); + ctx->ctor_params++; + } + free(service); + } + idl_free_str_array(names, nname); + idl_free_str_array(types, ntype); + return; + } + + if (strcmp(node->label, "Field") == 0) { + if (!parent_class || !node->name) { + return; + } + char type_buf[IDL_NAME_BUF]; + const char *field_type = idl_json_string(props, "return_type", type_buf, sizeof(type_buf)); + bool decorator_hit = false; + char *decos[IDL_PARAM_MAX]; + int dc = idl_json_str_array(props, "decorators", decos, IDL_PARAM_MAX); + for (int i = 0; i < dc; i++) { + if (idl_decorator_marks_grpc_client(decos[i])) { + decorator_hit = true; + } + } + if (!field_type) { + idl_free_str_array(decos, dc); + return; + } + char *service = idl_type_to_service(field_type, ctx->di_registry); + if (!service && decorator_hit) { + /* Spring/NestJS @GrpcClient on a field whose type doesn't follow + * the conventional Stub/Client suffix. The annotation itself is + * authoritative — derive service from the type basename. */ + const char *base = idl_type_basename(field_type); + char *stripped = idl_strip_suffix(base, k_grpc_client_suffixes); + if (stripped && stripped[0]) { + service = stripped; + } else { + free(stripped); + service = strdup(base); + } + } + if (service && service[0]) { + idl_stub_var_arr_push_scoped(ctx->out, NULL, parent_class, node->name, service); + ctx->fields++; + } + free(service); + idl_free_str_array(decos, dc); + } +} + +static void idl_collect_class_scope_stubs(cbm_gbuf_t *gbuf, const idl_di_registry_t *di_registry, + idl_stub_var_arr_t *out, int *ctor_count, + int *field_count) { + idl_class_scope_ctx_t ctx = { + .out = out, .di_registry = di_registry, .ctor_params = 0, .fields = 0}; + cbm_gbuf_foreach_node(gbuf, idl_class_scope_visitor, &ctx); + if (ctor_count) { + *ctor_count = ctx.ctor_params; + } + if (field_count) { + *field_count = ctx.fields; + } +} + /* Locate the caller node for a producer-side edge: prefer the enclosing function * QN's gbuf node, fall back to the file node. Mirrors pass_calls' calls_find_source. */ static const cbm_gbuf_node_t *idl_find_caller(cbm_pipeline_ctx_t *ctx, const char *rel_path, @@ -500,11 +1106,17 @@ static const cbm_gbuf_node_t *idl_find_caller(cbm_pipeline_ctx_t *ctx, const cha return src; } -/* Walk one file's calls and emit GRPC_CALLS edges for matched stub-var.method patterns. */ +/* Walk one file's calls and emit GRPC_CALLS edges for matched stub-var.method + * patterns. file_vars holds per-file (function/file-scope) entries; class_vars + * holds project-wide class-scope entries from Tier 1c/1f. */ static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *fi, const CBMFileResult *result, - const idl_stub_var_arr_t *stub_vars) { - if (!result || stub_vars->count == 0) { + const idl_stub_var_arr_t *file_vars, + const idl_stub_var_arr_t *class_vars) { + if (!result) { + return 0; + } + if (file_vars->count == 0 && (!class_vars || class_vars->count == 0)) { return 0; } int emitted = 0; @@ -532,7 +1144,14 @@ static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_f } const idl_stub_var_t *stub = - idl_stub_var_arr_find(stub_vars, call->enclosing_func_qn, var_buf); + idl_stub_var_arr_find(file_vars, call->enclosing_func_qn, var_buf); + if (!stub && class_vars) { + /* class_vars is project-wide. Disallow the name-only fallback so + * a stray `_client` field on an unrelated class can't silently + * bind this call. Pass 1/2 still match when the call's enclosing + * function lives under the var's class. */ + stub = idl_stub_var_arr_find_ext(class_vars, call->enclosing_func_qn, var_buf, false); + } if (!stub) { continue; } @@ -546,12 +1165,25 @@ static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_f idl_build_route_qn(route_qn, sizeof(route_qn), stub->service_name, method_buf); char display[IDL_QN_BUF]; snprintf(display, sizeof(display), "%s/%s", stub->service_name, method_buf); - char route_props[IDL_PROPS_BUF]; - snprintf(route_props, sizeof(route_props), - "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\"}", stub->service_name, - method_buf); - int64_t route_id = cbm_gbuf_upsert_node(ctx->gbuf, "Route", display, route_qn, "", 0, 0, - route_props); + + /* If the proto Class emission (Tier 1g) already created this Route + * with rich properties (proto_package, key_kind, etc.), reuse the + * existing node — upsert overwrites properties last-write-wins, + * which would drop the FQN provenance. The consumer-side emission + * only needs the Route id for the GRPC_CALLS edge. */ + const cbm_gbuf_node_t *existing = cbm_gbuf_find_by_qn(ctx->gbuf, route_qn); + int64_t route_id; + if (existing) { + route_id = existing->id; + } else { + char route_props[IDL_PROPS_BUF]; + snprintf(route_props, sizeof(route_props), + "{\"protocol\":\"grpc\",\"service\":\"%s\",\"method\":\"%s\"," + "\"key_kind\":\"bare\"}", + stub->service_name, method_buf); + route_id = cbm_gbuf_upsert_node(ctx->gbuf, "Route", display, route_qn, "", 0, 0, + route_props); + } if (route_id <= 0) { continue; } @@ -573,36 +1205,49 @@ static int idl_emit_producer_edges_for_file(cbm_pipeline_ctx_t *ctx, const cbm_f } static int idl_emit_producer_edges(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, - int file_count) { + int file_count, int *ctor_count_out, int *field_count_out, + int *di_count_out) { if (!ctx->result_cache || file_count <= 0) { return 0; } + /* Tier 1e: walk all calls once to harvest DI-registered stub types. */ + idl_di_registry_t di_registry; + idl_di_registry_init(&di_registry); + idl_collect_di_registry((const CBMFileResult *const *)ctx->result_cache, file_count, + &di_registry); + if (di_count_out) { + *di_count_out = di_registry.count; + } + + /* Tier 1c + 1f: walk gbuf for ctor params + class fields. Class-scope vars + * apply to any method whose enclosing_func_qn lives under the class. */ + idl_stub_var_arr_t class_vars; + idl_stub_var_arr_init(&class_vars); + idl_collect_class_scope_stubs(ctx->gbuf, &di_registry, &class_vars, ctor_count_out, + field_count_out); + int total_emitted = 0; for (int i = 0; i < file_count; i++) { const CBMFileResult *result = ctx->result_cache[i]; if (!result) { continue; } - idl_stub_var_arr_t stub_vars; - idl_stub_var_arr_init(&stub_vars); - idl_collect_stub_vars_for_file(result, &stub_vars); - if (stub_vars.count > 0) { - total_emitted += idl_emit_producer_edges_for_file(ctx, &files[i], result, &stub_vars); + idl_stub_var_arr_t file_vars; + idl_stub_var_arr_init(&file_vars); + idl_collect_stub_vars_for_file(result, &di_registry, &file_vars); + if (file_vars.count > 0 || class_vars.count > 0) { + total_emitted += + idl_emit_producer_edges_for_file(ctx, &files[i], result, &file_vars, &class_vars); } - idl_stub_var_arr_free(&stub_vars); + idl_stub_var_arr_free(&file_vars); } + idl_stub_var_arr_free(&class_vars); + idl_di_registry_free(&di_registry); return total_emitted; } -/* TLS-backed itoa for log calls. */ -static const char *idl_itoa(int v) { - static CBM_TLS char buf[IDL_LOG_BUF]; - snprintf(buf, sizeof(buf), "%d", v); - return buf; -} - /* Public entry point. Idempotent: re-running over the same gbuf only adds the * same Route + HANDLES + GRPC_CALLS tuples (deduped by gbuf upsert/insert semantics). */ int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, @@ -613,15 +1258,45 @@ int cbm_pipeline_pass_idl_scan(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *f cbm_log_info("pass.start", "pass", "idl_scan"); + /* Single O(M) walk to collect proto Classes + proto Functions. Subsequent + * per-class processing is O(F) where F is the count of proto Functions — + * keeps the pass linear in graph size even on monorepos with hundreds of + * service-bearing .proto files. */ + idl_proto_index_t proto_idx; + idl_proto_index_init(&proto_idx); + cbm_gbuf_foreach_node(ctx->gbuf, idl_proto_index_visitor, &proto_idx); idl_walk_ctx_t walk = {.gbuf = ctx->gbuf, .services = 0, .routes = 0}; - cbm_gbuf_foreach_node(ctx->gbuf, idl_proto_class_visitor, &walk); + for (int i = 0; i < proto_idx.class_count; i++) { + idl_emit_routes_for_proto_class(ctx->gbuf, proto_idx.classes[i], &proto_idx, + &walk.services, &walk.routes); + } + idl_proto_index_free(&proto_idx); int handles = idl_bind_consumer_handlers(ctx->gbuf); - int grpc_calls = idl_emit_producer_edges(ctx, files, file_count); + int ctor_count = 0; + int field_count = 0; + int di_count = 0; + int grpc_calls = + idl_emit_producer_edges(ctx, files, file_count, &ctor_count, &field_count, &di_count); - cbm_log_info("pass.done", "pass", "idl_scan", "services", idl_itoa(walk.services), "routes", - idl_itoa(walk.routes), "handles", idl_itoa(handles), "grpc_calls", - idl_itoa(grpc_calls)); + /* Per-call stack buffers — idl_itoa shares a single TLS buffer that gets + * clobbered when multiple calls share one log statement. */ + char b_svc[IDL_LOG_BUF]; + char b_rt[IDL_LOG_BUF]; + char b_h[IDL_LOG_BUF]; + char b_g[IDL_LOG_BUF]; + char b_ct[IDL_LOG_BUF]; + char b_fd[IDL_LOG_BUF]; + char b_di[IDL_LOG_BUF]; + snprintf(b_svc, sizeof(b_svc), "%d", walk.services); + snprintf(b_rt, sizeof(b_rt), "%d", walk.routes); + snprintf(b_h, sizeof(b_h), "%d", handles); + snprintf(b_g, sizeof(b_g), "%d", grpc_calls); + snprintf(b_ct, sizeof(b_ct), "%d", ctor_count); + snprintf(b_fd, sizeof(b_fd), "%d", field_count); + snprintf(b_di, sizeof(b_di), "%d", di_count); + cbm_log_info("pass.done", "pass", "idl_scan", "services", b_svc, "routes", b_rt, "handles", b_h, + "grpc_calls", b_g, "ctor_params", b_ct, "fields", b_fd, "di_types", b_di); return 0; } From 6323922460e559181395e7d1b7a4dc1fd83b69aa Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:13:18 +0500 Subject: [PATCH 07/10] fix(pipeline): wire pass_idl_scan into parallel + incremental paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pass_idl_scan needs ctx->result_cache populated to read producer-side typed-client signals out of CBMFileResult during emission. The full sequential pipeline already attached seq_cache before pass_definitions and ran pass_idl_scan with it. The other three pipeline paths didn't: * Full parallel — built a cache during parallel_extract + parallel_resolve but never invoked pass_idl_scan, then freed cache. Threshold for parallel is ~50 files, so every real-world repo silently skipped Tier 1 producer-side emission. * Incremental sequential — called pass_idl_scan but never attached a result cache, so the pass returned early at `if (!ctx->result_cache)` and producer-side edges never refreshed. * Incremental parallel — built a cache for extract+resolve but never called pass_idl_scan at all. Fix mirrors the full sequential pattern in all three call sites: allocate a CBMFileResult ** cache, attach to ctx->result_cache before pass_idl_scan runs, run, then free. --- src/pipeline/pipeline.c | 11 +++++++++++ src/pipeline/pipeline_incremental.c | 26 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index d2e209e4..035bc4f7 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -565,6 +565,17 @@ static int run_parallel_pipeline(cbm_pipeline_t *p, cbm_pipeline_ctx_t *ctx, cbm_gbuf_set_next_id(p->gbuf, atomic_load(&shared_ids)); cbm_pipeline_extract_infra_routes(p->gbuf, files, cache, file_count); cbm_pipeline_process_infra_bindings(p->gbuf, files, cache, file_count); + + /* Run idl_scan while result_cache is still populated. Mirrors the + * sequential pipeline at pipeline.c:497. Without this, pass_idl_scan is + * silently skipped for any repo large enough to take the parallel path — + * meaning every real-world .NET / Java service fleet. */ + ctx->result_cache = cache; + cbm_clock_gettime(CLOCK_MONOTONIC, t); + cbm_pipeline_pass_idl_scan(ctx, files, file_count); + cbm_log_info("pass.timing", "pass", "idl_scan", "elapsed_ms", itoa_buf((int)elapsed_ms(*t))); + ctx->result_cache = NULL; + for (int i = 0; i < file_count; i++) { if (cache[i]) { cbm_free_result(cache[i]); diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index 3c465519..0290f52d 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -209,6 +209,16 @@ static void run_extract_resolve(cbm_pipeline_ctx_t *ctx, cbm_file_info_t *change cbm_log_info("pass.timing", "pass", "incr_resolve", "elapsed_ms", itoa_buf((int)elapsed_ms(t))); + /* Producer-side IDL scan needs result_cache to read calls/usages + * out of the freshly extracted CBMFileResult. Mirrors the + * full-pipeline parallel path in pipeline.c. */ + cbm_clock_gettime(CLOCK_MONOTONIC, &t); + ctx->result_cache = cache; + cbm_pipeline_pass_idl_scan(ctx, changed_files, ci); + ctx->result_cache = NULL; + cbm_log_info("pass.timing", "pass", "incr_idl_scan", "elapsed_ms", + itoa_buf((int)elapsed_ms(t))); + for (int j = 0; j < ci; j++) { if (cache[j]) { cbm_free_result(cache[j]); @@ -218,11 +228,27 @@ static void run_extract_resolve(cbm_pipeline_ctx_t *ctx, cbm_file_info_t *change } } else { cbm_log_info("incremental.mode", "mode", "sequential", "changed", itoa_buf(ci)); + /* Attach a result cache so pass_definitions stores CBMFileResult per file + * and pass_idl_scan can read calls/imports out of it. Mirrors + * run_sequential_pipeline in pipeline.c. */ + CBMFileResult **seq_cache = (CBMFileResult **)calloc(ci, sizeof(CBMFileResult *)); + if (seq_cache) { + ctx->result_cache = seq_cache; + } cbm_pipeline_pass_definitions(ctx, changed_files, ci); cbm_pipeline_pass_calls(ctx, changed_files, ci); cbm_pipeline_pass_idl_scan(ctx, changed_files, ci); cbm_pipeline_pass_usages(ctx, changed_files, ci); cbm_pipeline_pass_semantic(ctx, changed_files, ci); + if (seq_cache) { + for (int i = 0; i < ci; i++) { + if (seq_cache[i]) { + cbm_free_result(seq_cache[i]); + } + } + free(seq_cache); + ctx->result_cache = NULL; + } } } From 93fbfbf7c556a93164987453ef51ff11f35d4181 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:13:46 +0500 Subject: [PATCH 08/10] feat(graph-ui): satellite galaxy rendering for cross-repo CROSS_* edges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end support for visualizing inter-repo links in the embedded Three.js graph viewer. When the active project's store has CROSS_* edges with target_project pointing to a sibling .db in the cache, the viewer now renders each linked project as an offset satellite cluster with edges connecting back to the primary cluster. Backend (src/ui/http_server.c): * /api/layout — for each distinct target_project found in the source store's CROSS_* edges, compute a layout for the linked project's store, place it on a circle around the primary cluster sized by primary + satellite radii so satellites don't bury inside, and populate cross_edges by joining the source CROSS_* edges to their Route's qualified_name (canonical across both stores) and looking up the matching Route id in the linked store. * layout_radius() — bounding-radius helper used to choose spacing. Frontend: * GraphScene.tsx — renders data.linked_projects?.map() as additional NodeCloud + EdgeLines groups offset by linked_projects[i].offset. Inter-galaxy edges go through a new EdgeLines invocation with a targetNodes prop pointing at the offset-shifted satellite nodes. * EdgeLines.tsx — new optional targetNodes prop. When set, edge.target ids are resolved against targetNodes instead of nodes. Existing intra-cluster usage is unchanged. Also adds CROSS_* / GRPC_CALLS / GRAPHQL_CALLS / TRPC_CALLS edge-type colors. * GraphTab.tsx — filteredData now passes linked_projects through (was silently dropped, leaving the scene with no satellites). Filter init / enableAll union in labels and edge types from each linked project so satellites stay visible by default. Without these changes, indexing a multi-service fleet produces CROSS_GRPC_CALLS edges in SQLite that never reach the canvas — the matching backend was correct, the rendering pipeline just had no path for inter-galaxy data. --- graph-ui/src/components/EdgeLines.tsx | 40 ++++++-- graph-ui/src/components/GraphScene.tsx | 11 +++ graph-ui/src/components/GraphTab.tsx | 32 ++++++- src/ui/http_server.c | 123 ++++++++++++++++++++++--- 4 files changed, 184 insertions(+), 22 deletions(-) diff --git a/graph-ui/src/components/EdgeLines.tsx b/graph-ui/src/components/EdgeLines.tsx index 4001b9d2..7c1f03a3 100644 --- a/graph-ui/src/components/EdgeLines.tsx +++ b/graph-ui/src/components/EdgeLines.tsx @@ -7,6 +7,10 @@ interface EdgeLinesProps { edges: GraphEdge[]; highlightedIds: Set | null; opacity?: number; + /* Optional: when set, edge.target is looked up in this array instead of + * `nodes`. Used for cross-galaxy edges where source lives in the primary + * graph and target lives in a linked project's offset-adjusted nodes. */ + targetNodes?: GraphNode[]; } function getClusterKey(fp?: string): string { @@ -28,17 +32,39 @@ const EDGE_TYPE_COLORS: Record = { IMPLEMENTS: "#f97316", HTTP_CALLS: "#e11d48", ASYNC_CALLS: "#ec4899", + GRPC_CALLS: "#f59e0b", + GRAPHQL_CALLS: "#e879f9", + TRPC_CALLS: "#a78bfa", + CROSS_HTTP_CALLS: "#fb923c", + CROSS_ASYNC_CALLS: "#fb7185", + CROSS_GRPC_CALLS: "#fbbf24", + CROSS_GRAPHQL_CALLS: "#f0abfc", + CROSS_TRPC_CALLS: "#c4b5fd", + CROSS_CHANNEL: "#fdba74", MEMBER_OF: "#64748b", TESTS_FILE: "#06b6d4", }; const DEFAULT_EDGE_COLOR = "#1C8585"; -export function EdgeLines({ nodes, edges, highlightedIds, opacity = 1.0 }: EdgeLinesProps) { +export function EdgeLines({ + nodes, + edges, + highlightedIds, + opacity = 1.0, + targetNodes, +}: EdgeLinesProps) { const geometry = useMemo(() => { - const idMap = new Map(); + const srcMap = new Map(); for (let i = 0; i < nodes.length; i++) { - idMap.set(nodes[i].id, i); + srcMap.set(nodes[i].id, i); + } + const tgtArr = targetNodes ?? nodes; + const tgtMap = targetNodes ? new Map() : srcMap; + if (targetNodes) { + for (let i = 0; i < targetNodes.length; i++) { + tgtMap.set(targetNodes[i].id, i); + } } const hasHighlight = highlightedIds && highlightedIds.size > 0; @@ -47,12 +73,12 @@ export function EdgeLines({ nodes, edges, highlightedIds, opacity = 1.0 }: EdgeL let validCount = 0; for (const edge of edges) { - const si = idMap.get(edge.source); - const ti = idMap.get(edge.target); + const si = srcMap.get(edge.source); + const ti = tgtMap.get(edge.target); if (si === undefined || ti === undefined) continue; const s = nodes[si]; - const t = nodes[ti]; + const t = tgtArr[ti]; const sHL = !hasHighlight || highlightedIds.has(s.id); const tHL = !hasHighlight || highlightedIds.has(t.id); @@ -99,7 +125,7 @@ export function EdgeLines({ nodes, edges, highlightedIds, opacity = 1.0 }: EdgeL new THREE.BufferAttribute(colors.slice(0, validCount * 6), 3), ); return geo; - }, [nodes, edges, highlightedIds]); + }, [nodes, edges, highlightedIds, targetNodes]); return ( diff --git a/graph-ui/src/components/GraphScene.tsx b/graph-ui/src/components/GraphScene.tsx index 50055fbd..fa150be3 100644 --- a/graph-ui/src/components/GraphScene.tsx +++ b/graph-ui/src/components/GraphScene.tsx @@ -156,6 +156,17 @@ export function GraphScene({ onClick={onNodeClick} opacity={0.5} /> + {/* Inter-galaxy CROSS_* edges: source is in primary, target in + * this linked project's offset nodes. */} + {lp.cross_edges && lp.cross_edges.length > 0 && ( + + )} ); })} diff --git a/graph-ui/src/components/GraphTab.tsx b/graph-ui/src/components/GraphTab.tsx index 735ffd04..ea9b7bb5 100644 --- a/graph-ui/src/components/GraphTab.tsx +++ b/graph-ui/src/components/GraphTab.tsx @@ -48,6 +48,11 @@ export function GraphTab({ project }: GraphTabProps) { if (!data) return; const labels = new Set(data.nodes.map((n) => n.label)); const types = new Set(data.edges.map((e) => e.type)); + for (const lp of data.linked_projects ?? []) { + for (const n of lp.nodes) labels.add(n.label); + for (const e of lp.edges) types.add(e.type); + for (const e of lp.cross_edges) types.add(e.type); + } setEnabledLabels(labels); setEnabledEdgeTypes(types); }, [data]); @@ -65,7 +70,21 @@ export function GraphTab({ project }: GraphTabProps) { nodeIds.has(e.target), ); - return { nodes, edges, total_nodes: data.total_nodes }; + const linked_projects = data.linked_projects?.map((lp) => { + const lpNodes = lp.nodes.filter((n) => enabledLabels.has(n.label)); + const lpIds = new Set(lpNodes.map((n) => n.id)); + const lpEdges = lp.edges.filter( + (e) => + enabledEdgeTypes.has(e.type) && lpIds.has(e.source) && lpIds.has(e.target), + ); + const crossEdges = lp.cross_edges.filter( + (e) => + enabledEdgeTypes.has(e.type) && nodeIds.has(e.source) && lpIds.has(e.target), + ); + return { ...lp, nodes: lpNodes, edges: lpEdges, cross_edges: crossEdges }; + }); + + return { nodes, edges, total_nodes: data.total_nodes, linked_projects }; }, [data, enabledLabels, enabledEdgeTypes]); useEffect(() => { @@ -136,8 +155,15 @@ export function GraphTab({ project }: GraphTabProps) { const enableAll = useCallback(() => { if (!data) return; - setEnabledLabels(new Set(data.nodes.map((n) => n.label))); - setEnabledEdgeTypes(new Set(data.edges.map((e) => e.type))); + const labels = new Set(data.nodes.map((n) => n.label)); + const types = new Set(data.edges.map((e) => e.type)); + for (const lp of data.linked_projects ?? []) { + for (const n of lp.nodes) labels.add(n.label); + for (const e of lp.edges) types.add(e.type); + for (const e of lp.cross_edges) types.add(e.type); + } + setEnabledLabels(labels); + setEnabledEdgeTypes(types); }, [data]); const disableAll = useCallback(() => { diff --git a/src/ui/http_server.c b/src/ui/http_server.c index d3efc53a..b3851177 100644 --- a/src/ui/http_server.c +++ b/src/ui/http_server.c @@ -926,6 +926,28 @@ static int find_cross_repo_targets(cbm_store_t *store, const char *project, char enum { LAYOUT_MAX_LINKED = 16 }; #define LAYOUT_GALAXY_SPACING 600.0 +#define LAYOUT_GALAXY_PAD 400.0 + +/* Bounding-radius of a layout result: max distance from origin across all + * nodes. Used to size galaxy spacing so satellites don't overlap the primary + * cluster. Layouts with a 1000-node cluster have radius ~1500; the previous + * fixed 600 spacing buried satellites inside the primary mass. */ +static double layout_radius(const cbm_layout_result_t *r) { + if (!r || r->node_count == 0) + return 0.0; + double max_r2 = 0.0; + for (int i = 0; i < r->node_count; i++) { + double x = (double)r->nodes[i].x; + double y = (double)r->nodes[i].y; + double z = (double)r->nodes[i].z; + if (!isfinite(x) || !isfinite(y) || !isfinite(z)) + continue; + double r2 = x * x + y * y + z * z; + if (r2 > max_r2) + max_r2 = r2; + } + return sqrt(max_r2); +} static void handle_layout(struct mg_connection *c, struct mg_http_message *hm) { char project[256] = {0}; @@ -961,26 +983,32 @@ static void handle_layout(struct mg_connection *c, struct mg_http_message *hm) { cbm_layout_result_t *layout = cbm_layout_compute(store, project, CBM_LAYOUT_OVERVIEW, NULL, 0, max_nodes); - /* Find linked projects from CROSS_* edges */ + /* Find linked projects from CROSS_* edges. Keep `store` open through the + * linked-projects loop below so we can query CROSS_* edge rows for each + * target and resolve target_function QNs against the linked stores. */ char *linked[LAYOUT_MAX_LINKED]; int linked_count = find_cross_repo_targets(store, project, linked, LAYOUT_MAX_LINKED); - cbm_store_close(store); - if (!layout) { + cbm_store_close(store); mg_http_reply(c, 500, g_cors_json, "{\"error\":\"layout computation failed\"}"); return; } + /* Capture primary cluster radius before freeing the layout. */ + double primary_radius = layout_radius(layout); + /* Build JSON: primary layout + linked_projects */ char *primary_json = cbm_layout_to_json(layout); cbm_layout_free(layout); if (!primary_json) { + cbm_store_close(store); mg_http_reply(c, 500, g_cors_json, "{\"error\":\"JSON serialization failed\"}"); return; } if (linked_count == 0) { + cbm_store_close(store); mg_http_reply(c, 200, g_cors_json, "%s", primary_json); free(primary_json); return; @@ -1008,24 +1036,27 @@ static void handle_layout(struct mg_connection *c, struct mg_http_message *hm) { continue; } - cbm_store_t *lp_store = cbm_store_open_path(lp_path); - if (!lp_store) { + cbm_store_t *lp_store_keep = cbm_store_open_path(lp_path); + if (!lp_store_keep) { free(linked[li]); continue; } cbm_layout_result_t *lp_layout = - cbm_layout_compute(lp_store, linked[li], CBM_LAYOUT_OVERVIEW, NULL, 0, max_nodes); - cbm_store_close(lp_store); + cbm_layout_compute(lp_store_keep, linked[li], CBM_LAYOUT_OVERVIEW, NULL, 0, max_nodes); + /* keep lp_store_keep open until cross_edges have been resolved below */ if (!lp_layout) { + cbm_store_close(lp_store_keep); free(linked[li]); continue; } + double sat_radius = layout_radius(lp_layout); char *lp_json = cbm_layout_to_json(lp_layout); cbm_layout_free(lp_layout); if (!lp_json) { + cbm_store_close(lp_store_keep); free(linked[li]); continue; } @@ -1034,6 +1065,7 @@ static void handle_layout(struct mg_connection *c, struct mg_http_message *hm) { yyjson_doc *lpdoc = yyjson_read(lp_json, strlen(lp_json), 0); free(lp_json); if (!lpdoc) { + cbm_store_close(lp_store_keep); free(linked[li]); continue; } @@ -1056,22 +1088,89 @@ static void handle_layout(struct mg_connection *c, struct mg_http_message *hm) { yyjson_mut_obj_add_val(mdoc, entry, "edges", yyjson_mut_val_mut_copy(mdoc, le)); } - /* Compute galaxy offset: evenly spaced around primary */ + /* Compute galaxy offset: evenly spaced around primary, far enough out + * that the primary cluster (radius primary_radius) and the satellite + * cluster (radius sat_radius) don't overlap. Bounded below by + * LAYOUT_GALAXY_SPACING for trivially small projects. */ double angle = (2.0 * 3.14159265358979) * (double)li / (double)linked_count; + double dist = primary_radius + sat_radius + LAYOUT_GALAXY_PAD; + if (dist < LAYOUT_GALAXY_SPACING) { + dist = LAYOUT_GALAXY_SPACING; + } yyjson_mut_val *offset = yyjson_mut_obj(mdoc); - yyjson_mut_obj_add_real(mdoc, offset, "x", cos(angle) * LAYOUT_GALAXY_SPACING); - yyjson_mut_obj_add_real(mdoc, offset, "y", sin(angle) * LAYOUT_GALAXY_SPACING); + yyjson_mut_obj_add_real(mdoc, offset, "x", cos(angle) * dist); + yyjson_mut_obj_add_real(mdoc, offset, "y", sin(angle) * dist); yyjson_mut_obj_add_real(mdoc, offset, "z", 0.0); yyjson_mut_obj_add_val(mdoc, entry, "offset", offset); - /* TODO: cross_edges array with CROSS_* edges connecting the galaxies */ - yyjson_mut_obj_add_val(mdoc, entry, "cross_edges", yyjson_mut_arr(mdoc)); + /* Populate cross_edges connecting primary→this linked galaxy. + * Each entry: {source: , target: , type}. + * + * Approach: a CROSS_* edge in the source store points caller_id → + * local_route_id (a Route node in the source store). The Route's + * qualified_name (e.g. "__route__grpc__ReserveServiceV2/AddTicketDetails") + * is canonical and the same Route exists in the linked store too — + * that's the cross-repo matching contract. Pull the QN by joining + * edges → nodes in source, then look it up in the linked store. + * + * properties.target_function only holds the bare method name and is + * not unique across handlers; using the Route QN avoids that. */ + yyjson_mut_val *cross_arr = yyjson_mut_arr(mdoc); + struct sqlite3 *src_db = cbm_store_get_db(store); + struct sqlite3 *lp_db = cbm_store_get_db(lp_store_keep); + if (src_db && lp_db) { + sqlite3_stmt *eq = NULL; + if (sqlite3_prepare_v2( + src_db, + "SELECT e.source_id, e.type, n.qualified_name " + "FROM edges e JOIN nodes n " + " ON n.id = e.target_id AND n.project = e.project " + "WHERE e.project = ?1 AND e.type LIKE 'CROSS_%' " + " AND json_extract(e.properties, '$.target_project') = ?2 " + " AND n.qualified_name IS NOT NULL", + -1, &eq, NULL) == SQLITE_OK) { + sqlite3_bind_text(eq, 1, project, -1, SQLITE_STATIC); + sqlite3_bind_text(eq, 2, linked[li], -1, SQLITE_STATIC); + + sqlite3_stmt *lookup = NULL; + sqlite3_prepare_v2( + lp_db, + "SELECT id FROM nodes WHERE qualified_name = ?1 LIMIT 1", + -1, &lookup, NULL); + + while (sqlite3_step(eq) == SQLITE_ROW) { + int64_t src_id = sqlite3_column_int64(eq, 0); + const char *etype = (const char *)sqlite3_column_text(eq, 1); + const char *qn = (const char *)sqlite3_column_text(eq, 2); + if (!qn || !etype || !lookup) { + continue; + } + sqlite3_reset(lookup); + sqlite3_clear_bindings(lookup); + sqlite3_bind_text(lookup, 1, qn, -1, SQLITE_STATIC); + if (sqlite3_step(lookup) != SQLITE_ROW) { + continue; + } + int64_t tgt_id = sqlite3_column_int64(lookup, 0); + yyjson_mut_val *ce = yyjson_mut_obj(mdoc); + yyjson_mut_obj_add_int(mdoc, ce, "source", src_id); + yyjson_mut_obj_add_int(mdoc, ce, "target", tgt_id); + yyjson_mut_obj_add_strcpy(mdoc, ce, "type", etype); + yyjson_mut_arr_append(cross_arr, ce); + } + if (lookup) sqlite3_finalize(lookup); + sqlite3_finalize(eq); + } + } + yyjson_mut_obj_add_val(mdoc, entry, "cross_edges", cross_arr); + cbm_store_close(lp_store_keep); yyjson_mut_arr_append(lp_arr, entry); yyjson_mut_doc_free(lm); free(linked[li]); } + cbm_store_close(store); yyjson_mut_obj_add_val(mdoc, mroot, "linked_projects", lp_arr); size_t len = 0; From bc6b9086594a74260500683f5df6e7bb46682df8 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:14:02 +0500 Subject: [PATCH 09/10] chore(test): dump_csharp debug tool for extractor inspection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone binary that runs the C# extractor over one file and prints the resulting defs / type_assigns / calls / Field nodes with their parent_class and return_type. Useful when iterating on producer-side gRPC detection — being able to point the extractor at a real source file and read structured output is how a few of the C# 12 primary-ctor edge cases got found. Built via `make -f Makefile.cbm dump-csharp`. Not wired into the main test suite or CI. --- Makefile.cbm | 14 ++++++ tests/dump_csharp.c | 120 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 tests/dump_csharp.c diff --git a/Makefile.cbm b/Makefile.cbm index e328f28f..0b14c374 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -504,6 +504,20 @@ $(BUILD_DIR)/codebase-memory-mcp: $(MAIN_SRC) $(PROD_SRCS) $(EXTRACTION_SRCS) $( cbm: $(BUILD_DIR)/codebase-memory-mcp @echo "Built: $(BUILD_DIR)/codebase-memory-mcp" +# Standalone debug tool: dump C# extraction results for one file. +$(BUILD_DIR)/dump_csharp: $(OBJS_VENDORED_PROD) | $(BUILD_DIR) + $(CC) $(CFLAGS_PROD) -o $@ \ + tests/dump_csharp.c \ + $(FOUNDATION_SRCS) \ + $(SIMHASH_SRCS) $(SEMANTIC_SRCS) \ + src/pipeline/worker_pool.c \ + $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(ZSTD_SRCS) \ + $(OBJS_VENDORED_PROD) \ + $(LDFLAGS) + +dump-csharp: $(BUILD_DIR)/dump_csharp + @echo "Built: $(BUILD_DIR)/dump_csharp" + # ── Build with embedded UI (requires Node.js) ─────────────────── # Swap embedded_stub.c for the generated embedded_assets.c diff --git a/tests/dump_csharp.c b/tests/dump_csharp.c new file mode 100644 index 00000000..bc2c2a5a --- /dev/null +++ b/tests/dump_csharp.c @@ -0,0 +1,120 @@ +/* dump_csharp.c — Standalone inspector for C# extraction. + * Reads a file path, runs cbm_extract_file, prints calls/type_assigns/defs. + */ +#include "cbm.h" +#include "tree_sitter/api.h" +#include +#include +#include + +extern const TSLanguage *tree_sitter_c_sharp(void); + +static void walk(TSNode n, int depth, const char *src, int max_depth) { + if (depth > max_depth) return; + const char *kind = ts_node_type(n); + uint32_t sb = ts_node_start_byte(n); + uint32_t eb = ts_node_end_byte(n); + char preview[64]; + int plen = (int)(eb - sb); + if (plen > 60) plen = 60; + memcpy(preview, src + sb, plen); + preview[plen] = 0; + for (int i = 0; i < plen; i++) if (preview[i] == '\n') preview[i] = ' '; + printf("%*s%s [%.*s]\n", depth * 2, "", kind, plen, preview); + uint32_t nc = ts_node_child_count(n); + for (uint32_t i = 0; i < nc; i++) { + TSNode c = ts_node_child(n, i); + const char *fld = ts_node_field_name_for_child(n, i); + if (fld) printf("%*s.%s:\n", (depth + 1) * 2, "", fld); + walk(c, depth + 1, src, max_depth); + } +} + +static char *slurp(const char *path, int *len_out) { + FILE *f = fopen(path, "rb"); + if (!f) return NULL; + fseek(f, 0, SEEK_END); + long n = ftell(f); + fseek(f, 0, SEEK_SET); + char *buf = malloc((size_t)n + 1); + fread(buf, 1, (size_t)n, f); + buf[n] = 0; + *len_out = (int)n; + fclose(f); + return buf; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 1; + } + if (cbm_init() != 0) { + fprintf(stderr, "cbm_init failed\n"); + return 1; + } + int n = 0; + char *src = slurp(argv[1], &n); + if (!src) { + fprintf(stderr, "cannot read %s\n", argv[1]); + return 1; + } + CBMFileResult *r = cbm_extract_file(src, n, CBM_LANG_CSHARP, "test", argv[1], 0, NULL, NULL); + if (!r) { fprintf(stderr, "extract returned NULL\n"); return 1; } + if (r->has_error) { + fprintf(stderr, "parse error: %s\n", r->error_msg ? r->error_msg : "?"); + } + + printf("=== DEFS (%d) ===\n", r->defs.count); + for (int i = 0; i < r->defs.count; i++) { + const CBMDefinition *d = &r->defs.items[i]; + printf(" [%s] %s qn=%s parent=%s rt=%s", + d->label ? d->label : "?", d->name ? d->name : "?", + d->qualified_name ? d->qualified_name : "", + d->parent_class ? d->parent_class : "", + d->return_type ? d->return_type : ""); + printf(" params="); + if (d->param_names) { + printf("["); + for (int j = 0; d->param_names[j]; j++) { + printf("%s%s:%s", j ? "," : "", d->param_names[j], + d->param_types && d->param_types[j] ? d->param_types[j] : "?"); + } + printf("]"); + } else { + printf("NULL"); + } + printf(" sig=%s\n", d->signature ? d->signature : "NULL"); + } + + printf("=== CALLS (%d) ===\n", r->calls.count); + for (int i = 0; i < r->calls.count; i++) { + const CBMCall *c = &r->calls.items[i]; + printf(" callee=%s enc=%s\n", + c->callee_name ? c->callee_name : "?", + c->enclosing_func_qn ? c->enclosing_func_qn : ""); + } + + printf("=== TYPE_ASSIGNS (%d) ===\n", r->type_assigns.count); + for (int i = 0; i < r->type_assigns.count; i++) { + const CBMTypeAssign *t = &r->type_assigns.items[i]; + printf(" var=%s type=%s enc=%s\n", + t->var_name ? t->var_name : "?", + t->type_name ? t->type_name : "?", + t->enclosing_func_qn ? t->enclosing_func_qn : ""); + } + + if (argc >= 3 && strcmp(argv[2], "--ast") == 0) { + printf("\n=== AST ===\n"); + TSParser *p = ts_parser_new(); + ts_parser_set_language(p, tree_sitter_c_sharp()); + TSTree *tree = ts_parser_parse_string(p, NULL, src, n); + walk(ts_tree_root_node(tree), 0, src, 8); + ts_tree_delete(tree); + ts_parser_delete(p); + } + + cbm_free_result(r); + free(src); + return 0; +} From 0431efb955f301716d39ff20fe4d737847859e90 Mon Sep 17 00:00:00 2001 From: sponger94 <45746997+sponger94@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:14:20 +0500 Subject: [PATCH 10/10] docs: cross-repo proposal updates + tier1 production-readiness notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two planning documents covering the rationale for everything else in this branch. cbm-cross-repo-proposal.md: * Updated Tier 1 sections to reflect the producer-side detection that ships in this branch (ctor params, factories, DI registry, fields). * New §5.7 Tier 1g — Contract-package FQN extraction. Explains why the whole 1g family is deferred to a focused follow-on PR rather than landed here, and sequences the work into four pieces (producer dual emission, AST-time package extraction, deterministic collision resolution, NuGet/Maven consumer-side cache scan). * Updated §5.8 sub-tier sequencing accordingly. tier1-extractor-fixes.md (new): * Documents the four production-readiness gaps that prevented Tier 1's producer-side detection from firing on real .NET fleets: parallel-pipeline wiring, C# 12 primary-ctor extraction, protobuf rpc → service fallback, and graph-UI cross-galaxy passthrough. * Adversarial-review follow-ups: incremental-pipeline wiring, safer project-wide stub-var fallback, and the cross-package collision warning + service_qn property mitigation. --- .planning/cbm-cross-repo-proposal.md | 969 +++++++++++++++++++++++++++ .planning/tier1-extractor-fixes.md | 89 +++ 2 files changed, 1058 insertions(+) create mode 100644 .planning/cbm-cross-repo-proposal.md create mode 100644 .planning/tier1-extractor-fixes.md diff --git a/.planning/cbm-cross-repo-proposal.md b/.planning/cbm-cross-repo-proposal.md new file mode 100644 index 00000000..1f0e6c05 --- /dev/null +++ b/.planning/cbm-cross-repo-proposal.md @@ -0,0 +1,969 @@ +# Protocol-Aware Cross-Repo Intelligence + +**Status:** Tier 1a+1b shipped as PR #293. Tiers 1c–1f, 2, 3, 4 sequenced as follow-up PRs. +**Audience:** cbm maintainer + reviewers +**Scope:** Extends `pass_cross_repo` from literal-string matching to protocol-aware matching across the four cross-language patterns that account for >95% of inter-service communication in modern codebases. +**Compatibility:** Strictly additive throughout. No breaking changes to existing tools, edges, or APIs. +**Companion artifacts:** +- **#292** — this proposal (issue thread on `DeusData/codebase-memory-mcp`) +- **#293** — Tier 1a + 1b PR (`pass_idl_scan`: gRPC IDL Route emission + consumer HANDLES + producer GRPC_CALLS via `type_assigns`). Validated against real-world .NET microservice fleet; covers ~30% of producer idioms (Python `*Stub`, manual C# `*Client`, Rust tonic, TS `@grpc/grpc-js`). +- **#294** — pre-existing `pass_parallel.c emit_grpc_edge` issue (phantom Routes from heuristic var-name matching, greedy `ServiceClient` suffix stripping). Discovered during validation; out of scope for #293. +- **PR #281** — rich `get_architecture` fields. Initially documented as a precondition for this work; Tier 1 implementation does not strictly depend on it. + +--- + +## TL;DR + +cbm already has the scaffolding for cross-repo intelligence (`pass_cross_repo.c`, `CROSS_HTTP_CALLS` / `CROSS_ASYNC_CALLS` / `CROSS_GRPC_CALLS` edge types, named-route matching). The original implementation only fired when a call site had a **literal URL or topic string** as its first argument. That covers idiomatic Python/Node code well, but misses the dominant pattern in modern strongly-typed stacks (Java/Spring, .NET, Kotlin, Go-with-codegen): **typed clients and message handlers where the routing identifier is a generic type parameter, an interface ancestor, an attribute, a constructor parameter type, a field type, a factory-function return type, or a config-resolved name** — never a literal string at the call site. + +This proposal adds four protocol-aware extraction tiers, each language-generic. **Tier 1 is decomposed into six sub-tiers (1a–1f) by signal source**, each one a small isolated extension to a unified `pass_idl_scan`. Tiers 2–4 follow as separate PRs once Tier 1 stabilizes; they reuse Tier 1's `(var, type, scope) → service` abstraction as foundation. + +Cross-language framework coverage matrix in §6. Acceptance gating: each sub-tier ships independently, success measured by precision/recall against multi-language test fixtures and validated against real-world fleet indexing. + +**Key insight from validation (April 2026):** Tier 1 is fundamentally a **type-flow problem**, not a call-site-string problem. The original v1 spec assumed `var = SomeStub(...)` covers most cases. Real fleets show that modern OOP DI (constructor injection, field declarations, factory return types) is the majority pattern, and `type_assigns`-based detection alone misses 60–70% of producers. + +--- + +## 1. Background + +### 1.1 What works in cbm today (as of `main` plus #293) + +After PR #281 lands (or its consumer-side equivalent — see note above), `get_architecture(aspects=["all"])` returns rich structural data (entry_points, routes, hotspots, layers, boundaries, languages). The per-repo extraction pipeline detects: + +- Library identifiers in resolved qualified names (`service_patterns.c:631` — 252 patterns across HTTP, async, gRPC, config, route-registration kinds, covering Python/Node/Go/Java/Rust/PHP/Ruby/C# basics) +- Literal URL / topic strings at call sites (`pass_calls.c:emit_http_async_edge`) +- Route registration via `app.get("/x", ...)` and attribute-routed framework styles (`pass_route_nodes.c`) +- Cross-repo matching when both sides have a literal-route identifier (`pass_cross_repo.c:cbm_cross_repo_match`) +- **(After #293)** IDL Route emission from `.proto` files via `pass_idl_scan` with QN format `__route__grpc__/` +- **(After #293)** Consumer-side `HANDLES` edges from server-stub-base subclasses (Python `*Servicer`, Java `*ImplBase`, C# `*Base`/`*ServiceBase`) +- **(After #293)** Producer-side `GRPC_CALLS` edges from `type_assigns`-captured stub variable assignments + +The `cross-repo-intelligence` mode in `index_repository` matches `__route____` keys across project DBs and emits CROSS_HTTP_CALLS / CROSS_ASYNC_CALLS / CROSS_GRPC_CALLS edges. **Phase D** of `pass_cross_repo.c` (`match_typed_routes`) handles gRPC/GraphQL/tRPC by reading the source-DB Route's QN and looking it up in target DBs — protocol-agnostic matching, agnostic to which pass emitted the Route. + +### 1.2 What didn't work before #293 + +`emit_http_async_edge` (pass_calls.c, line ~232): + +```c +bool is_url = (url_or_topic && url_or_topic[0] != '\0' && + (url_or_topic[0] == '/' || strstr(url_or_topic, "://") != NULL)); +bool is_topic = (url_or_topic && ... && svc == CBM_SVC_ASYNC && ...); +if (!is_url && !is_topic) { + /* fall back to plain CALLS edge */ + return; +} +``` + +If the first string argument was not a literal URL or topic, the edge fell through to `CALLS` — generic, unrouted, untaggable for cross-repo matching. Idiomatic code in major modern frameworks rarely passes a literal URL or topic at the call site: + +```csharp +// .NET / MassTransit — no topic string, message type is the identifier +await _publishEndpoint.Publish(new VoucherRedeemed(voucher.Id), ct); + +// .NET / generated gRPC client — no URL string, service.method is the identifier +var resp = await _promoCodeClient.GetVoucherAsync(req, cancellationToken: ct); + +// Java / Spring Cloud Stream — no topic string, message type is the identifier +streamBridge.send("output", new OrderShipped(order)); + +// Java / Feign — interface annotation IS the route, no literal at call site +return feignClient.getVoucher(id); + +// Kotlin / Retrofit — same shape as Feign +return retrofitApi.getVoucher(id) + +// Go with gRPC codegen — generated client method, no string at call site +resp, err := promoClient.GetVoucher(ctx, req) + +// Python / FastAPI typed httpx client (oapi-codegen-derived) — same shape +resp = await client.get_voucher(id=id) +``` + +In each case, the producer-side identifier (message type FQN, gRPC service.method, Feign interface annotation) was statically present and resolvable — but not as a literal string argument. It lives in: a generic type parameter, the constructor type of an argument, a class-level attribute, a method-level attribute, a field/property declared type, a factory-function return type, or a DI registration generic argument. + +The consumer side has the same identifier visible in a different syntactic position: an `IConsumer` declaration, a `*Base` implementation, a `@StreamListener` annotation, an attribute-routed controller method. + +**The matching problem is solvable.** The producer/consumer identifier exists statically on both sides. cbm's original extractor just didn't extract it. + +### 1.3 What works after #293 — and what doesn't + +PR #293 (`pass_idl_scan`) addressed the consumer side completely and the producer side partially: + +**Consumer side — fully working:** +- Python `*Servicer` subclasses → `HANDLES` to IDL Route ✓ +- Java `*ImplBase` subclasses → `HANDLES` ✓ +- C# `*ServiceBase` / `*Base` subclasses → `HANDLES` ✓ +- Cross-language: longest-suffix-first ordering disambiguates `ImplBase` vs `Base` + +**Producer side — partial:** +- Python `stub = pb2_grpc.FooStub(channel)` → `GRPC_CALLS` ✓ +- C# manual `var c = new Foo.FooClient(channel)` → `GRPC_CALLS` ✓ +- Rust `let c = FooClient::new(channel)` → `GRPC_CALLS` ✓ +- TS `const c = new pb.FooClient(...)` → `GRPC_CALLS` ✓ +- **Modern C# Grpc.Net.ClientFactory (DI ctor injection)** → ✗ no detection +- **Java `stub = FooGrpc.newBlockingStub(...)`** → ✗ no detection (return-type inference missing) +- **Go `c := pb.NewFooClient(conn)`** → ✗ no detection (same) +- **NestJS `@Client()` decorator pattern** → ✗ no detection +- **Spring `@GrpcClient` annotation** → ✗ no detection + +The gap clusters around three signal sources that `type_assigns` doesn't capture: +1. Constructor parameter types (DI injection) +2. Function return types (factory functions) +3. Annotations / decorators (DI registration) + +### 1.4 Why this matters now + +Cross-repo intelligence is one of the most-asked-for capabilities in code-graph tools. Industry tooling that does parts of this: + +| Tool | Approach | Limitation | +|---|---|---| +| Backstage (Spotify) | Service catalog from OpenAPI / AsyncAPI / .proto files | Manual catalog maintenance, declarative not derived | +| Sourcegraph | Cross-repo references via SCIP indexes | Per-symbol, doesn't protocol-match (only name-matches) | +| Apollo Studio | Federated GraphQL via `@key` directives | GraphQL only | +| AsyncAPI tooling | Typed async-message matching | AsyncAPI-spec only, requires explicit AsyncAPI files | +| GitHub CodeQL | Cross-repo dataflow for security | Security-focused, heavyweight | +| stack-graphs (GitHub) | Universal name resolution graph | Within-repo only | + +cbm is uniquely positioned: single binary, AST + LSP-grade extraction, sub-second incremental indexing, no external service dependencies. The cross-repo capability matters because it's the missing 20% of value that turns "smart code search" into "service-architecture truth source." + +--- + +## 2. Cross-language pattern audit + +The producer→consumer routing problem decomposes into four tiers. Each tier is generic across major language ecosystems. Concrete framework instances per tier: + +### Tier 1 — IDL-driven typed stubs (gRPC, GraphQL, OpenAPI, AsyncAPI) + +The stable identifier lives in an IDL file shared between producer and consumer repos. Both sides reference generated types derived from the same IDL. + +| Ecosystem | Producer pattern | Consumer pattern | Stable identifier | +|---|---|---|---| +| **gRPC** (Go, Java, Python, Rust, TS, C#, Kotlin, Swift) | `*Client` from .proto codegen | `*Base` / `*Servicer` impl from .proto codegen | `service.method` from .proto | +| **GraphQL Federation** (any GraphQL stack) | typed query/mutation client | resolver bound to type with `@key` directive | type + key from .graphql | +| **OpenAPI** (NSwag/openapi-generator/oapi-codegen) | generated typed client per language | controller/handler matching path+method | path + method from openapi.yaml | +| **AsyncAPI** | generated publisher | generated subscriber | channel + message from asyncapi.yaml | + +**Detection**: parse the IDL file (or extract via existing tree-sitter grammars — `grammar_protobuf` is already vendored), extract canonical IDs as routes; on producer side find references to generated client types; on consumer side find generated base-class implementations. + +**Genericity**: 100%. gRPC alone covers 8+ languages. .proto/.graphql/.openapi files are language-agnostic by design. + +### Tier 2 — Typed message pub/sub (interface-ancestor + generic-type) + +The stable identifier is a message type's fully-qualified name. Producer has `Publish` / `Send` / equivalent on a known interface; consumer implements `IConsumer` / `@MessageHandler` / equivalent. + +| Language | Frameworks | Producer shape | Consumer shape | +|---|---|---|---| +| C# | MassTransit, NServiceBus, Wolverine, Brighter, Rebus | `IPublishEndpoint.Publish` / `ISendEndpoint.Send` / `IBus.Publish` | `IConsumer` / `IHandleMessages` | +| Java/Kotlin | Spring Cloud Stream, Axon, Eventuate | `streamBridge.send(...)` / `@CommandHandler` | `@StreamListener` / `@EventHandler` | +| Node/TS | NestJS microservices, Moleculer, EventBus libs | `@MessagePattern` emit | `@EventPattern` handler | +| Python | Faust, Celery (typed), aio-pika typed wrappers | `@app.agent` send | typed handler funcs | +| Go | Watermill, NATS-typed, Wire | typed publish via marshalers | typed subscriber registration | +| Rust | Lapin + serde, async-nats with typed deserialization | typed publish | typed subscribe | + +**Detection**: pattern-match the producer interface (e.g., `IPublishEndpoint`, `streamBridge`) with its `Publish` / `Send` method, extract `T` from the generic param or the constructor argument's type. On consumer side, find classes implementing `IConsumer` / `IHandleMessages` / classes with `@StreamListener` on a method, extract `T`. Match by FQN. + +**Genericity**: highly cross-language. ~6 framework families, identical abstract pattern. + +### Tier 3 — Attribute / decorator-driven HTTP routes + +Producer is a typed HTTP client whose interface methods carry route attributes; consumer is a controller/handler with matching route attributes. Both attribute values are literal strings — the easiest tier *if extracted from the attribute, not the call site*. + +| Language | Producer | Consumer | +|---|---|---| +| C# | Refit, RestEase (`[Get("/x")]` on interface) | ASP.NET Core (`[HttpGet("/x")]` controller) | +| Java | Feign (`@RequestLine("GET /x")`), Retrofit (`@GET("/x")`) | Spring (`@GetMapping("/x")`), JAX-RS (`@GET @Path("/x")`) | +| Kotlin | Retrofit | Spring, Ktor route DSL | +| TypeScript | tsoa, NestJS HttpService with openapi-derived clients | NestJS (`@Get("/x")`), Hono, Express decorators | +| Python | httpx-codegen, aiohttp wrappers from openapi | FastAPI (`@app.get("/x")`), Litestar | +| Go | huma generated, oapi-codegen clients | huma, chi, gin, echo route registration | +| Rust | utoipa generated | actix-web, axum, rocket route attributes | + +**Detection**: extract HTTP method + path from class-level / method-level attributes on both interfaces (producer) and concrete classes (consumer). Match. + +**Genericity**: most universal — decorator-driven HTTP routing is the modern default in every serious web ecosystem. + +### Tier 4 — Config-resolved service discovery + +The producer's call site has only a relative path or named-client reference; the actual base URL lives in a config file (`appsettings.json`, `application.yaml`, env vars, Kubernetes Service DNS, service-registry config). Consumer side uses Tier 3 attribute-driven detection. + +| Ecosystem | Producer | Config source | +|---|---|---| +| C# | `IHttpClientFactory.CreateClient("name")` | `appsettings*.json`, `services.AddHttpClient(...)` | +| Spring | `@FeignClient(name="x", url="${promo.url}")` | `application.yaml`, env | +| Spring Cloud / Eureka / Consul | service registry lookups | registry config | +| Kubernetes | Service DNS (`http://my-service:80/x`) | Service / Ingress YAML | +| Node | env-driven base URLs in axios/fetch wrappers | `.env`, Helm values | +| Go | viper-loaded named services | YAML / env | + +**Detection**: scan config files for named-service → base-URL mappings; trace `CreateClient("name")` / `@FeignClient("name")` to resolved URL; combine with the variable URL path within the calling method to reconstruct the full route. + +**Genericity**: universal microservice pattern. + +### Tier 5 — Reflection / runtime-resolved DI (out of scope) + +`_serviceProvider.GetService(Type.GetType(configString))?.Invoke(...)` is genuinely impossible to resolve statically. This tier is named for completeness but explicitly out of scope. Estimated <5% of cross-service calls in practice. + +--- + +## 3. Proposed architecture + +### 3.1 Plugin-based service-pattern registry + +`internal/cbm/service_patterns.c` currently hardcodes 252 patterns in a C array. Adding a new framework requires a C patch + recompile. Proposal: externalize the pattern table to a YAML / JSON registry loaded at startup. + +Format example (registry-format-1.yaml — actual schema TBD with maintainer): + +```yaml +patterns: + # Tier 2 — typed-message pub/sub + - id: masstransit-publish + languages: [csharp] + kind: ASYNC_CALLS + producer: + match: + type_implements: IPublishEndpoint + method_pattern: "Publish(...)" + extract_id_from: generic_type_arg_or_first_arg_type + id_kind: message_fqn + broker: rabbitmq + consumer: + match: + class_implements: "IConsumer" + extract_id_from: generic_type_arg + id_kind: message_fqn + + - id: spring-cloud-stream-handler + languages: [java, kotlin] + kind: ASYNC_CALLS + producer: + match: + method_calls: "streamBridge.send" + first_arg_kind: string_literal + extract_id_from: first_arg + id_kind: channel_name + consumer: + match: + method_annotation: "@StreamListener" + extract_id_from: annotation_value + id_kind: channel_name + + - id: refit-client + languages: [csharp] + kind: HTTP_CALLS + producer: + match: + interface_method_attribute: "[Get|Post|Put|Delete|Patch]" + extract_id_from: attribute_arg + id_kind: http_route + consumer: + match: + method_attribute: "[HttpGet|HttpPost|HttpPut|HttpDelete|HttpPatch]" + extract_id_from: attribute_arg + id_kind: http_route +``` + +Benefits: +- Adding Wolverine, Watermill, or any new framework is one YAML entry, not a code patch + release cycle +- Maintainer review surface drops dramatically (review YAML, not C) +- Community contributions become low-risk (a YAML PR can't crash the binary) +- Multi-language patterns compose naturally (one ID matches both Java and Kotlin via `languages: [java, kotlin]`) + +Existing 252 patterns in `service_patterns.c` can be migrated to YAML in a separate cleanup PR (no behavior change, pure refactor) — out of scope for this proposal but a natural follow-on. + +### 3.2 Pipeline integration + +Two changes to the existing pipeline: + +1. **New pass: `pass_idl_scan`** — runs once per repo before `pass_definitions`. Scans for IDL files (.proto, .graphql, openapi.yaml, asyncapi.yaml) and emits canonical Route nodes derived from them. Each Route gets a stable QN like `__route__grpc__/` regardless of which language consumes it. + + **Status:** ✅ landed in #293. Runs after `pass_calls` (not before `pass_definitions`) because it needs the proto-derived Class+Function nodes and INHERITS edges already in the gbuf. Implementation choice was simpler than re-parsing IDL files; CBM's existing tree-sitter-protobuf grammar already extracts services as Class nodes and rpcs as Function nodes via `pass_definitions`, so `pass_idl_scan` walks the gbuf rather than the filesystem. + +2. **Extend `pass_calls.c emit_classified_edge`** — when matching against the new YAML-driven patterns, support extracting identifiers from: + - Generic type parameters (`Publish`) + - Constructor argument types (`Publish(new T(...))`) + - Interface-method attributes (`[Get("/x")]`) + - Class-level attributes (`@FeignClient(name="x")`) + - Combined with config-resolved values (Tier 4) + + **Status:** Tier 2/3/4 work. `pass_idl_scan` (#293) does this for gRPC `GRPC_CALLS` edges via a separate detection path keyed by `type_assigns`. + +3. **Auto-trigger cross-repo pass for workspace siblings** — when a repo is part of a workspace (e.g., `cross-repo-intelligence` mode is invoked once with `target_projects: ["*"]`), persist the workspace membership in the artifact, and on subsequent re-indexes auto-fire cross-repo matching against the same sibling set. + + **Status:** still open. Useful UX improvement but orthogonal to the type-detection work. + +### 3.3 Cross-repo extension + +The existing `cbm_cross_repo_match` already supports topic-based matching. Two extensions: + +1. **Add `match_by_message_fqn`** — phase D (after HTTP / Async / Channel matching). For each ASYNC_CALLS edge with `message_fqn` property, find consumer-side `IConsumer` registrations in target DBs and emit CROSS_ASYNC_CALLS edges. + +2. **Add `match_by_grpc_method`** — phase E. For each gRPC client call with `service.method` identifier, find consumer-side `*Base` overrides of the same `service.method` and emit CROSS_GRPC_CALLS edges. Reuses the existing CROSS_GRPC_CALLS edge type and emission helper at `pass_cross_repo.c:657`. + +**Status:** Both extensions ALREADY EXIST in `main` via `match_typed_routes` (lines 492-549 of `pass_cross_repo.c`). The function reads the source-DB Route's QN and looks it up in target DBs, agnostic to which pass emitted it. Phase D handles GRPC_CALLS / GRAPHQL_CALLS / TRPC_CALLS uniformly. Tier 1 (#293) plugs into this as-is. + +--- + +## 4. Tier 1 detailed spec — gRPC + +### 4.1 Original v1 producer-side spec + +(Quoted from initial proposal.) Detect references to generated gRPC client types. The detection signal is the **type name pattern**, not call-site strings: + +- C#: classes/interfaces ending in `Client` derived from `Grpc.Core.ClientBase` (generated by `Grpc.Tools`) +- Go: structs with `*grpc.ClientConn` field + methods matching .proto service methods +- Python: classes from `*_pb2_grpc.py` ending in `Stub` +- Java/Kotlin: classes ending in `*Grpc.*Stub` (generated by `protoc-gen-grpc-java`) +- TypeScript: classes from `*_pb_grpc.d.ts` with the right shape +- Rust: tonic-generated `*Client` structs + +For each method call on a generated client type: +1. Resolve the client type to its underlying `service.method` pair (recoverable from .proto) +2. Emit a `GRPC_CALLS` edge with new properties: `{rpc_kind: "grpc", service: "promocode.PromoCodeManagerGrpcService", method: "GetVoucher"}` + +### 4.2 Consumer-side extraction (shipped in #293) + +Detect classes implementing the generated gRPC server-base type via INHERITS edges: + +- C#: `: PromoCodeManagerGrpcServiceBase` +- Go: structs with method receivers matching the unimplemented server interface (currently out of scope — see §4.6) +- Python: classes inheriting `*Servicer` +- Java: `extends *ImplBase` +- Rust: `impl *Server for ...` (handled by IMPLEMENTS, not INHERITS — out of scope for v1) + +For each method override of a service method, emit a HANDLES edge from the implementing method to the corresponding Route node. + +Server-side base class suffixes (longest-first to disambiguate): +``` +ServicerBase, AsyncServicer, ServiceBase, ImplBase, Servicer, Base +``` + +### 4.3 IDL parsing (shipped in #293) + +`pass_idl_scan` walks the **gbuf** (not the filesystem) for `Class` nodes whose `file_path` ends in `.proto`. CBM's tree-sitter-protobuf grammar (`grammar_protobuf.c`, language `CBM_LANG_PROTOBUF`) already extracts proto services as Class nodes (`service` kind) and rpcs as Function nodes (`rpc` kind) via the standard `pass_definitions` pipeline. `pass_idl_scan` walks `DEFINES_METHOD` edges from each proto-Class to its rpcs and emits canonical Route nodes. + +Route QN format: `__route__grpc__/`. Aligned with the existing `__route____` convention used for HTTP and async Routes. + +(Originally the spec called for re-parsing `.proto` files in a separate `pass_idl_scan` pre-pass; the gbuf-walking approach is simpler and avoids duplicate parsing.) + +### 4.4 Cross-repo matching (no changes needed) + +Phase D in `cbm_cross_repo_match` (`match_typed_routes` at `pass_cross_repo.c:492`) iterates `GRPC_CALLS` edges in the source store, reads the Route's QN, looks it up in target stores, follows the `HANDLES` edge to the impl method, and emits `CROSS_GRPC_CALLS` bidirectionally. **Already in `main`** before this proposal — Tier 1 just plugs in as a producer/consumer of QNs. + +### 4.5 v1 estimated diff size (actual #293) + +Original estimate: ~560 LOC. +Actual #293: ~1483 lines added across 14 files (pass + tests + 9 fixture files). The pass itself is ~580 LOC. The discrepancy is due to (1) unit tests being more thorough than estimated, (2) fixture coverage across 6 languages, (3) extensive docstring comments documenting cross-language conventions. + +### 4.6 Test fixtures (multi-language) — shipped in #293 + +Reference fixtures under `testdata/cross-repo/grpc/`: +- `contracts/promo.proto` — single .proto with `package promo; service PromoCodeService { rpc GetVoucher(...); rpc RedeemVoucher(...); }` +- `server-python/promo_server.py` — Python `PromoCodeServiceServicer` subclass +- `server-csharp/PromoCodeService.cs` — .NET `PromoCodeServiceBase` subclass with `*Async` methods +- `server-java/PromoCodeServiceImpl.java` — Java `PromoCodeServiceImplBase` subclass +- `client-python/promo_client.py` — Python `stub = PromoCodeServiceStub(channel)` pattern +- `client-csharp/PromoCodeClient.cs` — C# `var c = new FooServiceClient(channel)` pattern +- `client-java/PromoCodeClient.java` — Java `stub = PromoCodeServiceGrpc.newBlockingStub(channel)` pattern (currently NOT detected by v1 — see §5.4) + +Tests assert: after gbuf population mirroring fixture shapes, expected Route nodes exist + expected HANDLES + expected GRPC_CALLS edges per language. + +### 4.7 Success criteria — original + +| Metric | Target | Actual (v1, #293) | +|---|---|---| +| Precision on test fixtures | 100% | ✓ 100% | +| Recall on test fixtures | 100% | ✓ 100% (synthetic) | +| Index-time overhead | <5% per repo with .proto files | ✓ <2% in fleet test | +| Index-time overhead per repo without .proto | 0% (pass is no-op) | ✓ ~5ms (file_count loop only) | +| Memory overhead | proportional to .proto count | ✓ ~1KB per service | +| Backwards compatibility | All existing tests pass | ✓ 2609/2609 | + +### 4.8 Real-world success criteria (introduced by validation, April 2026) + +The original criteria covered synthetic test fixtures. Real-world validation against an 11-service .NET microservice fleet revealed criteria the original spec didn't define: + +| Metric | Target | Actual (v1, #293) | +|---|---|---| +| `CROSS_GRPC_CALLS` edges emitted per fleet | matches manual service-graph audit | **0** (gap analysis below) | +| Producer detection rate on modern .NET fleet | >80% | **<20%** (only manual `var = new Client()` patterns hit) | +| False-positive Route count from v1 detection | 0 | 0 (✓ — denylist works) | +| False-positive Route count from pre-existing `pass_parallel` | 0 | **dozens** (✗ — see #294) | + +The fleet test exposed three gaps that drove the Tier 1 sub-decomposition described in §5. + +--- + +## 5. Tier 1 sub-decomposition (1a–1f) + +The original proposal treated Tier 1 as a single ~560-LOC PR. Real-world validation showed the producer-side problem is a family of **type-flow signals**, each requiring its own small extension. Decomposing into sub-tiers makes each one independently shippable, testable, and reviewable. + +### 5.1 Tier 1a — IDL Route emission ✅ SHIPPED in #293 + +Walk gbuf for `.proto`-derived `Class` nodes; emit `Route` nodes with QN `__route__grpc__/` plus `HANDLES` edge from each rpc Function back to its Route. + +**Coverage:** universal — fires whenever a `.proto` is indexed, language-agnostic. +**LOC:** ~80 in `pass_idl_scan.c`. + +### 5.2 Tier 1b — `type_assigns`-based producer detection ✅ SHIPPED in #293 + +Walk per-file `CBMFileResult.type_assigns`; for each `var = SomeStub(...)` assignment whose RHS type matches a stub-suffix pattern and isn't denylisted, record `(enclosing_qn, var_name, service_name)`. For each `var.Method(...)` call, emit a `GRPC_CALLS` edge. + +Includes file-scope fallback (var-scope match across functions to handle C# class fields), `*Async` method stripping, lowerCamelCase → PascalCase capitalization, and a non-gRPC type denylist (`System.Net.*`, `Microsoft.Extensions.Http`, `RestSharp`, `Refit`, etc.). + +**Coverage:** patterns where the consumer code explicitly assigns a stub variable: +- Python `stub = pb2_grpc.FooStub(channel)` ✓ +- C# manual `var c = new Foo.FooClient(channel)` ✓ +- Rust `let c = FooClient::new(channel)` ✓ +- TS `const c = new pb.FooClient(...)` ✓ +- C# class-field via ctor body `_client = new FooClient(channel)` ✓ (file-scope fallback) + +**LOC:** ~500 in `pass_idl_scan.c` + ~200 in tests. + +### 5.3 Tier 1c — Constructor-parameter tracking 📋 PROPOSED + +**Problem:** Modern .NET DI (Grpc.Net.ClientFactory) registers stubs in `Startup.cs` and consumers receive them as constructor parameters: + +```csharp +public class FooController(FooServiceClient client) { + public async Task X() { await client.GetVoucherAsync(req); } +} +``` + +There is no `var = ` assignment for `client`. `type_assigns` captures nothing. Tier 1b misses entirely. + +Same shape applies to: +- Spring Java `@Autowired` constructor injection +- Kotlin Spring DI +- ASP.NET Core primary constructors (C# 12) +- Any DI framework where the type signal lives in the constructor param type, not in user-written assignment code + +**Implementation:** +1. For each `Class` node `C` in gbuf, find its constructor methods. Heuristic per language: + - C#: Method named like `` OR Method with label `Constructor` (CBM-specific) + - Java: same + - Kotlin: same plus `init {}` blocks + - Python: Method named `__init__` + - TypeScript: Method named `constructor` +2. Read `param_names[]` and `param_types[]` from the ctor's properties_json (CBM extracts these today — `pass_definitions.c:191-192`). +3. For each ctor param of type matching a stub suffix (and not denylisted), record `(class_qn, param_name, service_name)` with **class-wide scope** — meaning every method of class `C` can match this var name. +4. Extend `idl_stub_var_arr_find` to also try class-scope lookup when both function-scope and file-scope misses fail. + +**Coverage gain:** modern .NET DI, Spring constructor injection, Kotlin DI. Conservatively 50%+ of OOP gRPC consumers that #293 currently misses. + +**LOC estimate:** ~250 (pass extension) + ~150 (tests covering 3 languages) = ~400 LOC. + +**Risk:** medium-low. CBM already extracts param types reliably across languages. Main risk is edge cases (e.g., C# primary constructors generate synthetic parameterless ctors — need to walk multiple ctors). + +### 5.4 Tier 1d — Factory-function return-type inference 📋 PROPOSED + +**Problem:** Java and Go gRPC consumers don't type variables explicitly — they rely on type inference from a factory function's return type: + +```java +// Java +PromoCodeServiceGrpc.PromoCodeServiceBlockingStub stub + = PromoCodeServiceGrpc.newBlockingStub(channel); +// In practice, often written as: +var stub = PromoCodeServiceGrpc.newBlockingStub(channel); +``` + +```go +// Go +client := pb.NewPromoCodeClient(conn) +``` + +CBM's `type_assigns` may capture the LHS type if the extractor handles `var` / `:=` inference, but the type name itself comes from the factory function's name pattern. + +**Implementation:** +1. Detect factory-function call patterns in `type_assigns` RHS: + - `Grpc.newBlockingStub(...)` / `newFutureStub` / `newAsyncStub` / `newStub` (Java/Kotlin) + - `.NewClient(...)` (Go) + - `::new(channel)` (Rust — already covered by suffix on type) +2. For each match, derive the service name from the factory function's name pattern (strip `New`, `new`, `pb.`, etc.; strip `Stub` / `Client` suffix from the result). +3. Synthesize a `(var, type_name, service)` tuple as if it were a regular `type_assigns` entry. + +**Coverage gain:** Go grpc-go (a major miss), Java `newBlockingStub` / `newFutureStub` / `newStub`, Kotlin equivalents. + +**LOC estimate:** ~150 + ~100 tests = ~250. + +**Risk:** medium. Factory function naming is conventional but not enforced. False positives possible from unrelated `NewClient` functions in non-gRPC codebases (mitigated by Phase D filter — non-matching cross-repo lookups produce no edges). + +### 5.5 Tier 1e — DI-registration scanning 📋 PROPOSED + +**Problem:** When the type signal lives in a registration call rather than at the consumer site: + +```csharp +// Startup.cs +services.AddGrpcClient(o => o.Address = ...); + +// Later, in consumer code: +public class FooController(FooServiceClient _client) { ... } +``` + +Even with Tier 1c, this works only if `FooServiceClient` ends in a recognized suffix. For custom-named clients (rare but real, especially with manual stub wrapping), the only reliable signal is the registration generic argument. + +Same concept in Spring: +```java +@GrpcClient("promo") +private PromoCodeServiceBlockingStub stub; +``` + +**Implementation:** +1. Detect DI-registration calls: + - C# `services.AddGrpcClient(...)` — extract `T` from generic argument + - Spring `@GrpcClient("name")` annotation — bind annotation arg + field type + - NestJS `@Client({ ... })` decorator on class properties +2. Build a per-repo "known stub types" registry, keyed by FQN. +3. When Tier 1c detects a ctor param of a registered type (or any type ending in a stub suffix already covered), confidence is upgraded. +4. When the suffix-stripped service name matches a registered type's name pattern, emit even if no other signal matched. + +**Coverage gain:** custom-named clients, Spring annotation-based clients, NestJS decorator-based clients. Edge cases that Tier 1c doesn't cover. + +**LOC estimate:** ~200 + ~150 tests = ~350. Higher than 1c/1d because of cross-language registration syntax variance. + +**Risk:** medium. Generic-argument extraction in tree-sitter requires careful AST navigation. Annotation-based detection (Spring/NestJS) needs decorator-arg traversal that CBM has elsewhere but is non-trivial to wire in. + +### 5.6 Tier 1f — Field/property type tracking 📋 PROPOSED + +**Problem:** Some consumers declare stub fields/properties without going through a constructor or DI registration, e.g.: + +```csharp +public class Worker { + private static readonly FooServiceClient Client = CreateClient(); + public async Task X() { await Client.GetVoucherAsync(req); } +} +``` + +Static field initialization with a non-ctor factory. Tier 1c (ctor params) doesn't apply. Tier 1b (`type_assigns`) might fire if CBM extracts the field's type from its declaration (which it does), but the var-scope is class-static, not function or class-instance. + +**Implementation:** +1. Verify CBM extracts class fields with type info. Inspection: `internal/cbm/extract_defs.c` field handling for each language. Likely yes for Java/C#/Kotlin, less certain for Python (annotations). +2. For each class field of a stub-suffix type, record `(class_qn, field_name, service_name)` with class-wide scope (same shape as Tier 1c). +3. Extend lookup to include this scope as a fallback after function/file/class. + +**Coverage gain:** static field patterns, less common but appears in singleton-style consumer code. + +**LOC estimate:** ~100 + ~100 tests = ~200, **assuming CBM already extracts field types**. If extractor work is needed, +200-400 LOC. + +**Risk:** low coverage gain, medium implementation risk depending on extractor state. + +### 5.7 Tier 1g — Contract-package FQN extraction 📋 PROPOSED (deferred from the big Tier 1 PR) + +**Problem:** the Route key shipped in #293 is `__route__grpc__/` using the **bare** service name. Producer (.proto) has the proto `package`; consumer (typed-client class reference) has only the language-side namespace, which is *not always* the proto package — `option csharp_namespace`, Java's `option java_package`, etc. let the two diverge. + +The adversarial review of the tier1 branch (April 2026) flagged this as a real risk: two `.proto` files in different packages but with the same `service` + `rpc` names will `cbm_gbuf_upsert_node()` to the same Route node, both within a project (multi-`.proto` repos) and across repos. The current code mitigates by emitting a `cbm_log_warn("idl_scan.route_collision", ...)` when a second file produces the same QN, and by carrying `service_qn` (proto Class qualified_name) in Route properties so a future FQN-aware matcher has the package available — but cross-repo lookup still joins on the bare key. + +A symmetric FQN fix needs the consumer to know the proto package. None of Tier 1c/1d/1e/1f can derive it from typed-client source alone. Tier 1g closes that gap. + +**Implementation:** + +1. **Locate the contract package** for each consumer-side typed-client reference. Distribution patterns: + - **NuGet:** `...Contracts` package referenced from `*.csproj` (``); the package's `lib/` directory ships generated `Service.pb.cs` / `ServiceGrpc.cs` plus the original `.proto` under `contentFiles/` or `tools/proto/` depending on packaging convention. Cache lives in `~/.nuget/packages///`. + - **Maven:** `*-contracts` JAR under `META-INF/proto/` or as a sibling resource directory; cache in `~/.m2/repository/...`. + - **Vendored:** `contracts/`, `protos/`, `proto/`, or `submodule/` directories inside the consumer repo with `.proto` files alongside generated bindings. + - **Submodule / git-subtree:** `.gitmodules` or shared-checkout dirs that already extract as Class nodes. + + `pass_idl_scan` already runs after `pass_definitions` and walks the gbuf for proto Class nodes, so vendored / submodule cases are free — proto Class nodes carry `qualified_name` of the form `.` already. The new work is only NuGet/Maven cache resolution. + +2. **Build a `` map** at the start of `pass_idl_scan` for the consumer repo: + - For each proto Class node in the gbuf (vendored case), record ``. + - For each NuGet/Maven contract package referenced from the manifest (d94a501 already detects these), parse `.proto` files in the cached package directory, extract `package` declarations + service names, and add to the map. + - Tie-break: when the bare name is ambiguous (two packages contributing the same service name to one consumer), record both and emit cross-product `CROSS_GRPC_CALLS` flagged with `ambiguous: true`. Better than silently picking one; surfaces the collision at query time. + +3. **Re-key consumer-side Routes to FQN** when the map resolves the bare name unambiguously. Producer-side already has `service_qn` available from §Gap 7's mitigation — wire `idl_emit_route_for_rpc` to prefer it when the consumer's contract distribution is also FQN-aware. + +4. **Backwards-compatible matching:** during the transition, `match_typed_routes` should accept either bare or FQN keys on either side. Index Route nodes by both `qualified_name` and a derived `service_qn` property. Avoids a flag-day migration. + +**Coverage gain:** symmetric FQN keying for any fleet using contract-package distribution (the dominant .NET pattern via NuGet `*.Contracts`, the standard Java pattern via `*-contracts` JARs) or vendored `.proto`. Removes the cross-package collision risk in Gap 7 of `tier1-extractor-fixes.md`. + +**LOC estimate:** ~250 + ~150 tests = ~400. +- NuGet/Maven cache scan: ~80 LOC (reuses d94a501's contract-package detection, extends with `.proto` filesystem walk). +- `.proto` minimal parse for `package` + `service`: ~60 LOC (or reuse `pass_idl_scan`'s gbuf walk if the contract `.proto`s are also extracted as gbuf nodes when the cache directory is in the indexer's scope — easier path; depends on whether `~/.nuget/packages/` paths are indexable). +- Map + re-keying: ~50 LOC. +- Backwards-compatible matcher: ~30 LOC in `pass_cross_repo.c`. +- Tests: ~150 LOC across .NET / Java / vendored fixtures. + +**Risk:** low to medium. NuGet/Maven cache layout differs across Windows/Linux/macOS and across package versions (the `proto` files aren't always at the same relative path within the package). Mitigation: probe a small allowlist of conventional locations and skip when none match — the system stays as-is when the contract package doesn't ship `.proto` (some packages ship only generated bindings). Java's `protoc-gen-java` doesn't include the `.proto` source by default, so Java fleets that don't bundle `.proto` in their contract JAR are out of luck and stay on bare keys; in practice most Spring Cloud Contract / `protobuf-maven-plugin` setups *do* bundle. + +**Dependencies:** d94a501's contract-package detection (already shipped). Independent of Tier 1c–1f — Tier 1g is purely a Route-key-derivation enhancement; the consumer-side detection passes feed the same routes regardless. + +#### Status — deferred from the big Tier 1 PR + +After validating Tier 1g.1 (producer-side FQN dual emission) on the test fixture and pushing it through five rounds of adversarial review, the conclusion was that **the half-shipped version is dead weight without consumer-side FQN derivation**: + +- Cross-repo matching still joins on the bare-name key, so the FQN-keyed Routes are essentially unused nodes in the matching path. +- The collision-defense it provides only matters when two `.proto` files in the indexed set share `/` names in different proto packages — real-world fleet validation didn't trigger this once across hundreds of `.proto` files. +- Each adversarial-review round uncovered new edge cases (TOCTOU on canonicalize-then-open, order-dependent winner under parallel scheduling, incremental orphan after the winning `.proto` is deleted, monotonic property preservation under collision + parse failure, etc.) — all real, but only relevant when collisions actually occur. + +The pragmatic call: **drop the entire Tier 1g family from the big Tier 1 PR** and ship it as a focused follow-on PR once a fleet actually reports a collision. The big PR's scope stays cohesive (extractor fixes from `tier1-extractor-fixes.md` Gaps 1-6 + producer-side detection + the Gap 7 collision-warning mitigation, which is the cheap ~15-LOC observability piece). The proposed Tier 1g work is sequenced as four pieces in the follow-on PR: + +- **Tier 1g.1 — producer-side dual emission.** Parse `.proto` `package ;` and emit both an FQN-keyed Route (`__route__grpc__./`) and a bare-keyed alias, with `HANDLES` from the rpc Function and `proto_package` + `key_kind` properties on both. ~150 LOC base. Adversarial review surfaced and fixed: tri-state `IDL_PKG_OK/NONE/ERROR` so parse failures don't silently mask as "no package", BOM and multi-line block-comment handling, `snprintf`-truncation hard-fails, the bare alias's `file_path` set to `""` to survive incremental proto-file deletion, monotonic property preservation when a colliding second proto hits `IDL_PKG_ERROR`, and skipping the duplicate `HANDLES` edge on package mismatch so the bare Route doesn't acquire handlers from multiple unrelated impls. Hardened total ~390 LOC. + +- **Tier 1g.1b — package extraction at AST time.** `pass_idl_scan` shouldn't reopen `.proto` files by path to read the package declaration; the proto Class / Function nodes were already extracted by `pass_definitions` reading the same file earlier in the same pipeline run, and a TOCTOU window between the two reads can let the package metadata come from a different snapshot than the AST nodes. Fix: extend `internal/cbm/extract_defs.c`'s protobuf branch to extract the `package` declaration during the AST walk and stash it as a property on the proto Class node. `pass_idl_scan` then reads package from the gbuf node, eliminating the second open and any TOCTOU window. ~50 LOC in `extract_defs.c` plus a small reader change in `pass_idl_scan.c`. + +- **Tier 1g.1c — deterministic collision resolution.** The 1g.1 bare-key collision handling has two leftover issues: (1) *incremental orphan* — when the winning emitter's `.proto` is later deleted/renamed, its HANDLES edge is purged with the rpc Function, but the bare Route survives (`file_path=""`) with the old `proto_package` and zero HANDLES; subsequent reindexes of the colliding partner skip HANDLES too, leaving the bare Route handlerless until full reindex. (2) *Order dependence* — proto Class iteration in the parallel pipeline depends on worker-bucket scheduling, so which package "wins" the bare HANDLES is non-deterministic across reindexes. Fix needs (a) a new graph-buffer primitive to remove HANDLES edges from a bare Route on collision detection and (b) a deterministic tie-break (lexicographic-smallest `proto_package` wins, or equivalent). ~80 LOC in graph_buffer.c + ~50 LOC restructuring in pass_idl_scan.c + targeted fixtures. + +- **Tier 1g.2 — consumer-side FQN derivation.** Scan referenced NuGet/Maven contract packages in the manifest cache (`~/.nuget/packages/`, `~/.m2/repository/`, etc.), extract `.proto` package declarations from packaged sources where present, build a ` -> ` map, and re-key consumer-side Routes to FQN. After this lands, both ends agree on FQN keys; the bare-key alias from 1g.1 can be retired. ~250 LOC + ~150 tests. This is the piece that makes Tier 1g.1's FQN Routes non-dormant on the cross-repo matching path. + +The four pieces together form a coherent "collision-safe FQN matching" PR. Until then, the big Tier 1 PR's Gap 7 mitigation (collision warning at `idl_scan.route_collision` + `service_qn` property in Route props, ~15 LOC, see `tier1-extractor-fixes.md`) gives operators the diagnostic signal when collisions occur, without the complexity surface of the FQN data model. + + +### 5.8 Sub-tier sequencing recommendation + +Land in this order: +1. **#293 (1a + 1b)** — already up. Validates architecture, ships proto Routes + HANDLES + manual stub-var producer detection. +2. **Tier 1c (ctor params)** — biggest unlock for OOP DI-heavy fleets. Single high-leverage PR. +3. **Tier 1d (factory return types)** — closes the Java/Go gap. +4. **Tier 1e (DI registration)** — cleans up edge cases and custom-named clients. +5. **Tier 1f (field types)** — diminishing returns; ship if the extractor work is cheap. +6. **Tier 1g (collision-safe FQN matching)** — bundled focused follow-on PR sequencing 1g.1 + 1g.1b + 1g.1c + 1g.2 together once a fleet actually reports a `/` collision across proto packages. Until then the big Tier 1 PR ships only the Gap 7 collision-warning mitigation (~15 LOC, see `tier1-extractor-fixes.md`); the FQN data model is gated on consumer-side derivation (1g.2), so half-shipping just 1g.1 produces dormant nodes that don't change matching behaviour. +7. **Tier 2-4** — proceed once Tier 1 hits ~90% coverage and the `(var, type, scope)` abstraction is proven. + +After 1c+1d, real-world coverage hits ~85% across the major OOP languages. After 1e, ~95%. 1f closes residual cases. The 1g family doesn't change coverage — it tightens the correctness contract for fleets where service+method names collide across proto packages, a scenario that's rare but observable. + +--- + +## 6. Producer-side signal source matrix + +The signal source is what tells the indexer that a given variable holds a gRPC client/stub. Same problem cross-language; different syntactic locations. + +| Idiom | Signal source | Tier | Captured? | +|---|---|---|---| +| Python `stub = pb2_grpc.FooStub(ch)` | local var `type_assigns` | 1b | ✅ #293 | +| Python class field `self.stub = FooStub(ch)` | `type_assigns` (in `__init__`) | 1b | ✅ #293 (file-scope fallback) | +| C# `var c = new FooClient(ch)` | local var `type_assigns` | 1b | ✅ #293 | +| C# class field `_client = new FooClient(ch)` in ctor body | `type_assigns` | 1b | ✅ #293 (file-scope fallback) | +| C# Grpc.Net.ClientFactory ctor param `(FooClient client)` | ctor `param_types` | **1c** | ❌ proposed | +| C# class field declaration `private readonly FooClient _client;` | field decl type | **1f** | ❌ proposed | +| C# Spring-style `[FromServices] FooClient client` action param | action `param_types` | **1c** (extension) | ❌ proposed | +| Java `stub = FooGrpc.newBlockingStub(ch)` (typed `var`) | factory func name + return type | **1d** | ❌ proposed | +| Java Spring `@Autowired private FooStub stub;` | field decl type + annotation | **1f** + **1e** | ❌ proposed | +| Java Spring ctor injection `(FooStub stub)` | ctor `param_types` | **1c** | ❌ proposed | +| Kotlin `val stub = FooGrpc.newBlockingStub(ch)` | same as Java | **1d** | ❌ proposed | +| Kotlin Spring DI ctor `(stub: FooStub)` | ctor `param_types` | **1c** | ❌ proposed | +| Go `c := pb.NewFooClient(conn)` | factory func name + return type | **1d** | ❌ proposed | +| Go struct field `Client pb.FooClient` | struct field decl type | **1f** | ❌ proposed | +| Rust `let c = FooClient::new(ch)` | local var `type_assigns` | 1b | ✅ #293 | +| Rust `let c = FooClient::connect(...).await?` | factory call return type | **1d** | ❌ proposed | +| TS `const c = new pb.FooClient(...)` | local var `type_assigns` | 1b | ✅ #293 | +| TS NestJS `@Client({transport: ...}) private client: ClientGrpc;` | decorator + field type | **1e** + **1f** | ❌ proposed | +| TS NestJS `@MessagePattern('foo.bar')` | decorator-driven (Tier 2) | Tier 2 | future | + +--- + +## 7. Architectural reusability for Tiers 2–4 + +The `(var_name, type_name, scope) → service_name` abstraction Tier 1 builds is reusable across all subsequent tiers. Each tier contributes its own detection rule that produces the same tuple shape, then the existing emission machinery (Route upsert, edge insertion, Phase D matching) kicks in. + +### 7.1 Tier 2 — typed-message pub/sub + +```csharp +// MassTransit +await _publishEndpoint.Publish(new VoucherRedeemed(voucher.Id), ct); +``` + +Detection: `_publishEndpoint` is typed as `IPublishEndpoint` (via Tier 1c ctor injection). Method `Publish` extracts `T` from the generic-arg position OR from the first argument's constructed type. Service "name" is the message type FQN `VoucherRedeemed`. + +Reuses Tier 1's machinery: +- Tier 1c provides `_publishEndpoint` → `IPublishEndpoint` mapping +- New: extract generic-arg `T` from method invocation (small AST helper) +- New: emit `ASYNC_CALLS` edge to a Route with QN `__route__msgfqn__VoucherRedeemed` +- Consumer side: walk INHERITS for `IConsumer` (interface inheritance, similar to Tier 1's `*Servicer` walk); extract `T` from the generic argument; emit HANDLES + +The producer side's variable scoping is **identical** to Tier 1c. The only difference is how the service name is derived (generic arg, not type name). + +### 7.2 Tier 3 — attribute-driven HTTP routes + +Refit / Feign / Retrofit declare HTTP routes via attributes on interface methods: + +```csharp +public interface IPromoApi { + [Get("/api/voucher/{id}")] + Task GetVoucher(string id); +} +``` + +Producer side: when a class field is typed as `IPromoApi` (Tier 1f / 1c) and a method is invoked, use the **interface method's attribute** to derive the route. This is consumer-side detection logic in producer-side code — same shape as Tier 1's `*Servicer` walk, applied to interfaces with route attributes. + +Reuses Tier 1's machinery: +- Tier 1c/1f provides `_promoApi` → `IPromoApi` mapping +- New: walk `IPromoApi`'s method definitions in the gbuf (already extracted by `pass_definitions`) +- New: read `[Get("...")]` attributes from method properties_json +- Emit `HTTP_CALLS` edge to Route node `__route__GET__/api/voucher/{id}` with the route arg +- Consumer side already extracts `[HttpGet("...")]` controller routes (existing `pass_route_nodes` capability) + +The variable scoping is **identical** to Tier 1c/1f. The detection rule changes (look at interface attributes instead of class hierarchy). + +### 7.3 Tier 4 — config-resolved discovery + +```csharp +var http = _httpClientFactory.CreateClient("promo-service"); +var resp = await http.GetAsync("/api/voucher/" + id); +``` + +`_httpClientFactory` is typed as `IHttpClientFactory` (Tier 1c). The string `"promo-service"` is a named-client identifier. Resolution requires: +1. Detect `CreateClient("name")` calls — name extracted from first string arg +2. Read `services.AddHttpClient("promo-service", c => c.BaseAddress = ...)` registrations from `Startup.cs` (or `appsettings.json` `HttpClientFactoryOptions:promo-service:BaseAddress`) +3. Combine the named client's base URL with the call site's relative path to reconstruct the full target URL + +Reuses Tier 1's machinery: +- Tier 1c provides `_httpClientFactory` → `IHttpClientFactory` mapping +- Tier 1e's DI-registration scanning provides the `name → URL` map from `services.AddHttpClient(...)` and config files +- New: track string-flow within a function scope (the `id` variable in the URL concatenation) + +The variable scoping is **identical** to Tier 1. The new piece is config-file scanning + intra-method dataflow for URL composition. + +### 7.4 Compounding investment summary + +| Tier | New machinery | Reuses | +|---|---|---| +| 1a | gbuf walk for `.proto` Class nodes | — | +| 1b | `type_assigns` consumer + suffix matcher | 1a | +| 1c | ctor `param_types` consumer | 1a, 1b stub-var emission | +| 1d | factory func name → service inference | 1a, 1b stub-var emission | +| 1e | DI registration scan + custom registry | 1c, 1f | +| 1f | field `param_types` (or extractor extension) | 1a, 1b stub-var emission | +| 1g | NuGet/Maven contract-package `.proto` scan + FQN re-keying | d94a501 contract detection, 1a Route emission | +| 2 | generic-arg extraction + interface walk for `IConsumer` | 1c (var → interface mapping) | +| 3 | interface-method-attribute reader | 1c/1f (var → interface), `pass_route_nodes` (consumer) | +| 4 | config file scan + named-client resolution | 1c (var → factory), 1e (registration) | + +By the time Tier 4 lands, ~70% of its implementation is leveraging machinery built for Tier 1. **The cost-per-tier decreases monotonically.** + +--- + +## 8. Roadmap — Tiers 2–4 + +Each tier is a separate PR after Tier 1 completes (1a + 1b shipped, 1c–1f sequenced). Sequence chosen by descending universality and ascending implementation complexity. + +### 8.1 Tier 2 — typed message pub/sub (after Tier 1) + +**Scope:** introduce the YAML-driven service-pattern registry; ship initial registry covering MassTransit (C#), Spring Cloud Stream (Java/Kotlin), and NestJS (TS) as proof of multi-language genericity. Add `pass_message_synthesis` (or extend `pass_idl_scan`) that emits ASYNC_CALLS edges keyed by `message_fqn` instead of requiring a topic literal. Extend `pass_cross_repo` Phase D to match by `message_fqn` (likely already covered by existing `match_typed_routes` with new property keys). + +**Estimated LOC:** ~800 (registry loader, YAML schema, three framework definitions, new pass code, cross-repo extension, tests). + +**Risk:** brittleness on framework-version drift (MassTransit v8 vs v7 have slightly different interface shapes). Mitigation: registry entries can be version-tagged; pattern matching tolerates shape variance. + +**Dependencies on Tier 1:** Tier 1c (ctor param tracking) is a prerequisite — `_publishEndpoint` typing comes from DI injection. + +### 8.2 Tier 3 — attribute-driven routes (after Tier 2) + +**Scope:** extend `pass_route_nodes.c` to extract routes from interface-method attributes (Refit / Retrofit / Feign) on the producer side. Match against existing controller-side attribute extraction. Most attribute-driven controller patterns are already detected by cbm — this tier closes the producer-side gap. + +**Estimated LOC:** ~400. + +**Risk:** low. Attribute syntax is declarative and stable across framework versions. + +**Dependencies on Tier 1:** Tier 1c/1f (var → interface mapping) is a prerequisite. + +### 8.3 Tier 4 — config-resolved service discovery (after Tier 3) + +**Scope:** extend `pass_envscan` to also parse `appsettings.json`, `application.yaml`, helm values, kustomize overlays. Build named-client → base-URL maps. Add light intra-method dataflow to resolve `path = $"/api/{x}"` patterns. Combine with named-client resolution to reconstruct full URLs. + +**Estimated LOC:** ~1200. Largest tier — config parsing across multiple ecosystems is genuinely complex. + +**Risk:** medium-high. Variable resolution can produce false positives; mitigation is confidence scoring on the emitted edges (high confidence when literal, lower when resolved through 2+ hops). + +**Dependencies on Tier 1:** Tier 1e (DI registration scan) is a prerequisite for the C# `IHttpClientFactory` case. + +### 8.4 Combined coverage estimate + +After Tiers 1–3 land (Tier 4 is bonus), realistic recall on cross-service edges in modern strongly-typed codebases: + +| Code style | Estimated recall | +|---|---| +| Go + gRPC + literal HTTP URLs | 95%+ (Tiers 1a+1b+1d) | +| Java/Spring + Feign + Cloud Stream | 90%+ (Tiers 1+2+3) | +| .NET / CQRS + MediatR + MassTransit + gRPC | 90%+ (Tiers 1c+2; HttpClient gap = Tier 4) | +| TypeScript / NestJS + microservices | 85%+ (Tiers 1+2+3 with decorator detection) | +| Python / FastAPI + Celery + httpx-codegen | 85%+ | +| Plain Python/Node with literal URLs (today's recall) | unchanged, still works | + +--- + +## 9. Risks and mitigations + +Updated with what's been learned from #293's validation. + +| Risk | Likelihood | Mitigation | +|---|---|---| +| Tree-sitter pattern brittleness across language versions | Medium | YAML registry allows per-version patterns; tests cover N-1 and N versions of each framework | +| YAML registry becomes a maintenance burden | Medium | Limit official registry to top 10 frameworks per language; community contributions land via PR review with required test fixtures | +| False-positive cross-repo edges from name collisions | Low (validated) | Phase D filter — non-matching target lookups produce no edges. Tier 1b denylist mitigates the producer-side false positives. | +| Increased index time | Low | `pass_idl_scan` is conditional (no `.proto`/no `type_assigns` matches = no work). #293 measured <2% overhead in fleet test. | +| Variable URL resolution (Tier 4) produces wrong routes | Medium | Confidence scoring; only emit cross-repo edge if resolved confidence > 0.7; consumer-side validation catches bad matches | +| Reflection / runtime-resolved DI is impossible | High but acknowledged | Explicitly out of scope (Tier 5); document as known limitation | +| Maintainer-burden objection | Medium | Plugin registry shifts most additions to YAML; core C surface area kept minimal; Tier 1 sub-decomposition keeps individual PRs small | +| Patch size scares reviewers | High for big-bang, Low for tier-by-tier | #293 demonstrates the small-PR cadence; subsequent tiers reference #293's architecture | +| **(NEW) Producer detection misses modern OOP DI** | High (validated) | Tier 1c sub-tier addresses constructor injection — biggest leverage missing piece | +| **(NEW) Pre-existing `pass_parallel.c emit_grpc_edge` emits phantom Routes** | Validated | Filed as #294, out of scope for #293; needs separate cleanup PR | +| **(NEW) Multiple Route-QN namespaces coexist** | Low | `pass_idl_scan` uses `__route__grpc__`, `pass_parallel` uses `__grpc__`; they're parallel namespaces but Phase D matches each independently. Aligning them is a #294 follow-up. | + +--- + +## 10. Open questions for the maintainer + +1. **Pattern-registry format preference**: YAML, TOML, JSON, or compiled-in C tables with a build-time generator? YAML is most readable but adds a YAML parser to runtime; TOML or JSON minimize parser surface. +2. **Where should IDL files be discovered**: walk-the-repo by default, or require explicit `idl_paths` config? Walk-the-repo has zero-config UX cost but may pick up vendored proto files in `node_modules` or `vendor/`. **Note (post-#293):** the gbuf-walking approach in `pass_idl_scan` sidesteps this question — it works on whatever `.proto`s end up in the gbuf, which is governed by existing discovery rules. +3. **Cross-repo auto-trigger model**: store workspace membership in the per-repo artifact, or in a separate workspace-level artifact? Per-repo is simpler but duplicates state; workspace-level is cleaner but adds a new artifact kind. +4. **Confidence scoring**: should cross-repo edges carry a `confidence` property explicitly, or rely on the existing `properties` JSON blob? A first-class confidence field makes downstream consumers' job easier, especially for Tier 4. +5. **Existing pattern table migration**: should the 252 patterns in `service_patterns.c` migrate to the YAML registry as part of this work, or stay in C with the registry only handling new patterns? Recommendation: keep C patterns as-is for v1, migrate in a separate cleanup PR after the YAML schema is proven stable. +6. **Tier 4 dataflow scope**: how aggressive should intra-method variable resolution be? Single-assignment + string-concat is safe; following data through helper methods gets harder. Suggest single-method scope for v1. +7. **Test-fixture monorepo strategy**: ship the multi-language fixtures in the cbm repo, or reference an external `cbm-test-fixtures` repo to keep the main repo small? The fixtures total ~5MB across 3-4 languages — manageable in-tree. **Note (post-#293):** in-tree fixtures shipped at `testdata/cross-repo/grpc/` total ~10KB; well within budget. +8. **(NEW) Route QN namespace consolidation**: align `pass_parallel.c emit_grpc_edge`'s `__grpc__/` to the canonical `__route__grpc__/` used by `pass_idl_scan` and HTTP/async passes? Filed as #294. +9. **(NEW) Should Tier 1c land as part of #293 or as a separate PR?** Recommendation: separate PR (~400 LOC) to keep review scope tractable. + +--- + +## 11. Why this is worth merging upstream + +cbm's competitive position vs. Sourcegraph / Backstage / Apollo: + +- **Sourcegraph** does cross-repo references but per-symbol, not protocol-aware. cbm + Tiers 1–3 would be the only AST-based tool emitting structured `CROSS_GRPC_CALLS` / `CROSS_ASYNC_CALLS` edges keyed by protocol identifiers. +- **Backstage** does service-graph from declarative IDL files but requires manual catalog upkeep. cbm + this proposal derives the service graph automatically from the same IDL files plus the consuming code. +- **Apollo Studio** does federated GraphQL via `@key` matching. cbm + this proposal generalizes the same idea to gRPC, OpenAPI, AsyncAPI, and typed-message ecosystems. + +Position: **cbm becomes the only single-binary, AST+LSP-grade tool that derives a complete service interaction graph automatically from source.** That's a defensible product position. + +The capability is asked for in every code-graph tool's roadmap (often as "service mesh visualization" or "API surface discovery"). cbm has the structural advantage to ship it first. + +#293 demonstrates the technical feasibility on real-world code (validated against an 11-service .NET microservice fleet using NuGet-distributed contracts). The remaining sub-tiers are incremental extensions of the same architecture. + +--- + +## 12. Appendix A — example YAML registry entries + +Full registry entries for the ten frameworks Tier 2 should ship with: + +```yaml +patterns: + # ── C# / .NET ────────────────────────────────────────────────── + - id: masstransit-publish + languages: [csharp] + kind: ASYNC_CALLS + producer: + match: { type_implements: "IPublishEndpoint", method: "Publish" } + extract_id_from: generic_arg_or_first_arg_type + id_kind: message_fqn + broker: rabbitmq + consumer: + match: { class_implements: "IConsumer" } + extract_id_from: generic_type_arg + id_kind: message_fqn + + - id: masstransit-send + languages: [csharp] + kind: ASYNC_CALLS + producer: + match: { type_implements: "ISendEndpoint", method: "Send" } + extract_id_from: generic_arg_or_first_arg_type + id_kind: message_fqn + broker: rabbitmq + consumer: { same_as: masstransit-publish.consumer } + + - id: nservicebus-publish + languages: [csharp] + kind: ASYNC_CALLS + producer: + match: { type_implements: "IMessageSession", method: "Publish" } + extract_id_from: first_arg_type + id_kind: message_fqn + consumer: + match: { class_implements: "IHandleMessages" } + extract_id_from: generic_type_arg + id_kind: message_fqn + + # ── Java / Kotlin / Spring ───────────────────────────────────── + - id: spring-cloud-stream-bridge + languages: [java, kotlin] + kind: ASYNC_CALLS + producer: + match: { type_implements: "StreamBridge", method: "send" } + extract_id_from: first_arg + id_kind: channel_name + consumer: + match: { method_annotation: "@StreamListener" } + extract_id_from: annotation_value + id_kind: channel_name + + - id: axon-command + languages: [java, kotlin] + kind: ASYNC_CALLS + producer: + match: { type_implements: "CommandGateway", method: "send" } + extract_id_from: first_arg_type + id_kind: message_fqn + consumer: + match: { method_annotation: "@CommandHandler" } + extract_id_from: parameter_type + id_kind: message_fqn + + # ── Node / TypeScript ────────────────────────────────────────── + - id: nestjs-message-pattern + languages: [typescript] + kind: ASYNC_CALLS + producer: + match: { method_annotation: "@MessagePattern" } + extract_id_from: annotation_value + id_kind: message_pattern + consumer: + match: { method_annotation: "@MessagePattern" } + extract_id_from: annotation_value + id_kind: message_pattern + + - id: nestjs-event-pattern + languages: [typescript] + kind: ASYNC_CALLS + producer: + match: { method: "emit", type_implements: "ClientProxy" } + extract_id_from: first_arg + id_kind: event_pattern + consumer: + match: { method_annotation: "@EventPattern" } + extract_id_from: annotation_value + id_kind: event_pattern + + # ── Python ────────────────────────────────────────────────────── + - id: faust-agent + languages: [python] + kind: ASYNC_CALLS + producer: + match: { method_call: "topic.send" } + extract_id_from: receiver_var_topic_name + id_kind: kafka_topic + consumer: + match: { decorator: "@app.agent" } + extract_id_from: decorator_arg + id_kind: kafka_topic + + # ── Go ────────────────────────────────────────────────────────── + - id: watermill-publish + languages: [go] + kind: ASYNC_CALLS + producer: + match: { type_implements: "message.Publisher", method: "Publish" } + extract_id_from: first_arg + id_kind: topic_name + consumer: + match: { type_implements: "message.Subscriber", method: "Subscribe" } + extract_id_from: first_arg + id_kind: topic_name + + # ── Rust ─────────────────────────────────────────────────────── + - id: async-nats-publish + languages: [rust] + kind: ASYNC_CALLS + producer: + match: { method: "publish", type_implements: "Client" } + extract_id_from: first_arg + id_kind: nats_subject + consumer: + match: { method: "subscribe", type_implements: "Client" } + extract_id_from: first_arg + id_kind: nats_subject +``` + +Schema notes: +- `match` block defines the AST-pattern selector (interface implementation, attribute presence, method-call shape) +- `extract_id_from` names a strategy from a fixed enum (`generic_type_arg`, `first_arg`, `first_arg_type`, `annotation_value`, `attribute_arg`, `receiver_var_topic_name`, etc.) +- `id_kind` declares the namespace of the extracted identifier (so `kafka_topic` from one framework matches `kafka_topic` from another, but never matches `message_fqn`) +- `broker` is optional metadata that flows into the emitted edge diff --git a/.planning/tier1-extractor-fixes.md b/.planning/tier1-extractor-fixes.md new file mode 100644 index 00000000..e08232ea --- /dev/null +++ b/.planning/tier1-extractor-fixes.md @@ -0,0 +1,89 @@ +# Tier 1 cross-repo gRPC: extractor + pipeline fixes + +Companion to `cbm-cross-repo-proposal.md` and PR #293. Production-readiness gaps that prevented Tier 1's producer-side detection from firing on a real .NET microservice fleet, all addressed in this branch. + +--- + +## Gap 1: `pass_idl_scan` not called in the parallel pipeline + +**Where:** `src/pipeline/pipeline.c` + +The pass was registered only in `seq_passes[]` (sequential path). The parallel path ran extract → registry → resolve → infra → free(cache) → k8s and never invoked `pass_idl_scan`. Repos cross the parallel-pipeline threshold at ~50 files, so production codebases silently skipped Tier 1. + +**Fix:** invoke `cbm_pipeline_pass_idl_scan` in `run_parallel_pipeline` after `process_infra_bindings`, before the cache is freed, with `ctx->result_cache` set to the cache pointer. + +**Diff:** `src/pipeline/pipeline.c` ~12 lines. + +--- + +## Gap 2: C# 12 primary-constructor params not surfaced as Field defs + +**Where:** `internal/cbm/extract_defs.c::extract_class_def` + +Modern .NET 8+/9+ controllers/services use the C# 12 primary-constructor syntax. The params on the class declaration line bind to implicit captured fields accessible from instance members, but `extract_class_def` only walked body `field_declaration` / `property_declaration` nodes, missing the primary-ctor params entirely. Tier 1c (ctor params) and Tier 1f (class fields) couldn't fire because there was no Method "ctor" def with `param_names`/`param_types` and no Field def for the captured param. + +**Fix:** after the existing class extraction, when `language == CBM_LANG_CSHARP`, locate the primary `parameter_list` (try `child_by_field_name("parameters")` first, fall back to direct child walk for grammars that don't surface the field name) and emit a `Field` def per param with `parent_class` and `return_type` set. + +**Diff:** `internal/cbm/extract_defs.c` ~35 lines. + +--- + +## Gap 3: protobuf rpc Functions not linked to their service Class + +**Where:** `src/pipeline/pass_idl_scan.c::idl_proto_class_visitor` + +The visitor used `cbm_gbuf_find_edges_by_source_type(class.id, "DEFINES_METHOD", ...)` to find rpc methods of each proto service. tree-sitter-protobuf emits rpc Functions as **flat siblings** of the service Class (not children), so `DEFINES_METHOD` returned empty for every proto Class and zero `__route__grpc__` Routes were created. `pass_route_nodes` did emit Routes but in the old `__grpc__/` format, which doesn't match Tier 1's `__route__grpc__/` consumer-side QN. + +**Fix:** when `DEFINES_METHOD` is empty, fall back to scanning proto Functions in the same file whose `start_line`/`end_line` falls within the service Class's range. Optimized to O(N+F) via a single pre-pass that collects all proto Classes and Functions into flat arrays. + +**Diff:** `src/pipeline/pass_idl_scan.c` ~85 lines (pre-collection helpers + refactored visitor). + +--- + +## Gap 4: graph UI dropped `linked_projects` so cross-galaxy never rendered + +**Where:** `graph-ui/src/components/GraphTab.tsx` + +`/api/layout` returns `{nodes, edges, total_nodes, linked_projects}` where each linked-project entry carries the satellite's nodes + edges + `cross_edges` (primary→linked id pairs). `GraphScene` already knew how to render satellites, but `GraphTab` rebuilt `filteredData` as `{nodes, edges, total_nodes}` and silently dropped `linked_projects`, so the scene received `data.linked_projects === undefined` on every render. + +**Fix:** pass `linked_projects` through the `useMemo` and apply the same enabled-labels / enabled-edge-types filter inside satellites. Filter init + `enableAll` union-in labels and edge types from satellites so they're visible by default. The binary embeds the built UI via `scripts/embed-frontend.sh`; rebuild with `scripts/build.sh --with-ui`. + +**Diff:** `graph-ui/src/components/GraphTab.tsx` ~25 lines. + +--- + +## Adversarial-review follow-ups + +Three additional findings from `/codex:adversarial-review`. Two fixed in-line; the third (route-key uniqueness) is mitigated rather than fully resolved. + +### Gap 5: incremental indexing skipped `pass_idl_scan` + +**Where:** `src/pipeline/pipeline_incremental.c::run_extract_resolve` + +Sequential incremental called `cbm_pipeline_pass_idl_scan` without attaching a cache, so the pass returned early at `if (!ctx->result_cache)`. Parallel incremental built a cache for extract+resolve but never called the pass. Producer-side edges only refreshed on full reindex. + +**Fix:** mirror the full-pipeline pattern in both branches — allocate a `CBMFileResult **` cache, attach to `ctx->result_cache`, run the pass, free. + +**Diff:** `src/pipeline/pipeline_incremental.c` ~25 lines. + +### Gap 6: project-wide stub-var name-only fallback could misattribute calls + +**Where:** `src/pipeline/pass_idl_scan.c::idl_stub_var_arr_find` + +The lookup ran function-scope exact, then class-scope, then a name-only fallback. The fallback is safe for `file_vars` (one TU) but unsafe for `class_vars` (project-wide) — two unrelated classes with a `_client` field would silently bind to each other. + +**Fix:** thread `allow_name_only_fallback` flag. The `class_vars` call site passes `false` (fail closed); `file_vars` lookups keep it `true`. + +**Diff:** `src/pipeline/pass_idl_scan.c` ~20 lines. + +### Gap 7 (mitigation): gRPC route-key collisions across proto packages + +**Where:** `src/pipeline/pass_idl_scan.c::idl_emit_route_for_rpc` + +Routes are keyed `__route__grpc__/` using the bare service name. Two `.proto` files in different proto packages with the same `service` + `rpc` names will upsert to the same Route node. A symmetric FQN fix needs both producer and consumer to derive the same fully-qualified key, which the consumer side can't do from typed-client class names alone. + +**Mitigation:** log `idl_scan.route_collision` when an existing Route's `file_path` differs from the incoming emission, and write the proto Class's `qualified_name` as a `service_qn` Route property so a future FQN-aware matcher can recover provenance. + +**Full fix path:** tracked as Tier 1g in `cbm-cross-repo-proposal.md` §5.7. The four-piece sequence (producer dual emission, AST-time package extraction, deterministic collision resolution, NuGet/Maven consumer-side scan) ships as a focused follow-on PR if/when a fleet hits an actual collision. Five rounds of `/codex:adversarial-review` on a 1g.1 prototype showed it's preventive defense for a scenario that didn't fire in real-world validation, and a half-shipped 1g.1 without consumer-side derivation produces dormant nodes. + +**Diff:** `src/pipeline/pass_idl_scan.c` ~15 lines.