From a2e83369ee74d20c2b5faac6d91f1e002cf551ed Mon Sep 17 00:00:00 2001 From: John Gemignani Date: Mon, 22 Dec 2025 17:20:35 -0800 Subject: [PATCH] Optimize id() and properties() field access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NOTE: This PR was created with AI tools and a human. Optimized id() and properties() field access on vertices and edges. When accessing id(v) or properties(v) on a vertex, the system was generating inefficient SQL that rebuilt the entire vertex agtype before extracting the field: age_id(_agtype_build_vertex(id, _label_name_from_table_oid(labels), properties)) This forced full vertex reconstruction for every row, even though the data was already available in table columns. Added optimize_vertex_field_access() in cypher_expr.c to detect these patterns and optimize them to direct column access: - age_id(_agtype_build_vertex(id, ...)) → graphid_to_agtype(id) - age_properties(_agtype_build_vertex(..., props)) → props - age_id(_agtype_build_edge(id, ...)) → graphid_to_agtype(id) - age_start_id(_agtype_build_edge(...)) → graphid_to_agtype(start_id) - age_end_id(_agtype_build_edge(...)) → graphid_to_agtype(end_id) - age_properties(_agtype_build_edge(...)) → props Note: age_label() is intentionally not optimized due to complexity of cstring-to-agtype string conversion. Added regression tests in unified_vertex_table.sql to verify the optimization works correctly for both vertices and edges. modified: regress/expected/unified_vertex_table.out modified: regress/sql/unified_vertex_table.sql modified: src/backend/parser/cypher_expr.c --- regress/expected/unified_vertex_table.out | 96 ++++++++++- regress/sql/unified_vertex_table.sql | 61 +++++++ src/backend/parser/cypher_expr.c | 192 ++++++++++++++++++++++ 3 files changed, 348 insertions(+), 1 deletion(-) diff --git a/regress/expected/unified_vertex_table.out b/regress/expected/unified_vertex_table.out index 58a3f1e36..118f4a837 100644 --- a/regress/expected/unified_vertex_table.out +++ b/regress/expected/unified_vertex_table.out @@ -1219,11 +1219,101 @@ SELECT * FROM cypher('unified_test', $$ RETURN e $$) AS (e agtype); ERROR: SET/REMOVE label can only be used on vertices +-- +-- Test 28: Verify id() and properties() optimization +-- +-- The optimization avoids rebuilding the full vertex agtype when accessing +-- id() or properties() on a vertex. Instead of: +-- age_id(_agtype_build_vertex(id, _label_name_from_table_oid(labels), properties)) +-- It generates: +-- graphid_to_agtype(id) +-- +-- And for properties: +-- age_properties(_agtype_build_vertex(...)) +-- It generates: +-- properties (direct column access) +-- +-- Create test data +SELECT * FROM cypher('unified_test', $$ + CREATE (:OptimizeTest {val: 1}), + (:OptimizeTest {val: 2}), + (:OptimizeTest {val: 3}) +$$) AS (v agtype); + v +--- +(0 rows) + +-- Test that id() works correctly with optimization +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + RETURN id(n), n.val + ORDER BY n.val +$$) AS (id agtype, val agtype); + id | val +-------------------+----- + 10977524091715585 | 1 + 10977524091715586 | 2 + 10977524091715587 | 3 +(3 rows) + +-- Test that properties() works correctly with optimization +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + RETURN properties(n), n.val + ORDER BY n.val +$$) AS (props agtype, val agtype); + props | val +------------+----- + {"val": 1} | 1 + {"val": 2} | 2 + {"val": 3} | 3 +(3 rows) + +-- Test id() in WHERE clause (common optimization target) +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + WHERE id(n) % 10 = 0 + RETURN n.val +$$) AS (val agtype); + val +----- +(0 rows) + +-- Test properties() access in expressions +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + WHERE properties(n).val > 1 + RETURN n.val + ORDER BY n.val +$$) AS (val agtype); + val +----- + 2 + 3 +(2 rows) + +-- Test edge id/properties optimization +SELECT * FROM cypher('unified_test', $$ + CREATE (:OptStart {x: 1})-[:OPT_EDGE {weight: 10}]->(:OptEnd {y: 2}) +$$) AS (v agtype); + v +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (a)-[e:OPT_EDGE]->(b) + RETURN id(e), properties(e), start_id(e), end_id(e) +$$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); + eid | props | sid | eid2 +-------------------+----------------+-------------------+------------------- + 11540474045136897 | {"weight": 10} | 11258999068426241 | 11821949021847553 +(1 row) + -- -- Cleanup -- SELECT drop_graph('unified_test', true); -NOTICE: drop cascades to 38 other objects +NOTICE: drop cascades to 42 other objects DETAIL: drop cascades to table unified_test._ag_label_vertex drop cascades to table unified_test._ag_label_edge drop cascades to table unified_test."Person" @@ -1262,6 +1352,10 @@ drop cascades to table unified_test."SameLabel" drop cascades to table unified_test."EdgeTest1" drop cascades to table unified_test."CONNECTS" drop cascades to table unified_test."EdgeTest2" +drop cascades to table unified_test."OptimizeTest" +drop cascades to table unified_test."OptStart" +drop cascades to table unified_test."OPT_EDGE" +drop cascades to table unified_test."OptEnd" NOTICE: graph "unified_test" has been dropped drop_graph ------------ diff --git a/regress/sql/unified_vertex_table.sql b/regress/sql/unified_vertex_table.sql index 8eebf9c1d..aebf6cd5a 100644 --- a/regress/sql/unified_vertex_table.sql +++ b/regress/sql/unified_vertex_table.sql @@ -748,6 +748,67 @@ SELECT * FROM cypher('unified_test', $$ RETURN e $$) AS (e agtype); +-- +-- Test 28: Verify id() and properties() optimization +-- +-- The optimization avoids rebuilding the full vertex agtype when accessing +-- id() or properties() on a vertex. Instead of: +-- age_id(_agtype_build_vertex(id, _label_name_from_table_oid(labels), properties)) +-- It generates: +-- graphid_to_agtype(id) +-- +-- And for properties: +-- age_properties(_agtype_build_vertex(...)) +-- It generates: +-- properties (direct column access) +-- + +-- Create test data +SELECT * FROM cypher('unified_test', $$ + CREATE (:OptimizeTest {val: 1}), + (:OptimizeTest {val: 2}), + (:OptimizeTest {val: 3}) +$$) AS (v agtype); + +-- Test that id() works correctly with optimization +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + RETURN id(n), n.val + ORDER BY n.val +$$) AS (id agtype, val agtype); + +-- Test that properties() works correctly with optimization +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + RETURN properties(n), n.val + ORDER BY n.val +$$) AS (props agtype, val agtype); + +-- Test id() in WHERE clause (common optimization target) +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + WHERE id(n) % 10 = 0 + RETURN n.val +$$) AS (val agtype); + +-- Test properties() access in expressions +SELECT * FROM cypher('unified_test', $$ + MATCH (n:OptimizeTest) + WHERE properties(n).val > 1 + RETURN n.val + ORDER BY n.val +$$) AS (val agtype); + +-- Test edge id/properties optimization +SELECT * FROM cypher('unified_test', $$ + CREATE (:OptStart {x: 1})-[:OPT_EDGE {weight: 10}]->(:OptEnd {y: 2}) +$$) AS (v agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (a)-[e:OPT_EDGE]->(b) + RETURN id(e), properties(e), start_id(e), end_id(e) +$$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); + -- -- Cleanup -- diff --git a/src/backend/parser/cypher_expr.c b/src/backend/parser/cypher_expr.c index 5f4de86b9..cc6bf52e6 100644 --- a/src/backend/parser/cypher_expr.c +++ b/src/backend/parser/cypher_expr.c @@ -116,6 +116,7 @@ static bool function_exists(char *funcname, char *extension); static Node *coerce_expr_flexible(ParseState *pstate, Node *expr, Oid source_oid, Oid target_oid, int32 t_typemod, bool error_out); +static Node *optimize_vertex_field_access(Node *node); /* transform a cypher expression */ Node *transform_cypher_expr(cypher_parsestate *cpstate, Node *expr, @@ -2082,6 +2083,14 @@ static Node *transform_FuncCall(cypher_parsestate *cpstate, FuncCall *fn) retval = ParseFuncOrColumn(pstate, fname, targs, last_srf, fn, false, fn->location); + /* + * Optimize vertex field access patterns. This detects cases like: + * age_id(_agtype_build_vertex(id, label, props)) + * and optimizes them to directly use the underlying column, avoiding + * the expensive reconstruction of the vertex agtype just to extract a field. + */ + retval = optimize_vertex_field_access(retval); + /* flag that an aggregate was found during a transform */ if (retval != NULL && retval->type == T_Aggref) { @@ -2407,3 +2416,186 @@ static Node *transform_SubLink(cypher_parsestate *cpstate, SubLink *sublink) return result; } +/* + * optimize_vertex_field_access + * + * This function optimizes patterns where we're extracting fields from + * a vertex that was just built from its underlying columns. The most + * common case is: + * + * age_id(_agtype_build_vertex(id, label_name, properties)) + * + * Which can be optimized to just use 'id' directly (cast to agtype). + * + * Similar optimizations apply to: + * - age_properties(_agtype_build_vertex(...)) -> properties + * - age_label(_agtype_build_vertex(...)) -> label_name (needs cast) + * + * The same optimizations apply to edges with _agtype_build_edge: + * - age_id(_agtype_build_edge(id, startid, endid, label, props)) -> id + * - age_start_id(_agtype_build_edge(...)) -> startid + * - age_end_id(_agtype_build_edge(...)) -> endid + * - age_properties(_agtype_build_edge(...)) -> props + * - age_label(_agtype_build_edge(...)) -> label + */ +static Node *optimize_vertex_field_access(Node *node) +{ + FuncExpr *outer_func; + FuncExpr *inner_func; + char *outer_func_name; + char *inner_func_name; + Node *arg; + List *inner_args; + int arg_index = -1; + Oid result_type; + bool needs_agtype_cast = false; + + /* Only optimize FuncExpr nodes */ + if (node == NULL || !IsA(node, FuncExpr)) + { + return node; + } + + outer_func = (FuncExpr *)node; + + /* Must have exactly one argument */ + if (list_length(outer_func->args) != 1) + { + return node; + } + + /* Get the function name */ + outer_func_name = get_func_name(outer_func->funcid); + if (outer_func_name == NULL) + { + return node; + } + + /* Check if this is an accessor function we can optimize */ + arg = (Node *)linitial(outer_func->args); + + /* The argument must be a FuncExpr (the build function) */ + if (!IsA(arg, FuncExpr)) + { + return node; + } + + inner_func = (FuncExpr *)arg; + inner_func_name = get_func_name(inner_func->funcid); + + if (inner_func_name == NULL) + { + return node; + } + + inner_args = inner_func->args; + + /* + * Check for _agtype_build_vertex(id, label_name, properties) + * Arguments: 0=id (graphid), 1=label_name (cstring), 2=properties (agtype) + * + * Note: We don't optimize age_label() because the label_name is a cstring + * from _label_name_from_table_oid() and converting it properly to agtype + * string is non-trivial. The id and properties optimizations are the most + * impactful for performance anyway. + */ + if (strcmp(inner_func_name, "_agtype_build_vertex") == 0 && + list_length(inner_args) == 3) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + /* Extract id (arg 0), needs cast from graphid to agtype */ + arg_index = 0; + result_type = GRAPHIDOID; + needs_agtype_cast = true; + } + else if (strcmp(outer_func_name, "age_properties") == 0) + { + /* Extract properties (arg 2), already agtype */ + arg_index = 2; + result_type = AGTYPEOID; + needs_agtype_cast = false; + } + /* age_label() is intentionally not optimized - cstring conversion is complex */ + } + /* + * Check for _agtype_build_edge(id, startid, endid, label_name, properties) + * Arguments: 0=id (graphid), 1=start_id (graphid), 2=end_id (graphid), + * 3=label_name (cstring), 4=properties (agtype) + * + * Note: Same as vertex, age_label() is not optimized for edges. + */ + else if (strcmp(inner_func_name, "_agtype_build_edge") == 0 && + list_length(inner_args) == 5) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + /* Extract id (arg 0), needs cast from graphid to agtype */ + arg_index = 0; + result_type = GRAPHIDOID; + needs_agtype_cast = true; + } + else if (strcmp(outer_func_name, "age_start_id") == 0) + { + /* Extract start_id (arg 1), needs cast from graphid to agtype */ + arg_index = 1; + result_type = GRAPHIDOID; + needs_agtype_cast = true; + } + else if (strcmp(outer_func_name, "age_end_id") == 0) + { + /* Extract end_id (arg 2), needs cast from graphid to agtype */ + arg_index = 2; + result_type = GRAPHIDOID; + needs_agtype_cast = true; + } + /* age_label() is intentionally not optimized - cstring conversion is complex */ + else if (strcmp(outer_func_name, "age_properties") == 0) + { + /* Extract properties (arg 4), already agtype */ + arg_index = 4; + result_type = AGTYPEOID; + needs_agtype_cast = false; + } + } + + /* If we found a pattern to optimize */ + if (arg_index >= 0) + { + Node *extracted_arg = (Node *)list_nth(inner_args, arg_index); + + if (needs_agtype_cast) + { + /* + * For graphid: use graphid_to_agtype() function + * Currently only graphid needs casting - cstring (for labels) + * is intentionally not optimized. + */ + if (result_type == GRAPHIDOID) + { + Oid cast_func_oid; + FuncExpr *cast_expr; + + /* Get the graphid_to_agtype function OID */ + cast_func_oid = get_ag_func_oid("graphid_to_agtype", 1, + GRAPHIDOID); + + cast_expr = makeFuncExpr(cast_func_oid, AGTYPEOID, + list_make1(extracted_arg), + InvalidOid, InvalidOid, + COERCE_EXPLICIT_CALL); + cast_expr->location = outer_func->location; + + return (Node *)cast_expr; + } + } + else + { + /* For properties, just return the extracted argument directly */ + return extracted_arg; + } + } + + /* No optimization possible */ + return node; +} \ No newline at end of file