From ec778a7084a5e567265b361aaf729b89abf7d502 Mon Sep 17 00:00:00 2001 From: John Gemignani Date: Tue, 23 Dec 2025 13:41:44 -0800 Subject: [PATCH] Add GROUP BY and ORDER BY optimization NOTE: This PR was created with AI tools and a human. Add GROUP BY and ORDER BY optimization for vertex/edge field access. Transform expressions like age_id(_agtype_build_vertex(id, label, props)) into graphid_to_agtype(id) in GROUP BY and ORDER BY clauses, avoiding unnecessary vertex/edge reconstruction when only the ID is needed. Implementation: - Add optimize_sortgroupby_vertex_access() in cypher_clause.c - Walk target entries with non-zero ressortgroupref (GROUP BY/ORDER BY refs) - Detect outer accessor functions: age_id, age_start_id, age_end_id, age_properties - Match inner build functions: _agtype_build_vertex, _agtype_build_edge - Extract the relevant field directly and wrap with graphid_to_agtype() - Add resjunk target entries to subquery for direct field access Supported patterns: - GROUP BY id(v) -> Group Key: graphid_to_agtype(v.id) - GROUP BY start_id(e) -> Group Key: graphid_to_agtype(e.start_id) - GROUP BY end_id(e) -> Group Key: graphid_to_agtype(e.end_id) - ORDER BY id(v) -> Sort Key: graphid_to_agtype(v.id) - ORDER BY start_id(e) -> Sort Key: graphid_to_agtype(e.start_id) - ORDER BY end_id(e) -> Sort Key: graphid_to_agtype(e.end_id) - Combined ORDER BY + GROUP BY This complements existing optimizations: - cypher_expr.c: optimize_vertex_field_access() for direct FuncExpr patterns - cypher_clause.c: optimize_qual_expr_mutator() for WHERE/join conditions Existing regression tests were not affected. Added additional regression tests. modified: regress/expected/unified_vertex_table.out modified: regress/sql/unified_vertex_table.sql modified: src/backend/parser/cypher_clause.c --- regress/expected/unified_vertex_table.out | 406 +++++++++++++++++++--- regress/sql/unified_vertex_table.sql | 271 ++++++++++++++- src/backend/parser/cypher_clause.c | 273 +++++++++++++++ 3 files changed, 883 insertions(+), 67 deletions(-) diff --git a/regress/expected/unified_vertex_table.out b/regress/expected/unified_vertex_table.out index 81655bcaa..119d43564 100644 --- a/regress/expected/unified_vertex_table.out +++ b/regress/expected/unified_vertex_table.out @@ -1320,6 +1320,37 @@ $$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); -- -- This avoids expensive vertex reconstruction in join conditions. -- +-- Helper function to check if join condition optimization is applied. +-- Returns true if the plan uses direct column references (e.g., u.id) +-- and NOT _agtype_build_vertex +CREATE OR REPLACE FUNCTION plan_has_join_optimization(sql text) +RETURNS boolean +LANGUAGE plpgsql AS +$$ +DECLARE + plan_row RECORD; + full_plan text := ''; + has_direct_id boolean; + has_build_vertex boolean; +BEGIN + -- Concatenate all rows of the EXPLAIN output + FOR plan_row IN EXECUTE format('EXPLAIN (FORMAT TEXT) %s', sql) + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for direct id references in join conditions (e.g., u.id, e.start_id) + has_direct_id := position('.id' in full_plan) > 0 OR + position('start_id' in full_plan) > 0 OR + position('end_id' in full_plan) > 0; + + -- Check for unoptimized pattern + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + + -- Optimization is applied if we see direct id references and no build_vertex + RETURN has_direct_id AND NOT has_build_vertex; +END; +$$; -- Create test data: Users following each other SELECT * FROM cypher('unified_test', $$ CREATE (:JoinOptUser {name: 'Alice'}), @@ -1346,28 +1377,14 @@ $$) AS (e agtype); --- (0 rows) --- EXPLAIN showing join conditions use direct column access --- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...)) --- And: direct id comparisons instead of age_id(...)::graphid -EXPLAIN (COSTS OFF) -SELECT * FROM cypher('unified_test', $$ - MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) - RETURN u.name, v.name -$$) AS (u_name agtype, v_name agtype); - QUERY PLAN ------------------------------------------------------------- - Nested Loop - Join Filter: (e.start_id = u.id) - -> Nested Loop - -> Seq Scan on _ag_label_vertex u - Filter: (labels = '23814'::oid) - -> Seq Scan on _ag_label_vertex v - Filter: (labels = '23814'::oid) - -> Bitmap Heap Scan on "JOPT_FOLLOWS" e - Recheck Cond: (end_id = v.id) - -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" - Index Cond: (end_id = v.id) -(11 rows) +-- Test 29a: Simple join - check for direct column access optimization +SELECT plan_has_join_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) RETURN u.name, v.name $$) AS (u_name agtype, v_name agtype)' +) AS simple_join_optimized; + simple_join_optimized +----------------------- + t +(1 row) -- Verify the query still returns correct results SELECT * FROM cypher('unified_test', $$ @@ -1381,36 +1398,14 @@ $$) AS (u_name agtype, v_name agtype); "Bob" | "Carol" (2 rows) --- Multi-hop pattern showing optimization across multiple joins -EXPLAIN (COSTS OFF) -SELECT * FROM cypher('unified_test', $$ - MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) - RETURN a.name, b.name, c.name -$$) AS (a_name agtype, b_name agtype, c_name agtype); - QUERY PLAN ------------------------------------------------------------------------- - Nested Loop - Join Filter: (e1.start_id = a.id) - -> Nested Loop - Join Filter: _ag_enforce_edge_uniqueness2(e1.id, e2.id) - -> Nested Loop - Join Filter: (e2.start_id = b.id) - -> Nested Loop - -> Seq Scan on _ag_label_vertex b - Filter: (labels = '23814'::oid) - -> Seq Scan on _ag_label_vertex c - Filter: (labels = '23814'::oid) - -> Bitmap Heap Scan on "JOPT_FOLLOWS" e2 - Recheck Cond: (end_id = c.id) - -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" - Index Cond: (end_id = c.id) - -> Bitmap Heap Scan on "JOPT_FOLLOWS" e1 - Recheck Cond: (end_id = b.id) - -> Bitmap Index Scan on "JOPT_FOLLOWS_end_id_idx" - Index Cond: (end_id = b.id) - -> Seq Scan on _ag_label_vertex a - Filter: (labels = '23814'::oid) -(21 rows) +-- Test 29b: Multi-hop pattern - check for optimization across multiple joins +SELECT plan_has_join_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) RETURN a.name, b.name, c.name $$) AS (a_name agtype, b_name agtype, c_name agtype)' +) AS multihop_join_optimized; + multihop_join_optimized +------------------------- + t +(1 row) -- Verify multi-hop query results SELECT * FROM cypher('unified_test', $$ @@ -1422,11 +1417,313 @@ $$) AS (a_name agtype, b_name agtype, c_name agtype); "Alice" | "Bob" | "Carol" (1 row) +-- Clean up Test 29 helper function +DROP FUNCTION plan_has_join_optimization(text); +-- +-- Test 30: Verify GROUP BY optimization with EXPLAIN +-- +-- When using aggregation with id(vertex) or id(edge) in the GROUP BY key, +-- the optimization should replace patterns like: +-- age_id(_agtype_build_vertex(u.id, ...)) +-- with direct column access: +-- graphid_to_agtype(u.id) +-- +-- This avoids expensive vertex/edge reconstruction during grouping. +-- +-- Helper function to check if GROUP BY optimization is applied. +-- Returns true if the plan uses graphid_to_agtype (optimized) +-- and NOT _agtype_build_vertex (unoptimized) +CREATE OR REPLACE FUNCTION plan_has_groupby_optimization(sql text) +RETURNS boolean +LANGUAGE plpgsql AS +$$ +DECLARE + plan_row RECORD; + full_plan text := ''; + has_graphid_to_agtype boolean; + has_build_vertex boolean; + has_build_edge boolean; +BEGIN + -- Concatenate all rows of the EXPLAIN output + FOR plan_row IN EXECUTE format('EXPLAIN (FORMAT TEXT) %s', sql) + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for optimized pattern + has_graphid_to_agtype := position('graphid_to_agtype' in full_plan) > 0; + + -- Check for unoptimized patterns + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + has_build_edge := position('_agtype_build_edge' in full_plan) > 0; + + -- Optimization is applied if we see graphid_to_agtype + -- and don't see _agtype_build in the Group Key + RETURN has_graphid_to_agtype AND NOT has_build_vertex AND NOT has_build_edge; +END; +$$; +-- Create test data: Users and Books with interactions +SELECT * FROM cypher('unified_test', $$ + CREATE (:GrpOptUser {name: 'Alice'}), + (:GrpOptUser {name: 'Bob'}), + (:GrpOptUser {name: 'Carol'}), + (:GrpOptBook {title: 'Book1'}), + (:GrpOptBook {title: 'Book2'}) +$$) AS (v agtype); + v +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Alice'}), (b:GrpOptBook {title: 'Book1'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + e +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Alice'}), (b:GrpOptBook {title: 'Book2'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + e +--- +(0 rows) + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Bob'}), (b:GrpOptBook {title: 'Book1'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + e +--- +(0 rows) + +-- Test 30a: Simple vertex id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(u.id) instead of age_id(_agtype_build_vertex(...)) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u), count(*) $$) AS (user_id agtype, cnt agtype)' +) AS vertex_id_optimized; + vertex_id_optimized +--------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (user_id agtype, cnt agtype); + user_id | cnt +-------------------+----- + 12666373951979521 | 1 + 12666373951979522 | 1 + 12666373951979523 | 1 +(3 rows) + +-- Test 30b: Multiple vertex id() GROUP BY keys optimization +-- Checks that plan uses graphid_to_agtype for both u.id and b.id +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(u), id(b), count(*) $$) AS (user_id agtype, book_id agtype, cnt agtype)' +) AS multi_vertex_id_optimized; + multi_vertex_id_optimized +--------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(u), id(b), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (user_id agtype, book_id agtype, cnt agtype); + user_id | book_id | cnt +-------------------+-------------------+----- + 12666373951979521 | 12947848928690177 | 1 + 12666373951979521 | 12947848928690178 | 1 + 12666373951979522 | 12947848928690177 | 1 +(3 rows) + +-- Test 30c: Edge id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(e.id) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(e), count(*) $$) AS (edge_id agtype, cnt agtype)' +) AS edge_id_optimized; + edge_id_optimized +------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(e), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (edge_id agtype, cnt agtype); + edge_id | cnt +-------------------+----- + 13229323905400833 | 1 + 13229323905400834 | 1 + 13229323905400835 | 1 +(3 rows) + +-- Test 30d: Edge start_id() and end_id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(e.start_id) and graphid_to_agtype(e.end_id) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN start_id(e), end_id(e), count(*) $$) AS (start_id agtype, end_id agtype, cnt agtype)' +) AS edge_start_end_id_optimized; + edge_start_end_id_optimized +----------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN start_id(e), end_id(e), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (start_id agtype, end_id agtype, cnt agtype); + start_id | end_id | cnt +-------------------+-------------------+----- + 12666373951979521 | 12947848928690177 | 1 + 12666373951979521 | 12947848928690178 | 1 + 12666373951979522 | 12947848928690177 | 1 +(3 rows) + +-- Cleanup the GROUP BY helper function +DROP FUNCTION plan_has_groupby_optimization(text); +-- +-- Test 31: ORDER BY optimization tests +-- Test that ORDER BY id(v) uses graphid_to_agtype instead of rebuilding vertex +-- +-- Create a helper function to check for ORDER BY optimization +CREATE OR REPLACE FUNCTION plan_has_orderby_optimization(query_text text) +RETURNS boolean AS $$ +DECLARE + plan_row record; + full_plan text := ''; + has_graphid_to_agtype boolean; + has_build_vertex boolean; + has_build_edge boolean; +BEGIN + -- Load AGE extension and set search path for the session + EXECUTE 'LOAD ''age'''; + EXECUTE 'SET search_path = ag_catalog, public'; + + -- Get the query plan + FOR plan_row IN EXECUTE 'EXPLAIN (COSTS OFF) ' || query_text + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for optimized pattern + has_graphid_to_agtype := position('graphid_to_agtype' in full_plan) > 0; + + -- Check for unoptimized patterns + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + has_build_edge := position('_agtype_build_edge' in full_plan) > 0; + + -- Optimization is applied if we see graphid_to_agtype + -- and don't see _agtype_build in the Sort Key + RETURN has_graphid_to_agtype AND NOT has_build_vertex AND NOT has_build_edge; +END; +$$ +LANGUAGE plpgsql; +-- Test 31a: Simple vertex id() ORDER BY optimization +-- Checks that Sort Key uses graphid_to_agtype(u.id) instead of age_id(_agtype_build_vertex(...)) +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u) ORDER BY id(u) $$) AS (user_id agtype)' +) AS vertex_id_orderby_optimized; + vertex_id_orderby_optimized +----------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u) ORDER BY id(u) +$$) AS (user_id agtype); + user_id +------------------- + 12666373951979521 + 12666373951979522 + 12666373951979523 +(3 rows) + +-- Test 31b: ORDER BY with GROUP BY (both should be optimized) +-- Checks that both Sort Key and Group Key use graphid_to_agtype +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u), count(*) ORDER BY id(u) $$) AS (user_id agtype, cnt agtype)' +) AS orderby_with_groupby_optimized; + orderby_with_groupby_optimized +-------------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u), count(*) AS cnt + ORDER BY id(u) +$$) AS (user_id agtype, cnt agtype); + user_id | cnt +-------------------+----- + 12666373951979521 | 1 + 12666373951979522 | 1 + 12666373951979523 | 1 +(3 rows) + +-- Test 31c: Edge id() ORDER BY optimization +-- Checks that Sort Key uses graphid_to_agtype(e.id) +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(e) ORDER BY id(e) $$) AS (edge_id agtype)' +) AS edge_id_orderby_optimized; + edge_id_orderby_optimized +--------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(e) ORDER BY id(e) +$$) AS (edge_id agtype); + edge_id +------------------- + 13229323905400833 + 13229323905400834 + 13229323905400835 +(3 rows) + +-- Test 31d: Edge start_id() ORDER BY optimization +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN start_id(e) ORDER BY start_id(e) $$) AS (start_id agtype)' +) AS edge_start_id_orderby_optimized; + edge_start_id_orderby_optimized +--------------------------------- + t +(1 row) + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN start_id(e) ORDER BY start_id(e) +$$) AS (start_id agtype); + start_id +------------------- + 12666373951979521 + 12666373951979521 + 12666373951979522 +(3 rows) + +-- Cleanup the ORDER BY helper function +DROP FUNCTION plan_has_orderby_optimization(text); -- -- Cleanup -- SELECT drop_graph('unified_test', true); -NOTICE: drop cascades to 44 other objects +NOTICE: drop cascades to 47 other objects DETAIL: drop cascades to table unified_test._ag_label_vertex drop cascades to table unified_test._ag_label_edge drop cascades to table unified_test."Person" @@ -1471,6 +1768,9 @@ drop cascades to table unified_test."OPT_EDGE" drop cascades to table unified_test."OptEnd" drop cascades to table unified_test."JoinOptUser" drop cascades to table unified_test."JOPT_FOLLOWS" +drop cascades to table unified_test."GrpOptUser" +drop cascades to table unified_test."GrpOptBook" +drop cascades to table unified_test."GRPOPT_READ" NOTICE: graph "unified_test" has been dropped drop_graph ------------ diff --git a/regress/sql/unified_vertex_table.sql b/regress/sql/unified_vertex_table.sql index 605fd176d..436d2a1a9 100644 --- a/regress/sql/unified_vertex_table.sql +++ b/regress/sql/unified_vertex_table.sql @@ -821,6 +821,38 @@ $$) AS (eid agtype, props agtype, sid agtype, eid2 agtype); -- This avoids expensive vertex reconstruction in join conditions. -- +-- Helper function to check if join condition optimization is applied. +-- Returns true if the plan uses direct column references (e.g., u.id) +-- and NOT _agtype_build_vertex +CREATE OR REPLACE FUNCTION plan_has_join_optimization(sql text) +RETURNS boolean +LANGUAGE plpgsql AS +$$ +DECLARE + plan_row RECORD; + full_plan text := ''; + has_direct_id boolean; + has_build_vertex boolean; +BEGIN + -- Concatenate all rows of the EXPLAIN output + FOR plan_row IN EXECUTE format('EXPLAIN (FORMAT TEXT) %s', sql) + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for direct id references in join conditions (e.g., u.id, e.start_id) + has_direct_id := position('.id' in full_plan) > 0 OR + position('start_id' in full_plan) > 0 OR + position('end_id' in full_plan) > 0; + + -- Check for unoptimized pattern + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + + -- Optimization is applied if we see direct id references and no build_vertex + RETURN has_direct_id AND NOT has_build_vertex; +END; +$$; + -- Create test data: Users following each other SELECT * FROM cypher('unified_test', $$ CREATE (:JoinOptUser {name: 'Alice'}), @@ -838,14 +870,10 @@ SELECT * FROM cypher('unified_test', $$ CREATE (b)-[:JOPT_FOLLOWS]->(c) $$) AS (e agtype); --- EXPLAIN showing join conditions use direct column access --- Look for: graphid_to_agtype(id) instead of age_id(_agtype_build_vertex(...)) --- And: direct id comparisons instead of age_id(...)::graphid -EXPLAIN (COSTS OFF) -SELECT * FROM cypher('unified_test', $$ - MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) - RETURN u.name, v.name -$$) AS (u_name agtype, v_name agtype); +-- Test 29a: Simple join - check for direct column access optimization +SELECT plan_has_join_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:JoinOptUser)-[e:JOPT_FOLLOWS]->(v:JoinOptUser) RETURN u.name, v.name $$) AS (u_name agtype, v_name agtype)' +) AS simple_join_optimized; -- Verify the query still returns correct results SELECT * FROM cypher('unified_test', $$ @@ -854,12 +882,10 @@ SELECT * FROM cypher('unified_test', $$ ORDER BY u.name $$) AS (u_name agtype, v_name agtype); --- Multi-hop pattern showing optimization across multiple joins -EXPLAIN (COSTS OFF) -SELECT * FROM cypher('unified_test', $$ - MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) - RETURN a.name, b.name, c.name -$$) AS (a_name agtype, b_name agtype, c_name agtype); +-- Test 29b: Multi-hop pattern - check for optimization across multiple joins +SELECT plan_has_join_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (a:JoinOptUser)-[e1:JOPT_FOLLOWS]->(b:JoinOptUser)-[e2:JOPT_FOLLOWS]->(c:JoinOptUser) RETURN a.name, b.name, c.name $$) AS (a_name agtype, b_name agtype, c_name agtype)' +) AS multihop_join_optimized; -- Verify multi-hop query results SELECT * FROM cypher('unified_test', $$ @@ -867,6 +893,223 @@ SELECT * FROM cypher('unified_test', $$ RETURN a.name, b.name, c.name $$) AS (a_name agtype, b_name agtype, c_name agtype); +-- Clean up Test 29 helper function +DROP FUNCTION plan_has_join_optimization(text); + +-- +-- Test 30: Verify GROUP BY optimization with EXPLAIN +-- +-- When using aggregation with id(vertex) or id(edge) in the GROUP BY key, +-- the optimization should replace patterns like: +-- age_id(_agtype_build_vertex(u.id, ...)) +-- with direct column access: +-- graphid_to_agtype(u.id) +-- +-- This avoids expensive vertex/edge reconstruction during grouping. +-- + +-- Helper function to check if GROUP BY optimization is applied. +-- Returns true if the plan uses graphid_to_agtype (optimized) +-- and NOT _agtype_build_vertex (unoptimized) +CREATE OR REPLACE FUNCTION plan_has_groupby_optimization(sql text) +RETURNS boolean +LANGUAGE plpgsql AS +$$ +DECLARE + plan_row RECORD; + full_plan text := ''; + has_graphid_to_agtype boolean; + has_build_vertex boolean; + has_build_edge boolean; +BEGIN + -- Concatenate all rows of the EXPLAIN output + FOR plan_row IN EXECUTE format('EXPLAIN (FORMAT TEXT) %s', sql) + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for optimized pattern + has_graphid_to_agtype := position('graphid_to_agtype' in full_plan) > 0; + + -- Check for unoptimized patterns + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + has_build_edge := position('_agtype_build_edge' in full_plan) > 0; + + -- Optimization is applied if we see graphid_to_agtype + -- and don't see _agtype_build in the Group Key + RETURN has_graphid_to_agtype AND NOT has_build_vertex AND NOT has_build_edge; +END; +$$; + +-- Create test data: Users and Books with interactions +SELECT * FROM cypher('unified_test', $$ + CREATE (:GrpOptUser {name: 'Alice'}), + (:GrpOptUser {name: 'Bob'}), + (:GrpOptUser {name: 'Carol'}), + (:GrpOptBook {title: 'Book1'}), + (:GrpOptBook {title: 'Book2'}) +$$) AS (v agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Alice'}), (b:GrpOptBook {title: 'Book1'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Alice'}), (b:GrpOptBook {title: 'Book2'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + +SELECT * FROM cypher('unified_test', $$ + MATCH (a:GrpOptUser {name: 'Bob'}), (b:GrpOptBook {title: 'Book1'}) + CREATE (a)-[:GRPOPT_READ]->(b) +$$) AS (e agtype); + +-- Test 30a: Simple vertex id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(u.id) instead of age_id(_agtype_build_vertex(...)) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u), count(*) $$) AS (user_id agtype, cnt agtype)' +) AS vertex_id_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (user_id agtype, cnt agtype); + +-- Test 30b: Multiple vertex id() GROUP BY keys optimization +-- Checks that plan uses graphid_to_agtype for both u.id and b.id +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(u), id(b), count(*) $$) AS (user_id agtype, book_id agtype, cnt agtype)' +) AS multi_vertex_id_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(u), id(b), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (user_id agtype, book_id agtype, cnt agtype); + +-- Test 30c: Edge id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(e.id) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(e), count(*) $$) AS (edge_id agtype, cnt agtype)' +) AS edge_id_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(e), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (edge_id agtype, cnt agtype); + +-- Test 30d: Edge start_id() and end_id() GROUP BY optimization +-- Checks that plan uses graphid_to_agtype(e.start_id) and graphid_to_agtype(e.end_id) +SELECT plan_has_groupby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN start_id(e), end_id(e), count(*) $$) AS (start_id agtype, end_id agtype, cnt agtype)' +) AS edge_start_end_id_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN start_id(e), end_id(e), count(*) AS cnt + ORDER BY cnt DESC +$$) AS (start_id agtype, end_id agtype, cnt agtype); + +-- Cleanup the GROUP BY helper function +DROP FUNCTION plan_has_groupby_optimization(text); + +-- +-- Test 31: ORDER BY optimization tests +-- Test that ORDER BY id(v) uses graphid_to_agtype instead of rebuilding vertex +-- + +-- Create a helper function to check for ORDER BY optimization +CREATE OR REPLACE FUNCTION plan_has_orderby_optimization(query_text text) +RETURNS boolean AS $$ +DECLARE + plan_row record; + full_plan text := ''; + has_graphid_to_agtype boolean; + has_build_vertex boolean; + has_build_edge boolean; +BEGIN + -- Load AGE extension and set search path for the session + EXECUTE 'LOAD ''age'''; + EXECUTE 'SET search_path = ag_catalog, public'; + + -- Get the query plan + FOR plan_row IN EXECUTE 'EXPLAIN (COSTS OFF) ' || query_text + LOOP + full_plan := full_plan || plan_row."QUERY PLAN" || ' '; + END LOOP; + + -- Check for optimized pattern + has_graphid_to_agtype := position('graphid_to_agtype' in full_plan) > 0; + + -- Check for unoptimized patterns + has_build_vertex := position('_agtype_build_vertex' in full_plan) > 0; + has_build_edge := position('_agtype_build_edge' in full_plan) > 0; + + -- Optimization is applied if we see graphid_to_agtype + -- and don't see _agtype_build in the Sort Key + RETURN has_graphid_to_agtype AND NOT has_build_vertex AND NOT has_build_edge; +END; +$$ +LANGUAGE plpgsql; + +-- Test 31a: Simple vertex id() ORDER BY optimization +-- Checks that Sort Key uses graphid_to_agtype(u.id) instead of age_id(_agtype_build_vertex(...)) +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u) ORDER BY id(u) $$) AS (user_id agtype)' +) AS vertex_id_orderby_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u) ORDER BY id(u) +$$) AS (user_id agtype); + +-- Test 31b: ORDER BY with GROUP BY (both should be optimized) +-- Checks that both Sort Key and Group Key use graphid_to_agtype +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser) RETURN id(u), count(*) ORDER BY id(u) $$) AS (user_id agtype, cnt agtype)' +) AS orderby_with_groupby_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser) + RETURN id(u), count(*) AS cnt + ORDER BY id(u) +$$) AS (user_id agtype, cnt agtype); + +-- Test 31c: Edge id() ORDER BY optimization +-- Checks that Sort Key uses graphid_to_agtype(e.id) +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN id(e) ORDER BY id(e) $$) AS (edge_id agtype)' +) AS edge_id_orderby_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN id(e) ORDER BY id(e) +$$) AS (edge_id agtype); + +-- Test 31d: Edge start_id() ORDER BY optimization +SELECT plan_has_orderby_optimization( + 'SELECT * FROM cypher(''unified_test'', $$ MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) RETURN start_id(e) ORDER BY start_id(e) $$) AS (start_id agtype)' +) AS edge_start_id_orderby_optimized; + +-- Verify correct results +SELECT * FROM cypher('unified_test', $$ + MATCH (u:GrpOptUser)-[e:GRPOPT_READ]->(b:GrpOptBook) + RETURN start_id(e) ORDER BY start_id(e) +$$) AS (start_id agtype); + +-- Cleanup the ORDER BY helper function +DROP FUNCTION plan_has_orderby_optimization(text); + -- -- Cleanup -- diff --git a/src/backend/parser/cypher_clause.c b/src/backend/parser/cypher_clause.c index e56ac8acb..5b5bbee96 100644 --- a/src/backend/parser/cypher_clause.c +++ b/src/backend/parser/cypher_clause.c @@ -322,6 +322,10 @@ static Index transform_group_clause_expr(List **flatresult, static List *add_target_to_group_list(cypher_parsestate *cpstate, TargetEntry *tle, List *grouplist, List *targetlist, int location); +static void optimize_sortgroupby_vertex_access(ParseState *pstate, + List *targetList, + List *sortClause, + List *groupClause); static void advance_transform_entities_to_next_clause(List *entities); static ParseNamespaceItem *get_namespace_item(ParseState *pstate, @@ -2281,6 +2285,265 @@ static List * transform_group_clause(cypher_parsestate *cpstate, return result; } +/* + * optimize_sortgroupby_vertex_access + * + * Optimizes GROUP BY and ORDER BY expressions that access vertex/edge fields + * through Var references to subqueries. When we have: + * + * MATCH (u:User) RETURN id(u), count(*) -- GROUP BY case + * MATCH (u:User) RETURN id(u) ORDER BY id(u) -- ORDER BY case + * + * The sort/group key is `age_id(Var)` where Var references the subquery's + * target entry containing `_agtype_build_vertex(id, label, props)`. + * + * This function walks the target list and for any expression matching + * this pattern, replaces it with the optimized form: + * graphid_to_agtype(id) + * + * This avoids reconstructing the entire vertex just to extract the id. + */ +static void optimize_sortgroupby_vertex_access(ParseState *pstate, + List *targetList, + List *sortClause, + List *groupClause) +{ + ListCell *lc; + + /* Only optimize if we have a GROUP BY or ORDER BY clause */ + if (groupClause == NIL && sortClause == NIL) + { + return; + } + + /* Walk through target entries involved in GROUP BY or ORDER BY */ + foreach(lc, targetList) + { + TargetEntry *tle = (TargetEntry *)lfirst(lc); + FuncExpr *outer_func; + Var *var; + RangeTblEntry *rte; + Query *subquery; + TargetEntry *sub_tle; + FuncExpr *inner_func; + char *outer_func_name; + char *inner_func_name; + List *inner_args; + int arg_index = -1; + Oid result_type; + bool needs_cast = false; + + /* Skip if not part of GROUP BY or ORDER BY */ + if (tle->ressortgroupref == 0) + { + continue; + } + + /* Must be a FuncExpr (like age_id) */ + if (tle->expr == NULL || !IsA(tle->expr, FuncExpr)) + { + continue; + } + + outer_func = (FuncExpr *)tle->expr; + + /* Must have exactly one argument */ + if (list_length(outer_func->args) != 1) + { + continue; + } + + /* Get outer function name */ + outer_func_name = get_func_name(outer_func->funcid); + if (outer_func_name == NULL) + { + continue; + } + + /* Check if it's an accessor function we can optimize */ + if (strcmp(outer_func_name, "age_id") != 0 && + strcmp(outer_func_name, "age_start_id") != 0 && + strcmp(outer_func_name, "age_end_id") != 0 && + strcmp(outer_func_name, "age_properties") != 0) + { + continue; + } + + /* The argument must be a Var */ + if (!IsA(linitial(outer_func->args), Var)) + { + continue; + } + + var = (Var *)linitial(outer_func->args); + + /* The Var must reference a subquery RTE */ + if (var->varno < 1 || + var->varno > list_length(pstate->p_rtable)) + { + continue; + } + + rte = rt_fetch(var->varno, pstate->p_rtable); + if (rte->rtekind != RTE_SUBQUERY || rte->subquery == NULL) + { + continue; + } + + subquery = rte->subquery; + + /* Get the target entry in the subquery that this Var references */ + if (var->varattno < 1 || + var->varattno > list_length(subquery->targetList)) + { + continue; + } + + sub_tle = (TargetEntry *)list_nth(subquery->targetList, + var->varattno - 1); + + /* The subquery target must be a FuncExpr (the build function) */ + if (sub_tle->expr == NULL || !IsA(sub_tle->expr, FuncExpr)) + { + continue; + } + + inner_func = (FuncExpr *)sub_tle->expr; + inner_func_name = get_func_name(inner_func->funcid); + + if (inner_func_name == NULL) + { + continue; + } + + inner_args = inner_func->args; + + /* + * Check for _agtype_build_vertex(id, label_name, properties) + * Arguments: 0=id (graphid), 1=label_name (cstring), 2=properties (agtype) + */ + if (strcmp(inner_func_name, "_agtype_build_vertex") == 0 && + list_length(inner_args) == 3) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + arg_index = 0; + result_type = GRAPHIDOID; + needs_cast = true; + } + else if (strcmp(outer_func_name, "age_properties") == 0) + { + arg_index = 2; + result_type = AGTYPEOID; + needs_cast = false; + } + } + /* + * Check for _agtype_build_edge(id, startid, endid, label_name, properties) + * Arguments: 0=id, 1=start_id, 2=end_id, 3=label_name, 4=properties + */ + else if (strcmp(inner_func_name, "_agtype_build_edge") == 0 && + list_length(inner_args) == 5) + { + if (strcmp(outer_func_name, "age_id") == 0) + { + arg_index = 0; + result_type = GRAPHIDOID; + needs_cast = true; + } + else if (strcmp(outer_func_name, "age_start_id") == 0) + { + arg_index = 1; + result_type = GRAPHIDOID; + needs_cast = true; + } + else if (strcmp(outer_func_name, "age_end_id") == 0) + { + arg_index = 2; + result_type = GRAPHIDOID; + needs_cast = true; + } + else if (strcmp(outer_func_name, "age_properties") == 0) + { + arg_index = 4; + result_type = AGTYPEOID; + needs_cast = false; + } + } + + /* If we found a pattern to optimize */ + if (arg_index >= 0) + { + Node *inner_arg = (Node *)list_nth(inner_args, arg_index); + Node *optimized_expr = NULL; + + /* + * The inner argument is a Var in the subquery's context. + * We need to create a new expression that references it properly + * from the outer query. We do this by adding a new target entry + * to the subquery that directly exposes the field we need. + */ + + if (needs_cast && result_type == GRAPHIDOID) + { + Oid cast_func_oid; + FuncExpr *cast_expr; + TargetEntry *new_sub_tle; + AttrNumber new_attno; + Var *new_var; + + /* Add a new target entry to the subquery with just the id */ + new_attno = list_length(subquery->targetList) + 1; + new_sub_tle = makeTargetEntry((Expr *)copyObject(inner_arg), + new_attno, + NULL, /* no name - internal */ + true); /* resjunk */ + subquery->targetList = lappend(subquery->targetList, new_sub_tle); + + /* Create a Var referencing this new target entry */ + new_var = makeVar(var->varno, new_attno, GRAPHIDOID, + -1, InvalidOid, var->varlevelsup); + + /* Wrap in graphid_to_agtype */ + cast_func_oid = get_ag_func_oid("graphid_to_agtype", 1, + GRAPHIDOID); + cast_expr = makeFuncExpr(cast_func_oid, AGTYPEOID, + list_make1(new_var), + InvalidOid, InvalidOid, + COERCE_EXPLICIT_CALL); + cast_expr->location = outer_func->location; + + optimized_expr = (Node *)cast_expr; + } + else if (!needs_cast) + { + /* For properties, already agtype */ + TargetEntry *new_sub_tle; + AttrNumber new_attno; + Var *new_var; + + new_attno = list_length(subquery->targetList) + 1; + new_sub_tle = makeTargetEntry((Expr *)copyObject(inner_arg), + new_attno, + NULL, + true); /* resjunk */ + subquery->targetList = lappend(subquery->targetList, new_sub_tle); + + new_var = makeVar(var->varno, new_attno, AGTYPEOID, + -1, InvalidOid, var->varlevelsup); + + optimized_expr = (Node *)new_var; + } + + /* Replace the target entry's expression with the optimized one */ + if (optimized_expr != NULL) + { + tle->expr = (Expr *)optimized_expr; + } + } + } +} + static Query *transform_cypher_return(cypher_parsestate *cpstate, cypher_clause *clause) { @@ -2315,6 +2578,16 @@ static Query *transform_cypher_return(cypher_parsestate *cpstate, query->sortClause, EXPR_KIND_GROUP_BY); + /* + * Optimize GROUP BY and ORDER BY expressions that reference vertex/edge + * fields through subquery Vars. This allows: + * MATCH (u:User) RETURN id(u), count(*) -- GROUP BY + * MATCH (u:User) RETURN id(u) ORDER BY id(u) -- ORDER BY + * to use graphid_to_agtype(id) instead of rebuilding the entire vertex. + */ + optimize_sortgroupby_vertex_access(pstate, query->targetList, + query->sortClause, query->groupClause); + /* DISTINCT */ if (self->distinct) {