Add aggregation exception for left join constraint

claude · claude · commit c69b446f9f1f · 2025-12-25T02:25:24.000Z
Aggregation with keep_all_rows=True uses a left join internally but has
the opposite requirement (B → A) compared to direct left joins (A → B).
This is valid because the GROUP BY clause resets the primary key to PK(A),
ensuring non-NULL primary key values.

Changes:
- Add _aggregation parameter to heading.join() and expression.join()
- Aggregation.create() passes _aggregation=True to bypass validation
- Document aggregation exception in spec
- Add tests for aggregation with keep_all_rows=True

Co-authored-by: dimitri-yatsenko&lt;dimitri@datajoint.com&gt;
diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/design/semantic-matching-spec.md
@@ -341,6 +341,32 @@ The following attributes from the right operand's primary key are not determined
 the left operand: ['z']. Use an inner join or restructure the query.
 ```
 
+### Aggregation Exception
+
+`A.aggr(B, keep_all_rows=True)` uses a left join internally but has the **opposite requirement**: **B → A** (the group expression B must have all of A's primary key attributes).
+
+This apparent contradiction is resolved by the `GROUP BY` clause:
+
+1. Aggregation requires B → A so that B can be grouped by A's primary key
+2. The intermediate left join `A LEFT JOIN B` would have an invalid PK under the normal left join rules (B → A case gives PK(B))
+3. However, aggregation's `GROUP BY PK(A)` clause **resets** the primary key to PK(A)
+4. The final result has PK(A), which consists entirely of non-NULL values from A
+
+**Example:**
+```
+Session: session_id*, date
+Trial: session_id*, trial_num*, response_time    (references Session)
+
+# Aggregation with keep_all_rows=True
+Session.aggr(Trial, keep_all_rows=True, avg_rt='avg(response_time)')
+
+# Internally: Session LEFT JOIN Trial (B → A, would normally be invalid)
+# But GROUP BY session_id resets PK to {session_id}
+# Result: All sessions, with avg_rt=NULL for sessions without trials
+```
+
+The left join constraint validation is bypassed internally for aggregation because the `GROUP BY` clause guarantees a valid primary key in the final result.
+
 ## Universal Set `dj.U`
 
 `dj.U()` or `dj.U('attr1', 'attr2', ...)` represents the universal set of all possible values and lineages.
diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py
@@ -282,7 +282,7 @@ def __matmul__(self, other):
             "The @ operator has been removed in DataJoint 2.0. " "Use .join(other, semantic_check=False) for permissive joins."
         )
 
-    def join(self, other, semantic_check=True, left=False):
+    def join(self, other, semantic_check=True, left=False, _aggregation=False):
         """
         Create the joined QueryExpression.
 
@@ -293,6 +293,7 @@ def join(self, other, semantic_check=True, left=False):
         :param semantic_check: If True (default), raise error on non-homologous namesakes.
             If False, bypass semantic check (use for legacy compatibility).
         :param left: If True, perform a left join retaining all rows from self.
+        :param _aggregation: Internal flag to bypass left join validation for aggregation.
 
         Examples:
             a * b  is short for a.join(b)
@@ -336,10 +337,10 @@ def join(self, other, semantic_check=True, left=False):
         result._connection = self.connection
         result._support = self.support + other.support
         result._left = self._left + [left] + other._left
-        result._heading = self.heading.join(other.heading, left=left)
+        result._heading = self.heading.join(other.heading, left=left, _aggregation=_aggregation)
         result._restriction = AndList(self.restriction)
         result._restriction.append(other.restriction)
-        result._original_heading = self.original_heading.join(other.original_heading, left=left)
+        result._original_heading = self.original_heading.join(other.original_heading, left=left, _aggregation=_aggregation)
         assert len(result.support) == len(result._left) + 1
         return result
 
@@ -683,7 +684,8 @@ def create(cls, arg, group, keep_all_rows=False):
 
         if keep_all_rows and len(group.support) > 1 or group.heading.new_attributes:
             group = group.make_subquery()  # subquery if left joining a join
-        join = arg.join(group, left=keep_all_rows)  # reuse the join logic
+        # Pass _aggregation=True to bypass left join validation (aggregation resets PK via GROUP BY)
+        join = arg.join(group, left=keep_all_rows, _aggregation=True)
         result = cls()
         result._connection = join.connection
         result._heading = join.heading.set_primary_key(arg.primary_key)  # use left operand's primary key
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
@@ -468,7 +468,7 @@ def select(self, select_list, rename_map=None, compute_map=None):
         )
         return Heading(chain(copy_attrs, compute_attrs))
 
-    def join(self, other, left=False):
+    def join(self, other, left=False, _aggregation=False):
         """
         Join two headings into a new one.
 
@@ -486,11 +486,16 @@ def join(self, other, left=False):
         - If B → A or Neither, the PK would include B's attributes, which could be NULL
         - Only when A → B does PK(A) uniquely identify all result rows
 
+        Exception: Aggregation (A.aggr(B, keep_all_rows=True)) uses a left join internally
+        but requires B → A instead. This is valid because the GROUP BY clause resets the
+        primary key to PK(A), which consists of non-NULL values from the left operand.
+
         It assumes that self and other are headings that share no common dependent attributes.
 
         :param other: The other heading to join with
-        :param left: If True, this is a left join (requires A → B)
-        :raises DataJointError: If left=True and A does not determine B
+        :param left: If True, this is a left join (requires A → B unless _aggregation=True)
+        :param _aggregation: If True, skip left join validation (used by Aggregation.create)
+        :raises DataJointError: If left=True and A does not determine B (unless _aggregation)
         """
         from .errors import DataJointError
 
@@ -502,8 +507,8 @@ def join(self, other, left=False):
             name in other.primary_key or name in other.secondary_attributes for name in self.primary_key
         )
 
-        # For left joins, require A → B
-        if left and not self_determines_other:
+        # For left joins, require A → B (unless this is an aggregation context)
+        if left and not _aggregation and not self_determines_other:
             missing = [
                 name for name in other.primary_key if name not in self.primary_key and name not in self.secondary_attributes
             ]
diff --git a/tests/test_semantic_matching.py b/tests/test_semantic_matching.py
@@ -754,3 +754,63 @@ def test_inner_join_still_works_when_b_determines_a(self, schema_pk_rules):
 
         # PK should be {x, z} (B's PK)
         assert set(result.primary_key) == {"x", "z"}
+
+
+class TestAggregationWithKeepAllRows:
+    """
+    Test that aggregation with keep_all_rows=True works correctly.
+
+    Aggregation uses a left join internally but has the opposite requirement (B → A)
+    compared to direct left joins (which require A → B). This is valid because the
+    GROUP BY clause resets the PK to PK(A).
+    """
+
+    def test_aggregation_keep_all_rows_works_with_b_determines_a(self, schema_pk_rules):
+        """
+        Aggregation with keep_all_rows=True should work when B → A.
+
+        A: x*               PK(A) = {x}
+        B: x*, y*           PK(B) = {x, y}
+
+        B → A? x in PK(B) → Yes (aggregation requirement met)
+
+        The internal left join would normally fail (B → A, not A → B), but
+        aggregation bypasses this because GROUP BY resets PK to {x}.
+        """
+        TableX = schema_pk_rules["TableX"]
+        TableXY = schema_pk_rules["TableXY"]
+
+        # This should work - aggregation with keep_all_rows=True
+        result = TableX.aggr(TableXY, keep_all_rows=True, count="count(*)")
+
+        # PK should be PK(A) = {x} (reset by GROUP BY)
+        assert set(result.primary_key) == {"x"}
+
+    def test_aggregation_keep_all_rows_produces_correct_pk(self, schema_pk_rules):
+        """
+        Aggregation result should always have PK(A), regardless of functional dependencies.
+        """
+        TableXY = schema_pk_rules["TableXY"]
+        TableXZwithY = schema_pk_rules["TableXZwithY"]
+
+        # TableXY (A): PK = {x, y}
+        # TableXZwithY (B): PK = {x, z}, y is secondary
+        # B → A (y secondary in B), so left join would use PK(B) = {x, z}
+        # But aggregation resets to PK(A) = {x, y}
+        result = TableXY.aggr(TableXZwithY, keep_all_rows=True, count="count(*)")
+
+        # PK should be PK(A) = {x, y}
+        assert set(result.primary_key) == {"x", "y"}
+
+    def test_aggregation_without_keep_all_rows_also_works(self, schema_pk_rules):
+        """
+        Normal aggregation (keep_all_rows=False) should continue to work.
+        """
+        TableX = schema_pk_rules["TableX"]
+        TableXY = schema_pk_rules["TableXY"]
+
+        # Normal aggregation (inner join behavior)
+        result = TableX.aggr(TableXY, count="count(*)")
+
+        # PK should be PK(A) = {x}
+        assert set(result.primary_key) == {"x"}