Skip to content

Commit eeb31f0

Browse files
committed
Implement semantic matching for joins
Phase 2-4: Core semantic matching implementation - Load lineage into Heading when initializing from table_info - Add get_homologous_namesakes() to find attributes with same name and lineage - Update assert_join_compatibility() for semantic matching: - Homologous namesakes (same lineage) -> allowed - Non-homologous namesakes (different lineage) -> error with guidance - Modify join() to use homologous namesakes instead of all namesakes - Remove @ operator (raises error directing to .join(semantic_check=False)) Joins now require attributes to have matching lineage, not just matching names. This prevents accidental joins on coincidentally-named attributes.
1 parent 5452fb1 commit eeb31f0

File tree

3 files changed

+95
-25
lines changed

3 files changed

+95
-25
lines changed

datajoint/condition.py

Lines changed: 69 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,45 @@ def __init__(self, restriction):
101101
self.restriction = restriction
102102

103103

104+
def get_homologous_namesakes(expr1, expr2):
105+
"""
106+
Find attributes that are namesakes (same name) and homologous (same lineage).
107+
108+
:param expr1: A QueryExpression object
109+
:param expr2: A QueryExpression object
110+
:return: set of attribute names that are homologous namesakes
111+
"""
112+
from .expression import U
113+
114+
if isinstance(expr1, U) or isinstance(expr2, U):
115+
# For U, fall back to simple name matching
116+
return set(expr1.heading.names) & set(expr2.heading.names)
117+
118+
namesakes = set(expr1.heading.names) & set(expr2.heading.names)
119+
homologous = set()
120+
121+
for attr in namesakes:
122+
lineage1 = expr1.heading.get_lineage(attr)
123+
lineage2 = expr2.heading.get_lineage(attr)
124+
125+
# Homologous if both have lineage and lineages match
126+
if lineage1 is not None and lineage2 is not None and lineage1 == lineage2:
127+
homologous.add(attr)
128+
129+
return homologous
130+
131+
104132
def assert_join_compatibility(expr1, expr2):
105133
"""
106-
Determine if expressions expr1 and expr2 are join-compatible. To be join-compatible,
107-
the matching attributes in the two expressions must be in the primary key of one or the
108-
other expression.
109-
Raises an exception if not compatible.
134+
Determine if expressions expr1 and expr2 are join-compatible using semantic matching.
135+
136+
For each namesake attribute (same name in both expressions):
137+
- If they have the same lineage (homologous), they can be joined
138+
- If they have different lineage (non-homologous), raise an error
110139
111140
:param expr1: A QueryExpression object
112141
:param expr2: A QueryExpression object
142+
:raises DataJointError: if non-homologous namesakes are found
113143
"""
114144
from .expression import QueryExpression, U
115145

@@ -118,21 +148,42 @@ def assert_join_compatibility(expr1, expr2):
118148
raise DataJointError(
119149
"Object %r is not a QueryExpression and cannot be joined." % rel
120150
)
121-
if not isinstance(expr1, U) and not isinstance(
122-
expr2, U
123-
): # dj.U is always compatible
124-
try:
125-
raise DataJointError(
126-
"Cannot join query expressions on dependent attribute `%s`"
127-
% next(
128-
r
129-
for r in set(expr1.heading.secondary_attributes).intersection(
130-
expr2.heading.secondary_attributes
131-
)
151+
152+
if isinstance(expr1, U) or isinstance(expr2, U):
153+
return # dj.U is always compatible
154+
155+
# Find namesake attributes (same name in both expressions)
156+
namesakes = set(expr1.heading.names) & set(expr2.heading.names)
157+
158+
for attr in namesakes:
159+
lineage1 = expr1.heading.get_lineage(attr)
160+
lineage2 = expr2.heading.get_lineage(attr)
161+
162+
# Check if they are homologous (same lineage)
163+
# None lineages are never homologous (not even with each other)
164+
if lineage1 is None or lineage2 is None or lineage1 != lineage2:
165+
# Non-homologous namesakes - error
166+
if lineage1 is None and lineage2 is None:
167+
msg = (
168+
f"Cannot join: attribute '{attr}' has no lineage in both operands "
169+
f"(native secondary attributes). Use .proj() to rename one."
132170
)
133-
)
134-
except StopIteration:
135-
pass # all ok
171+
elif lineage1 is None:
172+
msg = (
173+
f"Cannot join: attribute '{attr}' has lineage '{lineage2}' in one operand "
174+
f"but no lineage in the other. Use .proj() to rename one."
175+
)
176+
elif lineage2 is None:
177+
msg = (
178+
f"Cannot join: attribute '{attr}' has lineage '{lineage1}' in one operand "
179+
f"but no lineage in the other. Use .proj() to rename one."
180+
)
181+
else:
182+
msg = (
183+
f"Cannot join: attribute '{attr}' has different lineages "
184+
f"('{lineage1}' vs '{lineage2}'). Use .proj() to rename one."
185+
)
186+
raise DataJointError(msg)
136187

137188

138189
def make_condition(query_expression, condition, columns):

datajoint/expression.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Top,
1212
assert_join_compatibility,
1313
extract_column_names,
14+
get_homologous_namesakes,
1415
make_condition,
1516
translate_attribute,
1617
)
@@ -292,19 +293,22 @@ def __mul__(self, other):
292293

293294
def __matmul__(self, other):
294295
"""
295-
Permissive join of query expressions `self` and `other` ignoring compatibility check
296-
e.g. ``q1 @ q2``.
296+
The @ operator has been removed. Use .join(semantic_check=False) instead.
297297
"""
298-
if inspect.isclass(other) and issubclass(other, QueryExpression):
299-
other = other() # instantiate
300-
return self.join(other, semantic_check=False)
298+
raise DataJointError(
299+
"The @ operator has been removed in DataJoint 2.0. "
300+
"Use .join(other, semantic_check=False) instead."
301+
)
301302

302303
def join(self, other, semantic_check=True, left=False):
303304
"""
304305
create the joined QueryExpression.
305306
a * b is short for A.join(B)
306-
a @ b is short for A.join(B, semantic_check=False)
307+
a @ b is short for A.join(B, semantic_check=False) -- DEPRECATED
307308
Additionally, left=True will retain the rows of self, effectively performing a left join.
309+
310+
With semantic matching (semantic_check=True), joins are performed only on
311+
homologous namesakes (attributes with same name AND same lineage).
308312
"""
309313
# trigger subqueries if joining on renamed attributes
310314
if isinstance(other, U):
@@ -315,7 +319,11 @@ def join(self, other, semantic_check=True, left=False):
315319
raise DataJointError("The argument of join must be a QueryExpression")
316320
if semantic_check:
317321
assert_join_compatibility(self, other)
318-
join_attributes = set(n for n in self.heading.names if n in other.heading.names)
322+
# Use semantic matching: only join on homologous namesakes
323+
join_attributes = get_homologous_namesakes(self, other)
324+
else:
325+
# Permissive mode: join on all namesakes regardless of lineage
326+
join_attributes = set(n for n in self.heading.names if n in other.heading.names)
319327
# needs subquery if self's FROM clause has common attributes with other's FROM clause
320328
need_subquery1 = need_subquery2 = bool(
321329
(set(self.original_heading.names) & set(other.original_heading.names))

datajoint/heading.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,17 @@ def _init_from_database(self):
433433
# restore adapted type name
434434
attr["type"] = adapter_name
435435

436+
# Load lineage information for semantic matching
437+
try:
438+
from .lineage import get_lineage_for_heading
439+
440+
lineage_map = get_lineage_for_heading(conn, database, table_name, None)
441+
for attr in attributes:
442+
attr["lineage"] = lineage_map.get(attr["name"])
443+
except Exception as e:
444+
# If lineage loading fails, continue without it (backward compatibility)
445+
logger.debug(f"Could not load lineage for {database}.{table_name}: {e}")
446+
436447
self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes))
437448

438449
# Read and tabulate secondary indexes

0 commit comments

Comments
 (0)