@@ -214,8 +214,8 @@ class SchemaGraph:
214214### Phase 1: Add Lineage Infrastructure
215215
2162161 . ** Add ` lineage ` and ` lineage_hash ` fields to ` Attribute ` ** (` heading.py ` )
217- - ` lineage ` : tuple ` (origin_schema, origin_table, origin_attribute) ` or ` None `
218- - ` lineage_hash ` : short hash (e.g., 8 bytes) for fast comparison
217+ - ` lineage ` : string ` "schema.table.attribute" ` or ` None `
218+ - ` lineage_hash ` : 8-byte hash for fast comparison
219219 - Add both to ` default_attribute_properties ` with default ` None `
220220
221221 ``` python
@@ -224,14 +224,13 @@ class SchemaGraph:
224224 if lineage is None :
225225 return None
226226 # Use first 8 bytes of SHA-256 for compact representation
227- canonical = f " { lineage[0 ]} . { lineage[1 ]} . { lineage[2 ]} "
228- return hashlib.sha256(canonical.encode()).digest()[:8 ]
227+ return hashlib.sha256(lineage.encode()).digest()[:8 ]
229228 ```
230229
231230 ** Comparison strategy** :
232- - Fast path: compare ` lineage_hash ` (8-byte comparison)
233- - On hash match: verify full ` lineage ` tuple (collision protection)
234- - ` None ` lineage never matches anything (computed attributes)
231+ - Compare ` lineage_hash ` only (8-byte comparison)
232+ - Hash collisions (1 in 2^64) are acceptable given the low probability and cost
233+ - ` None ` lineage never matches anything
235234
2362352 . ** Create ` ~lineage ` table management** (new file: ` datajoint/lineage.py ` )
237236 - ` LineageTable ` class (similar to ` ExternalTable ` )
@@ -340,10 +339,12 @@ This approach:
340339When the ` ~lineage ` table does not exist (e.g., external databases, legacy schemas), lineage is computed ** in-memory** from the FK graph using the existing ` Dependencies ` class:
341340
342341``` python
343- def compute_lineage_from_dependencies (connection , table_name , attribute_name ):
342+ def compute_lineage_from_dependencies (connection , schema , table_name , attribute_name ):
344343 """
345344 Compute lineage by traversing the FK graph.
346345 Uses connection.dependencies which already loads FK info from INFORMATION_SCHEMA.
346+
347+ Returns lineage string "schema.table.attribute" or None for native secondary attrs.
347348 """
348349 connection.dependencies.load(force = False ) # ensure dependencies are loaded
349350
@@ -353,16 +354,18 @@ def compute_lineage_from_dependencies(connection, table_name, attribute_name):
353354 for parent, props in connection.dependencies.parents(full_table_name).items():
354355 attr_map = props.get(' attr_map' , {})
355356 if attribute_name in attr_map:
356- # This attribute is inherited from parent
357+ # This attribute is inherited from parent - recurse to find origin
357358 parent_attr = attr_map[attribute_name]
358359 parent_schema, parent_table = parse_full_table_name(parent)
359- # Recurse to find ultimate origin
360360 return compute_lineage_from_dependencies(
361- connection, parent_table, parent_attr, parent_schema
361+ connection, parent_schema, parent_table, parent_attr
362362 )
363363
364- # Not inherited - this table is the origin
365- return (schema, table_name, attribute_name)
364+ # Not inherited - origin is this table (for PK attrs) or None (for native secondary)
365+ if is_primary_key(connection, schema, table_name, attribute_name):
366+ return f " { schema} . { table_name} . { attribute_name} "
367+ else :
368+ return None # native secondary attribute
366369```
367370
368371#### Integration with Dependencies Loading
@@ -380,12 +383,10 @@ def compute_lineage_from_dependencies(connection, table_name, attribute_name):
380383CREATE TABLE ` ~lineage` (
381384 table_name VARCHAR (64 ) NOT NULL ,
382385 attribute_name VARCHAR (64 ) NOT NULL ,
383- origin_schema VARCHAR (64 ) NOT NULL ,
384- origin_table VARCHAR (64 ) NOT NULL ,
385- origin_attribute VARCHAR (64 ) NOT NULL ,
386+ lineage VARCHAR (200 ) NOT NULL , -- "schema.table.attribute"
386387 lineage_hash BINARY(8 ) NOT NULL , -- fast comparison hash
387388 PRIMARY KEY (table_name, attribute_name),
388- INDEX idx_lineage_hash (lineage_hash) -- enables hash-based lookups
389+ INDEX idx_lineage_hash (lineage_hash)
389390) ENGINE= InnoDB;
390391```
391392
@@ -394,9 +395,7 @@ For PostgreSQL:
394395CREATE TABLE " ~lineage" (
395396 table_name VARCHAR (64 ) NOT NULL ,
396397 attribute_name VARCHAR (64 ) NOT NULL ,
397- origin_schema VARCHAR (64 ) NOT NULL ,
398- origin_table VARCHAR (64 ) NOT NULL ,
399- origin_attribute VARCHAR (64 ) NOT NULL ,
398+ lineage VARCHAR (200 ) NOT NULL , -- "schema.table.attribute"
400399 lineage_hash BYTEA NOT NULL , -- 8 bytes
401400 PRIMARY KEY (table_name, attribute_name)
402401);
@@ -411,7 +410,7 @@ When a `Heading` is initialized from a table, query the `~lineage` table:
411410def _load_lineage (self , connection , database , table_name ):
412411 """ Load lineage information from the ~lineage metadata table."""
413412 query = """
414- SELECT attribute_name, origin_schema, origin_table, origin_attribute
413+ SELECT attribute_name, lineage, lineage_hash
415414 FROM `{database} `.`~lineage`
416415 WHERE table_name = %s
417416 """ .format(database = database)
@@ -529,22 +528,25 @@ def migrate_schema_lineage(schema):
529528#### Algorithm for Computing Lineage
530529
531530``` python
532- def compute_attribute_lineage (schema , table , attribute ):
531+ def compute_attribute_lineage (schema , table , attribute , is_pk ):
533532 """
534533 Trace an attribute to its original definition.
535534
536- Returns (origin_schema, origin_table, origin_attribute)
535+ Returns lineage string "schema.table.attribute" or None for native secondary.
537536 """
538537 # Check if this attribute is part of a foreign key
539538 fk_info = get_foreign_key_for_attribute(schema, table, attribute)
540539
541540 if fk_info is None :
542- # Native attribute - origin is this table
543- return (schema, table, attribute)
541+ # Native attribute
542+ if is_pk:
543+ return f " { schema} . { table} . { attribute} " # PK has lineage
544+ else :
545+ return None # native secondary has no lineage
544546
545547 # Inherited via FK - recurse to referenced table
546548 ref_schema, ref_table, ref_attribute = fk_info
547- return compute_attribute_lineage(ref_schema, ref_table, ref_attribute)
549+ return compute_attribute_lineage(ref_schema, ref_table, ref_attribute, is_pk = True )
548550```
549551
550552#### MySQL Query for FK Analysis
@@ -590,16 +592,16 @@ WHERE c.contype = 'f'
590592## Performance Considerations
591593
5925941 . ** Memory** : Two additional fields per attribute
593- - ` lineage ` : tuple of 3 strings ( ~ 100-200 bytes typical)
595+ - ` lineage ` : string ` "schema.table.attribute" ` ( ~ 50-100 bytes typical) or ` None `
594596 - ` lineage_hash ` : 8 bytes (fixed)
595597
596- 2 . ** Comparison** : Two-phase strategy for optimal performance
597- - ** Fast path ** : Compare 8-byte ` lineage_hash ` values (single comparison)
598- - ** Verification ** : On hash match, verify full tuple (collision protection)
599- - Hash collisions are astronomically rare (1 in 2^64) but we verify anyway
598+ 2 . ** Comparison** : Hash-only comparison
599+ - Compare 8-byte ` lineage_hash ` values (single integer comparison)
600+ - No fallback verification needed - collision probability (1 in 2^64) is negligible
601+ - ` None ` hashes never match
600602
6016033 . ** Storage** : Small overhead in ` ~lineage ` table
602- - ~ 200 bytes per attribute (table_name + attribute_name + origin tuple + hash)
604+ - ~ 150 bytes per attribute (table_name + attribute_name + lineage string + hash)
603605 - Indexed by (table_name, attribute_name) for fast lookup
604606 - Secondary index on ` lineage_hash ` for potential future optimizations
605607
0 commit comments