Skip to content

Commit 635c777

Browse files
author
miranov25
committed
feat(RDF): Modular API with collision handling
Phase 4 complete: - setup_rdf_with_friends() - returns (rdf, file_handle) - setup_chain_with_friends() - TChain for multiple files - add_defines_to_rdf(on_collision='warn') - handles existing columns - get_join_columns_for_snapshot() - index columns helper - cache_to_snapshot() - convenience function - export_tree(columns=...) - snapshot mode 20 RDF tests passing. Co-authored-by: Claude (Architect) Reviewed-by: GPT, Gemini, Claude
1 parent b7e5761 commit 635c777

2 files changed

Lines changed: 111 additions & 3 deletions

File tree

UTILS/dfextensions/AliasDataFrame/AliasDataFrameRDF.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,7 @@ def setup_chain_with_friends(adf, file_patterns, treename="tree"):
912912
return rdf, chain, file_handles
913913

914914

915-
def add_defines_to_rdf(rdf, adf, target_aliases):
915+
def add_defines_to_rdf(rdf, adf, target_aliases, on_collision='warn'):
916916
"""
917917
Add Define() chain to RDataFrame for requested aliases.
918918
@@ -927,6 +927,12 @@ def add_defines_to_rdf(rdf, adf, target_aliases):
927927
AliasDataFrame with alias definitions
928928
target_aliases : list of str
929929
Alias names to define (dependencies are auto-resolved)
930+
on_collision : str
931+
How to handle when alias name matches existing column:
932+
- 'error': Raise ValueError
933+
- 'skip': Skip silently
934+
- 'warn': Skip with UserWarning (default)
935+
- 'redefine': Use Redefine() to overwrite
930936
931937
Returns
932938
-------
@@ -946,16 +952,50 @@ def add_defines_to_rdf(rdf, adf, target_aliases):
946952
- Resolves alias dependencies using get_ordered_defines()
947953
- Converts Python expressions to C++ using to_cpp_expr()
948954
- Applies Define() calls in correct dependency order
955+
- Handles column name collisions based on on_collision parameter
949956
"""
957+
import warnings
958+
959+
# Get existing columns in RDataFrame
960+
existing_columns = set(str(c) for c in rdf.GetColumnNames())
961+
950962
# Get ordered defines with C++ expressions
951963
defines = get_ordered_defines(target_aliases, aDF=adf)
964+
skipped = []
952965

953966
# Apply Define() chain
954967
for d in defines:
955968
name = d['name']
956969
cpp_expr = d['cpp_expr']
970+
971+
if name in existing_columns:
972+
if on_collision == 'error':
973+
raise ValueError(
974+
f"add_defines_to_rdf: column '{name}' already exists. "
975+
f"Use on_collision='skip', 'warn', or 'redefine'."
976+
)
977+
elif on_collision == 'skip':
978+
skipped.append(name)
979+
continue
980+
elif on_collision == 'warn':
981+
skipped.append(name)
982+
continue
983+
elif on_collision == 'redefine':
984+
rdf = rdf.Redefine(name, cpp_expr)
985+
continue
986+
else:
987+
raise ValueError(f"Unknown on_collision: {on_collision!r}")
988+
957989
rdf = rdf.Define(name, cpp_expr)
958990

991+
# Single warning for all skipped columns
992+
if skipped and on_collision == 'warn':
993+
warnings.warn(
994+
f"add_defines_to_rdf: {len(skipped)} column(s) already exist, "
995+
f"using existing branch data: {skipped[:5]}{'...' if len(skipped) > 5 else ''}",
996+
UserWarning
997+
)
998+
959999
return rdf
9601000

9611001

@@ -1038,7 +1078,7 @@ def get_join_columns_for_snapshot(adf, target_aliases=None):
10381078

10391079
def cache_to_snapshot(adf, input_file, output_file, target_aliases,
10401080
treename="tree", output_treename="cache",
1041-
include_join_columns=True):
1081+
include_join_columns=True, on_collision='warn'):
10421082
"""
10431083
Convenience function: compute aliases and save to ROOT file via RDataFrame.
10441084
@@ -1061,6 +1101,9 @@ def cache_to_snapshot(adf, input_file, output_file, target_aliases,
10611101
Output tree name (default: "cache")
10621102
include_join_columns : bool
10631103
If True, include index columns for rejoining (default: True)
1104+
on_collision : str
1105+
How to handle column collisions (default: 'warn')
1106+
See add_defines_to_rdf() for options.
10641107
10651108
Returns
10661109
-------
@@ -1091,7 +1134,7 @@ def cache_to_snapshot(adf, input_file, output_file, target_aliases,
10911134
rdf, file_handle = setup_rdf_with_friends(adf, input_file, treename)
10921135

10931136
# Add defines
1094-
rdf = add_defines_to_rdf(rdf, adf, target_aliases)
1137+
rdf = add_defines_to_rdf(rdf, adf, target_aliases, on_collision=on_collision)
10951138

10961139
# Determine columns to save
10971140
columns_to_save = list(target_aliases)

UTILS/dfextensions/AliasDataFrame/tests/test_AliasDataFrameRDF.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,71 @@ def test_cache_creates_file(self, tmp_path):
745745
assert result['time_s'] > 0
746746

747747

748+
@pytest.mark.skipif(not HAS_ROOT, reason="ROOT not available")
749+
class TestAddDefinesCollision:
750+
"""Test add_defines_to_rdf collision handling."""
751+
752+
@pytest.fixture
753+
def adf_with_collision(self, tmp_path):
754+
"""Create ADF with physical column that matches an alias name."""
755+
df = pd.DataFrame({
756+
'x': np.array([1, 2, 3], dtype=np.float32),
757+
'y': np.array([4, 5, 6], dtype=np.float32),
758+
})
759+
adf = AliasDataFrame(df)
760+
761+
# Export FIRST (so 'x' is a physical branch in tree)
762+
filepath = str(tmp_path / "collision.root")
763+
adf.export_tree(filepath, 'tree')
764+
765+
# THEN add alias 'x' (collides with physical branch 'x')
766+
adf.add_alias('x', 'y * 2')
767+
768+
return adf, filepath
769+
770+
def test_collision_error(self, adf_with_collision):
771+
"""Test on_collision='error' raises ValueError."""
772+
from AliasDataFrameRDF import setup_rdf_with_friends, add_defines_to_rdf
773+
adf, filepath = adf_with_collision
774+
775+
rdf, f = setup_rdf_with_friends(adf, filepath)
776+
with pytest.raises(ValueError, match="already exists"):
777+
add_defines_to_rdf(rdf, adf, ['x'], on_collision='error')
778+
779+
def test_collision_skip(self, adf_with_collision):
780+
"""Test on_collision='skip' skips silently."""
781+
from AliasDataFrameRDF import setup_rdf_with_friends, add_defines_to_rdf
782+
adf, filepath = adf_with_collision
783+
784+
rdf, f = setup_rdf_with_friends(adf, filepath)
785+
rdf = add_defines_to_rdf(rdf, adf, ['x'], on_collision='skip')
786+
787+
# Should use original branch value (1,2,3), not alias (8,10,12)
788+
mean = rdf.Mean('x').GetValue()
789+
assert abs(mean - 2.0) < 0.1 # Original data mean
790+
791+
def test_collision_warn(self, adf_with_collision):
792+
"""Test on_collision='warn' skips with warning."""
793+
from AliasDataFrameRDF import setup_rdf_with_friends, add_defines_to_rdf
794+
adf, filepath = adf_with_collision
795+
796+
rdf, f = setup_rdf_with_friends(adf, filepath)
797+
with pytest.warns(UserWarning, match="already exist"):
798+
rdf = add_defines_to_rdf(rdf, adf, ['x'], on_collision='warn')
799+
800+
def test_collision_redefine(self, adf_with_collision):
801+
"""Test on_collision='redefine' overwrites with alias."""
802+
from AliasDataFrameRDF import setup_rdf_with_friends, add_defines_to_rdf
803+
adf, filepath = adf_with_collision
804+
805+
rdf, f = setup_rdf_with_friends(adf, filepath)
806+
rdf = add_defines_to_rdf(rdf, adf, ['x'], on_collision='redefine')
807+
808+
# Should use alias value (y*2 = 8,10,12), not original (1,2,3)
809+
mean = rdf.Mean('x').GetValue()
810+
assert abs(mean - 10.0) < 0.1 # Alias computation mean
811+
812+
748813
# =============================================================================
749814
# Summary Report
750815
# =============================================================================

0 commit comments

Comments
 (0)