From 59f2a53a485ebea2afb3467785cca5eb1f5f0b98 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 14 May 2026 10:17:18 -0600 Subject: [PATCH 1/3] test: add failing test for exponential CaseExpr hashing (#22173) Nested self-referential CASE chains cause exponential hash work during physical planning because CaseExpr derives Hash over both the original CaseBody and the derived ProjectedCaseBody in EvalMethod. Co-Authored-By: Claude Opus 4.6 --- .../physical-expr/src/expressions/case.rs | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 20d0a9e97e833..92830779966e4 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -3120,4 +3120,58 @@ mod tests { Arc::new(expected_with_else), ); } + + /// Reproduces https://github.com/apache/datafusion/issues/22173 + /// + /// Nested self-referential CASE chains (common in rewrite-style projections) + /// should not cause exponential hashing work during physical planning. + #[test] + fn nested_self_referential_case_hash_stays_bounded() -> Result<()> { + use std::hash::Hasher; + + #[derive(Default)] + struct CountingHasher { + write_calls: usize, + bytes_written: usize, + } + + impl Hasher for CountingHasher { + fn finish(&self) -> u64 { + 0 + } + + fn write(&mut self, bytes: &[u8]) { + self.write_calls += 1; + self.bytes_written += bytes.len(); + } + } + + let schema = + Arc::new(Schema::new(vec![Field::new("kind", DataType::Utf8, true)])); + + let kind = col("kind", &schema)?; + let mut label = Arc::clone(&kind); + + for idx in 0..18 { + let predicate = Arc::new(BinaryExpr::new( + Arc::clone(&kind), + Operator::Eq, + lit(idx.to_string()), + )) as Arc; + + label = case(None, vec![(predicate, lit("label"))], Some(label))?; + } + + let mut hasher = CountingHasher::default(); + label.hash(&mut hasher); + + assert!( + hasher.write_calls < 50_000, + "hashing nested CASE expression took {} hasher writes and {} bytes", + hasher.write_calls, + hasher.bytes_written + ); + + Ok(()) + } } From 0a2e1ca9bf0ef45f5d1abae8532fc7eaa16530ae Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 14 May 2026 10:25:29 -0600 Subject: [PATCH 2/3] perf: exclude derived EvalMethod from CaseExpr Hash/Eq (#22173) EvalMethod (including ProjectedCaseBody) is deterministically derived from CaseBody in try_new(), so hashing it is redundant. With the derived Hash, nested CASE chains caused exponential hash work during ProjectionExec::compute_properties because each level doubled the tree traversal. Manual Hash/Eq on body-only fixes this. Co-Authored-By: Claude Opus 4.6 --- .../physical-expr/src/expressions/case.rs | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 92830779966e4..1f23616793242 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -268,7 +268,7 @@ struct ProjectedCaseBody { /// [WHEN ...] /// [ELSE result] /// END -#[derive(Debug, Hash, PartialEq, Eq)] +#[derive(Debug)] pub struct CaseExpr { /// The case expression body body: CaseBody, @@ -276,6 +276,23 @@ pub struct CaseExpr { eval_method: EvalMethod, } +// eval_method is functionally derived from body, so excluding it from +// Hash/Eq avoids redundantly hashing the expression tree twice. For +// nested CASE chains this prevents exponential blowup (see #22173). +impl Hash for CaseExpr { + fn hash(&self, state: &mut H) { + self.body.hash(state); + } +} + +impl PartialEq for CaseExpr { + fn eq(&self, other: &Self) -> bool { + self.body == other.body + } +} + +impl Eq for CaseExpr {} + impl std::fmt::Display for CaseExpr { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { write!(f, "CASE ")?; @@ -3152,7 +3169,8 @@ mod tests { let kind = col("kind", &schema)?; let mut label = Arc::clone(&kind); - for idx in 0..18 { + let num_levels = 18; + for idx in 0..num_levels { let predicate = Arc::new(BinaryExpr::new( Arc::clone(&kind), Operator::Eq, @@ -3165,6 +3183,7 @@ mod tests { let mut hasher = CountingHasher::default(); label.hash(&mut hasher); + println!("{num_levels} level CASE did {} hashes", hasher.write_calls); assert!( hasher.write_calls < 50_000, "hashing nested CASE expression took {} hasher writes and {} bytes", From f63cf01f15f296ab3e57a72f8eaa59e1a28a643b Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 14 May 2026 10:34:39 -0600 Subject: [PATCH 3/3] no println --- datafusion/physical-expr/src/expressions/case.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 1f23616793242..568ecb9cf336b 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -3183,7 +3183,6 @@ mod tests { let mut hasher = CountingHasher::default(); label.hash(&mut hasher); - println!("{num_levels} level CASE did {} hashes", hasher.write_calls); assert!( hasher.write_calls < 50_000, "hashing nested CASE expression took {} hasher writes and {} bytes",