From 6ecfc4ebc02cd50faf4a71ee361a9e56c1552121 Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Tue, 27 Jan 2026 02:19:32 -0800 Subject: [PATCH 1/2] [C++] Add ORC stripe statistics extraction foundation Add internal utilities for extracting min/max statistics from ORC stripe metadata. This establishes the foundation for statistics-based stripe filtering in predicate pushdown. Changes: - Add MinMaxStats struct to hold extracted statistics - Add ExtractStripeStatistics() function for INT64 columns - Statistics extraction returns std::nullopt for missing/invalid data - Validates statistics integrity (min <= max) This is an internal-only change with no public API modifications. Part of incremental ORC predicate pushdown implementation (PR1/15). --- cpp/src/arrow/adapters/orc/adapter.cc | 61 +++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 51cca497485..1ef149c9b67 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,66 @@ constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024; using internal::checked_cast; +// Statistics container for min/max values from ORC stripe statistics +struct MinMaxStats { + int64_t min; + int64_t max; + bool has_null; + + MinMaxStats(int64_t min_val, int64_t max_val, bool null_flag) + : min(min_val), max(max_val), has_null(null_flag) {} +}; + +// Extract stripe-level statistics for a specific column +// Returns nullopt if statistics are missing or invalid +std::optional ExtractStripeStatistics( + const std::unique_ptr& stripe_stats, + uint32_t orc_column_id, + const std::shared_ptr& field_type) { + + if (!stripe_stats) { + return std::nullopt; // No statistics available + } + + // Get column statistics + const liborc::ColumnStatistics* col_stats = + stripe_stats->getColumnStatistics(orc_column_id); + + if (!col_stats) { + return std::nullopt; // Column statistics missing + } + + // Only INT64 support in this initial implementation + if (field_type->id() != Type::INT64) { + return std::nullopt; // Unsupported type + } + + // Dynamic cast to get integer-specific statistics + const auto* int_stats = + dynamic_cast(col_stats); + + if (!int_stats) { + return std::nullopt; // Wrong statistics type + } + + // Check if min/max are available + if (!int_stats->hasMinimum() || !int_stats->hasMaximum()) { + return std::nullopt; // Statistics incomplete + } + + // Extract raw values + int64_t min_value = int_stats->getMinimum(); + int64_t max_value = int_stats->getMaximum(); + bool has_null = col_stats->hasNull(); + + // Sanity check: min should be <= max + if (min_value > max_value) { + return std::nullopt; // Invalid statistics + } + + return MinMaxStats(min_value, max_value, has_null); +} + class ArrowInputFile : public liborc::InputStream { public: explicit ArrowInputFile(const std::shared_ptr& file) From aeb48bbd59604e4577bed4ef6aaa974b18988c78 Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Tue, 27 Jan 2026 02:20:12 -0800 Subject: [PATCH 2/2] [C++] Add Arrow expression builder for ORC statistics Add utility functions to convert ORC stripe statistics into Arrow compute expressions. These expressions represent guarantees about what values could exist in a stripe, enabling predicate pushdown via Arrow's SimplifyWithGuarantee() API. Changes: - Add BuildMinMaxExpression() for creating range expressions - Support null handling with OR is_null(field) when nulls present - Add convenience overload accepting MinMaxStats directly - Expression format: (field >= min AND field <= max) [OR is_null(field)] This is an internal-only utility with no public API changes. Part of incremental ORC predicate pushdown implementation (PR2/15). --- cpp/src/arrow/adapters/orc/adapter.cc | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 1ef149c9b67..af42d90c054 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -31,9 +31,11 @@ #include "arrow/adapters/orc/util.h" #include "arrow/builder.h" +#include "arrow/compute/expression.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/table_builder.h" @@ -161,6 +163,59 @@ std::optional ExtractStripeStatistics( return MinMaxStats(min_value, max_value, has_null); } +// Build Arrow Expression representing stripe statistics guarantee +// Returns expression: (field >= min AND field <= max) OR is_null(field) +// +// This expression describes what values COULD exist in the stripe. +// Arrow's SimplifyWithGuarantee() will use this to determine if +// a predicate could be satisfied by this stripe. +// +// Example: If stripe has min=0, max=100, the guarantee is: +// (field >= 0 AND field <= 100) OR is_null(field) +// +// Then for predicate "field > 200", SimplifyWithGuarantee returns literal(false), +// indicating the stripe can be skipped. +compute::Expression BuildMinMaxExpression( + const FieldRef& field_ref, + const std::shared_ptr& field_type, + const Scalar& min_value, + const Scalar& max_value, + bool has_null) { + + // Create field reference expression + auto field_expr = compute::field_ref(field_ref); + + // Build range expression: field >= min AND field <= max + auto min_expr = compute::greater_equal(field_expr, compute::literal(min_value)); + auto max_expr = compute::less_equal(field_expr, compute::literal(max_value)); + auto range_expr = compute::and_(std::move(min_expr), std::move(max_expr)); + + // If stripe contains nulls, add null handling + // This ensures we don't skip stripes with nulls when predicate + // could match null values + if (has_null) { + auto null_expr = compute::is_null(field_expr); + return compute::or_(std::move(range_expr), std::move(null_expr)); + } + + return range_expr; +} + +// Convenience overload that takes MinMaxStats directly +compute::Expression BuildMinMaxExpression( + const FieldRef& field_ref, + const std::shared_ptr& field_type, + const MinMaxStats& stats) { + + // Convert int64 to Arrow scalar + auto min_scalar = std::make_shared(stats.min); + auto max_scalar = std::make_shared(stats.max); + + return BuildMinMaxExpression(field_ref, field_type, + *min_scalar, *max_scalar, + stats.has_null); +} + class ArrowInputFile : public liborc::InputStream { public: explicit ArrowInputFile(const std::shared_ptr& file)