Skip to content

Commit 7873e80

Browse files
teach row group pruning about struct fields
1 parent b61aee7 commit 7873e80

File tree

10 files changed

+823
-235
lines changed

10 files changed

+823
-235
lines changed

Cargo.lock

Lines changed: 17 additions & 32 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [
9191
"runtime-rng",
9292
] }
9393
apache-avro = { version = "0.21", default-features = false }
94-
arrow = { version = "58.0.0", features = [
94+
arrow = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", features = [
9595
"prettyprint",
9696
"chrono-tz",
9797
] }
98-
arrow-buffer = { version = "58.0.0", default-features = false }
99-
arrow-flight = { version = "58.0.0", features = [
98+
arrow-buffer = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", default-features = false }
99+
arrow-flight = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", features = [
100100
"flight-sql-experimental",
101101
] }
102-
arrow-ipc = { version = "58.0.0", default-features = false, features = [
102+
arrow-ipc = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", default-features = false, features = [
103103
"lz4",
104104
] }
105-
arrow-ord = { version = "58.0.0", default-features = false }
106-
arrow-schema = { version = "58.0.0", default-features = false }
105+
arrow-ord = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", default-features = false }
106+
arrow-schema = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", default-features = false }
107107
async-trait = "0.1.89"
108108
bigdecimal = "0.4.8"
109109
bytes = "1.11"
@@ -168,7 +168,7 @@ memchr = "2.8.0"
168168
num-traits = { version = "0.2" }
169169
object_store = { version = "0.13.1", default-features = false }
170170
parking_lot = "0.12"
171-
parquet = { version = "58.0.0", default-features = false, features = [
171+
parquet = { git = "https://github.com/pydantic/arrow-rs.git", branch = "friendlymatthew/statistics-converter-from-col-index", default-features = false, features = [
172172
"arrow",
173173
"async",
174174
"object_store",

datafusion-examples/examples/data_io/parquet_index.rs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow::datatypes::{Int32Type, SchemaRef};
2525
use arrow::util::pretty::pretty_format_batches;
2626
use async_trait::async_trait;
2727
use datafusion::catalog::Session;
28-
use datafusion::common::pruning::PruningStatistics;
28+
use datafusion::common::pruning::{PruningColumn, PruningStatistics};
2929
use datafusion::common::{
3030
DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
3131
};
@@ -432,21 +432,19 @@ impl ParquetMetadataIndex {
432432
/// the required statistics via the [`PruningStatistics`] trait
433433
impl PruningStatistics for ParquetMetadataIndex {
434434
/// return the minimum values for the value column
435-
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
436-
if column.name.eq("value") {
437-
Some(self.value_column_mins().clone())
438-
} else {
439-
None
440-
}
435+
fn min_values(&self, column: &PruningColumn) -> Option<ArrayRef> {
436+
column
437+
.name()
438+
.eq("value")
439+
.then_some(self.value_column_mins().clone())
441440
}
442441

443442
/// return the maximum values for the value column
444-
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
445-
if column.name.eq("value") {
446-
Some(self.value_column_maxes().clone())
447-
} else {
448-
None
449-
}
443+
fn max_values(&self, column: &PruningColumn) -> Option<ArrayRef> {
444+
column
445+
.name()
446+
.eq("value")
447+
.then_some(self.value_column_maxes().clone())
450448
}
451449

452450
/// return the number of "containers". In this example, each "container" is
@@ -457,20 +455,20 @@ impl PruningStatistics for ParquetMetadataIndex {
457455

458456
/// Return `None` to signal we don't have any information about null
459457
/// counts in the index,
460-
fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
458+
fn null_counts(&self, _column: &PruningColumn) -> Option<ArrayRef> {
461459
None
462460
}
463461

464462
/// return the row counts for each file
465-
fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
463+
fn row_counts(&self, _column: &PruningColumn) -> Option<ArrayRef> {
466464
Some(self.row_counts_ref().clone())
467465
}
468466

469467
/// The `contained` API can be used with structures such as Bloom filters,
470468
/// but is not used in this example, so return `None`
471469
fn contained(
472470
&self,
473-
_column: &Column,
471+
_column: &PruningColumn,
474472
_values: &HashSet<ScalarValue>,
475473
) -> Option<BooleanArray> {
476474
None

datafusion-examples/examples/query_planning/pruning.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use std::sync::Arc;
2222

2323
use arrow::array::{ArrayRef, BooleanArray, Int32Array};
2424
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
25-
use datafusion::common::pruning::PruningStatistics;
25+
use datafusion::common::pruning::{PruningColumn, PruningStatistics};
2626
use datafusion::common::{DFSchema, ScalarValue};
2727
use datafusion::error::Result;
2828
use datafusion::execution::context::ExecutionProps;
@@ -148,40 +148,40 @@ impl PruningStatistics for MyCatalog {
148148
3
149149
}
150150

151-
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
151+
fn min_values(&self, column: &PruningColumn) -> Option<ArrayRef> {
152152
// The pruning predicate evaluates the bounds for multiple expressions
153153
// at once, so return an array with an element for the minimum value in
154154
// each file
155-
match column.name.as_str() {
155+
match column.name() {
156156
"x" => Some(i32_array(self.x_values.iter().map(|(min, _)| min))),
157157
"y" => Some(i32_array(self.y_values.iter().map(|(min, _)| min))),
158158
name => panic!("unknown column name: {name}"),
159159
}
160160
}
161161

162-
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
162+
fn max_values(&self, column: &PruningColumn) -> Option<ArrayRef> {
163163
// similarly to min_values, return an array with an element for the
164164
// maximum value in each file
165-
match column.name.as_str() {
165+
match column.name() {
166166
"x" => Some(i32_array(self.x_values.iter().map(|(_, max)| max))),
167167
"y" => Some(i32_array(self.y_values.iter().map(|(_, max)| max))),
168168
name => panic!("unknown column name: {name}"),
169169
}
170170
}
171171

172-
fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
172+
fn null_counts(&self, _column: &PruningColumn) -> Option<ArrayRef> {
173173
// In this example, we know nothing about the number of nulls
174174
None
175175
}
176176

177-
fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
177+
fn row_counts(&self, _column: &PruningColumn) -> Option<ArrayRef> {
178178
// In this example, we know nothing about the number of rows in each file
179179
None
180180
}
181181

182182
fn contained(
183183
&self,
184-
_column: &Column,
184+
_column: &PruningColumn,
185185
_values: &HashSet<ScalarValue>,
186186
) -> Option<BooleanArray> {
187187
// this method can be used to implement Bloom filter like filtering

0 commit comments

Comments
 (0)