Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 60 additions & 4 deletions datafusion/core/src/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ use datafusion_common::{
plan_datafusion_err, plan_err, unqualified_field_not_found,
};
use datafusion_expr::select_expr::SelectExpr;
use datafusion_expr::utils::find_aggregate_exprs;
use datafusion_expr::{
ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
dml::InsertOp,
Expand Down Expand Up @@ -410,21 +411,76 @@ impl DataFrame {
expr_list: impl IntoIterator<Item = impl Into<SelectExpr>>,
) -> Result<DataFrame> {
let expr_list: Vec<SelectExpr> =
expr_list.into_iter().map(|e| e.into()).collect::<Vec<_>>();
expr_list.into_iter().map(|e| e.into()).collect();

// Extract plain expressions
let expressions = expr_list.iter().filter_map(|e| match e {
SelectExpr::Expression(expr) => Some(expr),
_ => None,
});

let window_func_exprs = find_window_exprs(expressions);
let plan = if window_func_exprs.is_empty() {
// Apply window functions first
let window_func_exprs = find_window_exprs(expressions.clone());

let mut plan = if window_func_exprs.is_empty() {
self.plan
} else {
LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
};

let project_plan = LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
// Collect aggregate expressions
let aggr_exprs = find_aggregate_exprs(expressions.clone());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

find_aggregate_exprs() deduplicates the expressions.
Test like:

let res = df.select(vec![
        count(col("c9")).alias("count_c9"),
        count(col("c9")).alias("count_c9_str"),
    ])?;

fails with:


failures:

---- dataframe::test_dataframe_api_aggregate_fn_in_select2 stdout ----
Error: SchemaError(FieldNotFound { field: Column { relation: None, name: "__agg_1" }, valid_fields: [Column { relation: None, name: "__agg_0" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c1" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c2" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c3" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c4" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c5" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c6" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c7" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c8" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c9" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c10" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c11" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c12" }, Column { relation: Some(Bare { table: "aggregate_test_100" }), name: "c13" }] }, Some(""))

__agg_1 is lost


// Check if any expression is non-aggregate
let has_non_aggregate_expr = expressions
.clone()
.any(|expr| find_aggregate_exprs(std::iter::once(expr)).is_empty());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about aggregate expr + non-aggregate one ?
E.g.:

  let res = df.select(vec![
        count(col("c9")).alias("count_c9") + lit(1)
    ])?;

I'd expect 101 but it returns 100


// Fallback to projection:
// - already aggregated
// - contains non-aggregate expressions
// - no aggregates at all
if matches!(plan, LogicalPlan::Aggregate(_))
|| has_non_aggregate_expr
|| aggr_exprs.is_empty()
{
let project_plan =
LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;

return Ok(DataFrame {
session_state: self.session_state,
plan: project_plan,
projection_requires_validation: false,
});
}

// Build Aggregate node
let aggr_exprs: Vec<Expr> = aggr_exprs
.into_iter()
.enumerate()
.map(|(i, expr)| expr.alias(format!("__agg_{i}")))
.collect();

plan = LogicalPlanBuilder::from(plan)
.aggregate(Vec::<Expr>::new(), aggr_exprs)?
.build()?;

// Replace aggregates with their aliases
let mut rewritten_exprs = Vec::with_capacity(expr_list.len());
for (i, select_expr) in expr_list.into_iter().enumerate() {
match select_expr {
SelectExpr::Expression(expr) => {
let column = Expr::Column(Column::from_name(format!("__agg_{i}")));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__agg_0 could collide with a real column. Is this how it is being done elsewhere ?

let alias = expr.name_for_alias()?;
rewritten_exprs.push(SelectExpr::Expression(column.alias(alias)));
}
other => rewritten_exprs.push(other),
}
}

let project_plan = LogicalPlanBuilder::from(plan)
.project(rewritten_exprs)?
.build()?;

Ok(DataFrame {
session_state: self.session_state,
Expand Down
23 changes: 23 additions & 0 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6854,3 +6854,26 @@ async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> {

Ok(())
}

#[tokio::test]
async fn test_dataframe_api_aggregate_fn_in_select() -> Result<()> {
let df = test_table().await?;

let res = df.select(vec![
count(col("c9")).alias("count_c9"),
count(cast(col("c9"), DataType::Utf8View)).alias("count_c9_str"),
])?;

assert_batches_eq!(
&[
"+----------+--------------+",
"| count_c9 | count_c9_str |",
"+----------+--------------+",
"| 100 | 100 |",
"+----------+--------------+",
],
&res.collect().await?
);

Ok(())
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add tests for some more complex queries, e.g. df.select([sum(col("a")) + count(col("b"))]) and something with (qualified and non-qualified) wildcards too.

Loading