Skip to content

Commit 618a289

Browse files
committed
refactor: unify SQL planning for ORDER BY, HAVING, DISTINCT, etc
1 parent 5e79c5e commit 618a289

3 files changed

Lines changed: 208 additions & 114 deletions

File tree

datafusion/sql/src/expr/order_by.rs

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,18 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::collections::HashMap;
19+
1820
use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
21+
use datafusion_common::tree_node::{
22+
Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
23+
};
1924
use datafusion_common::{
20-
Column, DFSchema, Result, not_impl_err, plan_datafusion_err, plan_err,
25+
Column, DFSchema, DFSchemaRef, Result, not_impl_err, plan_datafusion_err, plan_err,
2126
};
2227
use datafusion_expr::expr::Sort;
2328
use datafusion_expr::{Expr, SortExpr};
29+
use indexmap::IndexSet;
2430
use sqlparser::ast::{
2531
Expr as SQLExpr, OrderByExpr, OrderByOptions, Value, ValueWithSpan,
2632
};
@@ -117,4 +123,135 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
117123

118124
Ok(sort_expr_vec)
119125
}
126+
127+
/// Add missing ORDER BY expressions to the SELECT list.
128+
///
129+
/// This function handles the case where ORDER BY expressions reference columns
130+
/// or expressions that are not present in the SELECT list. Instead of traversing
131+
/// the plan tree to find projection nodes, it directly adds the missing
132+
/// expressions to the SELECT list.
133+
///
134+
/// # Behavior
135+
///
136+
/// - For aggregate functions (e.g., `SUM(x)`) and window functions, the original
137+
/// expression is added to the SELECT list, and the ORDER BY expression is
138+
/// replaced with a column reference to that expression's output name.
139+
///
140+
/// - For column references that don't exist in the current schema, the column
141+
/// reference itself is added to the SELECT list.
142+
///
143+
/// - If the query uses `SELECT DISTINCT` and there are missing ORDER BY
144+
/// expressions, an error is returned, as this would make the DISTINCT
145+
/// operation ambiguous.
146+
///
147+
/// - Aliases defined in the SELECT list are recognized and used to replace
148+
/// the corresponding expressions in ORDER BY with column references.
149+
///
150+
/// - When `strict` is true (e.g., when GROUP BY is present), ORDER BY
151+
/// expressions must already be in the SELECT list, be an alias, or be an
152+
/// aggregate/window function. Missing expressions will cause an error instead
153+
/// of being added to the SELECT list. This preserves the error message
154+
/// "Column in ORDER BY must be in GROUP BY" for invalid queries.
155+
///
156+
/// # Arguments
157+
///
158+
/// * `select_exprs` - Mutable reference to the SELECT expressions list. Missing
159+
/// expressions will be added to this list (unless strict is true).
160+
/// * `schema` - The schema of the projected plan, used to check if column
161+
/// references exist.
162+
/// * `distinct` - Whether the query uses `SELECT DISTINCT`. If true, missing
163+
/// ORDER BY expressions will cause an error.
164+
/// * `strict` - Whether to strictly validate ORDER BY expressions. If true,
165+
/// missing expressions will cause an error instead of being added.
166+
/// * `order_by` - Mutable slice of ORDER BY expressions. The expressions will
167+
/// be rewritten to use column references where appropriate.
168+
///
169+
/// # Returns
170+
///
171+
/// * `Ok(true)` - If expressions were added to the SELECT list.
172+
/// * `Ok(false)` - If no expressions needed to be added.
173+
/// * `Err(...)` - If there's an error (e.g., DISTINCT with missing ORDER BY
174+
/// expressions).
175+
///
176+
/// # Example
177+
///
178+
/// ```text
179+
/// Input: SELECT x FROM foo ORDER BY y
180+
///
181+
/// Before: select_exprs = [x]
182+
/// order_by = [Sort { expr: Column(y), ... }]
183+
///
184+
/// After: select_exprs = [x, y]
185+
/// order_by = [Sort { expr: Column(y), ... }]
186+
/// returns Ok(true)
187+
/// ```
188+
pub(crate) fn add_missing_order_by_exprs(
189+
select_exprs: &mut Vec<Expr>,
190+
schema: &DFSchemaRef,
191+
distinct: bool,
192+
strict: bool,
193+
order_by: &mut [Sort],
194+
) -> Result<bool> {
195+
let mut missing_exprs: IndexSet<Expr> = IndexSet::new();
196+
197+
let mut aliases = HashMap::new();
198+
for expr in select_exprs.iter() {
199+
if let Expr::Alias(alias) = expr {
200+
aliases.insert(alias.expr.clone(), alias.name.clone());
201+
}
202+
}
203+
204+
let mut rewrite = |expr: Expr| {
205+
if select_exprs.contains(&expr) {
206+
return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump));
207+
}
208+
if let Some(alias) = aliases.get(&expr) {
209+
return Ok(Transformed::new(
210+
Expr::Column(Column::new_unqualified(alias.clone())),
211+
false,
212+
TreeNodeRecursion::Jump,
213+
));
214+
}
215+
match expr {
216+
Expr::AggregateFunction(_) | Expr::WindowFunction(_) => {
217+
let replaced = Expr::Column(Column::new_unqualified(
218+
expr.schema_name().to_string(),
219+
));
220+
missing_exprs.insert(expr);
221+
Ok(Transformed::new(replaced, true, TreeNodeRecursion::Jump))
222+
}
223+
Expr::Column(ref c) => {
224+
if strict {
225+
// In strict mode (e.g., GROUP BY present), the column must exist in schema
226+
// If it doesn't exist and isn't in select_exprs, we'll error later
227+
// Don't add it to missing_exprs to preserve proper error message
228+
Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump))
229+
} else if !schema.has_column(c) {
230+
missing_exprs.insert(Expr::Column(c.clone()));
231+
Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump))
232+
} else {
233+
Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump))
234+
}
235+
}
236+
_ => Ok(Transformed::no(expr)),
237+
}
238+
};
239+
for sort in order_by.iter_mut() {
240+
let expr = std::mem::take(&mut sort.expr);
241+
sort.expr = expr.transform_down(&mut rewrite).data()?;
242+
}
243+
if !missing_exprs.is_empty() {
244+
if distinct {
245+
plan_err!(
246+
"For SELECT DISTINCT, ORDER BY expressions {} must appear in select list",
247+
missing_exprs[0]
248+
)
249+
} else {
250+
select_exprs.extend(missing_exprs);
251+
Ok(true)
252+
}
253+
} else {
254+
Ok(false)
255+
}
256+
}
120257
}

0 commit comments

Comments
 (0)