From 3937337aedb644210a65aa66de37e6d12f0e7b2d Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Mon, 11 May 2026 17:11:45 +0100 Subject: [PATCH] simplify projection expression for SELECT * in duckdb Signed-off-by: Mikhail Kot --- vortex-duckdb/src/datasource.rs | 140 +++++++++++++++--- .../src/duckdb/table_function/init.rs | 2 +- vortex-sqllogictest/slt/duckdb/file_index.slt | 16 +- 3 files changed, 130 insertions(+), 28 deletions(-) diff --git a/vortex-duckdb/src/datasource.rs b/vortex-duckdb/src/datasource.rs index 1f07155e1f6..0d125b4ce4e 100644 --- a/vortex-duckdb/src/datasource.rs +++ b/vortex-duckdb/src/datasource.rs @@ -594,8 +594,6 @@ struct ProjectionWithVirtualColumns { file_row_number_column_pos: Option, } -/// Creates a projection expression from raw projection/column ID slices and -/// column names. fn extract_projection_expr( projection_ids: Option<&[u64]>, column_ids: &[u64], @@ -610,33 +608,53 @@ fn extract_projection_expr( let mut file_index_column_pos = None; let mut file_row_number_column_pos = None; + let mut is_star = true; + let mut real_column_count = 0; + + for (column_pos, &column_id) in ids.iter().enumerate() { + #[expect(clippy::cast_possible_truncation)] + let column_id = if has_projection_ids { + column_ids[column_id as usize] + } else { + column_id + }; - #[expect(clippy::cast_possible_truncation)] - let names = ids - .iter() - .enumerate() - .map(|(column_pos, &column_id)| { - let column_id = if has_projection_ids { - column_ids[column_id as usize] - } else { - column_id - }; + if column_id == FILE_INDEX_COLUMN_IDX { + file_index_column_pos = Some(column_pos); + continue; + } + if column_id == FILE_ROW_NUMBER_COLUMN_IDX { + file_row_number_column_pos = Some(column_pos); + continue; + } - if column_id == FILE_INDEX_COLUMN_IDX { - file_index_column_pos = Some(column_pos); - } - if column_id == FILE_ROW_NUMBER_COLUMN_IDX { - file_row_number_column_pos = Some(column_pos); - } + is_star &= column_id == real_column_count; + real_column_count += 1; + } + is_star &= real_column_count == column_fields.len() as u64; - column_id - }) - .filter(|&col_id| !is_virtual_column(col_id)) - .map(|col_id| Arc::from(column_fields[col_id as usize].name.as_str())) - .collect::(); + let select = if is_star { + root() + } else { + #[expect(clippy::cast_possible_truncation)] + let names = ids + .iter() + .map(|&column_id| { + if has_projection_ids { + column_ids[column_id as usize] + } else { + column_id + } + }) + .filter(|&col_id| !is_virtual_column(col_id)) + .map(|col_id| Arc::from(column_fields[col_id as usize].name.as_str())) + .collect::(); + + select(names, root()) + }; // file_index column will be filled later when exporting the chunk. - let select = select(names, root()); + let projection = if file_row_number_column_pos.is_some() { // row_idx will be rearranged to correct position in scan(), prepend // here @@ -723,7 +741,20 @@ mod tests { use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering::Relaxed; + use vortex::dtype::DType; + use vortex::dtype::PType; + use vortex::expr::cast; + use vortex::expr::merge; + use vortex::expr::pack; + use vortex::expr::root; + use vortex::layout::layouts::row_idx::row_idx; + use super::progress; + use crate::datasource::DuckdbField; + use crate::datasource::FILE_INDEX_COLUMN_IDX; + use crate::datasource::FILE_ROW_NUMBER_COLUMN_IDX; + use crate::datasource::extract_projection_expr; + use crate::duckdb::LogicalType; #[test] fn test_table_scan_progress() { @@ -738,4 +769,65 @@ mod tests { bytes_total.fetch_add(100, Relaxed); assert!((progress(&bytes_read, &bytes_total) - 50.).abs() < f64::EPSILON); } + + #[test] + fn test_select_star() { + let ids = [0, 1, 2]; + let fields = [ + DuckdbField { + name: "".to_owned(), + logical_type: LogicalType::null(), + dtype: DType::Null, + }, + DuckdbField { + name: "".to_owned(), + logical_type: LogicalType::null(), + dtype: DType::Null, + }, + DuckdbField { + name: "".to_owned(), + logical_type: LogicalType::null(), + dtype: DType::Null, + }, + ]; + + assert_eq!( + extract_projection_expr(None, &ids, &fields).projection, + root() + ); + + let ids = [FILE_ROW_NUMBER_COLUMN_IDX, 0, 1, FILE_INDEX_COLUMN_IDX, 2]; + let exprs = extract_projection_expr(None, &ids, &fields); + let row_idx = cast(row_idx(), DType::Primitive(PType::I64, false.into())); + let row_idx_struct = pack([("file_row_number", row_idx)], false.into()); + let root_with_virtual_cols = merge([row_idx_struct, root()]); + + assert_eq!(exprs.projection, root_with_virtual_cols); + assert_eq!(exprs.file_index_column_pos, Some(3)); + assert_eq!(exprs.file_row_number_column_pos, Some(0)); + + // projections can't be set in SELECT *. + assert_ne!( + extract_projection_expr(Some(&[0, 1]), &ids, &fields).projection, + root() + ); + + let ids = [0, 1]; + assert_ne!( + extract_projection_expr(None, &ids, &fields).projection, + root() + ); + + let ids = [0, 2, 2]; + assert_ne!( + extract_projection_expr(None, &ids, &fields).projection, + root() + ); + + let ids = [2, 1, 0]; + assert_ne!( + extract_projection_expr(None, &ids, &fields).projection, + root() + ); + } } diff --git a/vortex-duckdb/src/duckdb/table_function/init.rs b/vortex-duckdb/src/duckdb/table_function/init.rs index 6baa6d6a300..f9274476e36 100644 --- a/vortex-duckdb/src/duckdb/table_function/init.rs +++ b/vortex-duckdb/src/duckdb/table_function/init.rs @@ -84,7 +84,7 @@ impl<'a, T: TableFunction> TableInitInput<'a, T> { } pub fn projection_ids(&self) -> Option<&[u64]> { - if self.input.projection_ids.is_null() { + if self.input.projection_ids.is_null() || self.input.projection_ids_count == 0 { return None; } Some(unsafe { diff --git a/vortex-sqllogictest/slt/duckdb/file_index.slt b/vortex-sqllogictest/slt/duckdb/file_index.slt index ea2693e81a3..ce80ec83a21 100644 --- a/vortex-sqllogictest/slt/duckdb/file_index.slt +++ b/vortex-sqllogictest/slt/duckdb/file_index.slt @@ -27,7 +27,7 @@ SELECT file_index, str FROM '$__TEST_DIR__/file_index_2.vortex'; 0 3 query TB -SELECT *, file_index < 2 FROM '$__TEST_DIR__/*.vortex' +SELECT *, file_index < 2 FROM '$__TEST_DIR__/file_index_*.vortex' ORDER BY str; ---- 1 true @@ -36,14 +36,24 @@ ORDER BY str; Hello true Hi true +query BT +SELECT file_index < 2, * FROM '$__TEST_DIR__/file_index_*.vortex' +ORDER BY str; +---- +true 1 +true 2 +true 3 +true Hello +true Hi + query IB -SELECT count(*) AS cnt, sum(file_index) <= 3 FROM '$__TEST_DIR__/*.vortex' +SELECT count(*) AS cnt, sum(file_index) <= 3 FROM '$__TEST_DIR__/file_index_*.vortex' ORDER BY cnt; ---- 5 true query B -SELECT file_index < 2 FROM '$__TEST_DIR__/*.vortex' +SELECT file_index < 2 FROM '$__TEST_DIR__/file_index_*.vortex' WHERE len(str) > 1; ---- true