diff --git a/datafusion-testing b/datafusion-testing index eccb0e4a42634..3e35598206023 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit eccb0e4a426344ef3faf534cd60e02e9c3afd3ac +Subproject commit 3e35598206023f438cc66f567fa256f5ad255ef8 diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 0551cbbb15ae1..ff382f1605b1e 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -311,6 +311,11 @@ config_namespace! { /// By default, `nulls_max` is used to follow Postgres's behavior. /// postgres rule: pub default_null_ordering: String, default = "nulls_max".to_string() + + /// When set to true, the SQL types `INTEGER`, `INT`, and `INT4` are mapped + /// to `Int64` (BIGINT) instead of the default `Int32`. This is useful for + /// SQLite compatibility, where `INTEGER` is a 64-bit type. + pub integer_type_is_bigint: bool, default = false } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f0888e01049ad..7b28ef6ea637c 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -539,6 +539,7 @@ impl SessionState { .default_null_ordering .as_str() .into(), + integer_type_is_bigint: sql_parser_options.integer_type_is_bigint, } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index e25969903521c..2f3110548dfea 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3176,7 +3176,7 @@ mod tests { // verify that the plan correctly casts u8 to i64 // the cast from u8 to i64 for literal will be simplified, and get lit(int64(5)) // the cast here is implicit so has CastOptions with safe=true - let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: false"#; + let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: true"#; assert_contains!(format!("{exec_plan:?}"), expected); Ok(()) @@ -3578,7 +3578,7 @@ mod tests { let execution_plan = plan(&logical_plan).await?; // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. - let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; + let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: true }"#; assert_contains!(format!("{execution_plan:?}"), expected); diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 02628b405ec6c..4dde4c72e94c5 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -88,7 +88,7 @@ impl BinaryExpr { left, op, right, - fail_on_overflow: false, + fail_on_overflow: true, } } diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index d285f8b377eca..5b05a3b1cf969 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -498,14 +498,14 @@ mod test { ) .unwrap(); let snap = dynamic_filter_1.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: true }"#); let dynamic_filter_2 = reassign_expr_columns( Arc::clone(&dynamic_filter) as Arc, &filter_schema_2, ) .unwrap(); let snap = dynamic_filter_2.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: true }"#); // Both filters allow evaluating the same expression let batch_1 = RecordBatch::try_new( Arc::clone(&filter_schema_1), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index b7e270e4f0570..4d7fd511e4289 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -57,6 +57,9 @@ pub struct ParserOptions { pub map_string_types_to_utf8view: bool, /// Default null ordering for sorting expressions. pub default_null_ordering: NullOrdering, + /// When true, `INTEGER`/`INT`/`INT4` SQL types map to `Int64` instead of `Int32`. + /// Useful for SQLite compatibility where INTEGER is a 64-bit type. + pub integer_type_is_bigint: bool, } impl ParserOptions { @@ -81,6 +84,7 @@ impl ParserOptions { // By default, `nulls_max` is used to follow Postgres's behavior. // postgres rule: https://www.postgresql.org/docs/current/queries-order.html default_null_ordering: NullOrdering::NullsMax, + integer_type_is_bigint: false, } } @@ -141,6 +145,12 @@ impl ParserOptions { self.default_null_ordering = value; self } + + /// Sets the `integer_type_is_bigint` option. + pub fn with_integer_type_is_bigint(mut self, value: bool) -> Self { + self.integer_type_is_bigint = value; + self + } } impl Default for ParserOptions { @@ -160,6 +170,7 @@ impl From<&SqlParserOptions> for ParserOptions { .enable_options_value_normalization, collect_spans: options.collect_spans, default_null_ordering: options.default_null_ordering.as_str().into(), + integer_type_is_bigint: options.integer_type_is_bigint, } } } @@ -663,7 +674,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SQLDataType::TinyInt(_) => Ok(DataType::Int8), SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16), SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => { - Ok(DataType::Int32) + if self.options.integer_type_is_bigint { + Ok(DataType::Int64) + } else { + Ok(DataType::Int32) + } } SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64), SQLDataType::TinyIntUnsigned(_) => Ok(DataType::UInt8), diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 950a79ddb0b5e..6f7256d811854 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3472,6 +3472,7 @@ fn parse_decimals_parser_options() -> ParserOptions { enable_options_value_normalization: false, collect_spans: false, default_null_ordering: NullOrdering::NullsMax, + integer_type_is_bigint: false, } } @@ -3484,6 +3485,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions enable_options_value_normalization: false, collect_spans: false, default_null_ordering: NullOrdering::NullsMax, + integer_type_is_bigint: false, } } @@ -3496,6 +3498,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { enable_options_value_normalization: false, collect_spans: false, default_null_ordering: NullOrdering::NullsMax, + integer_type_is_bigint: false, } } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index d7bb2583c9d8c..2c673bab1c054 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -58,6 +58,7 @@ object_store = { workspace = true } postgres-types = { version = "0.2.13", features = ["derive", "with-chrono-0_4"], optional = true } # When updating the following dependency verify that sqlite test file regeneration works correctly # by running the regenerate_sqlite_files.sh script. +regex = { workspace = true } sqllogictest = "0.29.1" sqlparser = { workspace = true } tempfile = { workspace = true } @@ -83,7 +84,6 @@ substrait = ["datafusion-substrait"] [dev-dependencies] env_logger = { workspace = true } -regex = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } [[test]] diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 773f61655e41f..33431eac89580 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -49,6 +49,7 @@ use datafusion::{ use datafusion_spark::SessionStateBuilderSpark; use crate::is_spark_path; +use crate::is_sqlite_path; use async_trait::async_trait; use datafusion::common::cast::as_float64_array; use datafusion::execution::SessionStateBuilder; @@ -78,9 +79,17 @@ impl TestContext { /// If `None` is returned (e.g. because some needed feature is not /// enabled), the file should be skipped pub async fn try_new_for_test_file(relative_path: &Path) -> Option { - let config = SessionConfig::new() + let mut config = SessionConfig::new() // hardcode target partitions so plans are deterministic .with_target_partitions(4); + + // SQLite uses 64-bit integers for the INTEGER type, whereas DataFusion + // defaults to 32-bit. Enable the bigint mapping for SQLite-compatible tests. + if is_sqlite_path(relative_path) { + config = + config.set_bool("datafusion.sql_parser.integer_type_is_bigint", true); + } + let runtime = Arc::new(RuntimeEnv::default()); let mut state_builder = SessionStateBuilder::new() diff --git a/datafusion/sqllogictest/src/util.rs b/datafusion/sqllogictest/src/util.rs index b0cf32266ea31..dfe47a1225674 100644 --- a/datafusion/sqllogictest/src/util.rs +++ b/datafusion/sqllogictest/src/util.rs @@ -141,6 +141,10 @@ pub fn is_spark_path(relative_path: &Path) -> bool { relative_path.starts_with("spark/") } +pub fn is_sqlite_path(relative_path: &Path) -> bool { + relative_path.starts_with("sqlite") +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/sqllogictest/test_files/explain_analyze.slt b/datafusion/sqllogictest/test_files/explain_analyze.slt index 561c1a9dcbc87..ab3bbd8cfe679 100644 --- a/datafusion/sqllogictest/test_files/explain_analyze.slt +++ b/datafusion/sqllogictest/test_files/explain_analyze.slt @@ -152,7 +152,7 @@ SELECT a+1, pow(a,2) FROM generate_series(1, 100) as t1(a); ---- Plan with Metrics -01)ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[output_rows=100, elapsed_compute=, output_bytes=1632.0 B, output_batches=1, expr_0_eval_time=, expr_1_eval_time=] +01)ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[output_rows=100, elapsed_compute=, output_bytes=, output_batches=1, expr_0_eval_time=, expr_1_eval_time=] # common expressions @@ -163,8 +163,8 @@ SELECT a+1, a+1 as another_a_plus_one FROM generate_series(1, 100) as t1(a); ---- Plan with Metrics -01)ProjectionExec: expr=[__common_expr_1@0 as t1.a + Int64(1), __common_expr_1@0 as another_a_plus_one], metrics=[output_rows=100, elapsed_compute=, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=, expr_1_eval_time=] -02)--ProjectionExec: expr=[a@0 + 1 as __common_expr_1], metrics=[output_rows=100, elapsed_compute=, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=] +01)ProjectionExec: expr=[__common_expr_1@0 as t1.a + Int64(1), __common_expr_1@0 as another_a_plus_one], metrics=[output_rows=100, elapsed_compute=, output_bytes=, output_batches=1, expr_0_eval_time=, expr_1_eval_time=] +02)--ProjectionExec: expr=[a@0 + 1 as __common_expr_1], metrics=[output_rows=100, elapsed_compute=, output_bytes=, output_batches=1, expr_0_eval_time=] statement ok diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index ba1f1403450a9..3b035565aec01 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -339,6 +339,7 @@ datafusion.sql_parser.default_null_ordering nulls_max datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false +datafusion.sql_parser.integer_type_is_bigint false datafusion.sql_parser.map_string_types_to_utf8view true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 @@ -481,6 +482,7 @@ datafusion.sql_parser.default_null_ordering nulls_max Specifies the default null datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. +datafusion.sql_parser.integer_type_is_bigint false When set to true, the SQL types `INTEGER`, `INT`, and `INT4` are mapped to `Int64` (BIGINT) instead of the default `Int32`. This is useful for SQLite compatibility, where `INTEGER` is a 64-bit type. datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries diff --git a/datafusion/sqllogictest/test_files/operator.slt b/datafusion/sqllogictest/test_files/operator.slt index e50fa721c8850..3995f6a035cd4 100644 --- a/datafusion/sqllogictest/test_files/operator.slt +++ b/datafusion/sqllogictest/test_files/operator.slt @@ -363,3 +363,34 @@ set datafusion.explain.physical_plan_only = false; statement ok drop table numeric_types + +############### Integer Overflow Behavior ############### +# SMALLINT (Int16) addition overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 32767 \+ 2 +SELECT CAST(32767 AS SMALLINT) + CAST(2 AS SMALLINT); + +# SMALLINT (Int16) subtraction overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: -32768 - 1 +SELECT CAST(-32768 AS SMALLINT) - CAST(1 AS SMALLINT); + +# SMALLINT (Int16) multiplication overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 32767 \* 2 +SELECT CAST(32767 AS SMALLINT) * CAST(2 AS SMALLINT); + +# INT (Int32) addition overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 2147483647 \+ 1 +SELECT CAST(2147483647 AS INT) + CAST(1 AS INT); + +# BIGINT (Int64) addition overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 \+ 1 +SELECT CAST(9223372036854775807 AS BIGINT) + CAST(1 AS BIGINT); + +# TINYINT (Int8) addition overflow +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 127 \+ 1 +SELECT CAST(127 AS TINYINT) + CAST(1 AS TINYINT); + +# No overflow within range - should succeed +query I +SELECT CAST(32766 AS SMALLINT) + CAST(1 AS SMALLINT); +---- +32767 diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 69627e3cb9148..6f239b1fc6151 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -187,6 +187,7 @@ The following configuration settings are available: | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | | datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: | +| datafusion.sql_parser.integer_type_is_bigint | false | When set to true, the SQL types `INTEGER`, `INT`, and `INT4` are mapped to `Int64` (BIGINT) instead of the default `Int32`. This is useful for SQLite compatibility, where `INTEGER` is a 64-bit type. | | datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | | datafusion.format.null | | Format string for nulls | | datafusion.format.date_format | %Y-%m-%d | Date format for date arrays |