Merge remote-tracking branch 'origin/v1.4-andium'

evertlammerts · evertlammerts · commit 2b04888351e3 · 2025-12-15T16:31:45.000+01:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,9 @@
 # Contributing to duckdb-python
 
+## Setting up a development environment
+
+See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
+
 ## General Guidelines
 
 ### **Did you find a bug?**
@@ -39,7 +43,3 @@
 ### Testing cross-platform and cross-Python
 
 * On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all.
-
-## Setting up a development environment
-
-See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     def to_table(self, table_name: str) -> None: ...
     def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     @property
     def alias(self) -> str: ...
@@ -1034,15 +1036,12 @@ class token_type:
 def CaseExpression(condition: Expression, value: Expression) -> Expression: ...
 def CoalesceOperator(*args: Expression) -> Expression: ...
 def ColumnExpression(*args: str) -> Expression: ...
-def ConstantExpression(value: Expression | str) -> Expression: ...
+def ConstantExpression(value: pytyping.Any) -> Expression: ...
 def DefaultExpression() -> Expression: ...
 def FunctionExpression(function_name: str, *args: Expression) -> Expression: ...
-def LambdaExpression(lhs: Expression | str | tuple[str], rhs: Expression) -> Expression: ...
+def LambdaExpression(lhs: pytyping.Any, rhs: Expression) -> Expression: ...
 def SQLExpression(expression: str) -> Expression: ...
-@pytyping.overload
-def StarExpression(*, exclude: Expression | str | tuple[str]) -> Expression: ...
-@pytyping.overload
-def StarExpression() -> Expression: ...
+def StarExpression(*, exclude: pytyping.Any = None) -> Expression: ...
 def aggregate(
     df: pandas.DataFrame,
     aggr_expr: Expression | list[Expression] | str | list[str],
diff --git a/duckdb/experimental/spark/sql/functions.py b/duckdb/experimental/spark/sql/functions.py
@@ -30,6 +30,25 @@ def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column:
     return _invoke_function(name, *cols)
 
 
+def _nan_constant() -> Expression:
+    """Create a NaN constant expression.
+
+    Note: ConstantExpression(float("nan")) returns NULL instead of NaN because
+    TransformPythonValue() in the C++ layer has nan_as_null=true by default.
+    This is intentional for data import scenarios (CSV, Pandas, etc.) where NaN
+    represents missing data.
+
+    For mathematical functions that need to return NaN (not NULL) for out-of-range
+    inputs per PySpark/IEEE 754 semantics, we use SQLExpression as a workaround.
+
+    Returns:
+    -------
+    Expression
+        An expression that evaluates to NaN (not NULL)
+    """
+    return SQLExpression("'NaN'::DOUBLE")
+
+
 def col(column: str) -> Column:  # noqa: D103
     return Column(ColumnExpression(column))
 
@@ -617,11 +636,9 @@ def asin(col: "ColumnOrName") -> Column:
     +--------+
     """
     col = _to_column_expr(col)
-    # TODO: ConstantExpression(float("nan")) gives NULL and not NaN  # noqa: TD002, TD003
+    # asin domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
     return Column(
-        CaseExpression((col < -1.0) | (col > 1.0), ConstantExpression(float("nan"))).otherwise(
-            FunctionExpression("asin", col)
-        )
+        CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("asin", col))
     )
 
 
@@ -4177,7 +4194,11 @@ def acos(col: "ColumnOrName") -> Column:
     |     NaN|
     +--------+
     """
-    return _invoke_function_over_columns("acos", col)
+    col = _to_column_expr(col)
+    # acos domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
+    return Column(
+        CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("acos", col))
+    )
 
 
 def call_function(funcName: str, *cols: "ColumnOrName") -> Column:
diff --git a/duckdb/experimental/spark/sql/readwriter.py b/duckdb/experimental/spark/sql/readwriter.py
@@ -125,7 +125,7 @@ def load(  # noqa: D102
             types, names = schema.extract_types_and_names()
             df = df._cast_types(types)
             df = df.toDF(names)
-        raise NotImplementedError
+        return df
 
     def csv(  # noqa: D102
         self,
diff --git a/duckdb/experimental/spark/sql/type_utils.py b/duckdb/experimental/spark/sql/type_utils.py
@@ -2,6 +2,7 @@
 
 from duckdb.sqltypes import DuckDBPyType
 
+from ..exception import ContributionsAcceptedError
 from .types import (
     ArrayType,
     BinaryType,
@@ -79,7 +80,12 @@ def convert_nested_type(dtype: DuckDBPyType) -> DataType:  # noqa: D103
     if id == "list" or id == "array":
         children = dtype.children
         return ArrayType(convert_type(children[0][1]))
-    # TODO: add support for 'union'  # noqa: TD002, TD003
+    if id == "union":
+        msg = (
+            "Union types are not supported in the PySpark interface. "
+            "DuckDB union types cannot be directly mapped to PySpark types."
+        )
+        raise ContributionsAcceptedError(msg)
     if id == "struct":
         children: list[tuple[str, DuckDBPyType]] = dtype.children
         fields = [StructField(x[0], convert_type(x[1])) for x in children]
diff --git a/duckdb/polars_io.py b/duckdb/polars_io.py
@@ -236,8 +236,9 @@ def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
         # String type
         if dtype == "String" or dtype == "StringOwned":
             # Some new formats may store directly under StringOwned
-            string_val: object | None = value.get("StringOwned", value.get("String", None))
-            return f"'{string_val}'"
+            string_val = value.get("StringOwned", value.get("String", None))
+            # the string must be a string constant
+            return str(duckdb.ConstantExpression(string_val))
 
         msg = f"Unsupported scalar type {dtype!s}, with value {value}"
         raise NotImplementedError(msg)
diff --git a/external/duckdb b/external/duckdb
@@ -1 +1 @@
-Subproject commit 1d558536e04c916f6d6b19ed7e12f8af316e897d
+Subproject commit 839db1a18667c901d4c24f9b78399685f0b4276e
diff --git a/src/duckdb_py/include/duckdb_python/pyrelation.hpp b/src/duckdb_py/include/duckdb_python/pyrelation.hpp
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
 	               const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
 	               const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
 	               const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
-	               const py::object &append = py::none(), const py::object &filename_pattern = py::none());
+	               const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
+	               const py::object &file_size_bytes = py::none());
 
 	void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
 	           const py::object &header = py::none(), const py::object &quotechar = py::none(),
diff --git a/src/duckdb_py/pyconnection.cpp b/src/duckdb_py/pyconnection.cpp
@@ -1776,7 +1776,8 @@ shared_ptr<DuckDBPyConnection> DuckDBPyConnection::UnregisterPythonObject(const
 	D_ASSERT(py::gil_check());
 	py::gil_scoped_release release;
 	// FIXME: DROP TEMPORARY VIEW? doesn't exist?
-	connection.Query("DROP VIEW \"" + name + "\"");
+	const auto quoted_name = KeywordHelper::WriteOptionallyQuoted(name, '\"');
+	connection.Query("DROP VIEW " + quoted_name + "");
 	registered_objects.erase(name);
 	return shared_from_this();
 }
diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
                                  const py::object &overwrite, const py::object &per_thread_output,
                                  const py::object &use_tmp_file, const py::object &partition_by,
                                  const py::object &write_partition_columns, const py::object &append,
-                                 const py::object &filename_pattern) {
+                                 const py::object &filename_pattern, const py::object &file_size_bytes) {
 	case_insensitive_map_t<vector<Value>> options;
 
 	if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
 		options["filename_pattern"] = {Value(py::str(filename_pattern))};
 	}
 
+	if (!py::none().is(file_size_bytes)) {
+		if (py::isinstance<py::int_>(file_size_bytes)) {
+			int64_t file_size_bytes_int = py::int_(file_size_bytes);
+			options["file_size_bytes"] = {Value(file_size_bytes_int)};
+		} else if (py::isinstance<py::str>(file_size_bytes)) {
+			options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
+		} else {
+			throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
+		}
+	}
+
 	auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
 	PyExecuteRelation(write_parquet);
 }
diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp
@@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
 	             py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
 	             py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
 	             py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
-	             py::arg("filename_pattern") = py::none());
+	             py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());
 
 	DefineMethod(
 	    {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",
diff --git a/tests/fast/api/test_duckdb_connection.py b/tests/fast/api/test_duckdb_connection.py
@@ -313,6 +313,38 @@ def test_unregister_problematic_behavior(self, duckdb_cursor):
         # This should not have affected the existing view:
         assert duckdb_cursor.execute("select * from vw").fetchone() == (0,)
 
+    def test_unregister_quoted_table_names(self, duckdb_cursor):
+        """Test that unregister works for quoted tables."""
+        rel = duckdb_cursor.sql("select 'test', 'data'")
+
+        table_name = 'test with .s and "s and  s'
+        duckdb_cursor.register(table_name, rel)
+        duckdb_cursor.unregister(table_name)
+
+        escaped_table_name = table_name.replace('"', '""')
+        with pytest.raises(duckdb.CatalogException):
+            duckdb_cursor.sql(f'select * from "{escaped_table_name}"')
+
+    def test_unregister_with_scary_name(self, duckdb_cursor):
+        """Test that unregister doesn't have side effects."""
+        rel = duckdb_cursor.sql("select 'test', 'data'")
+
+        scary_name = 'test";create table foo as select * from range(10);--'
+        # make sure a view with the name "test" exists
+        duckdb_cursor.register("test", rel)
+        duckdb_cursor.register(scary_name, rel)
+        # try to trick unregister (which uses DROP VIEW) to run another statement
+        duckdb_cursor.unregister(scary_name)
+
+        # hopefully that didn't happen
+        with pytest.raises(duckdb.CatalogException):
+            duckdb_cursor.sql("select * from foo")
+
+        # verify the scary name table was properly unregistered
+        escaped_scary_name = scary_name.replace('"', '""')
+        with pytest.raises(duckdb.CatalogException):
+            duckdb_cursor.sql(f'select * from "{escaped_scary_name}"')
+
     @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
     def test_relation_out_of_scope(self, pandas):
         def temporary_scope():
diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py
@@ -225,3 +225,37 @@ def test_filename_pattern_with_uuid(self, pd):
         result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
         expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
         assert result.execute().fetchall() == expected
+
+    @pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
+    def test_file_size_bytes_basic(self, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+
+        # use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
+        rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);")
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000)
+
+        # Check that multiple files were created
+        files = list(pathlib.Path(temp_file_name).iterdir())
+        assert len(files) > 1, f"Expected multiple files, got {len(files)}"
+
+        # Verify data integrity
+        result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
+        assert len(result.execute().fetchall()) == 10000
+
+    @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
+    @pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"])
+    def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+        df = pd.DataFrame(
+            {
+                "name": ["rei", "shinji", "asuka", "kaworu"],
+                "float": [321.0, 123.0, 23.0, 340.0],
+                "category": ["a", "a", "b", "c"],
+            }
+        )
+        rel = duckdb.from_df(df)
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)
+
+        # With large file size limits, should create just one file
+        parquet_rel = duckdb.read_parquet(temp_file_name)
+        assert rel.execute().fetchall() == parquet_rel.execute().fetchall()
diff --git a/tests/fast/arrow/test_filter_pushdown.py b/tests/fast/arrow/test_filter_pushdown.py
@@ -8,8 +8,9 @@
 import duckdb
 
 pa = pytest.importorskip("pyarrow")
+pd = pytest.importorskip("pyarrow.dataset")
+pa_lib = pytest.importorskip("pyarrow.lib")
 pq = pytest.importorskip("pyarrow.parquet")
-ds = pytest.importorskip("pyarrow.dataset")
 np = pytest.importorskip("numpy")
 re = pytest.importorskip("re")
 
@@ -26,7 +27,7 @@ def create_pyarrow_table(rel):
 
 def create_pyarrow_dataset(rel):
     table = create_pyarrow_table(rel)
-    return ds.dataset(table)
+    return pd.dataset(table)
 
 
 def test_decimal_filter_pushdown(duckdb_cursor):
@@ -549,7 +550,7 @@ def test_9371(self, duckdb_cursor, tmp_path):
         df = df.set_index("ts")  # SET INDEX! (It all works correctly when the index is not set)
         df.to_parquet(str(file_path))
 
-        my_arrow_dataset = ds.dataset(str(file_path))
+        my_arrow_dataset = pd.dataset(str(file_path))
         res = duckdb_cursor.execute("SELECT * FROM my_arrow_dataset WHERE ts = ?", parameters=[dt]).fetch_arrow_table()
         output = duckdb_cursor.sql("select * from res").fetchall()
         expected = [(1, dt), (2, dt), (3, dt)]
@@ -1018,3 +1019,40 @@ def test_dynamic_filter(self, duckdb_cursor):
         duckdb_cursor.register("t", t)
         res = duckdb_cursor.sql("SELECT a FROM t ORDER BY a LIMIT 11").fetchall()
         assert len(res) == 11
+
+    def test_binary_view_filter(self, duckdb_cursor):
+        """Filters on a view column work (without pushdown because pyarrow does not support view filters yet)."""
+        table = pa.table({"col": pa.array([b"abc", b"efg"], type=pa.binary_view())})
+        dset = pd.dataset(table)
+        res = duckdb_cursor.sql("select * from dset where col = 'abc'::binary")
+        assert len(res) == 1
+
+    def test_string_view_filter(self, duckdb_cursor):
+        """Filters on a view column work (without pushdown because pyarrow does not support view filters yet)."""
+        table = pa.table({"col": pa.array(["abc", "efg"], type=pa.string_view())})
+        dset = pd.dataset(table)
+        res = duckdb_cursor.sql("select * from dset where col = 'abc'")
+        assert len(res) == 1
+
+    @pytest.mark.xfail(raises=pa_lib.ArrowNotImplementedError)
+    def test_canary_for_pyarrow_string_view_filter_support(self, duckdb_cursor):
+        """This canary will xpass when pyarrow implements string view filter support."""
+        # predicate: field == "string value"
+        filter_expr = pd.field("col") == pd.scalar("val1")
+        # dataset with a string view column
+        table = pa.table({"col": pa.array(["val1", "val2"], type=pa.string_view())})
+        dset = pd.dataset(table)
+        # creating the scanner fails
+        dset.scanner(columns=["col"], filter=filter_expr)
+
+    @pytest.mark.xfail(raises=pa_lib.ArrowNotImplementedError)
+    def test_canary_for_pyarrow_binary_view_filter_support(self, duckdb_cursor):
+        """This canary will xpass when pyarrow implements binary view filter support."""
+        # predicate: field == const
+        const = pd.scalar(pa.scalar(b"bin1", pa.binary_view()))
+        filter_expr = pd.field("col") == const
+        # dataset with a string view column
+        table = pa.table({"col": pa.array([b"bin1", b"bin2"], type=pa.binary_view())})
+        dset = pd.dataset(table)
+        # creating the scanner fails
+        dset.scanner(columns=["col"], filter=filter_expr)
diff --git a/tests/fast/arrow/test_polars.py b/tests/fast/arrow/test_polars.py
diff --git a/tests/fast/spark/test_spark_functions_numeric.py b/tests/fast/spark/test_spark_functions_numeric.py
diff --git a/tests/fast/test_insert.py b/tests/fast/test_insert.py