Skip to content

Commit 2b04888

Browse files
committed
Merge remote-tracking branch 'origin/v1.4-andium'
2 parents 2ba03ef + 888fa04 commit 2b04888

File tree

17 files changed

+228
-40
lines changed

17 files changed

+228
-40
lines changed

CONTRIBUTING.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Contributing to duckdb-python
22

3+
## Setting up a development environment
4+
5+
See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
6+
37
## General Guidelines
48

59
### **Did you find a bug?**
@@ -39,7 +43,3 @@
3943
### Testing cross-platform and cross-Python
4044

4145
* On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all.
42-
43-
## Setting up a development environment
44-
45-
See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).

_duckdb-stubs/__init__.pyi

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
721721
write_partition_columns: bool | None = None,
722722
append: bool | None = None,
723723
filename_pattern: str | None = None,
724+
file_size_bytes: str | int | None = None,
724725
) -> None: ...
725726
def to_table(self, table_name: str) -> None: ...
726727
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
774775
write_partition_columns: bool | None = None,
775776
append: bool | None = None,
776777
filename_pattern: str | None = None,
778+
file_size_bytes: str | int | None = None,
777779
) -> None: ...
778780
@property
779781
def alias(self) -> str: ...
@@ -1034,15 +1036,12 @@ class token_type:
10341036
def CaseExpression(condition: Expression, value: Expression) -> Expression: ...
10351037
def CoalesceOperator(*args: Expression) -> Expression: ...
10361038
def ColumnExpression(*args: str) -> Expression: ...
1037-
def ConstantExpression(value: Expression | str) -> Expression: ...
1039+
def ConstantExpression(value: pytyping.Any) -> Expression: ...
10381040
def DefaultExpression() -> Expression: ...
10391041
def FunctionExpression(function_name: str, *args: Expression) -> Expression: ...
1040-
def LambdaExpression(lhs: Expression | str | tuple[str], rhs: Expression) -> Expression: ...
1042+
def LambdaExpression(lhs: pytyping.Any, rhs: Expression) -> Expression: ...
10411043
def SQLExpression(expression: str) -> Expression: ...
1042-
@pytyping.overload
1043-
def StarExpression(*, exclude: Expression | str | tuple[str]) -> Expression: ...
1044-
@pytyping.overload
1045-
def StarExpression() -> Expression: ...
1044+
def StarExpression(*, exclude: pytyping.Any = None) -> Expression: ...
10461045
def aggregate(
10471046
df: pandas.DataFrame,
10481047
aggr_expr: Expression | list[Expression] | str | list[str],

duckdb/experimental/spark/sql/functions.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,25 @@ def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column:
3030
return _invoke_function(name, *cols)
3131

3232

33+
def _nan_constant() -> Expression:
34+
"""Create a NaN constant expression.
35+
36+
Note: ConstantExpression(float("nan")) returns NULL instead of NaN because
37+
TransformPythonValue() in the C++ layer has nan_as_null=true by default.
38+
This is intentional for data import scenarios (CSV, Pandas, etc.) where NaN
39+
represents missing data.
40+
41+
For mathematical functions that need to return NaN (not NULL) for out-of-range
42+
inputs per PySpark/IEEE 754 semantics, we use SQLExpression as a workaround.
43+
44+
Returns:
45+
-------
46+
Expression
47+
An expression that evaluates to NaN (not NULL)
48+
"""
49+
return SQLExpression("'NaN'::DOUBLE")
50+
51+
3352
def col(column: str) -> Column: # noqa: D103
3453
return Column(ColumnExpression(column))
3554

@@ -617,11 +636,9 @@ def asin(col: "ColumnOrName") -> Column:
617636
+--------+
618637
"""
619638
col = _to_column_expr(col)
620-
# TODO: ConstantExpression(float("nan")) gives NULL and not NaN # noqa: TD002, TD003
639+
# asin domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
621640
return Column(
622-
CaseExpression((col < -1.0) | (col > 1.0), ConstantExpression(float("nan"))).otherwise(
623-
FunctionExpression("asin", col)
624-
)
641+
CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("asin", col))
625642
)
626643

627644

@@ -4177,7 +4194,11 @@ def acos(col: "ColumnOrName") -> Column:
41774194
| NaN|
41784195
+--------+
41794196
"""
4180-
return _invoke_function_over_columns("acos", col)
4197+
col = _to_column_expr(col)
4198+
# acos domain is [-1, 1]; return NaN for out-of-range values per PySpark semantics
4199+
return Column(
4200+
CaseExpression((col < -1.0) | (col > 1.0), _nan_constant()).otherwise(FunctionExpression("acos", col))
4201+
)
41814202

41824203

41834204
def call_function(funcName: str, *cols: "ColumnOrName") -> Column:

duckdb/experimental/spark/sql/readwriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def load( # noqa: D102
125125
types, names = schema.extract_types_and_names()
126126
df = df._cast_types(types)
127127
df = df.toDF(names)
128-
raise NotImplementedError
128+
return df
129129

130130
def csv( # noqa: D102
131131
self,

duckdb/experimental/spark/sql/type_utils.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from duckdb.sqltypes import DuckDBPyType
44

5+
from ..exception import ContributionsAcceptedError
56
from .types import (
67
ArrayType,
78
BinaryType,
@@ -79,7 +80,12 @@ def convert_nested_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
7980
if id == "list" or id == "array":
8081
children = dtype.children
8182
return ArrayType(convert_type(children[0][1]))
82-
# TODO: add support for 'union' # noqa: TD002, TD003
83+
if id == "union":
84+
msg = (
85+
"Union types are not supported in the PySpark interface. "
86+
"DuckDB union types cannot be directly mapped to PySpark types."
87+
)
88+
raise ContributionsAcceptedError(msg)
8389
if id == "struct":
8490
children: list[tuple[str, DuckDBPyType]] = dtype.children
8591
fields = [StructField(x[0], convert_type(x[1])) for x in children]

duckdb/polars_io.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,9 @@ def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
236236
# String type
237237
if dtype == "String" or dtype == "StringOwned":
238238
# Some new formats may store directly under StringOwned
239-
string_val: object | None = value.get("StringOwned", value.get("String", None))
240-
return f"'{string_val}'"
239+
string_val = value.get("StringOwned", value.get("String", None))
240+
# the string must be a string constant
241+
return str(duckdb.ConstantExpression(string_val))
241242

242243
msg = f"Unsupported scalar type {dtype!s}, with value {value}"
243244
raise NotImplementedError(msg)

external/duckdb

Submodule duckdb updated 49 files

src/duckdb_py/include/duckdb_python/pyrelation.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
214214
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
215215
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
216216
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
217-
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
217+
const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
218+
const py::object &file_size_bytes = py::none());
218219

219220
void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
220221
const py::object &header = py::none(), const py::object &quotechar = py::none(),

src/duckdb_py/pyconnection.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1776,7 +1776,8 @@ shared_ptr<DuckDBPyConnection> DuckDBPyConnection::UnregisterPythonObject(const
17761776
D_ASSERT(py::gil_check());
17771777
py::gil_scoped_release release;
17781778
// FIXME: DROP TEMPORARY VIEW? doesn't exist?
1779-
connection.Query("DROP VIEW \"" + name + "\"");
1779+
const auto quoted_name = KeywordHelper::WriteOptionallyQuoted(name, '\"');
1780+
connection.Query("DROP VIEW " + quoted_name + "");
17801781
registered_objects.erase(name);
17811782
return shared_from_this();
17821783
}

src/duckdb_py/pyrelation.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
12141214
const py::object &overwrite, const py::object &per_thread_output,
12151215
const py::object &use_tmp_file, const py::object &partition_by,
12161216
const py::object &write_partition_columns, const py::object &append,
1217-
const py::object &filename_pattern) {
1217+
const py::object &filename_pattern, const py::object &file_size_bytes) {
12181218
case_insensitive_map_t<vector<Value>> options;
12191219

12201220
if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
13121312
options["filename_pattern"] = {Value(py::str(filename_pattern))};
13131313
}
13141314

1315+
if (!py::none().is(file_size_bytes)) {
1316+
if (py::isinstance<py::int_>(file_size_bytes)) {
1317+
int64_t file_size_bytes_int = py::int_(file_size_bytes);
1318+
options["file_size_bytes"] = {Value(file_size_bytes_int)};
1319+
} else if (py::isinstance<py::str>(file_size_bytes)) {
1320+
options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
1321+
} else {
1322+
throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
1323+
}
1324+
}
1325+
13151326
auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
13161327
PyExecuteRelation(write_parquet);
13171328
}

0 commit comments

Comments
 (0)