|
6 | 6 | from datetime import date, datetime |
7 | 7 | from decimal import Decimal |
8 | 8 | from pathlib import Path |
9 | | -from typing import Any, ClassVar, Dict, Set, Union |
| 9 | +from typing import Any, ClassVar, Dict, Generator, Iterator, Set, Union |
10 | 10 | from urllib.parse import urlparse |
11 | 11 |
|
12 | 12 | import duckdb.typing as ddbtyp |
13 | 13 | import numpy as np |
14 | | -import polars as pl # type: ignore |
15 | 14 | from duckdb import DuckDBPyConnection, DuckDBPyRelation |
16 | 15 | from duckdb.typing import DuckDBPyType |
17 | 16 | from pandas import DataFrame |
18 | | -from polars.datatypes.classes import DataTypeClass as PolarsType |
19 | 17 | from pydantic import BaseModel |
20 | 18 | from typing_extensions import Annotated, get_args, get_origin, get_type_hints |
21 | 19 |
|
@@ -91,19 +89,6 @@ def __call__(self): |
91 | 89 | } |
92 | 90 | """A mapping of Python types to the equivalent DuckDB types.""" |
93 | 91 |
|
94 | | -PYTHON_TYPE_TO_POLARS_TYPE: Dict[type, PolarsType] = { |
95 | | - # issue with decimal conversion at the moment... |
96 | | - str: pl.Utf8, # type: ignore |
97 | | - int: pl.Int64, # type: ignore |
98 | | - bool: pl.Boolean, # type: ignore |
99 | | - float: pl.Float64, # type: ignore |
100 | | - bytes: pl.Binary, # type: ignore |
101 | | - date: pl.Date, # type: ignore |
102 | | - datetime: pl.Datetime, # type: ignore |
103 | | - Decimal: pl.Utf8, # type: ignore |
104 | | -} |
105 | | -"""A mapping of Python types to the equivalent Polars types.""" |
106 | | - |
107 | 92 |
|
108 | 93 | def table_exists(connection: DuckDBPyConnection, table_name: str) -> bool: |
109 | 94 | """check if a table exists in a given DuckDBPyConnection""" |
@@ -205,98 +190,6 @@ def get_duckdb_type_from_annotation(type_annotation: Any) -> DuckDBPyType: |
205 | 190 | raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}") |
206 | 191 |
|
207 | 192 |
|
208 | | -def get_polars_type_from_annotation(type_annotation: Any) -> PolarsType: |
209 | | - """Get a polars type from a Python type annotation. |
210 | | -
|
211 | | - Supported types are any of the following (this definition is recursive): |
212 | | - - Supported basic Python types. These are: |
213 | | - * `str`: pl.Utf8 |
214 | | - * `int`: pl.Int64 |
215 | | - * `bool`: pl.Boolean |
216 | | - * `float`: pl.Float64 |
217 | | - * `bytes`: pl.Binary |
218 | | - * `datetime.date`: pl.Date |
219 | | - * `datetime.datetime`: pl.Datetime |
220 | | - * `decimal.Decimal`: pl.Decimal with precision of 38 and scale of 18 |
221 | | - - A list of supported types (e.g. `List[str]` or `typing.List[str]`). |
222 | | - This will return a pl.List type (variable length) |
223 | | - - A `typing.Optional` type or a `typing.Union` of the type and `None` (e.g. |
224 | | - `typing.Optional[str]`, `typing.Union[List[str], None]`). This will remove the |
225 | | - 'optional' wrapper and return the inner type |
226 | | - - A subclass of `typing.TypedDict` with values typed using supported types. This |
227 | | - will parse the value types as Polars types and return a Polars Struct. |
228 | | - - A dataclass or `pydantic.main.ModelMetaClass` with values typed using supported types. |
229 | | - This will parse the field types as Polars types and return a Polars Struct. |
230 | | - - Any supported type, with a `typing_extensions.Annotated` wrapper. |
231 | | - - A `decimal.Decimal` wrapped with `typing_extensions.Annotated` with a `DecimalConfig` |
232 | | - indicating precision and scale. This will return a Polars Decimal |
233 | | - with the specfied scale and precision. |
234 | | - - A `pydantic.types.condecimal` created type. |
235 | | -
|
236 | | - Any `ClassVar` types within `TypedDict`s, dataclasses, or `pydantic` models will be |
237 | | - ignored. |
238 | | -
|
239 | | - """ |
240 | | - type_origin = get_origin(type_annotation) |
241 | | - |
242 | | - # An `Optional` or `Union` type, check to ensure non-heterogenity. |
243 | | - if type_origin is Union: |
244 | | - python_type = _get_non_heterogenous_type(get_args(type_annotation)) |
245 | | - return get_polars_type_from_annotation(python_type) |
246 | | - |
247 | | - # Type hint is e.g. `List[str]`, check to ensure non-heterogenity. |
248 | | - if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)): |
249 | | - element_type = _get_non_heterogenous_type(get_args(type_annotation)) |
250 | | - return pl.List(get_polars_type_from_annotation(element_type)) # type: ignore |
251 | | - |
252 | | - if type_origin is Annotated: |
253 | | - python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable |
254 | | - return get_polars_type_from_annotation(python_type) |
255 | | - # Ensure that we have a concrete type at this point. |
256 | | - if not isinstance(type_annotation, type): |
257 | | - raise ValueError(f"Unsupported type annotation {type_annotation!r}") |
258 | | - |
259 | | - if ( |
260 | | - # Type hint is a dict subclass, but not dict. Possibly a `TypedDict`. |
261 | | - (issubclass(type_annotation, dict) and type_annotation is not dict) |
262 | | - # Type hint is a dataclass. |
263 | | - or is_dataclass(type_annotation) |
264 | | - # Type hint is a `pydantic` model. |
265 | | - or (type_origin is None and issubclass(type_annotation, BaseModel)) |
266 | | - ): |
267 | | - fields: Dict[str, PolarsType] = {} |
268 | | - for field_name, field_annotation in get_type_hints(type_annotation).items(): |
269 | | - # Technically non-string keys are disallowed, but people are bad. |
270 | | - if not isinstance(field_name, str): |
271 | | - raise ValueError( |
272 | | - f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}" |
273 | | - ) # pragma: no cover |
274 | | - if get_origin(field_annotation) is ClassVar: |
275 | | - continue |
276 | | - |
277 | | - fields[field_name] = get_polars_type_from_annotation(field_annotation) |
278 | | - |
279 | | - if not fields: |
280 | | - raise ValueError( |
281 | | - f"No type annotations in dict/dataclass type (got {type_annotation!r})" |
282 | | - ) |
283 | | - |
284 | | - return pl.Struct(fields) # type: ignore |
285 | | - |
286 | | - if type_annotation is list: |
287 | | - raise ValueError( |
288 | | - f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}" |
289 | | - ) |
290 | | - if type_annotation is dict or type_origin is dict: |
291 | | - raise ValueError(f"Dict must be `typing.TypedDict` subclass, got {type_annotation!r}") |
292 | | - |
293 | | - for type_ in type_annotation.mro(): |
294 | | - polars_type = PYTHON_TYPE_TO_POLARS_TYPE.get(type_) |
295 | | - if polars_type: |
296 | | - return polars_type |
297 | | - raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}") |
298 | | - |
299 | | - |
300 | 193 | def coerce_inferred_numpy_array_to_list(pandas_df: DataFrame) -> DataFrame: |
301 | 194 | """Function to modify numpy inferred array when cnverting from duckdb relation to |
302 | 195 | pandas dataframe - these cause issues with pydantic models |
@@ -331,15 +224,20 @@ def _ddb_read_parquet( |
331 | 224 |
|
332 | 225 |
|
333 | 226 | def _ddb_write_parquet( # pylint: disable=unused-argument |
334 | | - self, entity: DuckDBPyRelation, target_location: URI, **kwargs |
| 227 | + self, entity: Union[Iterator[Dict[str, Any]], DuckDBPyRelation], target_location: URI, **kwargs |
335 | 228 | ) -> URI: |
336 | 229 | """Method to write parquet files from type cast entities |
337 | 230 | following data contract application |
338 | 231 | """ |
339 | 232 | if isinstance(_get_implementation(target_location), LocalFilesystemImplementation): |
340 | 233 | Path(target_location).parent.mkdir(parents=True, exist_ok=True) |
341 | 234 |
|
342 | | - entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) |
| 235 | + if isinstance(entity, Generator): |
| 236 | + entity = self._connection.query( |
| 237 | + "select dta.* from (select unnest($data) as dta)", params={"data": list(entity)} |
| 238 | + ) |
| 239 | + |
| 240 | + entity.to_parquet(file_name=target_location, compression="snappy", **kwargs) # type: ignore |
343 | 241 | return target_location |
344 | 242 |
|
345 | 243 |
|
|
0 commit comments