Skip to content

Commit 2f2ca5a

Browse files
committed
feat: initial work to add custom error messages to data contract
1 parent 970f0ef commit 2f2ca5a

File tree

20 files changed

+448
-162
lines changed

20 files changed

+448
-162
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from dve.core_engine.backends.readers import register_reader
2+
3+
from .contract import DuckDBDataContract
4+
from .readers import DuckDBCSVReader, DuckDBXMLStreamReader
5+
from .reference_data import DuckDBRefDataLoader
6+
from .rules import DuckDBStepImplementations
7+
8+
register_reader(DuckDBCSVReader)
9+
register_reader(DuckDBXMLStreamReader)
10+
11+
__all__ = [
12+
"DuckDBDataContract",
13+
"DuckDBRefDataLoader",
14+
"DuckDBStepImplementations",
15+
]

src/dve/core_engine/backends/implementations/duckdb/auditing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
)
1414
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
1515
PYTHON_TYPE_TO_DUCKDB_TYPE,
16-
PYTHON_TYPE_TO_POLARS_TYPE,
1716
table_exists,
1817
)
1918
from dve.core_engine.models import (
@@ -23,6 +22,7 @@
2322
SubmissionStatisticsRecord,
2423
TransferRecord,
2524
)
25+
from dve.core_engine.backends.utilities import PYTHON_TYPE_TO_POLARS_TYPE
2626
from dve.core_engine.type_hints import URI, ExecutorType
2727

2828

src/dve/core_engine/backends/implementations/duckdb/contract.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,12 @@
2020
duckdb_read_parquet,
2121
duckdb_write_parquet,
2222
get_duckdb_type_from_annotation,
23-
get_polars_type_from_annotation,
2423
relation_is_empty,
2524
)
2625
from dve.core_engine.backends.implementations.duckdb.types import DuckDBEntities
2726
from dve.core_engine.backends.metadata.contract import DataContractMetadata
2827
from dve.core_engine.backends.types import StageSuccessful
29-
from dve.core_engine.backends.utilities import stringify_model
28+
from dve.core_engine.backends.utilities import stringify_model, get_polars_type_from_annotation
3029
from dve.core_engine.message import FeedbackMessage
3130
from dve.core_engine.type_hints import URI, Messages
3231
from dve.core_engine.validation import RowValidator

src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py

Lines changed: 1 addition & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from duckdb import DuckDBPyConnection, DuckDBPyRelation
1616
from duckdb.typing import DuckDBPyType
1717
from pandas import DataFrame
18-
from polars.datatypes.classes import DataTypeClass as PolarsType
18+
1919
from pydantic import BaseModel
2020
from typing_extensions import Annotated, get_args, get_origin, get_type_hints
2121

@@ -91,20 +91,6 @@ def __call__(self):
9191
}
9292
"""A mapping of Python types to the equivalent DuckDB types."""
9393

94-
PYTHON_TYPE_TO_POLARS_TYPE: Dict[type, PolarsType] = {
95-
# issue with decimal conversion at the moment...
96-
str: pl.Utf8, # type: ignore
97-
int: pl.Int64, # type: ignore
98-
bool: pl.Boolean, # type: ignore
99-
float: pl.Float64, # type: ignore
100-
bytes: pl.Binary, # type: ignore
101-
date: pl.Date, # type: ignore
102-
datetime: pl.Datetime, # type: ignore
103-
Decimal: pl.Utf8, # type: ignore
104-
}
105-
"""A mapping of Python types to the equivalent Polars types."""
106-
107-
10894
def table_exists(connection: DuckDBPyConnection, table_name: str) -> bool:
10995
"""check if a table exists in a given DuckDBPyConnection"""
11096
return table_name in map(lambda x: x[0], connection.sql("SHOW TABLES").fetchall())
@@ -204,99 +190,6 @@ def get_duckdb_type_from_annotation(type_annotation: Any) -> DuckDBPyType:
204190
return duck_type
205191
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
206192

207-
208-
def get_polars_type_from_annotation(type_annotation: Any) -> PolarsType:
209-
"""Get a polars type from a Python type annotation.
210-
211-
Supported types are any of the following (this definition is recursive):
212-
- Supported basic Python types. These are:
213-
* `str`: pl.Utf8
214-
* `int`: pl.Int64
215-
* `bool`: pl.Boolean
216-
* `float`: pl.Float64
217-
* `bytes`: pl.Binary
218-
* `datetime.date`: pl.Date
219-
* `datetime.datetime`: pl.Datetime
220-
* `decimal.Decimal`: pl.Decimal with precision of 38 and scale of 18
221-
- A list of supported types (e.g. `List[str]` or `typing.List[str]`).
222-
This will return a pl.List type (variable length)
223-
- A `typing.Optional` type or a `typing.Union` of the type and `None` (e.g.
224-
`typing.Optional[str]`, `typing.Union[List[str], None]`). This will remove the
225-
'optional' wrapper and return the inner type
226-
- A subclass of `typing.TypedDict` with values typed using supported types. This
227-
will parse the value types as Polars types and return a Polars Struct.
228-
- A dataclass or `pydantic.main.ModelMetaClass` with values typed using supported types.
229-
This will parse the field types as Polars types and return a Polars Struct.
230-
- Any supported type, with a `typing_extensions.Annotated` wrapper.
231-
- A `decimal.Decimal` wrapped with `typing_extensions.Annotated` with a `DecimalConfig`
232-
indicating precision and scale. This will return a Polars Decimal
233-
with the specfied scale and precision.
234-
- A `pydantic.types.condecimal` created type.
235-
236-
Any `ClassVar` types within `TypedDict`s, dataclasses, or `pydantic` models will be
237-
ignored.
238-
239-
"""
240-
type_origin = get_origin(type_annotation)
241-
242-
# An `Optional` or `Union` type, check to ensure non-heterogenity.
243-
if type_origin is Union:
244-
python_type = _get_non_heterogenous_type(get_args(type_annotation))
245-
return get_polars_type_from_annotation(python_type)
246-
247-
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
248-
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
249-
element_type = _get_non_heterogenous_type(get_args(type_annotation))
250-
return pl.List(get_polars_type_from_annotation(element_type)) # type: ignore
251-
252-
if type_origin is Annotated:
253-
python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
254-
return get_polars_type_from_annotation(python_type)
255-
# Ensure that we have a concrete type at this point.
256-
if not isinstance(type_annotation, type):
257-
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
258-
259-
if (
260-
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
261-
(issubclass(type_annotation, dict) and type_annotation is not dict)
262-
# Type hint is a dataclass.
263-
or is_dataclass(type_annotation)
264-
# Type hint is a `pydantic` model.
265-
or (type_origin is None and issubclass(type_annotation, BaseModel))
266-
):
267-
fields: Dict[str, PolarsType] = {}
268-
for field_name, field_annotation in get_type_hints(type_annotation).items():
269-
# Technically non-string keys are disallowed, but people are bad.
270-
if not isinstance(field_name, str):
271-
raise ValueError(
272-
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
273-
) # pragma: no cover
274-
if get_origin(field_annotation) is ClassVar:
275-
continue
276-
277-
fields[field_name] = get_polars_type_from_annotation(field_annotation)
278-
279-
if not fields:
280-
raise ValueError(
281-
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
282-
)
283-
284-
return pl.Struct(fields) # type: ignore
285-
286-
if type_annotation is list:
287-
raise ValueError(
288-
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
289-
)
290-
if type_annotation is dict or type_origin is dict:
291-
raise ValueError(f"Dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
292-
293-
for type_ in type_annotation.mro():
294-
polars_type = PYTHON_TYPE_TO_POLARS_TYPE.get(type_)
295-
if polars_type:
296-
return polars_type
297-
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
298-
299-
300193
def coerce_inferred_numpy_array_to_list(pandas_df: DataFrame) -> DataFrame:
301194
"""Function to modify numpy inferred array when cnverting from duckdb relation to
302195
pandas dataframe - these cause issues with pydantic models

src/dve/core_engine/backends/implementations/duckdb/readers/xml.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010
from dve.core_engine.backends.base.reader import read_function
1111
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
1212
duckdb_write_parquet,
13-
get_polars_type_from_annotation,
1413
)
1514
from dve.core_engine.backends.readers.xml import XMLStreamReader
16-
from dve.core_engine.backends.utilities import stringify_model
15+
from dve.core_engine.backends.utilities import stringify_model, get_polars_type_from_annotation
1716
from dve.core_engine.type_hints import URI
1817

1918

src/dve/core_engine/backends/implementations/duckdb/rules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757

5858
@duckdb_write_parquet
5959
@duckdb_read_parquet
60-
class DuckDBStepImplemetations(BaseStepImplementations[DuckDBPyRelation]):
60+
class DuckDBStepImplementations(BaseStepImplementations[DuckDBPyRelation]):
6161
"""An implementation of transformation steps in duckdb."""
6262

6363
def __init__(self, connection: DuckDBPyConnection, **kwargs):

src/dve/core_engine/backends/readers/csv.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@
1414
FieldCountMismatch,
1515
MissingHeaderError,
1616
)
17-
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
18-
get_polars_type_from_annotation,
19-
)
20-
from dve.core_engine.backends.utilities import stringify_model
17+
from dve.core_engine.backends.utilities import stringify_model, get_polars_type_from_annotation
2118
from dve.core_engine.type_hints import EntityName
2219
from dve.parser.file_handling import get_content_length, open_stream
2320
from dve.parser.file_handling.implementations.file import file_uri_to_local_path

src/dve/core_engine/backends/readers/xml.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111

1212
from dve.core_engine.backends.base.reader import BaseFileReader
1313
from dve.core_engine.backends.exceptions import EmptyFileError
14-
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
15-
get_polars_type_from_annotation,
16-
)
17-
from dve.core_engine.backends.utilities import stringify_model
14+
from dve.core_engine.backends.utilities import stringify_model, get_polars_type_from_annotation
1815
from dve.core_engine.loggers import get_logger
1916
from dve.core_engine.type_hints import URI, EntityName
2017
from dve.parser.file_handling import NonClosingTextIOWrapper, get_content_length, open_stream

src/dve/core_engine/backends/utilities.py

Lines changed: 114 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,41 @@
11
"""Necessary, otherwise uncategorised backend functionality."""
22

3+
from datetime import date, datetime
4+
from decimal import Decimal
35
import sys
46
from typing import Type
57

8+
from dataclasses import is_dataclass
69
from pydantic import BaseModel, create_model
710

811
from dve.core_engine.type_hints import Messages
12+
from dve.core_engine.backends.base.utilities import _get_non_heterogenous_type
13+
14+
import polars as pl # type: ignore
15+
from polars.datatypes.classes import DataTypeClass as PolarsType
16+
from typing import Any, ClassVar, Dict, Set, Union
917

1018
# We need to rely on a Python typing implementation detail in Python <= 3.7.
1119
if sys.version_info[:2] <= (3, 7):
1220
# Crimes against typing.
1321
from typing import _GenericAlias # type: ignore
1422

15-
from typing_extensions import get_args, get_origin
23+
from typing_extensions import Annotated, get_args, get_origin, get_type_hints
1624
else:
17-
from typing import get_args, get_origin
25+
from typing import Annotated, get_args, get_origin, get_type_hints
26+
27+
PYTHON_TYPE_TO_POLARS_TYPE: Dict[type, PolarsType] = {
28+
# issue with decimal conversion at the moment...
29+
str: pl.Utf8, # type: ignore
30+
int: pl.Int64, # type: ignore
31+
bool: pl.Boolean, # type: ignore
32+
float: pl.Float64, # type: ignore
33+
bytes: pl.Binary, # type: ignore
34+
date: pl.Date, # type: ignore
35+
datetime: pl.Datetime, # type: ignore
36+
Decimal: pl.Utf8, # type: ignore
37+
}
38+
"""A mapping of Python types to the equivalent Polars types."""
1839

1940

2041
def stringify_type(type_: type) -> type:
@@ -61,3 +82,94 @@ def dedup_messages(messages: Messages) -> Messages:
6182
6283
"""
6384
return list(dict.fromkeys(messages))
85+
86+
def get_polars_type_from_annotation(type_annotation: Any) -> PolarsType:
87+
"""Get a polars type from a Python type annotation.
88+
89+
Supported types are any of the following (this definition is recursive):
90+
- Supported basic Python types. These are:
91+
* `str`: pl.Utf8
92+
* `int`: pl.Int64
93+
* `bool`: pl.Boolean
94+
* `float`: pl.Float64
95+
* `bytes`: pl.Binary
96+
* `datetime.date`: pl.Date
97+
* `datetime.datetime`: pl.Datetime
98+
* `decimal.Decimal`: pl.Decimal with precision of 38 and scale of 18
99+
- A list of supported types (e.g. `List[str]` or `typing.List[str]`).
100+
This will return a pl.List type (variable length)
101+
- A `typing.Optional` type or a `typing.Union` of the type and `None` (e.g.
102+
`typing.Optional[str]`, `typing.Union[List[str], None]`). This will remove the
103+
'optional' wrapper and return the inner type
104+
- A subclass of `typing.TypedDict` with values typed using supported types. This
105+
will parse the value types as Polars types and return a Polars Struct.
106+
- A dataclass or `pydantic.main.ModelMetaClass` with values typed using supported types.
107+
This will parse the field types as Polars types and return a Polars Struct.
108+
- Any supported type, with a `typing_extensions.Annotated` wrapper.
109+
- A `decimal.Decimal` wrapped with `typing_extensions.Annotated` with a `DecimalConfig`
110+
indicating precision and scale. This will return a Polars Decimal
111+
with the specfied scale and precision.
112+
- A `pydantic.types.condecimal` created type.
113+
114+
Any `ClassVar` types within `TypedDict`s, dataclasses, or `pydantic` models will be
115+
ignored.
116+
117+
"""
118+
type_origin = get_origin(type_annotation)
119+
120+
# An `Optional` or `Union` type, check to ensure non-heterogenity.
121+
if type_origin is Union:
122+
python_type = _get_non_heterogenous_type(get_args(type_annotation))
123+
return get_polars_type_from_annotation(python_type)
124+
125+
# Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
126+
if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
127+
element_type = _get_non_heterogenous_type(get_args(type_annotation))
128+
return pl.List(get_polars_type_from_annotation(element_type)) # type: ignore
129+
130+
if type_origin is Annotated:
131+
python_type, *other_args = get_args(type_annotation) # pylint: disable=unused-variable
132+
return get_polars_type_from_annotation(python_type)
133+
# Ensure that we have a concrete type at this point.
134+
if not isinstance(type_annotation, type):
135+
raise ValueError(f"Unsupported type annotation {type_annotation!r}")
136+
137+
if (
138+
# Type hint is a dict subclass, but not dict. Possibly a `TypedDict`.
139+
(issubclass(type_annotation, dict) and type_annotation is not dict)
140+
# Type hint is a dataclass.
141+
or is_dataclass(type_annotation)
142+
# Type hint is a `pydantic` model.
143+
or (type_origin is None and issubclass(type_annotation, BaseModel))
144+
):
145+
fields: Dict[str, PolarsType] = {}
146+
for field_name, field_annotation in get_type_hints(type_annotation).items():
147+
# Technically non-string keys are disallowed, but people are bad.
148+
if not isinstance(field_name, str):
149+
raise ValueError(
150+
f"Dictionary/Dataclass keys must be strings, got {type_annotation!r}"
151+
) # pragma: no cover
152+
if get_origin(field_annotation) is ClassVar:
153+
continue
154+
155+
fields[field_name] = get_polars_type_from_annotation(field_annotation)
156+
157+
if not fields:
158+
raise ValueError(
159+
f"No type annotations in dict/dataclass type (got {type_annotation!r})"
160+
)
161+
162+
return pl.Struct(fields) # type: ignore
163+
164+
if type_annotation is list:
165+
raise ValueError(
166+
f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
167+
)
168+
if type_annotation is dict or type_origin is dict:
169+
raise ValueError(f"Dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
170+
171+
for type_ in type_annotation.mro():
172+
polars_type = PYTHON_TYPE_TO_POLARS_TYPE.get(type_)
173+
if polars_type:
174+
return polars_type
175+
raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")

0 commit comments

Comments
 (0)