1616 get_duckdb_type_from_annotation ,
1717)
1818from dve .core_engine .backends .implementations .duckdb .types import SQLType
19+ from dve .core_engine .backends .implementations .duckdb .utilities import check_csv_header_expected
1920from dve .core_engine .backends .utilities import get_polars_type_from_annotation
2021from dve .core_engine .message import FeedbackMessage
2122from dve .core_engine .type_hints import URI , EntityName
22- from dve .parser .file_handling import get_content_length
23+ from dve .parser .file_handling import get_content_length , open_stream
2324
2425
2526@duckdb_write_parquet
@@ -35,15 +36,46 @@ def __init__(
3536 delim : str = "," ,
3637 quotechar : str = '"' ,
3738 connection : Optional [DuckDBPyConnection ] = None ,
39+ field_check : bool = False ,
40+ field_check_error_code : Optional [str ] = "ExpectedVsActualFieldMismatch" ,
41+ field_check_error_message : Optional [str ] = "The submitted header does not match what is expected" ,
3842 ** _ ,
3943 ):
4044 self .header = header
4145 self .delim = delim
4246 self .quotechar = quotechar
4347 self ._connection = connection if connection else default_connection
48+ self .field_check = field_check
49+ self .field_check_error_code = field_check_error_code
50+ self .field_check_error_message = field_check_error_message
4451
4552 super ().__init__ ()
4653
54+ def perform_field_check (
55+ self , resource : URI , entity_name : str , expected_schema : type [BaseModel ]
56+ ):
57+ if not self .header :
58+ raise ValueError ("Cannot perform field check without a CSV header" )
59+
60+ if missing := check_csv_header_expected (
61+ resource ,
62+ expected_schema ,
63+ self .delim
64+ ):
65+ raise MessageBearingError (
66+ "The CSV header doesn't match what is expected" ,
67+ messages = [
68+ FeedbackMessage (
69+ entity = entity_name ,
70+ failure_type = "submission" ,
71+ error_location = "Whole File" ,
72+ error_code = self .field_check_error_code ,
73+ error_message = self .field_check_error_message ,
74+ value = f"Missing fields: { missing } " ,
75+ )
76+ ],
77+ )
78+
4779 def read_to_py_iterator (
4880 self , resource : URI , entity_name : EntityName , schema : type [BaseModel ]
4981 ) -> Iterator [dict [str , Any ]]:
@@ -58,6 +90,9 @@ def read_to_relation( # pylint: disable=unused-argument
5890 if get_content_length (resource ) == 0 :
5991 raise EmptyFileError (f"File at { resource } is empty." )
6092
93+ if self .field_check :
94+ self .perform_field_check (resource , entity_name , schema )
95+
6196 reader_options : dict [str , Any ] = {
6297 "header" : self .header ,
6398 "delimiter" : self .delim ,
@@ -89,6 +124,9 @@ def read_to_relation( # pylint: disable=unused-argument
89124 if get_content_length (resource ) == 0 :
90125 raise EmptyFileError (f"File at { resource } is empty." )
91126
127+ if self .field_check :
128+ self .perform_field_check (resource , entity_name , schema )
129+
92130 reader_options : dict [str , Any ] = {
93131 "has_header" : self .header ,
94132 "separator" : self .delim ,
@@ -132,6 +170,12 @@ class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
132170 | shop1 | clothes | 2025-01-01 |
133171 """
134172
173+ def __init__ (
174+ self , non_unique_header_error_code : Optional [str ] = "NonUniqueHeader" , * args , ** kwargs
175+ ):
176+ self ._non_unique_header_code = non_unique_header_error_code
177+ super ().__init__ (* args , ** kwargs )
178+
135179 @read_function (DuckDBPyRelation )
136180 def read_to_relation ( # pylint: disable=unused-argument
137181 self , resource : URI , entity_name : EntityName , schema : type [BaseModel ]
@@ -159,7 +203,7 @@ def read_to_relation( # pylint: disable=unused-argument
159203 ),
160204 error_location = entity_name ,
161205 category = "Bad file" ,
162- error_code = "NonUniqueHeader" ,
206+ error_code = self . _non_unique_header_code ,
163207 )
164208 ],
165209 )
0 commit comments