Skip to content

Commit 0191dd3

Browse files
committed
feat: Added new check to base reader for sense check of whether processing a text file - applied for all reads via read_to_entity_type
1 parent 5dfbafc commit 0191dd3

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

src/dve/core_engine/backends/base/reader.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
from pydantic import BaseModel
99
from typing_extensions import Protocol
1010

11-
from dve.core_engine.backends.exceptions import ReaderLacksEntityTypeSupport
11+
from dve.core_engine.backends.exceptions import MessageBearingError, ReaderLacksEntityTypeSupport
1212
from dve.core_engine.backends.types import EntityName, EntityType
13+
from dve.core_engine.message import FeedbackMessage
1314
from dve.core_engine.type_hints import URI, ArbitraryFunction, WrapDecorator
15+
from dve.parser.file_handling.service import open_stream
1416

1517
T = TypeVar("T")
1618
ET_co = TypeVar("ET_co", covariant=True)
@@ -115,6 +117,8 @@ def read_to_entity_type(
115117
"""
116118
if entity_name == Iterator[dict[str, Any]]:
117119
return self.read_to_py_iterator(resource, entity_name, schema) # type: ignore
120+
121+
self.raise_if_not_sensible_file(resource, entity_name)
118122

119123
try:
120124
reader_func = self.__read_methods__[entity_type]
@@ -137,3 +141,34 @@ def write_parquet(
137141
138142
"""
139143
raise NotImplementedError(f"write_parquet not implemented in {self.__class__}")
144+
145+
@staticmethod
146+
def _check_likely_text_file(resource: URI) -> bool:
147+
"""Quick sense check of file to see if it looks like text
148+
- not 100% full proof, but hopefully enough to weed out most
149+
non-text files"""
150+
with open_stream(resource, "rb") as fle:
151+
start_chunk = fle.read(4096)
152+
# check for BOM character - utf-16 can contain NULL bytes
153+
if start_chunk.startswith((b"\xff\xfe", b"\xfe\xff")):
154+
return True
155+
# if null byte in - unlikely text
156+
if b"\x00" in start_chunk:
157+
return False
158+
return True
159+
160+
def raise_if_not_sensible_file(self, resource: URI, entity_name:str):
161+
if not self._check_likely_text_file(resource):
162+
raise MessageBearingError(
163+
"The submitted file doesn't appear to be text",
164+
messages=[
165+
FeedbackMessage(
166+
entity=entity_name,
167+
record=None,
168+
failure_type="submission",
169+
error_location="Whole File",
170+
error_code="MalformedFile",
171+
error_message=f"The submitted resource doesn't seem to be a valid text file",
172+
)
173+
],
174+
)

src/dve/core_engine/backends/implementations/duckdb/utilities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def check_csv_header_expected(
5454
quote_char: str = '"') -> set[str]:
5555
"""Check the header of a CSV matches the expected fields"""
5656
with open_stream(resource) as fle:
57-
header_fields = fle.readline().replace(quote_char,"").split(delimiter)
57+
header_fields = fle.readline().rstrip().replace(quote_char,"").split(delimiter)
5858
expected_fields = expected_schema.__fields__.keys()
5959
return set(expected_fields).difference(header_fields)
6060

0 commit comments

Comments
 (0)