88from pydantic import BaseModel
99from typing_extensions import Protocol
1010
11- from dve .core_engine .backends .exceptions import ReaderLacksEntityTypeSupport
11+ from dve .core_engine .backends .exceptions import MessageBearingError , ReaderLacksEntityTypeSupport
1212from dve .core_engine .backends .types import EntityName , EntityType
13+ from dve .core_engine .message import FeedbackMessage
1314from dve .core_engine .type_hints import URI , ArbitraryFunction , WrapDecorator
15+ from dve .parser .file_handling .service import open_stream
1416
1517T = TypeVar ("T" )
1618ET_co = TypeVar ("ET_co" , covariant = True )
@@ -115,6 +117,8 @@ def read_to_entity_type(
115117 """
116118 if entity_name == Iterator [dict [str , Any ]]:
117119 return self .read_to_py_iterator (resource , entity_name , schema ) # type: ignore
120+
121+ self .raise_if_not_sensible_file (resource , entity_name )
118122
119123 try :
120124 reader_func = self .__read_methods__ [entity_type ]
@@ -137,3 +141,34 @@ def write_parquet(
137141
138142 """
139143 raise NotImplementedError (f"write_parquet not implemented in { self .__class__ } " )
144+
145+ @staticmethod
146+ def _check_likely_text_file (resource : URI ) -> bool :
147+ """Quick sense check of file to see if it looks like text
148+ - not 100% full proof, but hopefully enough to weed out most
149+ non-text files"""
150+ with open_stream (resource , "rb" ) as fle :
151+ start_chunk = fle .read (4096 )
152+ # check for BOM character - utf-16 can contain NULL bytes
153+ if start_chunk .startswith ((b"\xff \xfe " , b"\xfe \xff " )):
154+ return True
155+ # if null byte in - unlikely text
156+ if b"\x00 " in start_chunk :
157+ return False
158+ return True
159+
160+ def raise_if_not_sensible_file (self , resource : URI , entity_name :str ):
161+ if not self ._check_likely_text_file (resource ):
162+ raise MessageBearingError (
163+ "The submitted file doesn't appear to be text" ,
164+ messages = [
165+ FeedbackMessage (
166+ entity = entity_name ,
167+ record = None ,
168+ failure_type = "submission" ,
169+ error_location = "Whole File" ,
170+ error_code = "MalformedFile" ,
171+ error_message = f"The submitted resource doesn't seem to be a valid text file" ,
172+ )
173+ ],
174+ )
0 commit comments