diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index f55675e0a..a45d082af 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -6,19 +6,13 @@ import traceback from datetime import datetime from io import BytesIO, IOBase -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast import backoff import dpath import nltk import requests -from unstructured.file_utils.filetype import ( - EXT_TO_FILETYPE, - FILETYPE_TO_MIMETYPE, - STR_TO_FILETYPE, - FileType, - detect_filetype, -) +from unstructured.file_utils.filetype import FileType, detect_filetype from airbyte_cdk.models import FailureType from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -85,14 +79,23 @@ def _import_unstructured() -> None: global unstructured_partition_docx global unstructured_partition_pptx from unstructured.partition.docx import partition_docx - from unstructured.partition.pdf import partition_pdf from unstructured.partition.pptx import partition_pptx - # separate global variables to properly propagate typing - unstructured_partition_pdf = partition_pdf unstructured_partition_docx = partition_docx unstructured_partition_pptx = partition_pptx + try: + from unstructured.partition.pdf import partition_pdf + + unstructured_partition_pdf = partition_pdf + except (ImportError, ModuleNotFoundError): + # partition_pdf requires the heavy unstructured_inference package; + # PDF support is disabled when it is not installed. + logger = logging.getLogger(__name__) + logger.info( + "Could not import unstructured.partition.pdf (requires unstructured_inference). PDF parsing will be unavailable." + ) + def user_error(e: Exception) -> bool: """ @@ -207,13 +210,6 @@ def _read_file( logger: logging.Logger, ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") filetype: FileType | None = self._get_filetype(file_handle, remote_file) @@ -335,7 +331,7 @@ def _read_file_remotely( data = self._params_to_dict(format.parameters, strategy) - file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} + file_data = {"files": ("filename", file_handle, filetype.mime_type)} response = requests.post( f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data @@ -356,13 +352,6 @@ def _read_file_locally( self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") file: Any = file_handle @@ -373,15 +362,29 @@ def _read_file_locally( try: if filetype == FileType.PDF: - # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects + if not unstructured_partition_pdf: + raise self._create_parse_error( + remote_file, + "PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", + ) file_handle.seek(0) with BytesIO(file_handle.read()) as file: file_handle.seek(0) elements = unstructured_partition_pdf(file=file, strategy=strategy) elif filetype == FileType.DOCX: + if not unstructured_partition_docx: + raise self._create_parse_error( + remote_file, "DOCX partition function is not available" + ) elements = unstructured_partition_docx(file=file) elif filetype == FileType.PPTX: + if not unstructured_partition_pptx: + raise self._create_parse_error( + remote_file, "PPTX partition function is not available" + ) elements = unstructured_partition_pptx(file=file) + except RecordParseError: + raise except Exception as e: raise self._create_parse_error(remote_file, str(e)) @@ -405,8 +408,11 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT 2. Use the file name if available 3. Use the file content """ - if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[remote_file.mime_type] + if remote_file.mime_type: + try: + return FileType.from_mime_type(remote_file.mime_type) + except ValueError: + pass # set name to none, otherwise unstructured will try to get the modified date from the local file system if hasattr(file, "name"): @@ -418,7 +424,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT file_type: FileType | None = None try: file_type = detect_filetype( - filename=remote_file.uri, + file_path=remote_file.uri, ) except Exception: # Path doesn't exist locally. Try something else... @@ -427,16 +433,17 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT if file_type and file_type != FileType.UNK: return file_type - type_based_on_content = detect_filetype(file=file) + extension = "." + remote_file.uri.split(".")[-1].lower() + ext_type = FileType.from_extension(extension) + if ext_type is not None: + return ext_type + + type_based_on_content = detect_filetype(file=cast(IO[bytes], file)) file.seek(0) # detect_filetype is reading to read the file content, so we need to reset if type_based_on_content and type_based_on_content != FileType.UNK: return type_based_on_content - extension = "." + remote_file.uri.split(".")[-1].lower() - if extension in EXT_TO_FILETYPE: - return EXT_TO_FILETYPE[extension] - return None def _supported_file_types(self) -> List[Any]: diff --git a/poetry.lock b/poetry.lock index 55c2fa668..09325cbfd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "25.1.0" +description = "File support for asyncio." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695"}, + {file = "aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2"}, +] + [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -501,19 +514,6 @@ files = [ [package.dependencies] pycparser = "*" -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.1" @@ -1283,6 +1283,22 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "eval-type-backport" +version = "0.3.1" +description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "eval_type_backport-0.3.1-py3-none-any.whl", hash = "sha256:279ab641905e9f11129f56a8a78f493518515b83402b860f6f06dd7c011fdfa8"}, + {file = "eval_type_backport-0.3.1.tar.gz", hash = "sha256:57e993f7b5b69d271e37482e62f74e76a0276c82490cf8e4f0dffeb6b332d5ed"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -2052,6 +2068,29 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "httpcore" version = "1.0.9" @@ -3262,6 +3301,19 @@ files = [ ] markers = {main = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.9.1" @@ -3421,6 +3473,22 @@ files = [ {file = "numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a"}, ] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "0.27.9" @@ -3889,6 +3957,65 @@ Jinja2 = ">=2.11.0" MarkupSafe = ">=1.1.1" pygments = ">=2.12.0" +[[package]] +name = "pi-heif" +version = "1.2.0" +description = "Python interface for libheif library" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "pi_heif-1.2.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:ff8f0e5493f97973b5fef8da892f20a410527ed7a1820de4ff3ba2a0a640d458"}, + {file = "pi_heif-1.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b40a717ed9635186236496c1e27dde60fcec9853786889af0029bafb13626dbf"}, + {file = "pi_heif-1.2.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d2f73ef203e19e690d4e6ed336c7970b5fee2a0f1e38885a3c7f465eac6af16"}, + {file = "pi_heif-1.2.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f1f97cc4f842993dc7ac26e3f15748c63aa875ef8d935cf7ab957e02e35ae90d"}, + {file = "pi_heif-1.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4e9702e4300655b6063816c55c1bbd044b5cc4215a7fe31f9a0d41451b815b28"}, + {file = "pi_heif-1.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4110b98593aa2bc140f1c74491739db75b2e06f87aad8eaf625aecc1fe8c34ed"}, + {file = "pi_heif-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:06e4433337b10e7771aa0f7d22d8611cb24ebd61f71edbb82ea1ca8e087c115a"}, + {file = "pi_heif-1.2.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:680939024f6dd760925f0bffbe24a325845b8a4a6acf380ba6251b34485adc05"}, + {file = "pi_heif-1.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4f2cb2a102175b59acb3d61f93499553609ab07911284b30e6255fbee23c9347"}, + {file = "pi_heif-1.2.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d486b40f71a57e401625ae853f7b0b70ce0027c2378a2f69af89aa5f49d96b72"}, + {file = "pi_heif-1.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13e46b0850ef4b66e2fee9a3ad8b3c337ae1a93e963fe2180feb0b6159af0e03"}, + {file = "pi_heif-1.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d37c88e3da7c285e58de9b68c778ec241e2ad4722b4cc25e9068eb51e41d6fa2"}, + {file = "pi_heif-1.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dbc53e52f940394351f85c7fa1c7cabc845a18d924245806cafb91c29998804c"}, + {file = "pi_heif-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:e26fd46cc0c75c0e44a923c7e7d1407b2a1576f94c8bbf61b9e00516b6d1b1be"}, + {file = "pi_heif-1.2.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:e007c14570acf9e522e9a7e750bcfbd6924b2a9d86dd845857cce203ec5d696f"}, + {file = "pi_heif-1.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d74f70d60549f7198b1b1954bfceff48f5b527229cf211b83905c458819ed5a"}, + {file = "pi_heif-1.2.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:760d3ca420a46514cfd06a440d46c10eb0fbea5cc9c2c8fbd151c520a907a248"}, + {file = "pi_heif-1.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:264e8e50835c2e7f835f92508580da5859b63fa61ffe9319b2b97feba2120ccc"}, + {file = "pi_heif-1.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:46e934a72f7baed86525d9e0511a234b687b6aa80a764b33b42eedbe3d56d860"}, + {file = "pi_heif-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8307d668d40b156b9d19d13158a4a015540061f1694b4c6593a931e823c5959c"}, + {file = "pi_heif-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:fd886282231d630af17f0371aeaaec3ec8351c35b5e32fe9ef01b2c60bc176ef"}, + {file = "pi_heif-1.2.0-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:06947b98598026cf71df9ab841a17bd2cf0da704789e4fff73311d79539d6cc5"}, + {file = "pi_heif-1.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3b7bdefbf3bec7dd644bc8e17810d1f658db2ee60f35ef3943fcb9b435aca479"}, + {file = "pi_heif-1.2.0-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ad88e50bcb0aa25f9febde017ce7fce6801d927f032c1983d6846ead106c50e"}, + {file = "pi_heif-1.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2c90cf1238c75fb118eaeedf573fca833fef9ace8788c527f76758ac038e262d"}, + {file = "pi_heif-1.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b4c4fc7a877807ef7f156f26ae920074d1f40a96868a5962ead743049b7c96c8"}, + {file = "pi_heif-1.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eb7f0fcdbc80ae75b0881ee1e63d1e8b873df72ff2bdd154d500d6d0644d22e2"}, + {file = "pi_heif-1.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:7c49b1411c4e5f08677a5442048d896f6d1bc66469b556f930f0e658ae18800f"}, + {file = "pi_heif-1.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:cde6e4ebfdae0044d2852036d7f4f2399f8f89923501eceaecc564efc6a82899"}, + {file = "pi_heif-1.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3e51c6a56868be96534bd04c521637c71ad39b7a65a5aaa297adebe7e2d15ffb"}, + {file = "pi_heif-1.2.0-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b663f82cc3c87e315977577e6d267ecce2a17c96a766aae3fd807abc0ab45900"}, + {file = "pi_heif-1.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5e62c54ba42ca4d74fba84f92668e6bf825f4b827fb182b5e22244a2e2fb1b3"}, + {file = "pi_heif-1.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:bc0fa16a0751aba3a3d5fb222fd5587a789dd79a675ddf0532de5ec090e0003a"}, + {file = "pi_heif-1.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fe00abdb62faf1a37ef77d01ed7b0302196897a47da1fc14758ae1522a705733"}, + {file = "pi_heif-1.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:582b6ca24e6cbcee3f322d8dbebd946c52c40f5a8b96b9f4616f9eba4ba76d11"}, + {file = "pi_heif-1.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0495913cfd4ddf726fd3dc12cc0af065218f682bfb091feb1641223e7563065b"}, + {file = "pi_heif-1.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5d8f049862694534eced877e438df0687e7cb3e037348ab5792bee8fc86f2633"}, + {file = "pi_heif-1.2.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b0407727fdda6d410481e3b2ccd1c9eb1eb0e762aeb40c87a368f7b1f5d9d44"}, + {file = "pi_heif-1.2.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c698bf9bf4e39be88e91f1c12de603fbf321034ba5aee9280b789dae13532a71"}, + {file = "pi_heif-1.2.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:1430a0959d3899eb1aa1919519bb412810a18e457cf24ea8aa2b035a91782654"}, + {file = "pi_heif-1.2.0.tar.gz", hash = "sha256:52bbbc8c30b803288a9f1bb02e4575797940fdc1f5091fce743c699e812418cc"}, +] + +[package.dependencies] +pillow = ">=11.1.0" + +[package.extras] +tests = ["defusedxml", "numpy", "packaging", "pympler", "pytest"] +tests-min = ["defusedxml", "packaging", "pytest"] + [[package]] name = "pillow" version = "11.1.0" @@ -4185,6 +4312,42 @@ files = [ {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, ] +[[package]] +name = "psutil" +version = "7.2.2" +description = "Cross-platform lib for process and system monitoring." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b"}, + {file = "psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312"}, + {file = "psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b"}, + {file = "psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf"}, + {file = "psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1"}, + {file = "psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc"}, + {file = "psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988"}, + {file = "psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee"}, + {file = "psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372"}, +] + +[package.extras] +dev = ["abi3audit", "black", "check-manifest", "colorama", "coverage", "packaging", "psleak", "pylint", "pyperf", "pypinfo", "pyreadline3", "pytest", "pytest-cov", "pytest-instafail", "pytest-xdist", "pywin32", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "validate-pyproject[all]", "virtualenv", "vulture", "wheel", "wheel", "wmi"] +test = ["psleak", "pytest", "pytest-instafail", "pytest-xdist", "pywin32", "setuptools", "wheel", "wmi"] + [[package]] name = "pyarrow" version = "19.0.1" @@ -4537,6 +4700,30 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "6.7.0" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "pypdf-6.7.0-py3-none-any.whl", hash = "sha256:62e85036d50839cbdf45b8067c2c1a1b925517514d7cba4cbe8755a6c2829bc9"}, + {file = "pypdf-6.7.0.tar.gz", hash = "sha256:eb95e244d9f434e6cfd157272283339ef586e593be64ee699c620f756d5c3f7e"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -4878,21 +5065,41 @@ files = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.2" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "python_oxmsg-0.0.2-py3-none-any.whl", hash = "sha256:22be29b14c46016bcd05e34abddfd8e05ee82082f53b82753d115da3fc7d0355"}, + {file = "python_oxmsg-0.0.2.tar.gz", hash = "sha256:a6aff4deb1b5975d44d49dab1d9384089ffeec819e19c6940bc7ffbc84775fad"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing_extensions = ">=4.9.0" + [[package]] name = "python-pptx" -version = "0.6.21" -description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, ] [package.dependencies] lxml = ">=3.1.0" Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" XlsxWriter = ">=0.5.7" [[package]] @@ -5339,7 +5546,7 @@ description = "A utility belt for advanced users of python-requests" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, @@ -5968,22 +6175,6 @@ typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\"" [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, - {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, -] - -[package.extras] -widechars = ["wcwidth"] - [[package]] name = "tenacity" version = "8.5.0" @@ -6306,7 +6497,7 @@ description = "Runtime typing introspection tools" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, @@ -6378,85 +6569,87 @@ files = [ [[package]] name = "unstructured" -version = "0.10.27" +version = "0.18.18" description = "A library that prepares raw documents for downstream ML tasks." optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.10.0" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.18.18-py3-none-any.whl", hash = "sha256:d5189bdd5e2a1c5ed3cc289cfb4fb483c6f2dd544b42744bdc5b81d3388ea527"}, + {file = "unstructured-0.18.18.tar.gz", hash = "sha256:cfe6c84a36d374e5767930e13cfc10622357b3b68a5b7c735fdb1eeca08c6b57"}, ] [package.dependencies] backoff = "*" beautifulsoup4 = "*" -chardet = "*" +charset-normalizer = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" nltk = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +psutil = "*" +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" -tabulate = "*" +tqdm = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] -biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] -discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -msg = ["msg-parser"] -notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] -reddit = ["praw"] +paddleocr = ["paddlepaddle (>=3.0.0b1)", "unstructured.paddleocr (==2.10.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] -salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -slack = ["slack-sdk"] tsv = ["pandas"] -wikipedia = ["wikipedia"] -xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +xlsx = ["msoffcrypto-tool", "networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-client" +version = "0.32.3" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "unstructured_client-0.32.3-py3-none-any.whl", hash = "sha256:50b8198a3c3f984bdb53d848be7665d352093a99841858976f596cc2105903ec"}, + {file = "unstructured_client-0.32.3.tar.gz", hash = "sha256:1426d03325f7b93daad524ad2b954f1e7cceb0c15e67a4f4e88b49220dd2472c"}, +] + +[package.dependencies] +aiofiles = ">=24.1.0" +cryptography = ">=3.1" +eval-type-backport = ">=0.2.0" +httpx = ">=0.27.0" +nest-asyncio = ">=1.6.0" +pydantic = ">=2.10.3" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests-toolbelt = ">=1.0.0" +typing-inspection = ">=0.4.0" [[package]] name = "unstructured-pytesseract" @@ -6580,6 +6773,19 @@ files = [ [package.dependencies] bracex = ">=2.1.1" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = true +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "werkzeug" version = "3.1.3" @@ -6775,7 +6981,7 @@ files = [ {file = "wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22"}, {file = "wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0"}, ] -markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"manifest-server\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +markers = {main = "(extra == \"file-based\" or extra == \"manifest-server\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "xlsxwriter" @@ -7037,7 +7243,7 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"] [extras] dev = ["pytest"] -file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pi-heif", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] manifest-server = ["ddtrace", "fastapi", "uvicorn"] sql = ["sqlalchemy"] vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain_text_splitters", "openai", "tiktoken"] @@ -7045,4 +7251,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "b785d39f246498c8facd7854999dbdbfb78808489a09922dd3a1551be331ea7d" +content-hash = "59c1452c4b6873805b4f8c5c1bcbc6f5aeca3c880df8cdbe1898023f503e2efa" diff --git a/pyproject.toml b/pyproject.toml index bcdab217b..acf8f5792 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,8 @@ openpyxl = { version = "^3.1.0", optional = true } python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused tiktoken = { version = "0.8.0", optional = true } nltk = { version = "3.9.1", optional = true } -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.18.18", extras = ["docx", "pptx"], optional = true } +pi-heif = { version = ">=0.16.0", optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions @@ -121,7 +122,7 @@ deptry = "^0.23.0" dagger-io = "0.19.0" [tool.poetry.extras] -file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pi-heif", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] dev = ["pytest"] @@ -261,6 +262,7 @@ DEP002 = [ "python-snappy", "tiktoken", "unstructured.pytesseract", + "pi-heif", ] # DEP003: Project should not use transitive dependencies. diff --git a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py index 374a02eed..c5dd334dc 100644 --- a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py +++ b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py @@ -89,8 +89,9 @@ def test_infer_schema(mock_detect_filetype, filetype, format_config, raises): UnstructuredParser().infer_schema(config, fake_file, stream_reader, logger) ) else: + fake_file.mime_type = None schema = loop.run_until_complete( - UnstructuredParser().infer_schema(config, MagicMock(), MagicMock(), MagicMock()) + UnstructuredParser().infer_schema(config, fake_file, stream_reader, logger) ) assert schema == { "content": { @@ -227,15 +228,26 @@ def test_infer_schema(mock_detect_filetype, filetype, format_config, raises): ), ], ) -@patch("unstructured.partition.pdf.partition_pdf") -@patch("unstructured.partition.pptx.partition_pptx") -@patch("unstructured.partition.docx.partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser._import_unstructured") +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pdf", + new_callable=MagicMock, +) +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pptx", + new_callable=MagicMock, +) +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_docx", + new_callable=MagicMock, +) @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") def test_parse_records( mock_detect_filetype, mock_partition_docx, mock_partition_pptx, mock_partition_pdf, + mock_import_unstructured, filetype, format_config, parse_result, @@ -618,6 +630,19 @@ def test_check_config( ), ], ) +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser._import_unstructured") +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pdf", + new_callable=MagicMock, +) +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pptx", + new_callable=MagicMock, +) +@patch( + "airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_docx", + new_callable=MagicMock, +) @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests") @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") @patch("time.sleep", side_effect=lambda _: None) @@ -625,6 +650,10 @@ def test_parse_records_remotely( time_mock, mock_detect_filetype, requests_mock, + mock_partition_docx, + mock_partition_pptx, + mock_partition_pdf, + mock_import_unstructured, filetype, format_config, raises_for_status, diff --git a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py index c0db46e7a..84ec258d8 100644 --- a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py @@ -434,7 +434,7 @@ { "data": { "document_key": "sample.pdf", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -443,7 +443,7 @@ { "data": { "document_key": "sample.docx", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "sample.docx", }, @@ -510,7 +510,7 @@ { "data": { "document_key": "sample.pdf", - "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=No /Root object! - Is this really a PDF?", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -578,7 +578,7 @@ { "data": { "document_key": "pdf_without_extension", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=pdf_without_extension message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "pdf_without_extension", }, @@ -587,7 +587,7 @@ { "data": { "document_key": "docx_without_extension", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "docx_without_extension", },