diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index f55675e0a..8a1daac4f 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -6,19 +6,13 @@ import traceback from datetime import datetime from io import BytesIO, IOBase -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast import backoff import dpath import nltk import requests -from unstructured.file_utils.filetype import ( - EXT_TO_FILETYPE, - FILETYPE_TO_MIMETYPE, - STR_TO_FILETYPE, - FileType, - detect_filetype, -) +from unstructured.file_utils.filetype import FileType, detect_filetype from airbyte_cdk.models import FailureType from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -85,14 +79,21 @@ def _import_unstructured() -> None: global unstructured_partition_docx global unstructured_partition_pptx from unstructured.partition.docx import partition_docx - from unstructured.partition.pdf import partition_pdf from unstructured.partition.pptx import partition_pptx - # separate global variables to properly propagate typing - unstructured_partition_pdf = partition_pdf unstructured_partition_docx = partition_docx unstructured_partition_pptx = partition_pptx + try: + from unstructured.partition.pdf import partition_pdf + + unstructured_partition_pdf = partition_pdf + except ImportError: + logger = logging.getLogger(__name__) + logger.info( + "Could not import unstructured.partition.pdf (requires unstructured_inference). PDF parsing will be unavailable." + ) + def user_error(e: Exception) -> bool: """ @@ -207,13 +208,6 @@ def _read_file( logger: logging.Logger, ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") filetype: FileType | None = self._get_filetype(file_handle, remote_file) @@ -335,7 +329,7 @@ def _read_file_remotely( data = self._params_to_dict(format.parameters, strategy) - file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} + file_data = {"files": ("filename", file_handle, filetype.mime_type)} response = requests.post( f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data @@ -356,32 +350,38 @@ def _read_file_locally( self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") file: Any = file_handle - # before the parsing logic is entered, the file is read completely to make sure it is in local memory file_handle.seek(0) file_handle.read() file_handle.seek(0) try: if filetype == FileType.PDF: - # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects + if not unstructured_partition_pdf: + raise self._create_parse_error( + remote_file, + "PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", + ) file_handle.seek(0) with BytesIO(file_handle.read()) as file: file_handle.seek(0) elements = unstructured_partition_pdf(file=file, strategy=strategy) elif filetype == FileType.DOCX: + if not unstructured_partition_docx: + raise self._create_parse_error( + remote_file, "DOCX partition function is not available" + ) elements = unstructured_partition_docx(file=file) elif filetype == FileType.PPTX: + if not unstructured_partition_pptx: + raise self._create_parse_error( + remote_file, "PPTX partition function is not available" + ) elements = unstructured_partition_pptx(file=file) + except RecordParseError: + raise except Exception as e: raise self._create_parse_error(remote_file, str(e)) @@ -405,8 +405,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT 2. Use the file name if available 3. Use the file content """ - if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[remote_file.mime_type] + if remote_file.mime_type: + ft = FileType.from_mime_type(remote_file.mime_type) + if ft is not None and ft != FileType.UNK: + return ft # set name to none, otherwise unstructured will try to get the modified date from the local file system if hasattr(file, "name"): @@ -418,7 +420,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT file_type: FileType | None = None try: file_type = detect_filetype( - filename=remote_file.uri, + file_path=remote_file.uri, ) except Exception: # Path doesn't exist locally. Try something else... @@ -427,16 +429,17 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT if file_type and file_type != FileType.UNK: return file_type - type_based_on_content = detect_filetype(file=file) - file.seek(0) # detect_filetype is reading to read the file content, so we need to reset + extension = "." + remote_file.uri.split(".")[-1].lower() + ext_type = FileType.from_extension(extension) + if ext_type is not None: + return ext_type + + type_based_on_content = detect_filetype(file=cast(IO[bytes], file)) + file.seek(0) if type_based_on_content and type_based_on_content != FileType.UNK: return type_based_on_content - extension = "." + remote_file.uri.split(".")[-1].lower() - if extension in EXT_TO_FILETYPE: - return EXT_TO_FILETYPE[extension] - return None def _supported_file_types(self) -> List[Any]: diff --git a/poetry.lock b/poetry.lock index 55c2fa668..fcf0a216d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "25.1.0" +description = "File support for asyncio." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695"}, + {file = "aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2"}, +] + [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -501,19 +514,6 @@ files = [ [package.dependencies] pycparser = "*" -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.1" @@ -1283,6 +1283,22 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "eval-type-backport" +version = "0.3.1" +description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "eval_type_backport-0.3.1-py3-none-any.whl", hash = "sha256:279ab641905e9f11129f56a8a78f493518515b83402b860f6f06dd7c011fdfa8"}, + {file = "eval_type_backport-0.3.1.tar.gz", hash = "sha256:57e993f7b5b69d271e37482e62f74e76a0276c82490cf8e4f0dffeb6b332d5ed"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -2052,6 +2068,29 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "httpcore" version = "1.0.9" @@ -2616,6 +2655,38 @@ dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] doc = ["myst-parser", "sphinx", "sphinx-book-theme"] test = ["coverage", "pytest", "pytest-cov"] +[[package]] +name = "llvmlite" +version = "0.46.0" +description = "lightweight wrapper around basic LLVM functionality" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "llvmlite-0.46.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4323177e936d61ae0f73e653e2e614284d97d14d5dd12579adc92b6c2b0597b0"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a2d461cb89537b7c20feb04c46c32e12d5ad4f0896c9dfc0f60336219ff248e"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1f6595a35b7b39c3518b85a28bf18f45e075264e4b2dce3f0c2a4f232b4a910"}, + {file = "llvmlite-0.46.0-cp310-cp310-win_amd64.whl", hash = "sha256:e7a34d4aa6f9a97ee006b504be6d2b8cb7f755b80ab2f344dda1ef992f828559"}, + {file = "llvmlite-0.46.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:82f3d39b16f19aa1a56d5fe625883a6ab600d5cc9ea8906cca70ce94cabba067"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a3df43900119803bbc52720e758c76f316a9a0f34612a886862dfe0a5591a17e"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de183fefc8022d21b0aa37fc3e90410bc3524aed8617f0ff76732fc6c3af5361"}, + {file = "llvmlite-0.46.0-cp311-cp311-win_amd64.whl", hash = "sha256:e8b10bc585c58bdffec9e0c309bb7d51be1f2f15e169a4b4d42f2389e431eb93"}, + {file = "llvmlite-0.46.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b9588ad4c63b4f0175a3984b85494f0c927c6b001e3a246a3a7fb3920d9a137"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3535bd2bb6a2d7ae4012681ac228e5132cdb75fefb1bcb24e33f2f3e0c865ed4"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cbfd366e60ff87ea6cc62f50bc4cd800ebb13ed4c149466f50cf2163a473d1e"}, + {file = "llvmlite-0.46.0-cp312-cp312-win_amd64.whl", hash = "sha256:398b39db462c39563a97b912d4f2866cd37cba60537975a09679b28fbbc0fb38"}, + {file = "llvmlite-0.46.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:30b60892d034bc560e0ec6654737aaa74e5ca327bd8114d82136aa071d611172"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6cc19b051753368a9c9f31dc041299059ee91aceec81bd57b0e385e5d5bf1a54"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bca185892908f9ede48c0acd547fe4dc1bafefb8a4967d47db6cf664f9332d12"}, + {file = "llvmlite-0.46.0-cp313-cp313-win_amd64.whl", hash = "sha256:67438fd30e12349ebb054d86a5a1a57fd5e87d264d2451bcfafbbbaa25b82a35"}, + {file = "llvmlite-0.46.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:d252edfb9f4ac1fcf20652258e3f102b26b03eef738dc8a6ffdab7d7d341d547"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:379fdd1c59badeff8982cb47e4694a6143bec3bb49aa10a466e095410522064d"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e8cbfff7f6db0fa2c771ad24154e2a7e457c2444d7673e6de06b8b698c3b269"}, + {file = "llvmlite-0.46.0-cp314-cp314-win_amd64.whl", hash = "sha256:7821eda3ec1f18050f981819756631d60b6d7ab1a6cf806d9efefbe3f4082d61"}, + {file = "llvmlite-0.46.0.tar.gz", hash = "sha256:227c9fd6d09dce2783c18b754b7cd9d9b3b3515210c46acc2d3c5badd9870ceb"}, +] + [[package]] name = "lxml" version = "5.3.0" @@ -3262,6 +3333,19 @@ files = [ ] markers = {main = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.9.1" @@ -3289,6 +3373,42 @@ plot = ["matplotlib"] tgrep = ["pyparsing"] twitter = ["twython"] +[[package]] +name = "numba" +version = "0.63.1" +description = "compiling Python code using LLVM" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "numba-0.63.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6d6bf5bf00f7db629305caaec82a2ffb8abe2bf45eaad0d0738dc7de4113779"}, + {file = "numba-0.63.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08653d0dfc9cc9c4c9a8fba29ceb1f2d5340c3b86c4a7e5e07e42b643bc6a2f4"}, + {file = "numba-0.63.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f09eebf5650246ce2a4e9a8d38270e2d4b0b0ae978103bafb38ed7adc5ea906e"}, + {file = "numba-0.63.1-cp310-cp310-win_amd64.whl", hash = "sha256:f8bba17421d865d8c0f7be2142754ebce53e009daba41c44cf6909207d1a8d7d"}, + {file = "numba-0.63.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b33db00f18ccc790ee9911ce03fcdfe9d5124637d1ecc266f5ae0df06e02fec3"}, + {file = "numba-0.63.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7d31ea186a78a7c0f6b1b2a3fe68057fdb291b045c52d86232b5383b6cf4fc25"}, + {file = "numba-0.63.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed3bb2fbdb651d6aac394388130a7001aab6f4541837123a4b4ab8b02716530c"}, + {file = "numba-0.63.1-cp311-cp311-win_amd64.whl", hash = "sha256:1ecbff7688f044b1601be70113e2fb1835367ee0b28ffa8f3adf3a05418c5c87"}, + {file = "numba-0.63.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2819cd52afa5d8d04e057bdfd54367575105f8829350d8fb5e4066fb7591cc71"}, + {file = "numba-0.63.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5cfd45dbd3d409e713b1ccfdc2ee72ca82006860254429f4ef01867fdba5845f"}, + {file = "numba-0.63.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69a599df6976c03b7ecf15d05302696f79f7e6d10d620367407517943355bcb0"}, + {file = "numba-0.63.1-cp312-cp312-win_amd64.whl", hash = "sha256:bbad8c63e4fc7eb3cdb2c2da52178e180419f7969f9a685f283b313a70b92af3"}, + {file = "numba-0.63.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:0bd4fd820ef7442dcc07da184c3f54bb41d2bdb7b35bacf3448e73d081f730dc"}, + {file = "numba-0.63.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53de693abe4be3bd4dee38e1c55f01c55ff644a6a3696a3670589e6e4c39cde2"}, + {file = "numba-0.63.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81227821a72a763c3d4ac290abbb4371d855b59fdf85d5af22a47c0e86bf8c7e"}, + {file = "numba-0.63.1-cp313-cp313-win_amd64.whl", hash = "sha256:eb227b07c2ac37b09432a9bda5142047a2d1055646e089d4a240a2643e508102"}, + {file = "numba-0.63.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:f180883e5508940cc83de8a8bea37fc6dd20fbe4e5558d4659b8b9bef5ff4731"}, + {file = "numba-0.63.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0938764afa82a47c0e895637a6c55547a42c9e1d35cac42285b1fa60a8b02bb"}, + {file = "numba-0.63.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f90a929fa5094e062d4e0368ede1f4497d5e40f800e80aa5222c4734236a2894"}, + {file = "numba-0.63.1-cp314-cp314-win_amd64.whl", hash = "sha256:8d6d5ce85f572ed4e1a135dbb8c0114538f9dd0e3657eeb0bb64ab204cbe2a8f"}, + {file = "numba-0.63.1.tar.gz", hash = "sha256:b320aa675d0e3b17b40364935ea52a7b1c670c9037c39cf92c49502a75902f4b"}, +] + +[package.dependencies] +llvmlite = "==0.46.*" +numpy = ">=1.22,<2.4" + [[package]] name = "numpy" version = "1.26.4" @@ -3421,6 +3541,22 @@ files = [ {file = "numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a"}, ] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "0.27.9" @@ -3851,15 +3987,15 @@ pillow = "*" [[package]] name = "pdfminer-six" -version = "20221105" +version = "20260107" description = "PDF parser and analyzer" optional = true -python-versions = ">=3.6" +python-versions = ">=3.10" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, - {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, + {file = "pdfminer_six-20260107-py3-none-any.whl", hash = "sha256:366585ba97e80dffa8f00cebe303d2f381884d8637af4ce422f1df3ef38111a9"}, + {file = "pdfminer_six-20260107.tar.gz", hash = "sha256:96bfd431e3577a55a0efd25676968ca4ce8fd5b53f14565f85716ff363889602"}, ] [package.dependencies] @@ -3867,8 +4003,6 @@ charset-normalizer = ">=2.0.0" cryptography = ">=36.0.0" [package.extras] -dev = ["black", "mypy (==0.931)", "nox", "pytest"] -docs = ["sphinx", "sphinx-argparse"] image = ["Pillow"] [[package]] @@ -4185,6 +4319,42 @@ files = [ {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, ] +[[package]] +name = "psutil" +version = "7.2.2" +description = "Cross-platform lib for process and system monitoring." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b"}, + {file = "psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312"}, + {file = "psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b"}, + {file = "psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf"}, + {file = "psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1"}, + {file = "psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc"}, + {file = "psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988"}, + {file = "psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee"}, + {file = "psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372"}, +] + +[package.extras] +dev = ["abi3audit", "black", "check-manifest", "colorama", "coverage", "packaging", "psleak", "pylint", "pyperf", "pypinfo", "pyreadline3", "pytest", "pytest-cov", "pytest-instafail", "pytest-xdist", "pywin32", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "validate-pyproject[all]", "virtualenv", "vulture", "wheel", "wheel", "wmi"] +test = ["psleak", "pytest", "pytest-instafail", "pytest-xdist", "pywin32", "setuptools", "wheel", "wmi"] + [[package]] name = "pyarrow" version = "19.0.1" @@ -4537,6 +4707,30 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "6.7.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "pypdf-6.7.1-py3-none-any.whl", hash = "sha256:a02ccbb06463f7c334ce1612e91b3e68a8e827f3cee100b9941771e6066b094e"}, + {file = "pypdf-6.7.1.tar.gz", hash = "sha256:6b7a63be5563a0a35d54c6d6b550d75c00b8ccf36384be96365355e296e6b3b0"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -4878,21 +5072,41 @@ files = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.2" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "python_oxmsg-0.0.2-py3-none-any.whl", hash = "sha256:22be29b14c46016bcd05e34abddfd8e05ee82082f53b82753d115da3fc7d0355"}, + {file = "python_oxmsg-0.0.2.tar.gz", hash = "sha256:a6aff4deb1b5975d44d49dab1d9384089ffeec819e19c6940bc7ffbc84775fad"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing_extensions = ">=4.9.0" + [[package]] name = "python-pptx" -version = "0.6.21" -description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, ] [package.dependencies] lxml = ">=3.1.0" Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" XlsxWriter = ">=0.5.7" [[package]] @@ -5339,7 +5553,7 @@ description = "A utility belt for advanced users of python-requests" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, @@ -5968,22 +6182,6 @@ typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\"" [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, - {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, -] - -[package.extras] -widechars = ["wcwidth"] - [[package]] name = "tenacity" version = "8.5.0" @@ -6306,7 +6504,7 @@ description = "Runtime typing introspection tools" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, @@ -6378,85 +6576,89 @@ files = [ [[package]] name = "unstructured" -version = "0.10.27" +version = "0.18.32" description = "A library that prepares raw documents for downstream ML tasks." optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.10.0" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.18.32-py3-none-any.whl", hash = "sha256:c832ecdf467f5a869cc5e91428459e4b9ed75a16156ce3fab8f41ff64d840bc7"}, + {file = "unstructured-0.18.32.tar.gz", hash = "sha256:40a7cf4a4a7590350bedb8a447e37029d6e74b924692576627b4edb92d70e39d"}, ] [package.dependencies] backoff = "*" beautifulsoup4 = "*" -chardet = "*" +charset-normalizer = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" nltk = "*" +numba = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +psutil = "*" +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" -tabulate = "*" +tqdm = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] -biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +chunking-tokens = ["tiktoken"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] -discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -msg = ["msg-parser"] -notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] -reddit = ["praw"] +paddleocr = ["paddlepaddle (>=3.0.0b1)", "unstructured.paddleocr (==2.10.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] -salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -slack = ["slack-sdk"] tsv = ["pandas"] -wikipedia = ["wikipedia"] -xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +xlsx = ["msoffcrypto-tool", "networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-client" +version = "0.32.3" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "unstructured_client-0.32.3-py3-none-any.whl", hash = "sha256:50b8198a3c3f984bdb53d848be7665d352093a99841858976f596cc2105903ec"}, + {file = "unstructured_client-0.32.3.tar.gz", hash = "sha256:1426d03325f7b93daad524ad2b954f1e7cceb0c15e67a4f4e88b49220dd2472c"}, +] + +[package.dependencies] +aiofiles = ">=24.1.0" +cryptography = ">=3.1" +eval-type-backport = ">=0.2.0" +httpx = ">=0.27.0" +nest-asyncio = ">=1.6.0" +pydantic = ">=2.10.3" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests-toolbelt = ">=1.0.0" +typing-inspection = ">=0.4.0" [[package]] name = "unstructured-pytesseract" @@ -6580,6 +6782,19 @@ files = [ [package.dependencies] bracex = ">=2.1.1" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = true +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "werkzeug" version = "3.1.3" @@ -6775,7 +6990,7 @@ files = [ {file = "wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22"}, {file = "wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0"}, ] -markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"manifest-server\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +markers = {main = "(extra == \"file-based\" or extra == \"manifest-server\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "xlsxwriter" @@ -7045,4 +7260,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "b785d39f246498c8facd7854999dbdbfb78808489a09922dd3a1551be331ea7d" +content-hash = "35b6f772102c0696b04ed4e25da343e75d9b4c7617ba4de2b33b4398878e49ab" diff --git a/pyproject.toml b/pyproject.toml index bcdab217b..6f7498ea0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ langchain_text_splitters = { version = "^1.0.0", optional = true } markdown = { version = "*", optional = true } # TODO: Remove if unused openai = { version = "0.27.9", extras = ["embeddings"], optional = true } # Used indirectly by langchain library pdf2image = { version = "1.16.3", optional = true } -"pdfminer.six" = { version = "20221105", optional = true } # Used indirectly by unstructured library +"pdfminer.six" = { version = ">=20231228", optional = true } # Used indirectly by unstructured library pyarrow = { version = "^19.0.0", optional = true } pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused @@ -77,7 +77,7 @@ openpyxl = { version = "^3.1.0", optional = true } python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused tiktoken = { version = "0.8.0", optional = true } nltk = { version = "3.9.1", optional = true } -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.18.32", extras = ["docx", "pptx"], optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions diff --git a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py index 374a02eed..284cc57dc 100644 --- a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py +++ b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py @@ -227,12 +227,14 @@ def test_infer_schema(mock_detect_filetype, filetype, format_config, raises): ), ], ) -@patch("unstructured.partition.pdf.partition_pdf") -@patch("unstructured.partition.pptx.partition_pptx") -@patch("unstructured.partition.docx.partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pdf") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pptx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser._import_unstructured") @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") def test_parse_records( mock_detect_filetype, + mock_import_unstructured, mock_partition_docx, mock_partition_pptx, mock_partition_pdf, diff --git a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py index c0db46e7a..84ec258d8 100644 --- a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py @@ -434,7 +434,7 @@ { "data": { "document_key": "sample.pdf", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -443,7 +443,7 @@ { "data": { "document_key": "sample.docx", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "sample.docx", }, @@ -510,7 +510,7 @@ { "data": { "document_key": "sample.pdf", - "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=No /Root object! - Is this really a PDF?", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -578,7 +578,7 @@ { "data": { "document_key": "pdf_without_extension", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=pdf_without_extension message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "pdf_without_extension", }, @@ -587,7 +587,7 @@ { "data": { "document_key": "docx_without_extension", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "docx_without_extension", },