From 31b3ab143b2a60dafd05074dce44c3f008858450 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:40:51 +0000 Subject: [PATCH 1/7] fix(cdk): upgrade unstructured from 0.10.27 to 0.18.32 Co-Authored-By: Ryan Waskewich --- .../file_types/unstructured_parser.py | 78 +-- poetry.lock | 446 ++++++++++++++---- pyproject.toml | 6 +- 3 files changed, 406 insertions(+), 124 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index f55675e0a..e050bc91e 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -6,19 +6,13 @@ import traceback from datetime import datetime from io import BytesIO, IOBase -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast import backoff import dpath import nltk import requests -from unstructured.file_utils.filetype import ( - EXT_TO_FILETYPE, - FILETYPE_TO_MIMETYPE, - STR_TO_FILETYPE, - FileType, - detect_filetype, -) +from unstructured.file_utils.filetype import FileType, detect_filetype from airbyte_cdk.models import FailureType from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig @@ -85,14 +79,21 @@ def _import_unstructured() -> None: global unstructured_partition_docx global unstructured_partition_pptx from unstructured.partition.docx import partition_docx - from unstructured.partition.pdf import partition_pdf from unstructured.partition.pptx import partition_pptx - # separate global variables to properly propagate typing - unstructured_partition_pdf = partition_pdf unstructured_partition_docx = partition_docx unstructured_partition_pptx = partition_pptx + try: + from unstructured.partition.pdf import partition_pdf + + unstructured_partition_pdf = partition_pdf + except (ImportError, ModuleNotFoundError): + logger = logging.getLogger(__name__) + logger.info( + "Could not import unstructured.partition.pdf (requires unstructured_inference). PDF parsing will be unavailable." + ) + def user_error(e: Exception) -> bool: """ @@ -207,13 +208,6 @@ def _read_file( logger: logging.Logger, ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") filetype: FileType | None = self._get_filetype(file_handle, remote_file) @@ -335,7 +329,7 @@ def _read_file_remotely( data = self._params_to_dict(format.parameters, strategy) - file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} + file_data = {"files": ("filename", file_handle, filetype.mime_type)} response = requests.post( f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data @@ -356,32 +350,38 @@ def _read_file_locally( self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile ) -> str: _import_unstructured() - if ( - (not unstructured_partition_pdf) - or (not unstructured_partition_docx) - or (not unstructured_partition_pptx) - ): - # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) - raise Exception("unstructured library is not available") file: Any = file_handle - # before the parsing logic is entered, the file is read completely to make sure it is in local memory file_handle.seek(0) file_handle.read() file_handle.seek(0) try: if filetype == FileType.PDF: - # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects + if not unstructured_partition_pdf: + raise self._create_parse_error( + remote_file, + "PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", + ) file_handle.seek(0) with BytesIO(file_handle.read()) as file: file_handle.seek(0) elements = unstructured_partition_pdf(file=file, strategy=strategy) elif filetype == FileType.DOCX: + if not unstructured_partition_docx: + raise self._create_parse_error( + remote_file, "DOCX partition function is not available" + ) elements = unstructured_partition_docx(file=file) elif filetype == FileType.PPTX: + if not unstructured_partition_pptx: + raise self._create_parse_error( + remote_file, "PPTX partition function is not available" + ) elements = unstructured_partition_pptx(file=file) + except RecordParseError: + raise except Exception as e: raise self._create_parse_error(remote_file, str(e)) @@ -405,8 +405,11 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT 2. Use the file name if available 3. Use the file content """ - if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: - return STR_TO_FILETYPE[remote_file.mime_type] + if remote_file.mime_type: + try: + return FileType.from_mime_type(remote_file.mime_type) + except ValueError: + pass # set name to none, otherwise unstructured will try to get the modified date from the local file system if hasattr(file, "name"): @@ -418,7 +421,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT file_type: FileType | None = None try: file_type = detect_filetype( - filename=remote_file.uri, + file_path=remote_file.uri, ) except Exception: # Path doesn't exist locally. Try something else... @@ -427,16 +430,17 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT if file_type and file_type != FileType.UNK: return file_type - type_based_on_content = detect_filetype(file=file) - file.seek(0) # detect_filetype is reading to read the file content, so we need to reset + extension = "." + remote_file.uri.split(".")[-1].lower() + ext_type = FileType.from_extension(extension) + if ext_type is not None: + return ext_type + + type_based_on_content = detect_filetype(file=cast(IO[bytes], file)) + file.seek(0) if type_based_on_content and type_based_on_content != FileType.UNK: return type_based_on_content - extension = "." + remote_file.uri.split(".")[-1].lower() - if extension in EXT_TO_FILETYPE: - return EXT_TO_FILETYPE[extension] - return None def _supported_file_types(self) -> List[Any]: diff --git a/poetry.lock b/poetry.lock index 55c2fa668..bcffb41c9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,18 @@ # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "25.1.0" +description = "File support for asyncio." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695"}, + {file = "aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2"}, +] + [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -501,19 +514,6 @@ files = [ [package.dependencies] pycparser = "*" -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.1" @@ -1283,6 +1283,22 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "eval-type-backport" +version = "0.3.1" +description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "eval_type_backport-0.3.1-py3-none-any.whl", hash = "sha256:279ab641905e9f11129f56a8a78f493518515b83402b860f6f06dd7c011fdfa8"}, + {file = "eval_type_backport-0.3.1.tar.gz", hash = "sha256:57e993f7b5b69d271e37482e62f74e76a0276c82490cf8e4f0dffeb6b332d5ed"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -2052,6 +2068,29 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "httpcore" version = "1.0.9" @@ -2616,6 +2655,38 @@ dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] doc = ["myst-parser", "sphinx", "sphinx-book-theme"] test = ["coverage", "pytest", "pytest-cov"] +[[package]] +name = "llvmlite" +version = "0.46.0" +description = "lightweight wrapper around basic LLVM functionality" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "llvmlite-0.46.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4323177e936d61ae0f73e653e2e614284d97d14d5dd12579adc92b6c2b0597b0"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a2d461cb89537b7c20feb04c46c32e12d5ad4f0896c9dfc0f60336219ff248e"}, + {file = "llvmlite-0.46.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1f6595a35b7b39c3518b85a28bf18f45e075264e4b2dce3f0c2a4f232b4a910"}, + {file = "llvmlite-0.46.0-cp310-cp310-win_amd64.whl", hash = "sha256:e7a34d4aa6f9a97ee006b504be6d2b8cb7f755b80ab2f344dda1ef992f828559"}, + {file = "llvmlite-0.46.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:82f3d39b16f19aa1a56d5fe625883a6ab600d5cc9ea8906cca70ce94cabba067"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a3df43900119803bbc52720e758c76f316a9a0f34612a886862dfe0a5591a17e"}, + {file = "llvmlite-0.46.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de183fefc8022d21b0aa37fc3e90410bc3524aed8617f0ff76732fc6c3af5361"}, + {file = "llvmlite-0.46.0-cp311-cp311-win_amd64.whl", hash = "sha256:e8b10bc585c58bdffec9e0c309bb7d51be1f2f15e169a4b4d42f2389e431eb93"}, + {file = "llvmlite-0.46.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b9588ad4c63b4f0175a3984b85494f0c927c6b001e3a246a3a7fb3920d9a137"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3535bd2bb6a2d7ae4012681ac228e5132cdb75fefb1bcb24e33f2f3e0c865ed4"}, + {file = "llvmlite-0.46.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cbfd366e60ff87ea6cc62f50bc4cd800ebb13ed4c149466f50cf2163a473d1e"}, + {file = "llvmlite-0.46.0-cp312-cp312-win_amd64.whl", hash = "sha256:398b39db462c39563a97b912d4f2866cd37cba60537975a09679b28fbbc0fb38"}, + {file = "llvmlite-0.46.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:30b60892d034bc560e0ec6654737aaa74e5ca327bd8114d82136aa071d611172"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6cc19b051753368a9c9f31dc041299059ee91aceec81bd57b0e385e5d5bf1a54"}, + {file = "llvmlite-0.46.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bca185892908f9ede48c0acd547fe4dc1bafefb8a4967d47db6cf664f9332d12"}, + {file = "llvmlite-0.46.0-cp313-cp313-win_amd64.whl", hash = "sha256:67438fd30e12349ebb054d86a5a1a57fd5e87d264d2451bcfafbbbaa25b82a35"}, + {file = "llvmlite-0.46.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:d252edfb9f4ac1fcf20652258e3f102b26b03eef738dc8a6ffdab7d7d341d547"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:379fdd1c59badeff8982cb47e4694a6143bec3bb49aa10a466e095410522064d"}, + {file = "llvmlite-0.46.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e8cbfff7f6db0fa2c771ad24154e2a7e457c2444d7673e6de06b8b698c3b269"}, + {file = "llvmlite-0.46.0-cp314-cp314-win_amd64.whl", hash = "sha256:7821eda3ec1f18050f981819756631d60b6d7ab1a6cf806d9efefbe3f4082d61"}, + {file = "llvmlite-0.46.0.tar.gz", hash = "sha256:227c9fd6d09dce2783c18b754b7cd9d9b3b3515210c46acc2d3c5badd9870ceb"}, +] + [[package]] name = "lxml" version = "5.3.0" @@ -3262,6 +3333,19 @@ files = [ ] markers = {main = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.9.1" @@ -3289,6 +3373,42 @@ plot = ["matplotlib"] tgrep = ["pyparsing"] twitter = ["twython"] +[[package]] +name = "numba" +version = "0.63.1" +description = "compiling Python code using LLVM" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "numba-0.63.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6d6bf5bf00f7db629305caaec82a2ffb8abe2bf45eaad0d0738dc7de4113779"}, + {file = "numba-0.63.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08653d0dfc9cc9c4c9a8fba29ceb1f2d5340c3b86c4a7e5e07e42b643bc6a2f4"}, + {file = "numba-0.63.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f09eebf5650246ce2a4e9a8d38270e2d4b0b0ae978103bafb38ed7adc5ea906e"}, + {file = "numba-0.63.1-cp310-cp310-win_amd64.whl", hash = "sha256:f8bba17421d865d8c0f7be2142754ebce53e009daba41c44cf6909207d1a8d7d"}, + {file = "numba-0.63.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b33db00f18ccc790ee9911ce03fcdfe9d5124637d1ecc266f5ae0df06e02fec3"}, + {file = "numba-0.63.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7d31ea186a78a7c0f6b1b2a3fe68057fdb291b045c52d86232b5383b6cf4fc25"}, + {file = "numba-0.63.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed3bb2fbdb651d6aac394388130a7001aab6f4541837123a4b4ab8b02716530c"}, + {file = "numba-0.63.1-cp311-cp311-win_amd64.whl", hash = "sha256:1ecbff7688f044b1601be70113e2fb1835367ee0b28ffa8f3adf3a05418c5c87"}, + {file = "numba-0.63.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2819cd52afa5d8d04e057bdfd54367575105f8829350d8fb5e4066fb7591cc71"}, + {file = "numba-0.63.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5cfd45dbd3d409e713b1ccfdc2ee72ca82006860254429f4ef01867fdba5845f"}, + {file = "numba-0.63.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69a599df6976c03b7ecf15d05302696f79f7e6d10d620367407517943355bcb0"}, + {file = "numba-0.63.1-cp312-cp312-win_amd64.whl", hash = "sha256:bbad8c63e4fc7eb3cdb2c2da52178e180419f7969f9a685f283b313a70b92af3"}, + {file = "numba-0.63.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:0bd4fd820ef7442dcc07da184c3f54bb41d2bdb7b35bacf3448e73d081f730dc"}, + {file = "numba-0.63.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53de693abe4be3bd4dee38e1c55f01c55ff644a6a3696a3670589e6e4c39cde2"}, + {file = "numba-0.63.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81227821a72a763c3d4ac290abbb4371d855b59fdf85d5af22a47c0e86bf8c7e"}, + {file = "numba-0.63.1-cp313-cp313-win_amd64.whl", hash = "sha256:eb227b07c2ac37b09432a9bda5142047a2d1055646e089d4a240a2643e508102"}, + {file = "numba-0.63.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:f180883e5508940cc83de8a8bea37fc6dd20fbe4e5558d4659b8b9bef5ff4731"}, + {file = "numba-0.63.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0938764afa82a47c0e895637a6c55547a42c9e1d35cac42285b1fa60a8b02bb"}, + {file = "numba-0.63.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f90a929fa5094e062d4e0368ede1f4497d5e40f800e80aa5222c4734236a2894"}, + {file = "numba-0.63.1-cp314-cp314-win_amd64.whl", hash = "sha256:8d6d5ce85f572ed4e1a135dbb8c0114538f9dd0e3657eeb0bb64ab204cbe2a8f"}, + {file = "numba-0.63.1.tar.gz", hash = "sha256:b320aa675d0e3b17b40364935ea52a7b1c670c9037c39cf92c49502a75902f4b"}, +] + +[package.dependencies] +llvmlite = "==0.46.*" +numpy = ">=1.22,<2.4" + [[package]] name = "numpy" version = "1.26.4" @@ -3421,6 +3541,22 @@ files = [ {file = "numpy-2.3.4.tar.gz", hash = "sha256:a7d018bfedb375a8d979ac758b120ba846a7fe764911a64465fd87b8729f4a6a"}, ] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "0.27.9" @@ -3889,6 +4025,65 @@ Jinja2 = ">=2.11.0" MarkupSafe = ">=1.1.1" pygments = ">=2.12.0" +[[package]] +name = "pi-heif" +version = "1.2.1" +description = "Python interface for libheif library" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "pi_heif-1.2.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:ae60ba8bd0904c70ddd66f5b2ac416fa54f7db88e02f0d8a56cdda9c600329e7"}, + {file = "pi_heif-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfd42065bb03b7c9405833a64cf3e354fe00285675f8df9383c1f57f3b04913a"}, + {file = "pi_heif-1.2.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cdaa388e2637f7500fb77b9ffe8b49e90277cd12ea3ce8f4c40854eb3860ac0"}, + {file = "pi_heif-1.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:843de92f26cfccd1acfe74165b41e63d29586ca66faa83f59fbee3bebeda8788"}, + {file = "pi_heif-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:609bf44651c61d5ba89be12d63dbe58d5c59de4a65baa770320a371ffcc280c8"}, + {file = "pi_heif-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3ebba3c13808a32e26b3746e08e55db8306b8a47e448d03f004cb7d351072bc0"}, + {file = "pi_heif-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f5eca27e14bc56f6f45d73172d93efbb7c202ebd9f1070781b03ed54bc1f63e"}, + {file = "pi_heif-1.2.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:ab0db01fb98c0ea4c03f423968404abd0d1e0bddf0d42a1d08ccf43835f63c3c"}, + {file = "pi_heif-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:314ac18efea35926e2ee3d98c5729376ff60a9330faeae88c4e335f7edc26925"}, + {file = "pi_heif-1.2.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f17cdfefe929b2e174836dc30a10285b2de753d1444661c4fb399167473f8d7"}, + {file = "pi_heif-1.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1eccabd3e33d95484b4e519c9ef018c7af1b9c6187aa83dea94aea3c880cfdaa"}, + {file = "pi_heif-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a08f5b665a6b089982a7c9b5e5233a7e92e8a1911c68077aa65dbc93d830c9a4"}, + {file = "pi_heif-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ea7174bffcccf373d3c13858f864d2f106a5c339d8729dc67ddcabd3adc61941"}, + {file = "pi_heif-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:ee80a3e9fba56c56196e9e5acb9e6667961d4d3316facd490cda32348370cd53"}, + {file = "pi_heif-1.2.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:5d63ce7f2e1ab27d21b81b149c18d362bfadf2f5a19c613433ddeacaa7c321aa"}, + {file = "pi_heif-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23a223fd9d061ad3ddeca81c782381b56aa7bc349845587318637c6194f25f80"}, + {file = "pi_heif-1.2.1-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6847f0b4153118aa87e1c41195dc7aaa1215e56eccbe9c53ee14bb33af6c00b1"}, + {file = "pi_heif-1.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0869b4b1d2094539fc991533e20af5612ead2e60cdeca19a169584d68d79bd19"}, + {file = "pi_heif-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1cf305582ce3a5a62c0ee937d07b8b03b13e064cf2214173df0f215ed6f5665f"}, + {file = "pi_heif-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e224d88482710304e991b39daa935d6370bcb2c11ea6b53f4bd6d29e6bc977f7"}, + {file = "pi_heif-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ee287a354c707a46c15b01ae8d906f68276ad6deb86407fec4e06051983bddcc"}, + {file = "pi_heif-1.2.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:c0d0107d0b98496d7b67c7134fa43142e402282ebb2b63cb80de9ae73f7fafc4"}, + {file = "pi_heif-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ec6544650400b0748761a69996d364db580f2ce2a3499b6caa3efe688fbe95a1"}, + {file = "pi_heif-1.2.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bf2c9d57481285f95a6bab412ac3a1d5d92cb4ff551e40dd4066b10bfe8f5a"}, + {file = "pi_heif-1.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af86b98e599866ef18333ac18f3086fde00573c3e9a62c15f01157b0e209e841"}, + {file = "pi_heif-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:641066aeee2cf866fb885ae64fd0589ca019b4d419072cec3f4d2a51fa2ee4ab"}, + {file = "pi_heif-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:83d0ec074d7738fcf838d1472b342936add38233b8c0b6675005af97a11a2e27"}, + {file = "pi_heif-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:31ccfc4acc4804775dd035fefffd5d78c6030f01e9facc13bb14e27d8575983b"}, + {file = "pi_heif-1.2.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9fa93b453c612ec14973b690f6a24cf4a0e406d7f9fc7fd8cde5554258794596"}, + {file = "pi_heif-1.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b486eb0c6affe65df90bc1e2d34998f11f3ca6a0e7fd40f0360781f98ae1fa9"}, + {file = "pi_heif-1.2.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:115495664b22140c7137506a5cee4447c1e42f004867a3bf1a302feb8dc72187"}, + {file = "pi_heif-1.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ddc145307274ff7d84f5fa6930ba0c6f42e80012c4423bc61f39b8ea282ce0a"}, + {file = "pi_heif-1.2.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22137bad2aa0a745deb8851a24b0d1acb89cd0feb149be2147a5f0b371521990"}, + {file = "pi_heif-1.2.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a1e2d14df2064812beef97cd37931c6431e9387aa363c0aea6bb9f45a9595f14"}, + {file = "pi_heif-1.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:a06ba19fea5b868c0d55ee79adda8d04b4d0985594c51705b8e0cb3087401407"}, + {file = "pi_heif-1.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cc9c470bf64cde82de45354e80eb643e1d26daf40e8669e79639f498b0f2395a"}, + {file = "pi_heif-1.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea23057fed02f7f2723f8b284d667ecf5b7ba7a46fce3ddfe670297cd49c8bbc"}, + {file = "pi_heif-1.2.1-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:85c14434c1b34c70b2100b79b2b7dfe162e21fa0d46132c22d178d15270b25b7"}, + {file = "pi_heif-1.2.1-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b4bcc0a76597528b50d9d50dc7f9e6a33b7c130653e857381728b93ee568ab5"}, + {file = "pi_heif-1.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fc4161300b570d36690f3cfc76eb60a5df13319fbeca34353f02ca3da826b5f3"}, + {file = "pi_heif-1.2.1.tar.gz", hash = "sha256:a5c5fd4d92b4f0541d8629eaadd95403ccdfd1b7f2ddced52844fa610713685d"}, +] + +[package.dependencies] +pillow = ">=11.1.0" + +[package.extras] +tests = ["defusedxml", "numpy", "packaging", "pympler", "pytest"] +tests-min = ["defusedxml", "packaging", "pytest"] + [[package]] name = "pillow" version = "11.1.0" @@ -4185,6 +4380,42 @@ files = [ {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"}, ] +[[package]] +name = "psutil" +version = "7.2.2" +description = "Cross-platform lib for process and system monitoring." +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b"}, + {file = "psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63"}, + {file = "psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312"}, + {file = "psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b"}, + {file = "psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00"}, + {file = "psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a"}, + {file = "psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf"}, + {file = "psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1"}, + {file = "psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486"}, + {file = "psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9"}, + {file = "psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8"}, + {file = "psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc"}, + {file = "psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988"}, + {file = "psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee"}, + {file = "psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372"}, +] + +[package.extras] +dev = ["abi3audit", "black", "check-manifest", "colorama", "coverage", "packaging", "psleak", "pylint", "pyperf", "pypinfo", "pyreadline3", "pytest", "pytest-cov", "pytest-instafail", "pytest-xdist", "pywin32", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "validate-pyproject[all]", "virtualenv", "vulture", "wheel", "wheel", "wmi"] +test = ["psleak", "pytest", "pytest-instafail", "pytest-xdist", "pywin32", "setuptools", "wheel", "wmi"] + [[package]] name = "pyarrow" version = "19.0.1" @@ -4537,6 +4768,30 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "6.7.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "pypdf-6.7.1-py3-none-any.whl", hash = "sha256:a02ccbb06463f7c334ce1612e91b3e68a8e827f3cee100b9941771e6066b094e"}, + {file = "pypdf-6.7.1.tar.gz", hash = "sha256:6b7a63be5563a0a35d54c6d6b550d75c00b8ccf36384be96365355e296e6b3b0"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -4878,21 +5133,41 @@ files = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.2" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "python_oxmsg-0.0.2-py3-none-any.whl", hash = "sha256:22be29b14c46016bcd05e34abddfd8e05ee82082f53b82753d115da3fc7d0355"}, + {file = "python_oxmsg-0.0.2.tar.gz", hash = "sha256:a6aff4deb1b5975d44d49dab1d9384089ffeec819e19c6940bc7ffbc84775fad"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing_extensions = ">=4.9.0" + [[package]] name = "python-pptx" -version = "0.6.21" -description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, ] [package.dependencies] lxml = ">=3.1.0" Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" XlsxWriter = ">=0.5.7" [[package]] @@ -5339,7 +5614,7 @@ description = "A utility belt for advanced users of python-requests" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, @@ -5968,22 +6243,6 @@ typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\"" [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, - {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, -] - -[package.extras] -widechars = ["wcwidth"] - [[package]] name = "tenacity" version = "8.5.0" @@ -6306,7 +6565,7 @@ description = "Runtime typing introspection tools" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"vector-db-based\"" +markers = "(extra == \"vector-db-based\" or extra == \"file-based\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")" files = [ {file = "typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7"}, {file = "typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464"}, @@ -6378,85 +6637,89 @@ files = [ [[package]] name = "unstructured" -version = "0.10.27" +version = "0.18.32" description = "A library that prepares raw documents for downstream ML tasks." optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.10.0" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.18.32-py3-none-any.whl", hash = "sha256:c832ecdf467f5a869cc5e91428459e4b9ed75a16156ce3fab8f41ff64d840bc7"}, + {file = "unstructured-0.18.32.tar.gz", hash = "sha256:40a7cf4a4a7590350bedb8a447e37029d6e74b924692576627b4edb92d70e39d"}, ] [package.dependencies] backoff = "*" beautifulsoup4 = "*" -chardet = "*" +charset-normalizer = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" nltk = "*" +numba = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +psutil = "*" +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" -tabulate = "*" +tqdm = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] -biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +chunking-tokens = ["tiktoken"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] -discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -msg = ["msg-parser"] -notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] -reddit = ["praw"] +paddleocr = ["paddlepaddle (>=3.0.0b1)", "unstructured.paddleocr (==2.10.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi_heif", "pikepdf", "pypdf", "unstructured-inference (>=1.1.1)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] -salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -slack = ["slack-sdk"] tsv = ["pandas"] -wikipedia = ["wikipedia"] -xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +xlsx = ["msoffcrypto-tool", "networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-client" +version = "0.32.3" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "unstructured_client-0.32.3-py3-none-any.whl", hash = "sha256:50b8198a3c3f984bdb53d848be7665d352093a99841858976f596cc2105903ec"}, + {file = "unstructured_client-0.32.3.tar.gz", hash = "sha256:1426d03325f7b93daad524ad2b954f1e7cceb0c15e67a4f4e88b49220dd2472c"}, +] + +[package.dependencies] +aiofiles = ">=24.1.0" +cryptography = ">=3.1" +eval-type-backport = ">=0.2.0" +httpx = ">=0.27.0" +nest-asyncio = ">=1.6.0" +pydantic = ">=2.10.3" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests-toolbelt = ">=1.0.0" +typing-inspection = ">=0.4.0" [[package]] name = "unstructured-pytesseract" @@ -6580,6 +6843,19 @@ files = [ [package.dependencies] bracex = ">=2.1.1" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = true +python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "werkzeug" version = "3.1.3" @@ -6775,7 +7051,7 @@ files = [ {file = "wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22"}, {file = "wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0"}, ] -markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"manifest-server\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} +markers = {main = "(extra == \"file-based\" or extra == \"manifest-server\") and (python_version <= \"3.11\" or python_version >= \"3.12.0\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "xlsxwriter" @@ -7037,7 +7313,7 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"] [extras] dev = ["pytest"] -file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pi-heif", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] manifest-server = ["ddtrace", "fastapi", "uvicorn"] sql = ["sqlalchemy"] vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain_text_splitters", "openai", "tiktoken"] @@ -7045,4 +7321,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "b785d39f246498c8facd7854999dbdbfb78808489a09922dd3a1551be331ea7d" +content-hash = "a0992639f32e94cbabd42c1abcc80c4b986462a1eac64d890728b5fe943c573a" diff --git a/pyproject.toml b/pyproject.toml index bcdab217b..1678f2446 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,8 @@ openpyxl = { version = "^3.1.0", optional = true } python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused tiktoken = { version = "0.8.0", optional = true } nltk = { version = "3.9.1", optional = true } -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.18.32", extras = ["docx", "pptx"], optional = true } +pi-heif = { version = ">=0.16.0", optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions @@ -121,7 +122,7 @@ deptry = "^0.23.0" dagger-io = "0.19.0" [tool.poetry.extras] -file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pi-heif", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] dev = ["pytest"] @@ -261,6 +262,7 @@ DEP002 = [ "python-snappy", "tiktoken", "unstructured.pytesseract", + "pi-heif", ] # DEP003: Project should not use transitive dependencies. From f3ac153a09960e23ddfc51feca81b9399c95940f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:47:41 +0000 Subject: [PATCH 2/7] fix: handle unstructured 0.18.32 API changes in tests and parser Co-Authored-By: Ryan Waskewich --- .../sources/file_based/file_types/unstructured_parser.py | 7 +++---- .../file_based/file_types/test_unstructured_parser.py | 8 +++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index e050bc91e..ae67c045c 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -406,10 +406,9 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT 3. Use the file content """ if remote_file.mime_type: - try: - return FileType.from_mime_type(remote_file.mime_type) - except ValueError: - pass + ft = FileType.from_mime_type(remote_file.mime_type) + if ft is not None: + return ft # set name to none, otherwise unstructured will try to get the modified date from the local file system if hasattr(file, "name"): diff --git a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py index 374a02eed..284cc57dc 100644 --- a/unit_tests/sources/file_based/file_types/test_unstructured_parser.py +++ b/unit_tests/sources/file_based/file_types/test_unstructured_parser.py @@ -227,12 +227,14 @@ def test_infer_schema(mock_detect_filetype, filetype, format_config, raises): ), ], ) -@patch("unstructured.partition.pdf.partition_pdf") -@patch("unstructured.partition.pptx.partition_pptx") -@patch("unstructured.partition.docx.partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pdf") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_pptx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.unstructured_partition_docx") +@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser._import_unstructured") @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype") def test_parse_records( mock_detect_filetype, + mock_import_unstructured, mock_partition_docx, mock_partition_pptx, mock_partition_pdf, From 4b907beddd78ebfa27e8a7a0fdb45d0da51a39e4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:52:03 +0000 Subject: [PATCH 3/7] fix: also skip FileType.UNK in mime_type detection to preserve fallback behavior Co-Authored-By: Ryan Waskewich --- .../sources/file_based/file_types/unstructured_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index ae67c045c..d752795da 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -407,7 +407,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT """ if remote_file.mime_type: ft = FileType.from_mime_type(remote_file.mime_type) - if ft is not None: + if ft is not None and ft != FileType.UNK: return ft # set name to none, otherwise unstructured will try to get the modified date from the local file system From eb167efafe05e50418983ddf39db89747837ad90 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:57:17 +0000 Subject: [PATCH 4/7] fix: remove unnecessary pi-heif dependency, simplify ImportError catch Co-Authored-By: Ryan Waskewich --- .../file_types/unstructured_parser.py | 2 +- poetry.lock | 63 +------------------ pyproject.toml | 4 +- 3 files changed, 4 insertions(+), 65 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index d752795da..8a1daac4f 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -88,7 +88,7 @@ def _import_unstructured() -> None: from unstructured.partition.pdf import partition_pdf unstructured_partition_pdf = partition_pdf - except (ImportError, ModuleNotFoundError): + except ImportError: logger = logging.getLogger(__name__) logger.info( "Could not import unstructured.partition.pdf (requires unstructured_inference). PDF parsing will be unavailable." diff --git a/poetry.lock b/poetry.lock index bcffb41c9..e5408a09d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4025,65 +4025,6 @@ Jinja2 = ">=2.11.0" MarkupSafe = ">=1.1.1" pygments = ">=2.12.0" -[[package]] -name = "pi-heif" -version = "1.2.1" -description = "Python interface for libheif library" -optional = true -python-versions = ">=3.9" -groups = ["main"] -markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" -files = [ - {file = "pi_heif-1.2.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:ae60ba8bd0904c70ddd66f5b2ac416fa54f7db88e02f0d8a56cdda9c600329e7"}, - {file = "pi_heif-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfd42065bb03b7c9405833a64cf3e354fe00285675f8df9383c1f57f3b04913a"}, - {file = "pi_heif-1.2.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cdaa388e2637f7500fb77b9ffe8b49e90277cd12ea3ce8f4c40854eb3860ac0"}, - {file = "pi_heif-1.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:843de92f26cfccd1acfe74165b41e63d29586ca66faa83f59fbee3bebeda8788"}, - {file = "pi_heif-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:609bf44651c61d5ba89be12d63dbe58d5c59de4a65baa770320a371ffcc280c8"}, - {file = "pi_heif-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3ebba3c13808a32e26b3746e08e55db8306b8a47e448d03f004cb7d351072bc0"}, - {file = "pi_heif-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f5eca27e14bc56f6f45d73172d93efbb7c202ebd9f1070781b03ed54bc1f63e"}, - {file = "pi_heif-1.2.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:ab0db01fb98c0ea4c03f423968404abd0d1e0bddf0d42a1d08ccf43835f63c3c"}, - {file = "pi_heif-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:314ac18efea35926e2ee3d98c5729376ff60a9330faeae88c4e335f7edc26925"}, - {file = "pi_heif-1.2.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f17cdfefe929b2e174836dc30a10285b2de753d1444661c4fb399167473f8d7"}, - {file = "pi_heif-1.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1eccabd3e33d95484b4e519c9ef018c7af1b9c6187aa83dea94aea3c880cfdaa"}, - {file = "pi_heif-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a08f5b665a6b089982a7c9b5e5233a7e92e8a1911c68077aa65dbc93d830c9a4"}, - {file = "pi_heif-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ea7174bffcccf373d3c13858f864d2f106a5c339d8729dc67ddcabd3adc61941"}, - {file = "pi_heif-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:ee80a3e9fba56c56196e9e5acb9e6667961d4d3316facd490cda32348370cd53"}, - {file = "pi_heif-1.2.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:5d63ce7f2e1ab27d21b81b149c18d362bfadf2f5a19c613433ddeacaa7c321aa"}, - {file = "pi_heif-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23a223fd9d061ad3ddeca81c782381b56aa7bc349845587318637c6194f25f80"}, - {file = "pi_heif-1.2.1-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6847f0b4153118aa87e1c41195dc7aaa1215e56eccbe9c53ee14bb33af6c00b1"}, - {file = "pi_heif-1.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0869b4b1d2094539fc991533e20af5612ead2e60cdeca19a169584d68d79bd19"}, - {file = "pi_heif-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1cf305582ce3a5a62c0ee937d07b8b03b13e064cf2214173df0f215ed6f5665f"}, - {file = "pi_heif-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e224d88482710304e991b39daa935d6370bcb2c11ea6b53f4bd6d29e6bc977f7"}, - {file = "pi_heif-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ee287a354c707a46c15b01ae8d906f68276ad6deb86407fec4e06051983bddcc"}, - {file = "pi_heif-1.2.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:c0d0107d0b98496d7b67c7134fa43142e402282ebb2b63cb80de9ae73f7fafc4"}, - {file = "pi_heif-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ec6544650400b0748761a69996d364db580f2ce2a3499b6caa3efe688fbe95a1"}, - {file = "pi_heif-1.2.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bf2c9d57481285f95a6bab412ac3a1d5d92cb4ff551e40dd4066b10bfe8f5a"}, - {file = "pi_heif-1.2.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af86b98e599866ef18333ac18f3086fde00573c3e9a62c15f01157b0e209e841"}, - {file = "pi_heif-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:641066aeee2cf866fb885ae64fd0589ca019b4d419072cec3f4d2a51fa2ee4ab"}, - {file = "pi_heif-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:83d0ec074d7738fcf838d1472b342936add38233b8c0b6675005af97a11a2e27"}, - {file = "pi_heif-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:31ccfc4acc4804775dd035fefffd5d78c6030f01e9facc13bb14e27d8575983b"}, - {file = "pi_heif-1.2.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9fa93b453c612ec14973b690f6a24cf4a0e406d7f9fc7fd8cde5554258794596"}, - {file = "pi_heif-1.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b486eb0c6affe65df90bc1e2d34998f11f3ca6a0e7fd40f0360781f98ae1fa9"}, - {file = "pi_heif-1.2.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:115495664b22140c7137506a5cee4447c1e42f004867a3bf1a302feb8dc72187"}, - {file = "pi_heif-1.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ddc145307274ff7d84f5fa6930ba0c6f42e80012c4423bc61f39b8ea282ce0a"}, - {file = "pi_heif-1.2.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22137bad2aa0a745deb8851a24b0d1acb89cd0feb149be2147a5f0b371521990"}, - {file = "pi_heif-1.2.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a1e2d14df2064812beef97cd37931c6431e9387aa363c0aea6bb9f45a9595f14"}, - {file = "pi_heif-1.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:a06ba19fea5b868c0d55ee79adda8d04b4d0985594c51705b8e0cb3087401407"}, - {file = "pi_heif-1.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cc9c470bf64cde82de45354e80eb643e1d26daf40e8669e79639f498b0f2395a"}, - {file = "pi_heif-1.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea23057fed02f7f2723f8b284d667ecf5b7ba7a46fce3ddfe670297cd49c8bbc"}, - {file = "pi_heif-1.2.1-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:85c14434c1b34c70b2100b79b2b7dfe162e21fa0d46132c22d178d15270b25b7"}, - {file = "pi_heif-1.2.1-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b4bcc0a76597528b50d9d50dc7f9e6a33b7c130653e857381728b93ee568ab5"}, - {file = "pi_heif-1.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fc4161300b570d36690f3cfc76eb60a5df13319fbeca34353f02ca3da826b5f3"}, - {file = "pi_heif-1.2.1.tar.gz", hash = "sha256:a5c5fd4d92b4f0541d8629eaadd95403ccdfd1b7f2ddced52844fa610713685d"}, -] - -[package.dependencies] -pillow = ">=11.1.0" - -[package.extras] -tests = ["defusedxml", "numpy", "packaging", "pympler", "pytest"] -tests-min = ["defusedxml", "packaging", "pytest"] - [[package]] name = "pillow" version = "11.1.0" @@ -7313,7 +7254,7 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"] [extras] dev = ["pytest"] -file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pi-heif", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] +file-based = ["avro", "fastavro", "markdown", "openpyxl", "pdf2image", "pdfminer.six", "pyarrow", "pytesseract", "python-calamine", "python-snappy", "unstructured", "unstructured.pytesseract"] manifest-server = ["ddtrace", "fastapi", "uvicorn"] sql = ["sqlalchemy"] vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain_text_splitters", "openai", "tiktoken"] @@ -7321,4 +7262,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "a0992639f32e94cbabd42c1abcc80c4b986462a1eac64d890728b5fe943c573a" +content-hash = "b1490bed9f204a54b19a679a040f714fdeec9078880e6fc7ab0e6bde5da43098" diff --git a/pyproject.toml b/pyproject.toml index 1678f2446..b039d655f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,6 @@ python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused tiktoken = { version = "0.8.0", optional = true } nltk = { version = "3.9.1", optional = true } unstructured = { version = "0.18.32", extras = ["docx", "pptx"], optional = true } -pi-heif = { version = ">=0.16.0", optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions @@ -122,7 +121,7 @@ deptry = "^0.23.0" dagger-io = "0.19.0" [tool.poetry.extras] -file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pi-heif", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] +file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"] vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"] sql = ["sqlalchemy"] dev = ["pytest"] @@ -262,7 +261,6 @@ DEP002 = [ "python-snappy", "tiktoken", "unstructured.pytesseract", - "pi-heif", ] # DEP003: Project should not use transitive dependencies. From 74612b842fecaab71aa747be51a24bf95d6b3426 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 15:18:39 +0000 Subject: [PATCH 5/7] fix: update scenario test expectations for unstructured 0.18.32 output changes Co-Authored-By: Ryan Waskewich --- .../file_based/scenarios/unstructured_scenarios.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py index c0db46e7a..84ec258d8 100644 --- a/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +++ b/unit_tests/sources/file_based/scenarios/unstructured_scenarios.py @@ -434,7 +434,7 @@ { "data": { "document_key": "sample.pdf", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -443,7 +443,7 @@ { "data": { "document_key": "sample.docx", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "sample.docx", }, @@ -510,7 +510,7 @@ { "data": { "document_key": "sample.pdf", - "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=No /Root object! - Is this really a PDF?", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=sample.pdf message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "sample.pdf", }, @@ -578,7 +578,7 @@ { "data": { "document_key": "pdf_without_extension", - "content": "# Hello World", + "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=pdf_without_extension message=PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "pdf_without_extension", }, @@ -587,7 +587,7 @@ { "data": { "document_key": "docx_without_extension", - "content": "# Content", + "content": "Content", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "docx_without_extension", }, From c7fed0f12acef91a792ee6cc8a4b2ad62cdbaae9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:07:09 +0000 Subject: [PATCH 6/7] fix: update pdfminer.six pin to >=20231228 for unstructured 0.18.32 compatibility Co-Authored-By: Ryan Waskewich --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b039d655f..6f7498ea0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ langchain_text_splitters = { version = "^1.0.0", optional = true } markdown = { version = "*", optional = true } # TODO: Remove if unused openai = { version = "0.27.9", extras = ["embeddings"], optional = true } # Used indirectly by langchain library pdf2image = { version = "1.16.3", optional = true } -"pdfminer.six" = { version = "20221105", optional = true } # Used indirectly by unstructured library +"pdfminer.six" = { version = ">=20231228", optional = true } # Used indirectly by unstructured library pyarrow = { version = "^19.0.0", optional = true } pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused From 400b27836ed8e2bf43c2213e0fcefdd6a56d5b8f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 22:07:16 +0000 Subject: [PATCH 7/7] chore: update poetry.lock for pdfminer.six version change Co-Authored-By: Ryan Waskewich --- poetry.lock | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index e5408a09d..fcf0a216d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3987,15 +3987,15 @@ pillow = "*" [[package]] name = "pdfminer-six" -version = "20221105" +version = "20260107" description = "PDF parser and analyzer" optional = true -python-versions = ">=3.6" +python-versions = ">=3.10" groups = ["main"] markers = "(python_version <= \"3.11\" or python_version >= \"3.12.0\") and extra == \"file-based\"" files = [ - {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, - {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, + {file = "pdfminer_six-20260107-py3-none-any.whl", hash = "sha256:366585ba97e80dffa8f00cebe303d2f381884d8637af4ce422f1df3ef38111a9"}, + {file = "pdfminer_six-20260107.tar.gz", hash = "sha256:96bfd431e3577a55a0efd25676968ca4ce8fd5b53f14565f85716ff363889602"}, ] [package.dependencies] @@ -4003,8 +4003,6 @@ charset-normalizer = ">=2.0.0" cryptography = ">=36.0.0" [package.extras] -dev = ["black", "mypy (==0.931)", "nox", "pytest"] -docs = ["sphinx", "sphinx-argparse"] image = ["Pillow"] [[package]] @@ -7262,4 +7260,4 @@ vector-db-based = ["cohere", "langchain_community", "langchain_core", "langchain [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "b1490bed9f204a54b19a679a040f714fdeec9078880e6fc7ab0e6bde5da43098" +content-hash = "35b6f772102c0696b04ed4e25da343e75d9b4c7617ba4de2b33b4398878e49ab"