Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,259 changes: 1,187 additions & 1,072 deletions CHANGELOG.md

Large diffs are not rendered by default.

Binary file added example-docs/rotated-page-90.pdf
Binary file not shown.
17 changes: 6 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dependencies = [
[project.optional-dependencies]
# Document type extras
csv = [
"pandas>=2.0.0, <4.0.0",
"pandas>=2.0.0, <3.0.0",
]
doc = [
"unstructured[docx]",
Expand All @@ -68,9 +68,9 @@ image = [
"pi-heif>=1.2.0, <2.0.0",
"pikepdf>=10.3.0, <11.0.0",
"pypdf>=6.6.2, <7.0.0",
"unstructured-inference>=1.6.2, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
"unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
"unstructured-inference>=1.6.2, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-pytesseract>=0.3.15, <1.0.0",
]
md = [
Expand Down Expand Up @@ -109,7 +109,7 @@ xlsx = [
"msoffcrypto-tool>=6.0.0, <7.0.0",
"networkx>=3.2.0, <4.0.0",
"openpyxl>=3.1.5, <4.0.0",
"pandas>=2.0.0, <4.0.0",
"pandas>=2.0.0, <3.0.0",
"xlrd>=2.0.1, <3.0.0",
]
# Speech-to-text for partition_audio (multimodal: audio -> elements)
Expand Down Expand Up @@ -195,14 +195,9 @@ required-environments = [
"sys_platform == 'darwin' and platform_machine == 'arm64'",
"sys_platform == 'win32'",
]
override-dependencies = [
# unstructured-inference 1.6.2 has unnecessarily aggressive numpy/pandas floors
# that conflict with kdbai-client (via pykx). The inference codebase only uses
# basic APIs available since numpy 1.26 / pandas 1.5.
"numpy>=1.26.0",
"pandas>=1.5.0",
]
constraint-dependencies = [
# Temporary pin for Azure public-container ingest regression in adlfs 2026.4.0 stack
"adlfs==2026.2.0",
# deltalake 1.3.0 is missing Linux ARM64 wheels, causing Docker ARM64 builds to fail
"deltalake<1.3.0",
"fonttools>=4.60.2",
Expand Down
15 changes: 8 additions & 7 deletions scripts/check-licenses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,14 @@ Python-2.0"
# upstream source repository.
IGNORED_PACKAGES=(
# Metadata missing -- verified permissive on GitHub
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
iopath # MIT (facebookresearch/iopath)
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
sentencepiece # Apache-2.0 (google/sentencepiece)
voyageai # MIT (voyage-ai/voyageai-python)
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
iopath # MIT (facebookresearch/iopath)
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
sentencepiece # Apache-2.0 (google/sentencepiece)
voyageai # MIT (voyage-ai/voyageai-python)
matplotlib-inline # BSD 3-Clause (ipython/matplotlib-inline)

# Permissive but non-standard classifier
lmdb # OpenLDAP Public License (BSD-style, jnwatson/py-lmdb)
Expand Down
1 change: 1 addition & 0 deletions test_unstructured/partition/pdf_image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image):
self.number = number
self.image = image
self.image_metadata = {"pdf_rotation": 0}
self.elements = [
layout.LayoutElement.from_coords(
type="Title",
Expand Down
17 changes: 17 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1824,3 +1824,20 @@ def test_reproductible_pdf_loader():
assert e1.text == e2.text, f"load two time {f=} return differents results"
else:
break


def test_hi_res_groups_rotated_page_text_into_words():
elements = pdf.partition_pdf(
filename=example_doc_path("rotated-page-90.pdf"),
strategy=PartitionStrategy.HI_RES,
)

texts = [e.text for e in elements if e.text and len(e.text) > 5]
assert any("Hello World" in t for t in texts), (
f"Expected 'Hello World' as grouped text from rotated page, got: {texts[:5]}"
)

single_chars = [e.text for e in elements if e.text and len(e.text) == 1]
assert len(single_chars) == 0, (
f"Rotated page produced {len(single_chars)} single-char elements: {single_chars[:10]}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
<h1 class="Title" id="d3be9e3d661e2a79f37257caa5b54d8c">
LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis
</h1>
<p class="NarrativeText" id="4dfee7e352ae892814e46bb220094b0f">
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li&gt;
<p class="NarrativeText" id="607ee712429ac9cf3540dbdc5e55e143">
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
</p>
<p class="NarrativeText" id="23b8def20ce16f929d4f558b2a19f200">
1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
Expand Down Expand Up @@ -561,8 +561,8 @@ <h1 class="Title" id="54ee49eac3f4e6098811cda1f9dd0306">
<li class="ListItem" id="184a3abfd34e7aa04632979ee3c2de36">
17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
</li>
<li class="ListItem" id="2b7101f39954d5301166b82906202ea9">
LayoutParser: A Unified Toolkit for DL-Based DIA
<li class="ListItem" id="f1b03448874d9c98a0a59a20b134c513">
LayoutParser: A Unified Toolkit for DL-Based DIA 13
</li>
<img alt="ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line" class="Image" id="d7ab3da5ec0adb1b2b4fb5f800a545a0"/>
<p class="FigureCaption" id="d35d253341e8b8d837f384ecd6ac410a">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ X
r
a
# LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>
Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®
1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca
Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.
Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.
Expand Down Expand Up @@ -120,7 +120,7 @@ Additionally, it is common for historical documents to use unique fonts with di
Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR.
16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1.
17 This measures the number of edits from the ground-truth text to the predicted text, and lower is better.
LayoutParser: A Unified Toolkit for DL-Based DIA
LayoutParser: A Unified Toolkit for DL-Based DIA 13
ra (a) Partial table at the bottom (b) Full page table (c) Partial table at the top (d) Mis-detected text line
Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in different locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.
5.2 A light-weight Visual Table Extractor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@
},
{
"type": "NarrativeText",
"element_id": "4dfee7e352ae892814e46bb220094b0f",
"text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson®, and Weining Li>",
"element_id": "607ee712429ac9cf3540dbdc5e55e143",
"text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson?, and Weining Li®",
"metadata": {
"is_extracted": "true",
"filetype": "application/pdf",
Expand Down Expand Up @@ -3292,8 +3292,8 @@
},
{
"type": "ListItem",
"element_id": "2b7101f39954d5301166b82906202ea9",
"text": "LayoutParser: A Unified Toolkit for DL-Based DIA",
"element_id": "f1b03448874d9c98a0a59a20b134c513",
"text": "LayoutParser: A Unified Toolkit for DL-Based DIA 13",
"metadata": {
"is_extracted": "true",
"filetype": "application/pdf",
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.19" # pragma: no cover
__version__ = "0.22.20" # pragma: no cover
22 changes: 22 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,18 @@ def is_pdf_too_complex(
return False


def _enable_detect_vertical_if_rotated(
inferred_document_layout,
pdfminer_config: Optional["PDFMinerConfig"],
) -> Optional["PDFMinerConfig"]:
"""Enable detect_vertical in pdfminer when the PDF has rotated pages."""
if any((p.image_metadata or {}).get("pdf_rotation", 0) for p in inferred_document_layout.pages):
pdfminer_config = pdfminer_config or PDFMinerConfig()
pdfminer_config.detect_vertical = True

return pdfminer_config


@requires_dependencies("unstructured_inference")
def _partition_pdf_or_image_local(
filename: str = "",
Expand Down Expand Up @@ -815,6 +827,11 @@ def _partition_pdf_or_image_local(
password=password,
)

pdfminer_config = _enable_detect_vertical_if_rotated(
inferred_document_layout,
pdfminer_config,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(
filename=filename,
Expand Down Expand Up @@ -877,6 +894,11 @@ def _partition_pdf_or_image_local(
if hasattr(file, "seek"):
file.seek(0)

pdfminer_config = _enable_detect_vertical_if_rotated(
inferred_document_layout,
pdfminer_config,
)

extracted_layout, layouts_links = (
process_data_with_pdfminer(
file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
Expand Down
1 change: 1 addition & 0 deletions unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ class PDFMinerConfig(BaseModel):
word_margin: Optional[float] = None
line_margin: Optional[float] = None
char_margin: Optional[float] = None
detect_vertical: Optional[bool] = None


def init_pdfminer(pdfminer_config: Optional[PDFMinerConfig] = None):
Expand Down
Loading
Loading