Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
- **Chunk PDF rendering during OCR and image extraction**: `process_file_with_ocr()` now renders multi-page PDFs in configurable page ranges (`PDFIUM_CHUNK_SIZE`, default `8`) instead of one full-document render, and `save_elements()` renders only the page ranges actually needed for extracted images/tables instead of rasterizing the entire document.
- **Harden `PDFIUM_CHUNK_SIZE` configuration**: Invalid `PDFIUM_CHUNK_SIZE` values now fall back safely to the default with a warning instead of raising a request-path `ValueError`.

### Fixes
- **Preserve CSV semantics for single-column files**: Keep using CSV parsing rules when delimiter detection falls back to a single-column shape so quoted commas, escaped quotes, and quoted multiline cells decode correctly instead of being split as raw lines.

## 0.22.16

### Enhancements
Expand Down
93 changes: 93 additions & 0 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,80 @@ def test_partition_csv_with_encoding():
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT


def test_partition_single_column_csv():
elements = partition_csv(example_doc_path("single-column.csv"))

assert clean_extra_whitespace(elements[0].text) == (
"Lorem, ipsum dolor sit amet consectetur adipiscing, elit sed, do eiusmod "
"tempor incididunt ut labore et dolore; magna aliqua Ut enim, ad minim, veniam"
)


def test_partition_single_column_csv_with_header():
elements = partition_csv(example_doc_path("single-column.csv"), include_header=True)

assert clean_extra_whitespace(elements[0].text) == (
"Lorem, ipsum dolor sit amet consectetur adipiscing, elit sed, do eiusmod "
"tempor incididunt ut labore et dolore; magna aliqua Ut enim, ad minim, veniam"
)
assert elements[0].metadata.text_as_html is not None
assert "<td>0</td>" not in elements[0].metadata.text_as_html


def test_partition_csv_with_quoted_commas():
csv_data = (
b"_id,title,reviewid,creationdate,criticname,originalscore,reviewstate,reviewtext\r\n"
b"60297eea-73d7-4fca-a97e-ea73d7cfca62,City Hunter: Shinjuku Private Eyes,2590987,"
b'2019-05-28,Reuben Baron,,fresh,"The choreography is so precise and lifelike at '
b"points one might wonder whether the movie was rotoscoped, but no live-action "
b"reference footage was used. The quality is due to the skill of the animators and "
b"Kodama's love for professional wrestling.\"\r\n"
)

elements = partition_csv(file=io.BytesIO(csv_data))

assert clean_extra_whitespace(elements[0].text).startswith(
"_id title reviewid creationdate criticname originalscore reviewstate reviewtext"
)
assert "<td>reviewtext</td>" in elements[0].metadata.text_as_html


def test_partition_csv_keeps_multicolumn_shape_when_first_row_exceeds_sniff_window():
long_first_row = b"a," * 40000 + b"aa\n"
csv_data = long_first_row + b"left,right\n1,2\n"

elements = partition_csv(file=io.BytesIO(csv_data))

assert "left right" in elements[0].text
assert elements[0].metadata.text_as_html is not None
assert "<td>left</td>" in elements[0].metadata.text_as_html
assert "<td>right</td>" in elements[0].metadata.text_as_html


def test_partition_tsv_when_delimiter_sniffing_falls_back():
elements = partition_csv(file=io.BytesIO(b"left\tright\n1\t2\n"))

assert elements[0].metadata.text_as_html is not None
assert "<td>left</td>" in elements[0].metadata.text_as_html
assert "<td>right</td>" in elements[0].metadata.text_as_html
assert "<td>1</td>" in elements[0].metadata.text_as_html
assert "<td>2</td>" in elements[0].metadata.text_as_html


def test_partition_single_column_csv_preserves_quoted_fields():
csv_data = b'notes\r\n"hello, world"\r\n"a ""quote"""\r\n"line 1\nline 2"\r\n'

elements = partition_csv(file=io.BytesIO(csv_data), include_header=True)

assert elements[0].text == 'notes hello, world a "quote" line 1\\nline 2'
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test assertions expect literal \n instead of space

Medium Severity

The test assertions for the multiline quoted field expect a literal two-character \n (backslash + n) in both elements[0].text and text_as_html, but that's not what the production code produces. The csv.reader correctly decodes "line 1\nline 2" into a string with an actual newline character. Then HtmlTable.from_html_text() normalizes all whitespace via " ".join(e.text.split()), converting the newline to a plain space. The actual output text contains line 1 line 2 (with a space), not line 1\nline 2 (with literal backslash-n). These assertions will fail when run.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit c2b87fd. Configure here.

assert elements[0].metadata.text_as_html is not None
assert "<td>hello, world</td>" in elements[0].metadata.text_as_html
assert '<td>a "quote"</td>' in elements[0].metadata.text_as_html
assert "<td>line 1\\nline 2</td>" in elements[0].metadata.text_as_html
assert '"hello, world"' not in elements[0].metadata.text_as_html
assert '""quote""' not in elements[0].metadata.text_as_html


@pytest.mark.parametrize(
("filename", "expected_text", "expected_table"),
[
Expand Down Expand Up @@ -260,6 +334,25 @@ def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self):
ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv"))
assert ctx.delimiter == ";"

def and_it_auto_detects_the_delimiter_for_a_tab_delimited_CSV_file(self):
ctx = _CsvPartitioningContext(file=io.BytesIO(b"a\tb\n1\t2\n"))
assert ctx.delimiter == "\t"

def and_it_auto_detects_the_delimiter_for_a_small_file_without_a_trailing_newline(self):
ctx = _CsvPartitioningContext(file=io.BytesIO(b"a,b"))
assert ctx.delimiter == ","

def and_it_auto_detects_the_delimiter_for_an_exact_size_file_without_a_trailing_newline(self):
line = ("a," * 32767) + "aa"
assert len(line) == 65536
ctx = _CsvPartitioningContext(file=io.BytesIO(line.encode()))
assert ctx.delimiter == ","

def and_it_keeps_the_delimiter_when_the_first_line_exceeds_the_sniff_window(self):
line = (b"a," * 40000) + b"aa\n1,2\n"
ctx = _CsvPartitioningContext(file=io.BytesIO(line))
assert ctx.delimiter == ","

def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self):
ctx = _CsvPartitioningContext(example_doc_path("single-column.csv"))
assert ctx.delimiter is None
Expand Down
56 changes: 50 additions & 6 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import contextlib
import csv
import io
from functools import cached_property
from typing import IO, Any, Iterator

Expand All @@ -16,6 +17,7 @@

DETECTION_ORIGIN: str = "csv"
CSV_FIELD_LIMIT = 10 * 1048576 # 10MiB
SNIFFABLE_DELIMITERS = ",;\t|"


@apply_metadata(FileType.CSV)
Expand Down Expand Up @@ -58,7 +60,15 @@ def partition_csv(

csv.field_size_limit(CSV_FIELD_LIMIT)
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=ctx.encoding)
if ctx.delimiter is None:
dataframe = ctx.single_column_dataframe(file)
else:
dataframe = pd.read_csv(
file,
header=ctx.header,
sep=ctx.delimiter,
encoding=ctx.encoding,
)

html_table = HtmlTable.from_html_text(
dataframe.to_html(index=False, header=include_header, na_rep="")
Expand Down Expand Up @@ -122,14 +132,31 @@ def delimiter(self) -> str | None:
num_bytes = 65536

with self.open() as file:
# -- read whole lines, sniffer can be confused by a trailing partial line --
data = "\n".join(
ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
)
sample = file.read(num_bytes + 1)

is_truncated = len(sample) > num_bytes
if is_truncated:
sample = sample[:num_bytes]

data = sample.decode(self._encoding or "utf-8", errors="ignore")
if is_truncated and not data.endswith(("\n", "\r")):
last_newline = max(data.rfind("\n"), data.rfind("\r"))
if last_newline != -1:
data = data[:last_newline]

try:
return sniffer.sniff(data, delimiters=",;|").delimiter
return sniffer.sniff(data, delimiters=SNIFFABLE_DELIMITERS).delimiter
except csv.Error:
# -- `csv.Sniffer` can fail on small files with quoted delimiters. Fall back to
# -- testing candidate delimiters and accept only those that produce a consistent
# -- multi-column shape.
candidate_delimiters = (",", ";", "\t", "|")
for delimiter in candidate_delimiters:
rows = list(csv.reader(io.StringIO(data), delimiter=delimiter))
row_lengths = [len(row) for row in rows if row]
if row_lengths and min(row_lengths) > 1 and len(set(row_lengths)) == 1:
return delimiter

# -- sniffing will fail on single-column csv as no default can be assumed --
return None

Expand All @@ -143,6 +170,23 @@ def encoding(self) -> str | None:
"""The encoding to use for reading the file."""
return self._encoding

def single_column_dataframe(self, file: IO[bytes]) -> pd.DataFrame:
"""Parse a delimiter-less CSV while still honoring CSV quoting semantics.

These files are treated as a single logical column, so commas remain literal text, but
quoted commas, escaped quotes, and quoted multiline fields are still decoded by the CSV
parser instead of being left as raw source text.
"""
text = file.read().decode(self.encoding or "utf-8")
rows = list(csv.reader(io.StringIO(text), delimiter="\0"))

if self.header == 0:
if not rows:
return pd.DataFrame()
return pd.DataFrame(rows[1:], columns=rows[0])

return pd.DataFrame(rows)

@cached_property
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
Expand Down
Loading