Unstructured-IO · PastelStorm · Apr 5, 2026 · Apr 6, 2026 · cursor · Apr 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@
 - **Chunk PDF rendering during OCR and image extraction**: `process_file_with_ocr()` now renders multi-page PDFs in configurable page ranges (`PDFIUM_CHUNK_SIZE`, default `8`) instead of one full-document render, and `save_elements()` renders only the page ranges actually needed for extracted images/tables instead of rasterizing the entire document.
 - **Harden `PDFIUM_CHUNK_SIZE` configuration**: Invalid `PDFIUM_CHUNK_SIZE` values now fall back safely to the default with a warning instead of raising a request-path `ValueError`.
 
+### Fixes
+- **Preserve CSV semantics for single-column files**: Keep using CSV parsing rules when delimiter detection falls back to a single-column shape so quoted commas, escaped quotes, and quoted multiline cells decode correctly instead of being split as raw lines.
+
 ## 0.22.16
 
 ### Enhancements

diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
@@ -86,6 +86,80 @@ def test_partition_csv_with_encoding():
     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 
 
+def test_partition_single_column_csv():
+    elements = partition_csv(example_doc_path("single-column.csv"))
+
+    assert clean_extra_whitespace(elements[0].text) == (
+        "Lorem, ipsum dolor sit amet consectetur adipiscing, elit sed, do eiusmod "
+        "tempor incididunt ut labore et dolore; magna aliqua Ut enim, ad minim, veniam"
+    )
+
+
+def test_partition_single_column_csv_with_header():
+    elements = partition_csv(example_doc_path("single-column.csv"), include_header=True)
+
+    assert clean_extra_whitespace(elements[0].text) == (
+        "Lorem, ipsum dolor sit amet consectetur adipiscing, elit sed, do eiusmod "
+        "tempor incididunt ut labore et dolore; magna aliqua Ut enim, ad minim, veniam"
+    )
+    assert elements[0].metadata.text_as_html is not None
+    assert "<td>0</td>" not in elements[0].metadata.text_as_html
+
+
+def test_partition_csv_with_quoted_commas():
+    csv_data = (
+        b"_id,title,reviewid,creationdate,criticname,originalscore,reviewstate,reviewtext\r\n"
+        b"60297eea-73d7-4fca-a97e-ea73d7cfca62,City Hunter: Shinjuku Private Eyes,2590987,"
+        b'2019-05-28,Reuben Baron,,fresh,"The choreography is so precise and lifelike at '
+        b"points one might wonder whether the movie was rotoscoped, but no live-action "
+        b"reference footage was used. The quality is due to the skill of the animators and "
+        b"Kodama's love for professional wrestling.\"\r\n"
+    )
+
+    elements = partition_csv(file=io.BytesIO(csv_data))
+
+    assert clean_extra_whitespace(elements[0].text).startswith(
+        "_id title reviewid creationdate criticname originalscore reviewstate reviewtext"
+    )
+    assert "<td>reviewtext</td>" in elements[0].metadata.text_as_html
+
+
+def test_partition_csv_keeps_multicolumn_shape_when_first_row_exceeds_sniff_window():
+    long_first_row = b"a," * 40000 + b"aa\n"
+    csv_data = long_first_row + b"left,right\n1,2\n"
+
+    elements = partition_csv(file=io.BytesIO(csv_data))
+
+    assert "left right" in elements[0].text
+    assert elements[0].metadata.text_as_html is not None
+    assert "<td>left</td>" in elements[0].metadata.text_as_html
+    assert "<td>right</td>" in elements[0].metadata.text_as_html
+
+
+def test_partition_tsv_when_delimiter_sniffing_falls_back():
+    elements = partition_csv(file=io.BytesIO(b"left\tright\n1\t2\n"))
+
+    assert elements[0].metadata.text_as_html is not None
+    assert "<td>left</td>" in elements[0].metadata.text_as_html
+    assert "<td>right</td>" in elements[0].metadata.text_as_html
+    assert "<td>1</td>" in elements[0].metadata.text_as_html
+    assert "<td>2</td>" in elements[0].metadata.text_as_html
+
+
+def test_partition_single_column_csv_preserves_quoted_fields():
+    csv_data = b'notes\r\n"hello, world"\r\n"a ""quote"""\r\n"line 1\nline 2"\r\n'
+
+    elements = partition_csv(file=io.BytesIO(csv_data), include_header=True)
+
+    assert elements[0].text == 'notes hello, world a "quote" line 1\\nline 2'
+    assert elements[0].metadata.text_as_html is not None
+    assert "<td>hello, world</td>" in elements[0].metadata.text_as_html
+    assert '<td>a "quote"</td>' in elements[0].metadata.text_as_html
+    assert "<td>line 1\\nline 2</td>" in elements[0].metadata.text_as_html
+    assert '"hello, world"' not in elements[0].metadata.text_as_html
+    assert '""quote""' not in elements[0].metadata.text_as_html
+
+
 @pytest.mark.parametrize(
     ("filename", "expected_text", "expected_table"),
     [
@@ -260,6 +334,25 @@ def and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file(self):
         ctx = _CsvPartitioningContext(example_doc_path("semicolon-delimited.csv"))
         assert ctx.delimiter == ";"
 
+    def and_it_auto_detects_the_delimiter_for_a_tab_delimited_CSV_file(self):
+        ctx = _CsvPartitioningContext(file=io.BytesIO(b"a\tb\n1\t2\n"))
+        assert ctx.delimiter == "\t"
+
+    def and_it_auto_detects_the_delimiter_for_a_small_file_without_a_trailing_newline(self):
+        ctx = _CsvPartitioningContext(file=io.BytesIO(b"a,b"))
+        assert ctx.delimiter == ","
+
+    def and_it_auto_detects_the_delimiter_for_an_exact_size_file_without_a_trailing_newline(self):
+        line = ("a," * 32767) + "aa"
+        assert len(line) == 65536
+        ctx = _CsvPartitioningContext(file=io.BytesIO(line.encode()))
+        assert ctx.delimiter == ","
+
+    def and_it_keeps_the_delimiter_when_the_first_line_exceeds_the_sniff_window(self):
+        line = (b"a," * 40000) + b"aa\n1,2\n"
+        ctx = _CsvPartitioningContext(file=io.BytesIO(line))
+        assert ctx.delimiter == ","
+
     def but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file(self):
         ctx = _CsvPartitioningContext(example_doc_path("single-column.csv"))
         assert ctx.delimiter is None

diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import csv
+import io
 from functools import cached_property
 from typing import IO, Any, Iterator
 
@@ -16,6 +17,7 @@
 
 DETECTION_ORIGIN: str = "csv"
 CSV_FIELD_LIMIT = 10 * 1048576  # 10MiB
+SNIFFABLE_DELIMITERS = ",;\t|"
 
 
 @apply_metadata(FileType.CSV)
@@ -58,7 +60,15 @@ def partition_csv(
 
     csv.field_size_limit(CSV_FIELD_LIMIT)
     with ctx.open() as file:
-        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=ctx.encoding)
+        if ctx.delimiter is None:
+            dataframe = ctx.single_column_dataframe(file)
+        else:
+            dataframe = pd.read_csv(
+                file,
+                header=ctx.header,
+                sep=ctx.delimiter,
+                encoding=ctx.encoding,
+            )
 
     html_table = HtmlTable.from_html_text(
         dataframe.to_html(index=False, header=include_header, na_rep="")
@@ -122,14 +132,31 @@ def delimiter(self) -> str | None:
         num_bytes = 65536
 
         with self.open() as file:
-            # -- read whole lines, sniffer can be confused by a trailing partial line --
-            data = "\n".join(
-                ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
-            )
+            sample = file.read(num_bytes + 1)
+
+        is_truncated = len(sample) > num_bytes
+        if is_truncated:
+            sample = sample[:num_bytes]
+
+        data = sample.decode(self._encoding or "utf-8", errors="ignore")
+        if is_truncated and not data.endswith(("\n", "\r")):
+            last_newline = max(data.rfind("\n"), data.rfind("\r"))
+            if last_newline != -1:
+                data = data[:last_newline]
 
         try:
-            return sniffer.sniff(data, delimiters=",;|").delimiter
+            return sniffer.sniff(data, delimiters=SNIFFABLE_DELIMITERS).delimiter
         except csv.Error:
+            # -- `csv.Sniffer` can fail on small files with quoted delimiters. Fall back to
+            # -- testing candidate delimiters and accept only those that produce a consistent
+            # -- multi-column shape.
+            candidate_delimiters = (",", ";", "\t", "|")
+            for delimiter in candidate_delimiters:
+                rows = list(csv.reader(io.StringIO(data), delimiter=delimiter))
+                row_lengths = [len(row) for row in rows if row]
+                if row_lengths and min(row_lengths) > 1 and len(set(row_lengths)) == 1:
+                    return delimiter
+
             # -- sniffing will fail on single-column csv as no default can be assumed --
             return None
 
@@ -143,6 +170,23 @@ def encoding(self) -> str | None:
         """The encoding to use for reading the file."""
         return self._encoding
 
+    def single_column_dataframe(self, file: IO[bytes]) -> pd.DataFrame:
+        """Parse a delimiter-less CSV while still honoring CSV quoting semantics.
+
+        These files are treated as a single logical column, so commas remain literal text, but
+        quoted commas, escaped quotes, and quoted multiline fields are still decoded by the CSV
+        parser instead of being left as raw source text.
+        """
+        text = file.read().decode(self.encoding or "utf-8")
+        rows = list(csv.reader(io.StringIO(text), delimiter="\0"))
+
+        if self.header == 0:
+            if not rows:
+                return pd.DataFrame()
+            return pd.DataFrame(rows[1:], columns=rows[0])
+
+        return pd.DataFrame(rows)
+
     @cached_property
     def last_modified(self) -> str | None:
         """The best last-modified date available, None if no sources are available."""