From cf38e6b7ad69d35e2fd681414ab68436bef9f618 Mon Sep 17 00:00:00 2001
From: laughingman7743 <laughingman7743@gmail.com>
Date: Tue, 24 Mar 2026 08:39:10 +0900
Subject: [PATCH 1/3] Always use C engine for CSV parsing, fix incorrect
 hallucinated claims
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Python engine selection for large files (>50MB) introduced in PR #594 was
based on incorrect AI-generated claims that pandas' C parser has 32-bit integer
limitations. This is factually wrong — pandas defaults to int64 and the C parser
has no such limits.

The original "signed integer is greater than maximum" error (Issue #414) was
caused by an OpenSSL SSL_read() 2GB buffer limit in Python 3.8 (bpo-42853),
which was fixed in Python 3.10 and is unrelated to CSV parsing.

The forced Python engine causes significant performance degradation (up to 28%
slower on pandas 2.3.3 per Issue #696 benchmarks).

Changes:
- _get_optimal_csv_engine() now always returns "c"
- Remove misleading "C parser limitations" error handler
- Update tests to expect C engine for all file sizes

Closes #696

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pyathena/pandas/result_set.py        | 25 ++-----------------------
 tests/pyathena/pandas/test_cursor.py |  6 +++---
 2 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/pyathena/pandas/result_set.py b/pyathena/pandas/result_set.py
index 592fdb2d..9ed678ce 100644
--- a/pyathena/pandas/result_set.py
+++ b/pyathena/pandas/result_set.py
@@ -389,10 +389,9 @@ def _get_optimal_csv_engine(self, file_size_bytes: int | None = None) -> str:
             file_size_bytes: Size of the CSV file in bytes.
 
         Returns:
-            'python' for large files (>50MB) to avoid C parser limits, otherwise 'c'.
+            Always returns 'c' as the C engine is faster and the previously
+            assumed 32-bit integer limitation does not exist in modern pandas.
         """
-        if file_size_bytes and file_size_bytes > self.LARGE_FILE_THRESHOLD_BYTES:
-            return "python"
         return "c"
 
     def _auto_determine_chunksize(self, file_size_bytes: int) -> int | None:
@@ -594,26 +593,6 @@ def _read_csv(self) -> TextFileReader | DataFrame:
 
         except Exception as e:
             _logger.exception("Failed to read %s.", self.output_location)
-            error_msg = str(e).lower()
-            if any(
-                phrase in error_msg
-                for phrase in ["signed integer", "maximum", "overflow", "int32", "c parser"]
-            ):
-                # Enhanced error message with specific recommendations
-                file_mb = (length or 0) // (1024 * 1024)
-                detailed_msg = (
-                    f"Large dataset processing error ({file_mb}MB file): {e}. "
-                    "This is likely due to pandas C parser limitations. "
-                    "Recommended solutions:\n"
-                    "1. Set chunksize: cursor = connection.cursor(PandasCursor, chunksize=50000)\n"
-                    "2. Enable auto-optimization: "
-                    "cursor = connection.cursor(PandasCursor, auto_optimize_chunksize=True)\n"
-                    "3. Use PyArrow engine: "
-                    "cursor = connection.cursor(PandasCursor, engine='pyarrow')\n"
-                    "4. Use Python engine: "
-                    "cursor = connection.cursor(PandasCursor, engine='python')"
-                )
-                raise OperationalError(detailed_msg) from e
             raise OperationalError(*e.args) from e
 
     def _read_parquet(self, engine) -> DataFrame:
diff --git a/tests/pyathena/pandas/test_cursor.py b/tests/pyathena/pandas/test_cursor.py
index 944dc025..fd5273e6 100644
--- a/tests/pyathena/pandas/test_cursor.py
+++ b/tests/pyathena/pandas/test_cursor.py
@@ -667,13 +667,13 @@ def test_get_optimal_csv_engine(self):
             result_set._engine = "auto"
             result_set._chunksize = None  # No chunking by default
 
-            # Small file should prefer C engine (compatibility-first approach)
+            # Small file should use C engine
             engine = result_set._get_optimal_csv_engine(1024)  # 1KB
             assert engine == "c"
 
-            # Large file should prefer Python engine (avoid C parser int32 limits)
+            # Large file should also use C engine
             engine = result_set._get_optimal_csv_engine(100 * 1024 * 1024)  # 100MB
-            assert engine == "python"
+            assert engine == "c"
 
     def test_auto_determine_chunksize(self):
         """Test _auto_determine_chunksize method behavior."""

From 0116ca33a5dc42865fee92ef39a0fbc1fd4e865f Mon Sep 17 00:00:00 2001
From: laughingman7743 <laughingman7743@gmail.com>
Date: Tue, 24 Mar 2026 08:59:24 +0900
Subject: [PATCH 2/3] Simplify CSV engine selection: remove dead methods and
 inline pyarrow logic

Delete _get_optimal_csv_engine (always returned "c") and _get_pyarrow_engine,
inlining the pyarrow compatibility checks directly into _get_csv_engine.
This reduces the method chain from 4 methods to 2 (keeping _get_available_engine
which is shared with _get_parquet_engine).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pyathena/pandas/result_set.py        | 46 ++++++++--------------------
 tests/pyathena/pandas/test_cursor.py | 31 -------------------
 2 files changed, 12 insertions(+), 65 deletions(-)

diff --git a/pyathena/pandas/result_set.py b/pyathena/pandas/result_set.py
index 9ed678ce..5611e31c 100644
--- a/pyathena/pandas/result_set.py
+++ b/pyathena/pandas/result_set.py
@@ -326,32 +326,22 @@ def _get_csv_engine(
         Returns:
             CSV engine name ('pyarrow', 'c', or 'python').
         """
-        effective_chunksize = chunksize if chunksize is not None else self._chunksize
-
-        if self._engine == "pyarrow":
-            return self._get_pyarrow_engine(file_size_bytes, effective_chunksize)
-
         if self._engine in ("c", "python"):
             return self._engine
 
-        # Auto-selection for "auto" or unknown engine values
-        return self._get_optimal_csv_engine(file_size_bytes)
-
-    def _get_pyarrow_engine(self, file_size_bytes: int | None, chunksize: int | None) -> str:
-        """Get PyArrow engine if compatible, otherwise return optimal engine."""
-        # Check parameter compatibility
-        if chunksize is not None or self._quoting != 1 or self.converters:
-            return self._get_optimal_csv_engine(file_size_bytes)
-
-        # Check file size compatibility
-        if file_size_bytes is not None and file_size_bytes < self.PYARROW_MIN_FILE_SIZE_BYTES:
-            return self._get_optimal_csv_engine(file_size_bytes)
+        if self._engine == "pyarrow":
+            effective_chunksize = chunksize if chunksize is not None else self._chunksize
+            if effective_chunksize is not None or self._quoting != 1 or self.converters:
+                return "c"
+            if file_size_bytes is not None and file_size_bytes < self.PYARROW_MIN_FILE_SIZE_BYTES:
+                return "c"
+            try:
+                return self._get_available_engine(["pyarrow"])
+            except ImportError:
+                return "c"
 
-        # Check availability
-        try:
-            return self._get_available_engine(["pyarrow"])
-        except ImportError:
-            return self._get_optimal_csv_engine(file_size_bytes)
+        # "auto" or unknown → C engine (same as pandas default)
+        return "c"
 
     def _get_available_engine(self, engine_candidates: list[str]) -> str:
         """Get the first available engine from a list of candidates.
@@ -382,18 +372,6 @@ def _get_available_engine(self, engine_candidates: list[str]) -> str:
             f"{error_msgs}"
         )
 
-    def _get_optimal_csv_engine(self, file_size_bytes: int | None = None) -> str:
-        """Get the optimal CSV engine based on file size.
-
-        Args:
-            file_size_bytes: Size of the CSV file in bytes.
-
-        Returns:
-            Always returns 'c' as the C engine is faster and the previously
-            assumed 32-bit integer limitation does not exist in modern pandas.
-        """
-        return "c"
-
     def _auto_determine_chunksize(self, file_size_bytes: int) -> int | None:
         """Determine appropriate chunksize for large files based on file size.
 
diff --git a/tests/pyathena/pandas/test_cursor.py b/tests/pyathena/pandas/test_cursor.py
index fd5273e6..d5e6d51c 100644
--- a/tests/pyathena/pandas/test_cursor.py
+++ b/tests/pyathena/pandas/test_cursor.py
@@ -658,23 +658,6 @@ def test_no_ops(self):
         cursor.close()
         conn.close()
 
-    def test_get_optimal_csv_engine(self):
-        """Test _get_optimal_csv_engine method behavior."""
-
-        # Mock the parent class initialization
-        with patch("pyathena.pandas.result_set.AthenaResultSet.__init__"):
-            result_set = AthenaPandasResultSet.__new__(AthenaPandasResultSet)
-            result_set._engine = "auto"
-            result_set._chunksize = None  # No chunking by default
-
-            # Small file should use C engine
-            engine = result_set._get_optimal_csv_engine(1024)  # 1KB
-            assert engine == "c"
-
-            # Large file should also use C engine
-            engine = result_set._get_optimal_csv_engine(100 * 1024 * 1024)  # 100MB
-            assert engine == "c"
-
     def test_auto_determine_chunksize(self):
         """Test _auto_determine_chunksize method behavior."""
 
@@ -747,58 +730,46 @@ def test_get_csv_engine_explicit_specification(self):
 
             # Test PyArrow with incompatible chunksize (via parameter)
             with (
-                patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
                 patch.object(
                     type(result_set), "converters", new_callable=PropertyMock, return_value={}
                 ),
-                patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
             ):
                 engine = result_set._get_csv_engine(chunksize=1000)
                 assert engine == "c"
-                mock_opt.assert_called_once()
 
             # Test PyArrow with incompatible chunksize (via instance variable)
             result_set._chunksize = 1000
             with (
-                patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
                 patch.object(
                     type(result_set), "converters", new_callable=PropertyMock, return_value={}
                 ),
-                patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
             ):
                 engine = result_set._get_csv_engine()
                 assert engine == "c"
-                mock_opt.assert_called_once()
 
             # Test PyArrow with incompatible quoting
             result_set._chunksize = None
             result_set._quoting = 0  # Non-default quoting
             with (
-                patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
                 patch.object(
                     type(result_set), "converters", new_callable=PropertyMock, return_value={}
                 ),
-                patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
             ):
                 engine = result_set._get_csv_engine()
                 assert engine == "c"
-                mock_opt.assert_called_once()
 
             # Test PyArrow with incompatible converters
             result_set._quoting = 1  # Reset to default
             with (
-                patch.object(result_set, "_get_available_engine", return_value="pyarrow"),
                 patch.object(
                     type(result_set),
                     "converters",
                     new_callable=PropertyMock,
                     return_value={"col1": str},
                 ),
-                patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
             ):
                 engine = result_set._get_csv_engine()
                 assert engine == "c"
-                mock_opt.assert_called_once()
 
             # Test PyArrow specification (when unavailable)
             with (
@@ -806,11 +777,9 @@ def test_get_csv_engine_explicit_specification(self):
                 patch.object(
                     type(result_set), "converters", new_callable=PropertyMock, return_value={}
                 ),
-                patch.object(result_set, "_get_optimal_csv_engine", return_value="c") as mock_opt,
             ):
                 engine = result_set._get_csv_engine()
                 assert engine == "c"
-                mock_opt.assert_called_once()
 
     @pytest.mark.parametrize(
         ("pandas_cursor", "parquet_engine", "chunksize"),

From 16599b5d2be98f0f3476a53f30a1850603f99472 Mon Sep 17 00:00:00 2001
From: laughingman7743 <laughingman7743@gmail.com>
Date: Tue, 24 Mar 2026 09:12:25 +0900
Subject: [PATCH 3/3] Refactor _get_csv_engine: single return point for C
 engine default

Consolidate multiple "return c" branches into a single fallthrough by
expressing the pyarrow compatibility check as a positive condition
(is_compatible) instead of multiple negative early returns.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pyathena/pandas/result_set.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/pyathena/pandas/result_set.py b/pyathena/pandas/result_set.py
index 5611e31c..3e49383b 100644
--- a/pyathena/pandas/result_set.py
+++ b/pyathena/pandas/result_set.py
@@ -320,27 +320,32 @@ def _get_csv_engine(
         """Determine the appropriate CSV engine based on configuration and compatibility.
 
         Args:
-            file_size_bytes: Size of the CSV file in bytes.
+            file_size_bytes: Size of the CSV file in bytes. Only used for PyArrow
+                compatibility checks (minimum file size threshold).
             chunksize: Chunksize parameter (overrides self._chunksize if provided).
 
         Returns:
             CSV engine name ('pyarrow', 'c', or 'python').
         """
-        if self._engine in ("c", "python"):
-            return self._engine
+        if self._engine == "python":
+            return "python"
 
+        # Use PyArrow only when explicitly requested and all compatibility
+        # checks pass; otherwise fall through to the C engine default.
         if self._engine == "pyarrow":
             effective_chunksize = chunksize if chunksize is not None else self._chunksize
-            if effective_chunksize is not None or self._quoting != 1 or self.converters:
-                return "c"
-            if file_size_bytes is not None and file_size_bytes < self.PYARROW_MIN_FILE_SIZE_BYTES:
-                return "c"
-            try:
-                return self._get_available_engine(["pyarrow"])
-            except ImportError:
-                return "c"
+            is_compatible = (
+                effective_chunksize is None
+                and self._quoting == 1
+                and not self.converters
+                and (file_size_bytes is None or file_size_bytes >= self.PYARROW_MIN_FILE_SIZE_BYTES)
+            )
+            if is_compatible:
+                try:
+                    return self._get_available_engine(["pyarrow"])
+                except ImportError:
+                    pass
 
-        # "auto" or unknown → C engine (same as pandas default)
         return "c"
 
     def _get_available_engine(self, engine_candidates: list[str]) -> str: