From c5b345be416ba2562303e2542cc2ed9d9bbd592f Mon Sep 17 00:00:00 2001 From: Ernest Provo Date: Sun, 22 Feb 2026 14:10:02 -0500 Subject: [PATCH 1/2] GH-41365: [Python] Fix S3 URI resolution with whitespace falling back to LocalFileSystem Port the C++ `IsLikelyUri()` heuristic to Python and use it in `_resolve_filesystem_and_path()` to distinguish between local paths and malformed URIs before suppressing `ValueError`. Previously, URIs like `s3://bucket/path with space/file` would silently fall back to `LocalFileSystem` because the "Cannot parse URI" error was unconditionally swallowed. Now, paths that look like URIs (valid scheme prefix) propagate the parsing error, while local paths continue to fall back to `LocalFileSystem` as expected. --- python/pyarrow/fs.py | 40 ++++++++++++++++++--- python/pyarrow/tests/test_fs.py | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 670ccaaf245..41308cd60c1 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -82,6 +82,32 @@ def __getattr__(name): ) +def _is_likely_uri(path): + """ + Check if a string looks like a URI (has a valid scheme followed by ':'). + + This is a Python port of the C++ ``IsLikelyUri()`` heuristic in + ``cpp/src/arrow/filesystem/path_util.cc``. It is intentionally + conservative: single-letter schemes are excluded (could be a Windows + drive letter) and the scheme must conform to RFC 3986. + """ + if not path or path[0] == '/': + return False + colon_pos = path.find(':') + if colon_pos < 0: + return False + # One-letter URI schemes don't officially exist; may be a Windows drive + if colon_pos < 2: + return False + # The largest IANA-registered scheme is 36 characters + if colon_pos > 36: + return False + scheme = path[:colon_pos] + if not scheme[0].isalpha(): + return False + return all(c.isalnum() or c in ('+', '-', '.') for c in scheme[1:]) + + def _ensure_filesystem(filesystem, *, use_mmap=False): if isinstance(filesystem, FileSystem): return filesystem @@ -174,13 +200,17 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): filesystem, path = FileSystem.from_uri(path) except ValueError as e: msg = str(e) - if "empty scheme" in msg or "Cannot parse URI" in msg: - # neither an URI nor a locally existing path, so assume that - # local path was given and propagate a nicer file not found - # error instead of a more confusing scheme parsing error + if "empty scheme" in msg: + # No scheme at all — treat as a local path and propagate + # a nicer "file not found" error later. + pass + elif "Cannot parse URI" in msg and not _is_likely_uri(path): + # Path doesn't look like a URI (no valid scheme prefix), + # so treat it as a local path rather than surfacing a + # confusing URI-parsing error. pass else: - raise e + raise else: path = filesystem.normalize_path(path) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 376398baa07..537ceb99997 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1695,6 +1695,67 @@ def test_filesystem_from_path_object(path): assert path == p.resolve().absolute().as_posix() +def test_is_likely_uri(): + """Unit tests for the _is_likely_uri() heuristic.""" + from pyarrow.fs import _is_likely_uri + + # Valid URI schemes + assert _is_likely_uri("s3://bucket/key") + assert _is_likely_uri("gs://bucket/key") + assert _is_likely_uri("hdfs://host/path") + assert _is_likely_uri("file:///local/path") + assert _is_likely_uri("abfss://container@account/path") + assert _is_likely_uri("grpc+https://host:443") + + # Not URIs — local paths, Windows drives, empty, etc. + assert not _is_likely_uri("") + assert not _is_likely_uri("/absolute/path") + assert not _is_likely_uri("relative/path") + assert not _is_likely_uri("C:\\Users\\foo") # single-letter → drive + assert not _is_likely_uri("C:/Users/foo") + assert not _is_likely_uri("3bucket://key") # scheme starts with digit + assert not _is_likely_uri("-scheme://key") # scheme starts with dash + + +def test_resolve_filesystem_and_path_uri_with_spaces(): + """ + URIs with a recognised scheme but un-encoded spaces must raise + ValueError — NOT silently fall back to LocalFileSystem. + (GH-41365) + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # S3 URI with spaces should raise, not return LocalFileSystem + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("s3://bucket/path with space/file.parquet") + + # GCS URI with spaces should also raise + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("gs://bucket/path with space/file.csv") + + # abfss URI with spaces + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path( + "abfss://container@account/dir with space/file" + ) + + +def test_resolve_filesystem_and_path_local_with_spaces(): + """ + Local paths (no scheme) with spaces should still resolve to + LocalFileSystem — they must NOT be confused with malformed URIs. + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # Absolute local path with spaces → LocalFileSystem + fs, path = _resolve_filesystem_and_path("/tmp/path with spaces/data") + assert isinstance(fs, LocalFileSystem) + + # Non-existent absolute path → still LocalFileSystem + fs, path = _resolve_filesystem_and_path("/nonexistent/path") + assert isinstance(fs, LocalFileSystem) + + @pytest.mark.s3 def test_filesystem_from_uri_s3(s3_server): from pyarrow.fs import S3FileSystem From ab5292db58023989323329ae71f294d979fd35e6 Mon Sep 17 00:00:00 2001 From: Ernest Provo Date: Fri, 6 Mar 2026 22:52:49 -0500 Subject: [PATCH 2/2] GH-41365: [Python] Refactor _is_likely_uri to use C++ via Cython Replace the pure-Python _is_likely_uri() with a Cython wrapper that calls the C++ arrow::fs::IsLikelyUri(), as requested in PR review. - Move IsLikelyUri from arrow::fs::internal to arrow::fs namespace - Add public declaration in filesystem.h - Keep inline backward-compat wrapper in path_util.h - Add Cython binding in libarrow_fs.pxd and wrapper in _fs.pyx - Import from _fs module in fs.py, delete Python implementation - Add non-ASCII scheme test cases --- cpp/src/arrow/filesystem/filesystem.h | 7 +++++++ cpp/src/arrow/filesystem/path_util.cc | 4 ++++ cpp/src/arrow/filesystem/path_util.h | 7 +++++-- python/pyarrow/_fs.pyx | 8 ++++++++ python/pyarrow/fs.py | 27 +------------------------ python/pyarrow/includes/libarrow_fs.pxd | 2 ++ python/pyarrow/tests/test_fs.py | 2 ++ 7 files changed, 29 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 3a47eb62f52..85ee3ff1146 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -529,6 +530,12 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem { /// The user is responsible for synchronization of calls to this function. void EnsureFinalized(); +/// \brief Return whether a path string is likely a URI. +/// +/// This heuristic is conservative and may return false for malformed URIs. +ARROW_EXPORT +bool IsLikelyUri(std::string_view path); + /// \defgroup filesystem-factories Functions for creating FileSystem instances /// /// @{ diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index dc82afd07e7..e5678813731 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -337,6 +337,8 @@ bool IsEmptyPath(std::string_view v) { return true; } +} // namespace internal + bool IsLikelyUri(std::string_view v) { if (v.empty() || v[0] == '/') { return false; @@ -357,6 +359,8 @@ bool IsLikelyUri(std::string_view v) { return ::arrow::util::IsValidUriScheme(v.substr(0, pos)); } +namespace internal { + struct Globber::Impl { std::regex pattern_; diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index d49d9d2efa7..d146a57fbe1 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -27,6 +27,10 @@ namespace arrow { namespace fs { + +// Declared in arrow/filesystem/filesystem.h, defined in path_util.cc. +ARROW_EXPORT bool IsLikelyUri(std::string_view path); + namespace internal { constexpr char kSep = '/'; @@ -159,8 +163,7 @@ std::string ToSlashes(std::string_view s); ARROW_EXPORT bool IsEmptyPath(std::string_view s); -ARROW_EXPORT -bool IsLikelyUri(std::string_view s); +inline bool IsLikelyUri(std::string_view s) { return ::arrow::fs::IsLikelyUri(s); } class ARROW_EXPORT Globber { public: diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0739b6acba3..0b8ab7480f8 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -84,6 +84,14 @@ def _file_type_to_string(ty): return f"{ty.__class__.__name__}.{ty._name_}" +def _is_likely_uri(path): + cdef c_string c_path + if not isinstance(path, str): + raise TypeError("Path must be a string") + c_path = tobytes(path) + return CIsLikelyUri(cpp_string_view(c_path)) + + cdef class FileInfo(_Weakrefable): """ FileSystem entry info. diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 41308cd60c1..bc68d3f2d7c 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -33,6 +33,7 @@ PyFileSystem, _copy_files, _copy_files_selector, + _is_likely_uri, ) # For backward compatibility. @@ -82,32 +83,6 @@ def __getattr__(name): ) -def _is_likely_uri(path): - """ - Check if a string looks like a URI (has a valid scheme followed by ':'). - - This is a Python port of the C++ ``IsLikelyUri()`` heuristic in - ``cpp/src/arrow/filesystem/path_util.cc``. It is intentionally - conservative: single-letter schemes are excluded (could be a Windows - drive letter) and the scheme must conform to RFC 3986. - """ - if not path or path[0] == '/': - return False - colon_pos = path.find(':') - if colon_pos < 0: - return False - # One-letter URI schemes don't officially exist; may be a Windows drive - if colon_pos < 2: - return False - # The largest IANA-registered scheme is 36 characters - if colon_pos > 36: - return False - scheme = path[:colon_pos] - if not scheme[0].isalpha(): - return False - return all(c.isalnum() or c in ('+', '-', '.') for c in scheme[1:]) - - def _ensure_filesystem(filesystem, *, use_mmap=False): if isinstance(filesystem, FileSystem): return filesystem diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index af01c47c8c7..178fa7650bb 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -93,6 +93,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: "arrow::fs::FileSystemFromUriOrPath"(const c_string& uri, c_string* out_path) + c_bool CIsLikelyUri "arrow::fs::IsLikelyUri"(cpp_string_view path) + cdef cppclass CFileSystemGlobalOptions \ "arrow::fs::FileSystemGlobalOptions": c_string tls_ca_file_path diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 537ceb99997..31f3f8a8cab 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1715,6 +1715,8 @@ def test_is_likely_uri(): assert not _is_likely_uri("C:/Users/foo") assert not _is_likely_uri("3bucket://key") # scheme starts with digit assert not _is_likely_uri("-scheme://key") # scheme starts with dash + assert not _is_likely_uri("schéme://bucket/key") # non-ASCII in scheme + assert not _is_likely_uri("漢字://bucket/key") # non-ASCII in scheme def test_resolve_filesystem_and_path_uri_with_spaces():