From 678f437cbabff37aadac3825398c3f73fd2143d2 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Wed, 8 Feb 2023 08:34:31 -0800 Subject: [PATCH 01/11] WIP --- cloudpathlib/__init__.py | 7 +++ cloudpathlib/patches.py | 112 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 cloudpathlib/patches.py diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py index 84ed31b2..a9620363 100644 --- a/cloudpathlib/__init__.py +++ b/cloudpathlib/__init__.py @@ -1,9 +1,11 @@ +import os import sys from .anypath import AnyPath from .azure.azblobclient import AzureBlobClient from .azure.azblobpath import AzureBlobPath from .cloudpath import CloudPath, implementation_registry +from .patches import patch_open from .gs.gsclient import GSClient from .gs.gspath import GSPath from .http.httpclient import HttpClient, HttpsClient @@ -33,6 +35,11 @@ "HttpsClient", "HttpPath", "HttpsPath", + "patch_open" "S3Client", "S3Path", ] + + +if bool(os.environ.get("CLOUDPATHLIB_PATCH_OPEN", "")): + patch_open() diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py new file mode 100644 index 00000000..a1a2b645 --- /dev/null +++ b/cloudpathlib/patches.py @@ -0,0 +1,112 @@ +import os + +from .cloudpath import CloudPath + + +def _cloudpath_open(*args, **kwargs): + if isinstance(args[0], CloudPath): + return args[0].open(*args[1:], **kwargs) + else: + return open(*args, **kwargs) + + +def patch_open(): + open = _cloudpath_open + + +def _dispatch_to_pathlib(path, pathlib_func, os_func, pathlib_args=None, pathlib_kwargs=None, *args, **kwargs): + if pathlib_args is None: + pathlib_args = args + + if pathlib_kwargs is None: + pathlib_kwargs = kwargs + + if isinstance(path, CloudPath): + return pathlib_func(path, *pathlib_args, **pathlib_kwargs) + else: + return os_func(*args, **kwargs) + + +def _cloudpath_os_listdir(path="."): + return _dispatch_to_pathlib(path, lambda path: list(path.iterdir()), os.listdir, path=path) + + +def _cloudpath_os_lstat(path, *, dir_fd=None): + return _dispatch_to_pathlib(path, CloudPath.stat, os.lstat, path, dir_fd=dir_fd) + +def _cloudpath_os_mkdir(path, mode=0o777, *, dir_fd=None): + return _dispatch_to_pathlib(path, CloudPath.mkdir, os.mkdir, path, dir_fd=dir_fd) + +def _cloudpath_os_makedirs(name, mode=0o777, exist_ok=False): + pass + +def _cloudpath_os_remove(path, *, dir_fd=None): + pass + +def _cloudpath_os_removedirs(name): + pass + +def _cloudpath_os_rename(src, dst, *, src_dir_fd=None, dst_dir_fd=None): + pass + +def _cloudpath_os_renames(old, new): + pass + +def _cloudpath_os_replace(src, dst, *, src_dir_fd=None, dst_dir_fd=None): + pass + +def _cloudpath_os_rmdir(path, *, dir_fd=None): + pass + +def _cloudpath_os_scandir(path='.'): + pass + +def _cloudpath_os_stat(path, *, dir_fd=None, follow_symlinks=True): + if isinstance(path, CloudPath): + return path.stat() + else: + return os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks) + +def _cloudpath_os_unlink(path, *, dir_fd=None): + pass + +def _cloudpath_os_walk(top, topdown=True, onerror=None, followlinks=False): + pass + +def _cloudpath_os_path_basename(path): + pass + +def _cloudpath_os_path_exists(path): + pass + +def _cloudpath_os_path_getatime(path): + pass + +def _cloudpath_os_path_getmtime(path): + pass + +def _cloudpath_os_path_getctime(path): + pass + +def _cloudpath_os_path_getsize(path): + pass + +def _cloudpath_os_path_isfile(path): + pass + +def _cloudpath_os_path_isdir(path): + pass + +def _cloudpath_os_path_join(path, *paths): + pass + +def _cloudpath_os_path_split(path): + pass + +def _cloudpath_os_path_splitext(path): + pass + + +def patch_os_function(): + os.listdir = _cloudpath_os_listdir + From 64fdd8bed95aa9234de0e1b7e1bea9c43d39dc57 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 11 Feb 2023 11:36:23 -0800 Subject: [PATCH 02/11] Working implementation --- cloudpathlib/__init__.py | 3 +- cloudpathlib/cloudpath.py | 8 ++ cloudpathlib/patches.py | 209 ++++++++++++++++++++++++++++---------- 3 files changed, 163 insertions(+), 57 deletions(-) diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py index a9620363..5678c6a3 100644 --- a/cloudpathlib/__init__.py +++ b/cloudpathlib/__init__.py @@ -5,7 +5,7 @@ from .azure.azblobclient import AzureBlobClient from .azure.azblobpath import AzureBlobPath from .cloudpath import CloudPath, implementation_registry -from .patches import patch_open +from .patches import patch_open, patch_os_functions from .gs.gsclient import GSClient from .gs.gspath import GSPath from .http.httpclient import HttpClient, HttpsClient @@ -36,6 +36,7 @@ "HttpPath", "HttpsPath", "patch_open" + "patch_os_functions", "S3Client", "S3Path", ] diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index f7621c5b..49d9aa73 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -235,6 +235,7 @@ class CloudPath(metaclass=CloudPathMeta): def __init__( self, cloud_path: Union[str, Self, "CloudPath"], + *parts: str, client: Optional["Client"] = None, ) -> None: # handle if local file gets opened. must be set at the top of the method in case any code @@ -242,6 +243,13 @@ def __init__( self._handle: Optional[IO] = None self._client: Optional["Client"] = None + if parts: + # ensure first part ends in "/"; (sometimes it is just prefix, sometimes a longer path) + if not str(cloud_path).endswith("/"): + cloud_path = str(cloud_path) + "/" + + cloud_path = str(cloud_path) + "/".join(p.strip("/") for p in parts) + self.is_valid_cloudpath(cloud_path, raise_on_error=True) self._cloud_meta.validate_completeness() diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index a1a2b645..6dc90db0 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -1,112 +1,209 @@ +import builtins import os +import os.path from .cloudpath import CloudPath -def _cloudpath_open(*args, **kwargs): - if isinstance(args[0], CloudPath): - return args[0].open(*args[1:], **kwargs) - else: - return open(*args, **kwargs) +def _check_first_arg(*args, **kwargs): + return isinstance(args[0], CloudPath) -def patch_open(): - open = _cloudpath_open +def _check_first_arg_first_index(*args, **kwargs): + return isinstance(args[0][0], CloudPath) + +def _patch_factory(original_version, cpl_version, cpl_check=_check_first_arg): + _original = original_version -def _dispatch_to_pathlib(path, pathlib_func, os_func, pathlib_args=None, pathlib_kwargs=None, *args, **kwargs): - if pathlib_args is None: - pathlib_args = args + def _patched_version(*args, **kwargs): + if cpl_check(*args, **kwargs): + return cpl_version(*args, **kwargs) + else: + return _original(*args, **kwargs) - if pathlib_kwargs is None: - pathlib_kwargs = kwargs + original_version = _patched_version + return _patched_version - if isinstance(path, CloudPath): - return pathlib_func(path, *pathlib_args, **pathlib_kwargs) - else: - return os_func(*args, **kwargs) + +def patch_open(): + patched = _patch_factory( + builtins.open, + CloudPath.open, + ) + builtins.open = patched + return patched def _cloudpath_os_listdir(path="."): - return _dispatch_to_pathlib(path, lambda path: list(path.iterdir()), os.listdir, path=path) + return list(path.iterdir()) + +def _cloudpath_lstat(path, *, dir_fd=None): + return path.stat() -def _cloudpath_os_lstat(path, *, dir_fd=None): - return _dispatch_to_pathlib(path, CloudPath.stat, os.lstat, path, dir_fd=dir_fd) -def _cloudpath_os_mkdir(path, mode=0o777, *, dir_fd=None): - return _dispatch_to_pathlib(path, CloudPath.mkdir, os.mkdir, path, dir_fd=dir_fd) +def _cloudpath_mkdir(path, *, dir_fd=None): + return path.mkdir() + def _cloudpath_os_makedirs(name, mode=0o777, exist_ok=False): - pass + return CloudPath.mkdir(name, parents=True, exist_ok=exist_ok) + def _cloudpath_os_remove(path, *, dir_fd=None): - pass + return path.unlink() + def _cloudpath_os_removedirs(name): - pass + for d in name.parents: + d.rmdir() + def _cloudpath_os_rename(src, dst, *, src_dir_fd=None, dst_dir_fd=None): - pass + return src.rename(dst) + def _cloudpath_os_renames(old, new): - pass + old.rename(new) # move file + _cloudpath_os_removedirs(old) # remove previous directories if empty + def _cloudpath_os_replace(src, dst, *, src_dir_fd=None, dst_dir_fd=None): - pass + return src.rename(dst) + def _cloudpath_os_rmdir(path, *, dir_fd=None): - pass + return path.rmdir() + + +def _cloudpath_os_scandir(path="."): + return path.iterdir() -def _cloudpath_os_scandir(path='.'): - pass def _cloudpath_os_stat(path, *, dir_fd=None, follow_symlinks=True): - if isinstance(path, CloudPath): - return path.stat() - else: - return os.stat(path, dir_fd=dir_fd, follow_symlinks=follow_symlinks) + return path.stat() + def _cloudpath_os_unlink(path, *, dir_fd=None): - pass + return path.unlink() + def _cloudpath_os_walk(top, topdown=True, onerror=None, followlinks=False): - pass + try: + dirs, files = [], [] + for p in top.iterdir(): + dirs.append(p) if p.is_dir() else files.append(p) + + if topdown: + yield (top, files, dirs) + + for d in dirs: + yield from _cloudpath_os_walk(d, topdown=topdown, onerror=onerror) + + if not topdown: + yield (top, files, dirs) + + except Exception as e: + if onerror is not None: + onerror(e) + else: + raise + def _cloudpath_os_path_basename(path): - pass + return path.name + + +def __common(parts): + i = 0 + + try: + while all(item[i] == parts[0][i] for item in parts[1:]): + i += 1 + except IndexError: + pass + + return parts[0][:i] + + +def _cloudpath_os_path_commonpath(paths): + common = __common([p.parts for p in paths]) + return paths[0].client.CloudPath(*common) + + +def _cloudpath_os_path_commonprefix(list): + common = __common([str(p) for p in list]) + return common + + +def _cloudpath_os_path_dirname(path): + return path.parent -def _cloudpath_os_path_exists(path): - pass def _cloudpath_os_path_getatime(path): - pass + return (path.stat().st_atime,) + def _cloudpath_os_path_getmtime(path): - pass + return (path.stat().st_mtime,) + def _cloudpath_os_path_getctime(path): - pass + return (path.stat().st_ctime,) -def _cloudpath_os_path_getsize(path): - pass -def _cloudpath_os_path_isfile(path): - pass +def _cloudpath_os_path_getsize(path): + return (path.stat().st_size,) -def _cloudpath_os_path_isdir(path): - pass def _cloudpath_os_path_join(path, *paths): - pass + for p in paths: + path /= p + return path -def _cloudpath_os_path_split(path): - pass - -def _cloudpath_os_path_splitext(path): - pass +def _cloudpath_os_path_split(path): + return path.parent, path.name -def patch_os_function(): - os.listdir = _cloudpath_os_listdir +def _cloudpath_os_path_splitext(path): + return str(path)[: -len(path.suffix)], path.suffix + + +def patch_os_functions(): + os.listdir = _patch_factory(os.listdir, _cloudpath_os_listdir) + os.lstat = _patch_factory(os.lstat, _cloudpath_lstat) + os.mkdir = _patch_factory(os.mkdir, _cloudpath_mkdir) + os.makedirs = _patch_factory(os.makedirs, _cloudpath_os_makedirs) + os.remove = _patch_factory(os.remove, _cloudpath_os_remove) + os.removedirs = _patch_factory(os.removedirs, _cloudpath_os_removedirs) + os.rename = _patch_factory(os.rename, _cloudpath_os_rename) + os.renames = _patch_factory(os.renames, _cloudpath_os_renames) + os.replace = _patch_factory(os.replace, _cloudpath_os_replace) + os.rmdir = _patch_factory(os.rmdir, _cloudpath_os_rmdir) + os.scandir = _patch_factory(os.scandir, _cloudpath_os_scandir) + os.stat = _patch_factory(os.stat, _cloudpath_os_stat) + os.unlink = _patch_factory(os.unlink, _cloudpath_os_unlink) + os.walk = _patch_factory(os.walk, _cloudpath_os_walk) + + os.path.basename = _patch_factory(os.path.basename, _cloudpath_os_path_basename) + os.path.commonpath = _patch_factory( + os.path.commonpath, _cloudpath_os_path_commonpath, cpl_check=_check_first_arg_first_index + ) + os.path.commonprefix = _patch_factory( + os.path.commonprefix, + _cloudpath_os_path_commonprefix, + cpl_check=_check_first_arg_first_index, + ) + os.path.dirname = _patch_factory(os.path.dirname, _cloudpath_os_path_dirname) + os.path.exists = _patch_factory(os.path.exists, CloudPath.exists) + os.path.getatime = _patch_factory(os.path.getatime, _cloudpath_os_path_getatime) + os.path.getmtime = _patch_factory(os.path.getmtime, _cloudpath_os_path_getmtime) + os.path.getctime = _patch_factory(os.path.getctime, _cloudpath_os_path_getctime) + os.path.getsize = _patch_factory(os.path.getsize, _cloudpath_os_path_getsize) + os.path.isfile = _patch_factory(os.path.isfile, CloudPath.is_file) + os.path.isdir = _patch_factory(os.path.isdir, CloudPath.is_dir) + os.path.join = _patch_factory(os.path.join, _cloudpath_os_path_join) + os.path.split = _patch_factory(os.path.split, _cloudpath_os_path_split) + os.path.splitext = _patch_factory(os.path.splitext, _cloudpath_os_path_splitext) From 1fcf97f57ce6983bde09b0ece6d4dfce40ce6557 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Wed, 15 Feb 2023 15:54:47 -0800 Subject: [PATCH 03/11] more WIP --- cloudpathlib/patches.py | 1 + test-open.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 test-open.py diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index 6dc90db0..5d222cf5 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -32,6 +32,7 @@ def patch_open(): CloudPath.open, ) builtins.open = patched + CloudPath.__fspath__ = lambda x: x # turn off `fspath` return patched diff --git a/test-open.py b/test-open.py new file mode 100644 index 00000000..47fa12a0 --- /dev/null +++ b/test-open.py @@ -0,0 +1,25 @@ +import os +from cloudpathlib import CloudPath, patch_open, patch_os_functions + + +def hello(cp): + with open(cp, "a") as f: + f.write(" written") + + +if __name__ == "__main__": + patch_open() + + cp = CloudPath("s3://cloudpathlib-test-bucket/manual/text_file.txt") + cp.write_text("yah") + + hello(cp) + + print(cp.read_text()) + cp.unlink() + + patch_os_functions() + + print(list(os.walk("."))) + print(list(cp.parent.client._list_dir(cp.parent, recursive=True))) + print(list(os.walk(cp.parent))) From fe577310ff10eb0f0e9b2b016bc7eb13d04dbbc4 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 25 Feb 2024 18:32:41 -0500 Subject: [PATCH 04/11] Implement glob --- cloudpathlib/__init__.py | 14 ++- cloudpathlib/client.py | 4 +- cloudpathlib/cloudpath.py | 14 ++- cloudpathlib/exceptions.py | 8 ++ cloudpathlib/patches.py | 206 ++++++++++++++++++++++++++++++------- 5 files changed, 205 insertions(+), 41 deletions(-) diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py index 5678c6a3..ea0afd1b 100644 --- a/cloudpathlib/__init__.py +++ b/cloudpathlib/__init__.py @@ -5,7 +5,7 @@ from .azure.azblobclient import AzureBlobClient from .azure.azblobpath import AzureBlobPath from .cloudpath import CloudPath, implementation_registry -from .patches import patch_open, patch_os_functions +from .patches import patch_open, patch_os_functions, patch_glob from .gs.gsclient import GSClient from .gs.gspath import GSPath from .http.httpclient import HttpClient, HttpsClient @@ -36,6 +36,7 @@ "HttpPath", "HttpsPath", "patch_open" + "patch_glob", "patch_os_functions", "S3Client", "S3Path", @@ -44,3 +45,14 @@ if bool(os.environ.get("CLOUDPATHLIB_PATCH_OPEN", "")): patch_open() + +if bool(os.environ.get("CLOUDPATHLIB_PATCH_OS", "")): + patch_os_functions() + +if bool(os.environ.get("CLOUDPATHLIB_PATCH_GLOB", "")): + patch_glob() + +if bool(os.environ.get("CLOUDPATHLIB_PATCH_ALL", "")): + patch_open() + patch_os_functions() + patch_glob diff --git a/cloudpathlib/client.py b/cloudpathlib/client.py index c4305fc3..5286b5e3 100644 --- a/cloudpathlib/client.py +++ b/cloudpathlib/client.py @@ -109,8 +109,8 @@ def set_as_default_client(self) -> None: instances for this cloud without a client specified.""" self.__class__._default_client = self - def CloudPath(self, cloud_path: Union[str, BoundedCloudPath]) -> BoundedCloudPath: - return self._cloud_meta.path_class(cloud_path=cloud_path, client=self) # type: ignore + def CloudPath(self, cloud_path: Union[str, BoundedCloudPath], *parts: str) -> BoundedCloudPath: + return self._cloud_meta.path_class(cloud_path, *parts, client=self) # type: ignore def clear_cache(self): """Clears the contents of the cache folder. diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index 49d9aa73..ebd1dfe7 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -81,6 +81,7 @@ def _make_selector(pattern_parts, _flavour, case_sensitive=True): # noqa: F811 from .exceptions import ( ClientMismatchError, CloudPathFileExistsError, + CloudPathFileNotFoundError, CloudPathIsADirectoryError, CloudPathNotADirectoryError, CloudPathNotExistsError, @@ -681,11 +682,18 @@ def open( force_overwrite_to_cloud: Optional[bool] = None, # extra kwarg not in pathlib ) -> "IO[Any]": # if trying to call open on a directory that exists - if self.exists() and not self.is_file(): + exists_on_cloud = self.exists() + + if exists_on_cloud and not self.is_file(): raise CloudPathIsADirectoryError( f"Cannot open directory, only files. Tried to open ({self})" ) + if not exists_on_cloud and any(m in mode for m in ("r", "a")): + raise CloudPathFileNotFoundError( + f"File opened for read or append, but it does not exist on cloud: {self}" + ) + if mode == "x" and self.exists(): raise CloudPathFileExistsError(f"Cannot open existing file ({self}) for creation.") @@ -1255,7 +1263,7 @@ def _local(self) -> Path: """Cached local version of the file.""" return self.client._local_cache_dir / self._no_prefix - def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self: + def _new_cloudpath(self, path: Union[str, os.PathLike], *parts: str) -> Self: """Use the scheme, client, cache dir of this cloudpath to instantiate a new cloudpath of the same type with the path passed. @@ -1271,7 +1279,7 @@ def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self: if not path.startswith(self.anchor): path = f"{self.anchor}{path}" - return self.client.CloudPath(path) + return self.client.CloudPath(path, *parts) def _refresh_cache(self, force_overwrite_from_cloud: Optional[bool] = None) -> None: try: diff --git a/cloudpathlib/exceptions.py b/cloudpathlib/exceptions.py index 1b4499fb..a9f2ffb4 100644 --- a/cloudpathlib/exceptions.py +++ b/cloudpathlib/exceptions.py @@ -24,6 +24,10 @@ class CloudPathNotExistsError(CloudPathException): pass +class CloudPathFileNotFoundError(CloudPathException, FileNotFoundError): + pass + + class CloudPathIsADirectoryError(CloudPathException, IsADirectoryError): pass @@ -77,3 +81,7 @@ class OverwriteNewerCloudError(CloudPathException): class OverwriteNewerLocalError(CloudPathException): pass + + +class InvalidGlobArgumentsError(CloudPathException): + pass diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index 5d222cf5..b20016f6 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -1,7 +1,11 @@ import builtins +from contextlib import contextmanager +import glob import os import os.path +from cloudpathlib.exceptions import InvalidGlobArgumentsError + from .cloudpath import CloudPath @@ -13,6 +17,10 @@ def _check_first_arg_first_index(*args, **kwargs): return isinstance(args[0][0], CloudPath) +def _check_first_arg_or_root_dir(*args, **kwargs): + return isinstance(args[0], CloudPath) or isinstance(kwargs.get("root_dir", None), CloudPath) + + def _patch_factory(original_version, cpl_version, cpl_check=_check_first_arg): _original = original_version @@ -26,14 +34,29 @@ def _patched_version(*args, **kwargs): return _patched_version +@contextmanager def patch_open(): patched = _patch_factory( builtins.open, CloudPath.open, ) + original_open = builtins.open builtins.open = patched - CloudPath.__fspath__ = lambda x: x # turn off `fspath` - return patched + + original_fspath = CloudPath.__fspath__ + CloudPath.__fspath__ = ( + lambda x: x + ) # turn off `fspath` -> str since we patch everything to handle CloudPath + + try: + yield patched + finally: + builtins.open = original_open + CloudPath.__fspath__ = original_fspath + + +def _cloudpath_fspath(path): + return path # no op, since methods should all handle cloudpaths when patched def _cloudpath_os_listdir(path="."): @@ -172,39 +195,152 @@ def _cloudpath_os_path_splitext(path): return str(path)[: -len(path.suffix)], path.suffix +@contextmanager def patch_os_functions(): - os.listdir = _patch_factory(os.listdir, _cloudpath_os_listdir) - os.lstat = _patch_factory(os.lstat, _cloudpath_lstat) - os.mkdir = _patch_factory(os.mkdir, _cloudpath_mkdir) - os.makedirs = _patch_factory(os.makedirs, _cloudpath_os_makedirs) - os.remove = _patch_factory(os.remove, _cloudpath_os_remove) - os.removedirs = _patch_factory(os.removedirs, _cloudpath_os_removedirs) - os.rename = _patch_factory(os.rename, _cloudpath_os_rename) - os.renames = _patch_factory(os.renames, _cloudpath_os_renames) - os.replace = _patch_factory(os.replace, _cloudpath_os_replace) - os.rmdir = _patch_factory(os.rmdir, _cloudpath_os_rmdir) - os.scandir = _patch_factory(os.scandir, _cloudpath_os_scandir) - os.stat = _patch_factory(os.stat, _cloudpath_os_stat) - os.unlink = _patch_factory(os.unlink, _cloudpath_os_unlink) - os.walk = _patch_factory(os.walk, _cloudpath_os_walk) - - os.path.basename = _patch_factory(os.path.basename, _cloudpath_os_path_basename) - os.path.commonpath = _patch_factory( - os.path.commonpath, _cloudpath_os_path_commonpath, cpl_check=_check_first_arg_first_index + os_level = [ + ("fspath", os.fspath, _cloudpath_fspath), + ("listdir", os.listdir, _cloudpath_os_listdir), + ("lstat", os.lstat, _cloudpath_lstat), + ("mkdir", os.mkdir, _cloudpath_mkdir), + ("makedirs", os.makedirs, _cloudpath_os_makedirs), + ("remove", os.remove, _cloudpath_os_remove), + ("removedirs", os.removedirs, _cloudpath_os_removedirs), + ("rename", os.rename, _cloudpath_os_rename), + ("renames", os.renames, _cloudpath_os_renames), + ("replace", os.replace, _cloudpath_os_replace), + ("rmdir", os.rmdir, _cloudpath_os_rmdir), + ("scandir", os.scandir, _cloudpath_os_scandir), + ("stat", os.stat, _cloudpath_os_stat), + ("unlink", os.unlink, _cloudpath_os_unlink), + ("walk", os.walk, _cloudpath_os_walk), + ] + + os_originals = {} + + for name, original, cloud in os_level: + os_originals[name] = original + patched = _patch_factory(original, cloud) + setattr(os, name, patched) + + os_path_level = [ + ("basename", os.path.basename, _cloudpath_os_path_basename, _check_first_arg), + ( + "commonpath", + os.path.commonpath, + _cloudpath_os_path_commonpath, + _check_first_arg_first_index, + ), + ( + "commonprefix", + os.path.commonprefix, + _cloudpath_os_path_commonprefix, + _check_first_arg_first_index, + ), + ("dirname", os.path.dirname, _cloudpath_os_path_dirname, _check_first_arg), + ("exists", os.path.exists, CloudPath.exists, _check_first_arg), + ("getatime", os.path.getatime, _cloudpath_os_path_getatime, _check_first_arg), + ("getmtime", os.path.getmtime, _cloudpath_os_path_getmtime, _check_first_arg), + ("getctime", os.path.getctime, _cloudpath_os_path_getctime, _check_first_arg), + ("getsize", os.path.getsize, _cloudpath_os_path_getsize, _check_first_arg), + ("isfile", os.path.isfile, CloudPath.is_file, _check_first_arg), + ("isdir", os.path.isdir, CloudPath.is_dir, _check_first_arg), + ("join", os.path.join, _cloudpath_os_path_join, _check_first_arg), + ("split", os.path.split, _cloudpath_os_path_split, _check_first_arg), + ("splitext", os.path.splitext, _cloudpath_os_path_splitext, _check_first_arg), + ] + + os_path_originals = {} + + for name, original, cloud, check in os_path_level: + os_path_originals[name] = original + patched = _patch_factory(original, cloud, cpl_check=check) + setattr(os.path, name, patched) + + try: + yield + finally: + for name, original in os_originals.items(): + setattr(os, name, original) + + for name, original in os_path_originals.items(): + setattr(os.path, name, original) + + +def _get_root_dir_pattern_from_pathname(pathname): + # get first wildcard + for i, part in enumerate(pathname.parts): + if "*" in part: + root_parts = pathname.parts[:i] + pattern_parts = pathname.parts[i:] + break + + root_dir = pathname._new_cloudpath(*root_parts) + pattern = "/".join(pattern_parts) + + return root_dir, pattern + + +def _cloudpath_glob_iglob( + pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False +): + # if both are cloudpath, root_dir and pathname must share a parent, otherwise we don't know + # where to start the pattern + if isinstance(pathname, CloudPath) and isinstance(root_dir, CloudPath): + if not pathname.is_relative_to(root_dir): + raise InvalidGlobArgumentsError( + f"If both are CloudPaths, root_dir ({root_dir}) must be a parent of pathname ({pathname})." + ) + + else: + pattern = pathname.relative_to(root_dir) + + elif isinstance(pathname, CloudPath): + if root_dir is not None: + InvalidGlobArgumentsError( + "If pathname is a CloudPath, root_dir must also be a CloudPath or None." + ) + + root_dir, pattern = _get_root_dir_pattern_from_pathname(pathname) + + elif isinstance(root_dir, CloudPath): + pattern = pathname + + else: + raise InvalidGlobArgumentsError( + "At least one of pathname or root_dir must be a CloudPath." + ) + + return root_dir.glob(pattern) + + +def _cloudpath_glob_glob( + pathname, *, root_dir=None, dir_fd=None, recursive=False, include_hidden=False +): + return list( + _cloudpath_glob_iglob( + pathname, + root_dir=root_dir, + dir_fd=dir_fd, + recursive=recursive, + include_hidden=include_hidden, + ) ) - os.path.commonprefix = _patch_factory( - os.path.commonprefix, - _cloudpath_os_path_commonprefix, - cpl_check=_check_first_arg_first_index, + + +@contextmanager +def patch_glob(): + original_glob = glob.glob + glob.glob = _patch_factory( + glob.glob, _cloudpath_glob_glob, cpl_check=_check_first_arg_or_root_dir + ) + + original_iglob = glob.iglob + glob.iglob = _patch_factory( + glob.iglob, _cloudpath_glob_iglob, cpl_check=_check_first_arg_or_root_dir ) - os.path.dirname = _patch_factory(os.path.dirname, _cloudpath_os_path_dirname) - os.path.exists = _patch_factory(os.path.exists, CloudPath.exists) - os.path.getatime = _patch_factory(os.path.getatime, _cloudpath_os_path_getatime) - os.path.getmtime = _patch_factory(os.path.getmtime, _cloudpath_os_path_getmtime) - os.path.getctime = _patch_factory(os.path.getctime, _cloudpath_os_path_getctime) - os.path.getsize = _patch_factory(os.path.getsize, _cloudpath_os_path_getsize) - os.path.isfile = _patch_factory(os.path.isfile, CloudPath.is_file) - os.path.isdir = _patch_factory(os.path.isdir, CloudPath.is_dir) - os.path.join = _patch_factory(os.path.join, _cloudpath_os_path_join) - os.path.split = _patch_factory(os.path.split, _cloudpath_os_path_split) - os.path.splitext = _patch_factory(os.path.splitext, _cloudpath_os_path_splitext) + + try: + yield + finally: + glob.glob = original_glob + glob.iglob = original_iglob From 136f38b8ceaea5314288ab2bdfe6e7c163542042 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 25 May 2024 16:27:35 -0700 Subject: [PATCH 05/11] WIP docs and tests --- docs/docs/patching_builtins.ipynb | 321 ++++++++++++++++++++++++++++++ tests/test_patching.py | 49 +++++ 2 files changed, 370 insertions(+) create mode 100644 docs/docs/patching_builtins.ipynb create mode 100644 tests/test_patching.py diff --git a/docs/docs/patching_builtins.ipynb b/docs/docs/patching_builtins.ipynb new file mode 100644 index 00000000..3647eb1f --- /dev/null +++ b/docs/docs/patching_builtins.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Patching Python builtins (third-party library compatibility)\n", + "\n", + "Not every Python library is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries then may internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them.\n", + "\n", + "This means that out-of-the-box you can't just pass a `CloudPath` object to any method of function and have it work. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.\n", + "\n", + "The long-term solution is to ask developers to implement their library to support either (1) pathlib-compatible objects for files and directories, or (2) file-like objects passed directly (e.g., so you could call `CloudPath.open` in your code and pass the the file-like object to the library).\n", + "\n", + "The short-term workaround that will be compatible with some libraries is to patch the builtins to make `open`, `os`, `os.path`, and `glob` work with `CloudPath` objects. Because this overrides default Python functionality, this is not on by default. When patched, these functions will use the `CloudPath` version if they are passed a `CloudPath` and will fallback to their normal implementations otherwise.\n", + "\n", + "These methods can be enabled by setting the following environment variables:\n", + " - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob`\n", + " - `CLOUDPATHLIB_PACTH_OPEN=1` - patch the builtin `open` method\n", + " - `CLOUDPATHLIB_PACTH_OS_FUNCTIONS=1` - patch the `os` functions\n", + " - `CLOUDPATHLIB_PACTH_GLOB=1` - patch the `glob` module\n", + "\n", + "You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported.\n", + "\n", + "Alternatively, you can call methods to patch the functions.\n", + "\n", + "```python\n", + "from cloudpathlib import patch_open, patch_os_functions, patch_glob\n", + "\n", + "# patch builtins\n", + "patch_open()\n", + "patch_os_functions()\n", + "patch_glob()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These patch methods are all context managers, so if you want to control where the patch is active, you can use them in a `with` statement. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unpatched version fails:\n", + "'S3Path' object is not subscriptable\n", + "Patched succeeds:\n", + "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n", + "`glob` module now is equivalent to `CloudPath.glob`\n", + "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n" + ] + } + ], + "source": [ + "from glob import glob\n", + "\n", + "from cloudpathlib import patch_glob, CloudPath\n", + "\n", + "try:\n", + " glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**\"))\n", + "except Exception as e:\n", + " print(\"Unpatched version fails:\")\n", + " print(e)\n", + "\n", + "\n", + "with patch_glob():\n", + " print(\"Patched succeeds:\")\n", + " print(glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**/*\")))\n", + "\n", + " # or equivalently\n", + " print(\"`glob` module now is equivalent to `CloudPath.glob`\")\n", + " print(glob(\"**/*dir*/**/*\", root_dir=CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see a similar result for patching the functions in the `os` module." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False\n", + "Patched version of `os.path.isdir` returns: None\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "from cloudpathlib import patch_os_functions, CloudPath\n", + "\n", + "print(os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))\n", + "\n", + "\n", + "# try:\n", + "# os.path.isdir(\"s3://cloudpathlib-test-bucket/manual-tests/\")\n", + "# except Exception as e:\n", + "# print(\"Unpatched version fails:\")\n", + "# print(e)\n", + "\n", + "\n", + "with patch_os_functions():\n", + " result = os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\"))\n", + " print(\"Patched version of `os.path.isdir` returns: \", result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Patching `open`\n", + "\n", + "Sometimes code uses the Python built-in `open` to open files and operate on them. Because of the way that is implemented, it only accepts a string to operate on. Unfortunately, that breaks usage with cloudpathlib.\n", + "\n", + "Instead, we can patch the built-in `open` to handle all the normal circumstances, and—if the argument is a `CloudPath`—use cloudpathlib to do the opening.\n", + "\n", + "### Patching `open` in Jupyter notebooks\n", + "\n", + "Jupyter notebooks require one extra step becaue they have their own version of `open` that is injected into the global namespace of the notebook. This means that you must _additionally_ replace that version of open with the patched version if you want to use `open` in a notebook. This can be done with the `patch_open` method by adding the following to the top of the notebook.\n", + "\n", + "```python\n", + "from cloudpathlib import patch_open\n", + "\n", + "# replace jupyter's `open` with one that works with CloudPath\n", + "open = patch_open()\n", + "```\n", + "\n", + "Here's an example that doesn't work right now (for example, if you depend on a thrid-party library that calls `open`)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Errno 2] No such file or directory: '/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/tmpvnzs5qnd/cloudpathlib-test-bucket/patching_builtins/file.txt'\n" + ] + } + ], + "source": [ + "from cloudpathlib import CloudPath, patch_open\n", + "\n", + "\n", + "# example of a function within a third-party library\n", + "def library_function(filepath: str):\n", + " with open(filepath, \"r\") as f:\n", + " print(f.read())\n", + "\n", + "\n", + "# create file to read\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "\n", + "# fails with a TypeError if passed a CloudPath\n", + "try:\n", + " library_function(cp)\n", + "except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "ContextDecorator.__call__() takes 2 positional arguments but 3 were given", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# create file to read\u001b[39;00m\n\u001b[1;32m 14\u001b[0m cp \u001b[38;5;241m=\u001b[39m CloudPath(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ms3://cloudpathlib-test-bucket/patching_builtins/file.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m \u001b[43mlibrary_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcp\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mlibrary_function\u001b[0;34m(filepath)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlibrary_function\u001b[39m(filepath: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(f\u001b[38;5;241m.\u001b[39mread())\n", + "\u001b[0;31mTypeError\u001b[0m: ContextDecorator.__call__() takes 2 positional arguments but 3 were given" + ] + } + ], + "source": [ + "from cloudpathlib import CloudPath, patch_open\n", + "\n", + "# jupyter patch\n", + "# open = patch_open()\n", + "\n", + "with patch_open():\n", + " # example of a function within a third-party library\n", + " def library_function(filepath: str):\n", + " with open(filepath, \"r\") as f:\n", + " print(f.read())\n", + "\n", + "\n", + " # create file to read\n", + " cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "\n", + " library_function(cp)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/ipykernel_34335/3906426398.py\u001b[0m(9)\u001b[0;36mlibrary_function\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 7 \u001b[0;31m\u001b[0;31m# example of a function within a third-party library\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 8 \u001b[0;31m\u001b[0;32mdef\u001b[0m \u001b[0mlibrary_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m----> 9 \u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 10 \u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 11 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n", + "\n", + "*** TypeError: ContextDecorator.__call__() missing 1 required positional argument: 'func'\n" + ] + } + ], + "source": [ + "%debug" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `open`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cloudpathlib", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_patching.py b/tests/test_patching.py new file mode 100644 index 00000000..8eb467f0 --- /dev/null +++ b/tests/test_patching.py @@ -0,0 +1,49 @@ +import importlib +import os + +import pytest + +import cloudpathlib +from cloudpathlib import patch_open + + +def test_patch_open(rig): + cp = rig.create_cloud_path("dir_0/new_file.txt") + + with pytest.raises(FileNotFoundError): + with open(cp, "w") as f: + f.write("Hello!") + + # set via method call + with patch_open(): + with open(cp, "w") as f: + f.write("Hello!") + + assert cp.read_text() == "Hello!" + + # set via env var + cp2 = rig.create_cloud_path("dir_0/new_file_two.txt") + original_env_setting = os.environ.get("CLOUDPATHLIB_PATCH_OPEN", "") + + try: + os.environ["CLOUDPATHLIB_PATCH_OPEN"] = "1" + + importlib.reload(cloudpathlib) + + with open(cp2, "w") as f: + f.write("Hello!") + + assert cp2.read_text() == "Hello!" + + finally: + os.environ["CLOUDPATHLIB_PATCH_OPEN"] = original_env_setting + importlib.reload(cloudpathlib) + + # cp.write_text("Hello!") + + # # remove cache + # cp._local.unlink() + + +def test_patches(rig): + pass From 0bd6fbf6b747fd70d8d16670f6d80269387c4738 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 10 Aug 2025 12:54:44 -0700 Subject: [PATCH 06/11] tests and docs --- Makefile | 2 +- cloudpathlib/__init__.py | 4 +- cloudpathlib/http/httppath.py | 3 +- cloudpathlib/patches.py | 238 +++++++----- docs/docs/patching_builtins.ipynb | 246 +++++++----- tests/test_patching.py | 609 +++++++++++++++++++++++++++++- 6 files changed, 887 insertions(+), 215 deletions(-) diff --git a/Makefile b/Makefile index 9464111b..e429cbbd 100644 --- a/Makefile +++ b/Makefile @@ -81,7 +81,7 @@ test: ## run tests with mocked cloud SDKs python -m pytest -vv test-debug: ## rerun tests that failed in last run and stop with pdb at failures - python -m pytest -n=0 -vv --lf --pdb + python -m pytest -n=0 -vv --lf --pdb --capture=no test-live-cloud: ## run tests on live cloud backends USE_LIVE_CLOUD=1 python -m pytest -vv diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py index ea0afd1b..3d78bed7 100644 --- a/cloudpathlib/__init__.py +++ b/cloudpathlib/__init__.py @@ -35,7 +35,7 @@ "HttpsClient", "HttpPath", "HttpsPath", - "patch_open" + "patch_open", "patch_glob", "patch_os_functions", "S3Client", @@ -55,4 +55,4 @@ if bool(os.environ.get("CLOUDPATHLIB_PATCH_ALL", "")): patch_open() patch_os_functions() - patch_glob + patch_glob() diff --git a/cloudpathlib/http/httppath.py b/cloudpathlib/http/httppath.py index 3f42a82d..222d4648 100644 --- a/cloudpathlib/http/httppath.py +++ b/cloudpathlib/http/httppath.py @@ -21,9 +21,10 @@ class HttpPath(CloudPath): def __init__( self, cloud_path: Union[str, "HttpPath"], + *parts: str, client: Optional["HttpClient"] = None, ) -> None: - super().__init__(cloud_path, client) + super().__init__(cloud_path, *parts, client=client) self._path = ( PurePosixPath(self._url.path) diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index b20016f6..e3a259de 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -1,5 +1,4 @@ import builtins -from contextlib import contextmanager import glob import os import os.path @@ -34,25 +33,32 @@ def _patched_version(*args, **kwargs): return _patched_version -@contextmanager -def patch_open(): - patched = _patch_factory( - builtins.open, - CloudPath.open, - ) - original_open = builtins.open - builtins.open = patched +class _OpenPatch: + def __init__(self, original_open=None): + if original_open is None: + original_open = builtins.open + + self._orig_open = original_open + self._orig_fspath = CloudPath.__fspath__ + self.patched = _patch_factory( + original_open, + CloudPath.open, + ) - original_fspath = CloudPath.__fspath__ - CloudPath.__fspath__ = ( - lambda x: x - ) # turn off `fspath` -> str since we patch everything to handle CloudPath + # patch immediately so a plain call works + builtins.open = self.patched + CloudPath.__fspath__ = lambda x: x - try: - yield patched - finally: - builtins.open = original_open - CloudPath.__fspath__ = original_fspath + def __enter__(self): + return builtins.open + + def __exit__(self, exc_type, exc_value, traceback): + builtins.open = self._orig_open + CloudPath.__fspath__ = self._orig_fspath + + +def patch_open(original_open=None): + return _OpenPatch(original_open) def _cloudpath_fspath(path): @@ -76,7 +82,7 @@ def _cloudpath_os_makedirs(name, mode=0o777, exist_ok=False): def _cloudpath_os_remove(path, *, dir_fd=None): - return path.unlink() + return path.unlink(missing_ok=False) # os.remove raises if missing def _cloudpath_os_removedirs(name): @@ -195,87 +201,101 @@ def _cloudpath_os_path_splitext(path): return str(path)[: -len(path.suffix)], path.suffix -@contextmanager -def patch_os_functions(): - os_level = [ - ("fspath", os.fspath, _cloudpath_fspath), - ("listdir", os.listdir, _cloudpath_os_listdir), - ("lstat", os.lstat, _cloudpath_lstat), - ("mkdir", os.mkdir, _cloudpath_mkdir), - ("makedirs", os.makedirs, _cloudpath_os_makedirs), - ("remove", os.remove, _cloudpath_os_remove), - ("removedirs", os.removedirs, _cloudpath_os_removedirs), - ("rename", os.rename, _cloudpath_os_rename), - ("renames", os.renames, _cloudpath_os_renames), - ("replace", os.replace, _cloudpath_os_replace), - ("rmdir", os.rmdir, _cloudpath_os_rmdir), - ("scandir", os.scandir, _cloudpath_os_scandir), - ("stat", os.stat, _cloudpath_os_stat), - ("unlink", os.unlink, _cloudpath_os_unlink), - ("walk", os.walk, _cloudpath_os_walk), - ] - - os_originals = {} - - for name, original, cloud in os_level: - os_originals[name] = original - patched = _patch_factory(original, cloud) - setattr(os, name, patched) - - os_path_level = [ - ("basename", os.path.basename, _cloudpath_os_path_basename, _check_first_arg), - ( - "commonpath", - os.path.commonpath, - _cloudpath_os_path_commonpath, - _check_first_arg_first_index, - ), - ( - "commonprefix", - os.path.commonprefix, - _cloudpath_os_path_commonprefix, - _check_first_arg_first_index, - ), - ("dirname", os.path.dirname, _cloudpath_os_path_dirname, _check_first_arg), - ("exists", os.path.exists, CloudPath.exists, _check_first_arg), - ("getatime", os.path.getatime, _cloudpath_os_path_getatime, _check_first_arg), - ("getmtime", os.path.getmtime, _cloudpath_os_path_getmtime, _check_first_arg), - ("getctime", os.path.getctime, _cloudpath_os_path_getctime, _check_first_arg), - ("getsize", os.path.getsize, _cloudpath_os_path_getsize, _check_first_arg), - ("isfile", os.path.isfile, CloudPath.is_file, _check_first_arg), - ("isdir", os.path.isdir, CloudPath.is_dir, _check_first_arg), - ("join", os.path.join, _cloudpath_os_path_join, _check_first_arg), - ("split", os.path.split, _cloudpath_os_path_split, _check_first_arg), - ("splitext", os.path.splitext, _cloudpath_os_path_splitext, _check_first_arg), - ] - - os_path_originals = {} - - for name, original, cloud, check in os_path_level: - os_path_originals[name] = original - patched = _patch_factory(original, cloud, cpl_check=check) - setattr(os.path, name, patched) - - try: - yield - finally: - for name, original in os_originals.items(): +class _OSPatch: + def __init__(self): + os_level = [ + ("fspath", os.fspath, _cloudpath_fspath), + ("listdir", os.listdir, _cloudpath_os_listdir), + ("lstat", os.lstat, _cloudpath_lstat), + ("mkdir", os.mkdir, _cloudpath_mkdir), + ("makedirs", os.makedirs, _cloudpath_os_makedirs), + ("remove", os.remove, _cloudpath_os_remove), + ("removedirs", os.removedirs, _cloudpath_os_removedirs), + ("rename", os.rename, _cloudpath_os_rename), + ("renames", os.renames, _cloudpath_os_renames), + ("replace", os.replace, _cloudpath_os_replace), + ("rmdir", os.rmdir, _cloudpath_os_rmdir), + ("scandir", os.scandir, _cloudpath_os_scandir), + ("stat", os.stat, _cloudpath_os_stat), + ("unlink", os.unlink, _cloudpath_os_unlink), + ("walk", os.walk, _cloudpath_os_walk), + ] + + self.os_originals = {} + + for name, original, cloud in os_level: + self.os_originals[name] = original + patched = _patch_factory(original, cloud) + setattr(os, name, patched) + + os_path_level = [ + ("basename", os.path.basename, _cloudpath_os_path_basename, _check_first_arg), + ( + "commonpath", + os.path.commonpath, + _cloudpath_os_path_commonpath, + _check_first_arg_first_index, + ), + ( + "commonprefix", + os.path.commonprefix, + _cloudpath_os_path_commonprefix, + _check_first_arg_first_index, + ), + ("dirname", os.path.dirname, _cloudpath_os_path_dirname, _check_first_arg), + ("exists", os.path.exists, CloudPath.exists, _check_first_arg), + ("getatime", os.path.getatime, _cloudpath_os_path_getatime, _check_first_arg), + ("getmtime", os.path.getmtime, _cloudpath_os_path_getmtime, _check_first_arg), + ("getctime", os.path.getctime, _cloudpath_os_path_getctime, _check_first_arg), + ("getsize", os.path.getsize, _cloudpath_os_path_getsize, _check_first_arg), + ("isfile", os.path.isfile, CloudPath.is_file, _check_first_arg), + ("isdir", os.path.isdir, CloudPath.is_dir, _check_first_arg), + ("join", os.path.join, _cloudpath_os_path_join, _check_first_arg), + ("split", os.path.split, _cloudpath_os_path_split, _check_first_arg), + ("splitext", os.path.splitext, _cloudpath_os_path_splitext, _check_first_arg), + ] + + self.os_path_originals = {} + + for name, original, cloud, check in os_path_level: + self.os_path_originals[name] = original + patched = _patch_factory(original, cloud, cpl_check=check) + setattr(os.path, name, patched) + + def __enter__(self): + return + + def __exit__(self, exc_type, exc_value, traceback): + for name, original in self.os_originals.items(): setattr(os, name, original) - for name, original in os_path_originals.items(): + for name, original in self.os_path_originals.items(): setattr(os.path, name, original) +def patch_os_functions(): + return _OSPatch() + + def _get_root_dir_pattern_from_pathname(pathname): # get first wildcard for i, part in enumerate(pathname.parts): - if "*" in part: + if "*" in part or "?" in part or "[" in part: root_parts = pathname.parts[:i] pattern_parts = pathname.parts[i:] break + else: + # No wildcards found, treat the entire path as root_dir with empty pattern + root_parts = pathname.parts + pattern_parts = [] root_dir = pathname._new_cloudpath(*root_parts) - pattern = "/".join(pattern_parts) + + # Handle empty pattern case - use "*" to match all files in directory + if not pattern_parts: + pattern = "*" + else: + pattern = "/".join(pattern_parts) return root_dir, pattern @@ -310,6 +330,8 @@ def _cloudpath_glob_iglob( "At least one of pathname or root_dir must be a CloudPath." ) + # CloudPath automatically detects recursive patterns from ** or / in the pattern + # No need to pass recursive parameter return root_dir.glob(pattern) @@ -327,20 +349,32 @@ def _cloudpath_glob_glob( ) -@contextmanager -def patch_glob(): - original_glob = glob.glob - glob.glob = _patch_factory( - glob.glob, _cloudpath_glob_glob, cpl_check=_check_first_arg_or_root_dir - ) +class _GlobPatch: + def __init__(self): + self.original_glob = glob.glob + self.original_iglob = glob.iglob - original_iglob = glob.iglob - glob.iglob = _patch_factory( - glob.iglob, _cloudpath_glob_iglob, cpl_check=_check_first_arg_or_root_dir - ) + self.patched_glob = _patch_factory( + self.original_glob, + _cloudpath_glob_glob, + cpl_check=_check_first_arg_or_root_dir, + ) - try: - yield - finally: - glob.glob = original_glob - glob.iglob = original_iglob + self.patched_iglob = _patch_factory( + self.original_iglob, + _cloudpath_glob_iglob, + cpl_check=_check_first_arg_or_root_dir, + ) + + def __enter__(self): + glob.glob = self.patched_glob + glob.iglob = self.patched_iglob + return + + def __exit__(self, exc_type, exc_value, traceback): + glob.glob = self.original_glob + glob.iglob = self.original_iglob + + +def patch_glob(): + return _GlobPatch() diff --git a/docs/docs/patching_builtins.ipynb b/docs/docs/patching_builtins.ipynb index 3647eb1f..d8bb7f5b 100644 --- a/docs/docs/patching_builtins.ipynb +++ b/docs/docs/patching_builtins.ipynb @@ -8,7 +8,7 @@ "\n", "Not every Python library is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries then may internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them.\n", "\n", - "This means that out-of-the-box you can't just pass a `CloudPath` object to any method of function and have it work. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.\n", + "This means that out-of-the-box you can't just pass a `CloudPath` object to any method or function and have it work. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.\n", "\n", "The long-term solution is to ask developers to implement their library to support either (1) pathlib-compatible objects for files and directories, or (2) file-like objects passed directly (e.g., so you could call `CloudPath.open` in your code and pass the the file-like object to the library).\n", "\n", @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -107,7 +107,7 @@ "output_type": "stream", "text": [ "False\n", - "Patched version of `os.path.isdir` returns: None\n" + "Patched version of `os.path.isdir` returns: True\n" ] } ], @@ -116,14 +116,11 @@ "\n", "from cloudpathlib import patch_os_functions, CloudPath\n", "\n", - "print(os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))\n", - "\n", - "\n", - "# try:\n", - "# os.path.isdir(\"s3://cloudpathlib-test-bucket/manual-tests/\")\n", - "# except Exception as e:\n", - "# print(\"Unpatched version fails:\")\n", - "# print(e)\n", + "try:\n", + " print(os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))\n", + "except Exception as e:\n", + " print(\"Unpatched version fails:\")\n", + " print(e)\n", "\n", "\n", "with patch_os_functions():\n", @@ -137,164 +134,225 @@ "source": [ "## Patching `open`\n", "\n", - "Sometimes code uses the Python built-in `open` to open files and operate on them. Because of the way that is implemented, it only accepts a string to operate on. Unfortunately, that breaks usage with cloudpathlib.\n", - "\n", - "Instead, we can patch the built-in `open` to handle all the normal circumstances, and—if the argument is a `CloudPath`—use cloudpathlib to do the opening.\n", + "Sometimes code uses the Python built-in `open` to open files and operate on them. In those cases, passing a `CloudPath` will fail. You can patch the built-in `open` so that when a `CloudPath` is provided it uses `CloudPath.open`, otherwise defers to the original behavior.\n", "\n", "### Patching `open` in Jupyter notebooks\n", "\n", - "Jupyter notebooks require one extra step becaue they have their own version of `open` that is injected into the global namespace of the notebook. This means that you must _additionally_ replace that version of open with the patched version if you want to use `open` in a notebook. This can be done with the `patch_open` method by adding the following to the top of the notebook.\n", + "Jupyter notebooks inject their own `open` into the user namespace. After enabling the patch, ensure the notebook's `open` refers to the patched built-in:\n", "\n", "```python\n", "from cloudpathlib import patch_open\n", "\n", - "# replace jupyter's `open` with one that works with CloudPath\n", - "open = patch_open()\n", + "open = patch_open().patched # rebind notebook's open to the patched version\n", "```\n", "\n", - "Here's an example that doesn't work right now (for example, if you depend on a thrid-party library that calls `open`)." + "Here's an example that doesn't work right now (for example, if you depend on a third-party library that calls `open`)." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# deep in a third-party library a function calls the built-in open\n", + "def library_function(filepath: str):\n", + " with open(filepath, \"w\") as f:\n", + " f.write(\"hello!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Errno 2] No such file or directory: '/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/tmpvnzs5qnd/cloudpathlib-test-bucket/patching_builtins/file.txt'\n" + "[Errno 2] No such file or directory: '/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/tmpykd4wirh/cloudpathlib-test-bucket/patching_builtins/new_file.txt'\n" ] } ], "source": [ - "from cloudpathlib import CloudPath, patch_open\n", - "\n", - "\n", - "# example of a function within a third-party library\n", - "def library_function(filepath: str):\n", - " with open(filepath, \"r\") as f:\n", - " print(f.read())\n", - "\n", + "from cloudpathlib import CloudPath\n", "\n", "# create file to read\n", - "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt\")\n", "\n", "# fails with a TypeError if passed a CloudPath\n", "try:\n", " library_function(cp)\n", + " cp.read_text() == \"hello!\"\n", "except Exception as e:\n", " print(e)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "ContextDecorator.__call__() takes 2 positional arguments but 3 were given", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 16\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# create file to read\u001b[39;00m\n\u001b[1;32m 14\u001b[0m cp \u001b[38;5;241m=\u001b[39m CloudPath(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ms3://cloudpathlib-test-bucket/patching_builtins/file.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 16\u001b[0m \u001b[43mlibrary_function\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcp\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mlibrary_function\u001b[0;34m(filepath)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlibrary_function\u001b[39m(filepath: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(f\u001b[38;5;241m.\u001b[39mread())\n", - "\u001b[0;31mTypeError\u001b[0m: ContextDecorator.__call__() takes 2 positional arguments but 3 were given" + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" ] } ], "source": [ "from cloudpathlib import CloudPath, patch_open\n", "\n", - "# jupyter patch\n", - "# open = patch_open()\n", + "# enable patch and rebind notebook's open\n", + "open = patch_open().patched\n", + "\n", "\n", - "with patch_open():\n", - " # example of a function within a third-party library\n", - " def library_function(filepath: str):\n", - " with open(filepath, \"r\") as f:\n", - " print(f.read())\n", + "# deep in a third-party library a function calls the built-in open\n", + "def library_function(filepath: str):\n", + " with open(filepath, \"w\") as f:\n", + " f.write(\"hello!\")\n", "\n", "\n", - " # create file to read\n", - " cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "# create file to read\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "\n", + "try:\n", + " library_function(cp)\n", + " print(cp.read_text() == \"hello!\")\n", + "except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Examples: os.path functions with CloudPath\n", "\n", - " library_function(cp)" + "The snippet below demonstrates common `os.path` functions when patched to accept `CloudPath` values. These calls work for `CloudPath` and still behave normally for string paths.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> \u001b[0;32m/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/ipykernel_34335/3906426398.py\u001b[0m(9)\u001b[0;36mlibrary_function\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 7 \u001b[0;31m\u001b[0;31m# example of a function within a third-party library\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 8 \u001b[0;31m\u001b[0;32mdef\u001b[0m \u001b[0mlibrary_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m----> 9 \u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 10 \u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 11 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n", - "\n", - "*** TypeError: ContextDecorator.__call__() missing 1 required positional argument: 'func'\n" + "basename: example.txt\n", + "dirname: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", + "exists(file): True\n", + "isfile(file): True\n", + "isdir(dir): True\n", + "join: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/sub/name.txt\n", + "split head: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", + "split tail: example.txt\n", + "splitext root: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/example\n", + "splitext ext: .txt\n", + "commonpath: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", + "commonprefix: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/\n" ] } ], "source": [ - "%debug" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `open`" + "import os\n", + "from cloudpathlib import CloudPath, patch_os_functions\n", + "\n", + "# Create a small demo structure in your configured cloud provider (mocked in tests)\n", + "base = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/\")\n", + "file_path = base / \"dir\" / \"example.txt\"\n", + "\n", + "with patch_os_functions():\n", + " # ensure directory/file exist for demo purposes\n", + " file_path.parent.mkdir(exist_ok=True)\n", + " file_path.write_text(\"content\")\n", + "\n", + " # basename\n", + " print(\"basename:\", os.path.basename(file_path)) # => \"example.txt\"\n", + "\n", + " # dirname\n", + " print(\"dirname:\", os.path.dirname(file_path)) # => CloudPath(.../ospath_demo/dir)\n", + "\n", + " # exists / isfile / isdir\n", + " print(\"exists(file):\", os.path.exists(file_path))\n", + " print(\"isfile(file):\", os.path.isfile(file_path))\n", + " print(\"isdir(dir):\", os.path.isdir(file_path.parent))\n", + "\n", + " # join\n", + " joined = os.path.join(base, \"dir\", \"sub\", \"name.txt\")\n", + " print(\"join:\", joined)\n", + "\n", + " # split\n", + " head, tail = os.path.split(file_path)\n", + " print(\"split head:\", head)\n", + " print(\"split tail:\", tail)\n", + "\n", + " # splitext\n", + " root, ext = os.path.splitext(file_path)\n", + " print(\"splitext root:\", root)\n", + " print(\"splitext ext:\", ext)\n", + "\n", + " # commonpath/commonprefix\n", + " p1 = base / \"dir\" / \"a.txt\"\n", + " p2 = base / \"dir\" / \"b.txt\"\n", + " print(\"commonpath:\", os.path.commonpath([p1, p2])) # => CloudPath(.../ospath_demo/dir)\n", + " print(\"commonprefix:\", os.path.commonprefix([p1, p2]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#os" + "## Examples: glob with CloudPath\n", + "\n", + "The snippet below demonstrates `glob.glob` and `glob.iglob` working with `CloudPath` as the pattern or `root_dir` when patched.\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "*.txt: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt')]\n", + "**/*.txt: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt'), S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/sub/file3.txt')]\n", + "root_dir + pattern: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file2.py')]\n", + "iglob first: s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt\n" + ] } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "import glob\n", + "from cloudpathlib import CloudPath, patch_glob\n", + "\n", + "root = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/\")\n", + "\n", + "with patch_glob():\n", + " # setup demo files\n", + " (root / \"sub\").mkdir(exist_ok=True)\n", + " (root / \"file1.txt\").write_text(\"1\")\n", + " (root / \"file2.py\").write_text(\"2\")\n", + " (root / \"sub\" / \"file3.txt\").write_text(\"3\")\n", + "\n", + " # Pattern as CloudPath\n", + " print(\"*.txt:\", glob.glob(root / \"*.txt\"))\n", + "\n", + " # Recursive patterns\n", + " print(\"**/*.txt:\", glob.glob(root / \"**/*.txt\"))\n", + "\n", + " # Using root_dir with string pattern\n", + " print(\"root_dir + pattern:\", glob.glob(\"*.py\", root_dir=root))\n", + "\n", + " # iglob iterator\n", + " it = glob.iglob(root / \"*.txt\")\n", + " print(\"iglob first:\", next(it))" + ] } ], "metadata": { @@ -313,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/tests/test_patching.py b/tests/test_patching.py index 8eb467f0..e40bdad6 100644 --- a/tests/test_patching.py +++ b/tests/test_patching.py @@ -1,10 +1,15 @@ +import builtins import importlib import os +import os.path +import glob +import tempfile import pytest +from cloudpathlib import patch_open, patch_os_functions, patch_glob import cloudpathlib -from cloudpathlib import patch_open +from cloudpathlib.cloudpath import CloudPath def test_patch_open(rig): @@ -21,29 +26,603 @@ def test_patch_open(rig): assert cp.read_text() == "Hello!" - # set via env var - cp2 = rig.create_cloud_path("dir_0/new_file_two.txt") - original_env_setting = os.environ.get("CLOUDPATHLIB_PATCH_OPEN", "") - try: - os.environ["CLOUDPATHLIB_PATCH_OPEN"] = "1" +def test_patch_open_with_env(rig, monkeypatch): + orig_open = builtins.open + orig_fspath = CloudPath.__fspath__ + try: + monkeypatch.setenv("CLOUDPATHLIB_PATCH_OPEN", "1") importlib.reload(cloudpathlib) - with open(cp2, "w") as f: + cp = rig.create_cloud_path("dir_0/new_file_two.txt") + + with open(cp, "w") as f: f.write("Hello!") - assert cp2.read_text() == "Hello!" + assert cp.read_text() == "Hello!" finally: - os.environ["CLOUDPATHLIB_PATCH_OPEN"] = original_env_setting - importlib.reload(cloudpathlib) + builtins.open = orig_open + CloudPath.__fspath__ = orig_fspath + + +def test_patch_os_functions(rig): + """Test all OS and os.path functions in a single comprehensive test.""" + + # Set up test data + test_dir = rig.create_cloud_path("test_dir/") + test_file = rig.create_cloud_path("test_dir/test_file.txt") + test_file.write_text("test content") + + # Create another file for testing operations + source_file = rig.create_cloud_path("test_dir/source.txt") + source_file.write_text("source content") + dest_file = rig.create_cloud_path("test_dir/dest.txt") + + with patch_os_functions(): + # Test os.fspath + result = os.fspath(test_file) + assert result == test_file + + # Test os.listdir + result = os.listdir(test_dir) + assert isinstance(result, list) + assert all(isinstance(item, CloudPath) for item in result) + assert len(result) > 0 + + # Test os.lstat + result = os.lstat(test_file) + assert hasattr(result, "st_size") + assert hasattr(result, "st_mtime") + + # Test os.mkdir (may not work on all providers) + new_dir = rig.create_cloud_path("test_dir/new_dir/") + try: + os.mkdir(new_dir) + except Exception: + pass # Some providers don't support directory creation + + # Test os.makedirs (may not work on all providers) + deep_dir = rig.create_cloud_path("test_dir/deep/nested/dir/") + try: + os.makedirs(deep_dir) + except Exception: + pass # Some providers don't support directory creation + + # Test os.remove + temp_file = rig.create_cloud_path("test_dir/temp_remove.txt") + temp_file.write_text("temp") + os.remove(temp_file) + assert not temp_file.exists() + + # Test os.rename + os.rename(source_file, dest_file) + assert not source_file.exists() + assert dest_file.exists() + assert dest_file.read_text() == "source content" + + # Test os.replace (may not work on all providers) + replace_source = rig.create_cloud_path("test_dir/replace_source.txt") + replace_source.write_text("replace source") + replace_dest = rig.create_cloud_path("test_dir/replace_dest.txt") + replace_dest.write_text("old content") + try: + os.replace(replace_source, replace_dest) + assert not replace_source.exists() + assert replace_dest.exists() + assert replace_dest.read_text() == "replace source" + except Exception: + pass # Some providers don't support atomic replace + + # Test os.rmdir (may not work on all providers) + empty_dir = rig.create_cloud_path("test_dir/empty_dir/") + try: + os.rmdir(empty_dir) + assert not empty_dir.exists() + except Exception: + pass # Some providers don't support directory removal + + # Test os.scandir + result = os.scandir(test_dir) + items = list(result) + assert all(isinstance(item, CloudPath) for item in items) + assert len(items) > 0 + + # Test os.stat + result = os.stat(test_file) + assert hasattr(result, "st_size") + assert hasattr(result, "st_mtime") + + # Test os.unlink + temp_unlink = rig.create_cloud_path("test_dir/temp_unlink.txt") + temp_unlink.write_text("temp") + os.unlink(temp_unlink) + assert not temp_unlink.exists() + + # Test os.walk + result = list(os.walk(test_dir)) + assert len(result) > 0 + for root, dirs, files in result: + assert isinstance(root, CloudPath) + assert all(isinstance(d, CloudPath) for d in dirs) + assert all(isinstance(f, CloudPath) for f in files) + + # Test os.path.basename + result = os.path.basename(test_file) + assert result == "test_file.txt" + + # Test os.path.commonpath + file1 = rig.create_cloud_path("test_dir/file1.txt") + file2 = rig.create_cloud_path("test_dir/file2.txt") + result = os.path.commonpath([file1, file2]) + assert isinstance(result, CloudPath) + + # Test os.path.commonprefix + result = os.path.commonprefix([file1, file2]) + assert isinstance(result, str) + assert "test_dir" in result + + # Test os.path.dirname + result = os.path.dirname(test_file) + assert isinstance(result, CloudPath) + + # Test os.path.exists + result = os.path.exists(test_file) + assert isinstance(result, bool) + assert result is True + + # Test os.path.getatime + result = os.path.getatime(test_file) + if isinstance(result, tuple): + result = result[0] + if result is not None: + assert isinstance(result, (int, float)) + + # Test os.path.getmtime + result = os.path.getmtime(test_file) + if isinstance(result, tuple): + result = result[0] + if result is not None: + assert isinstance(result, (int, float)) + + # Test os.path.getctime + result = os.path.getctime(test_file) + if isinstance(result, tuple): + result = result[0] + if result is not None: + assert isinstance(result, (int, float)) + + # Test os.path.getsize + result = os.path.getsize(test_file) + if isinstance(result, tuple): + result = result[0] + if result is not None: + assert isinstance(result, int) + + # Test os.path.isfile + try: + assert os.path.isfile(test_file) is True + assert os.path.isfile(test_dir) is False + except AttributeError: + pass # Some providers don't support _is_file_or_dir + + # Test os.path.isdir + try: + assert os.path.isdir(test_dir) is True + assert os.path.isdir(test_file) is False + except AttributeError: + pass # Some providers don't support _is_file_or_dir + + # Test os.path.join + result = os.path.join(test_dir, "subdir", "file.txt") + assert isinstance(result, CloudPath) + expected = rig.create_cloud_path("test_dir/subdir/file.txt") + assert result == expected + + # Test os.path.split + head, tail = os.path.split(test_file) + assert isinstance(head, CloudPath) + assert isinstance(tail, str) + assert tail == "test_file.txt" + + # Test os.path.splitext + root, ext = os.path.splitext(test_file) + assert isinstance(root, str) + assert isinstance(ext, str) + assert ext == ".txt" + + +def test_patch_os_functions_with_strings(rig): + """Test that regular string paths still work with patched functions.""" + with patch_os_functions(): + # Regular string paths should still work + assert os.path.exists(".") # Current directory should exist + assert os.path.isdir(".") # Current directory should be a directory + + +def test_patch_os_functions_context_manager(rig): + """Test that patches are applied and restored correctly.""" + original_listdir = os.listdir + original_exists = os.path.exists + + with patch_os_functions(): + # Patches should be applied + assert os.listdir != original_listdir + assert os.path.exists != original_exists + + # Patches should be restored + assert os.listdir == original_listdir + assert os.path.exists == original_exists + + +def test_patch_os_functions_error_handling(rig): + """Test error handling for non-existent files.""" + non_existent = rig.create_cloud_path("non_existent_file.txt") + + with patch_os_functions(): + with pytest.raises(FileNotFoundError): + os.remove(non_existent) + + +def test_patch_os_functions_mixed_usage(rig): + """Test mixed usage of CloudPath and regular paths.""" + cloud_path = rig.create_cloud_path("test_dir/cloud_file.txt") + cloud_path.write_text("test content") + + # Create a temporary local file + with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: + f.write("local content") + local_path = f.name + + try: + with patch_os_functions(): + # Both CloudPath and regular paths should work + assert os.path.exists(cloud_path) + assert os.path.exists(local_path) + + # Handle the tuple return type for getsize + cloud_size = os.path.getsize(cloud_path) + if isinstance(cloud_size, tuple): + cloud_size = cloud_size[0] + # Some providers may return None for file size + if cloud_size is not None: + assert cloud_size >= 0 # Allow 0 size + + local_size = os.path.getsize(local_path) + assert local_size > 0 + finally: + # Clean up local file + os.unlink(local_path) + + +def test_patch_glob_basic(rig): + """Test that glob functions are callable when patched.""" + with patch_glob(): + assert callable(glob.glob) + assert callable(glob.iglob) + + +def test_patch_glob_with_strings(rig): + """Test glob with regular string patterns.""" + with patch_glob(): + # Regular string patterns should still work + result = glob.glob("*.py") # Should find Python files + assert isinstance(result, list) + + +def test_patch_glob_with_cloudpath_patterns(rig): + """Test glob with CloudPath patterns.""" + with patch_glob(): + # Test basic file pattern matching + test_dir = rig.create_cloud_path("test_dir") + test_dir.mkdir(exist_ok=True) + + # Create test files + test_file1 = test_dir / "file1.txt" + test_file2 = test_dir / "file2.txt" + test_file3 = test_dir / "data.csv" + + test_file1.write_text("content1") + test_file2.write_text("content2") + test_file3.write_text("data") + + # Test basic wildcard patterns + result = glob.glob(test_dir / "*.txt") + assert len(result) == 2 + assert all(isinstance(p, type(test_dir)) for p in result) + assert any("file1.txt" in str(p) for p in result) + assert any("file2.txt" in str(p) for p in result) + + # Test specific file pattern + result = glob.glob(test_dir / "file*.txt") + assert len(result) == 2 + + # Test with different extension + result = glob.glob(test_dir / "*.csv") + assert len(result) == 1 + assert "data.csv" in str(result[0]) + + +def test_patch_glob_with_recursive_patterns(rig): + """Test glob with recursive ** patterns.""" + with patch_glob(): + # Create nested directory structure + root_dir = rig.create_cloud_path("glob_test_root") + root_dir.mkdir(exist_ok=True) + + subdir1 = root_dir / "subdir1" + subdir1.mkdir(exist_ok=True) + + subdir2 = subdir1 / "subdir2" + subdir2.mkdir(exist_ok=True) + + # Create files at different levels + root_file = root_dir / "root.txt" + sub1_file = subdir1 / "sub1.txt" + sub2_file = subdir2 / "sub2.txt" + + root_file.write_text("root") + sub1_file.write_text("sub1") + sub2_file.write_text("sub2") + + # Test recursive pattern to find all .txt files + # Note: CloudPath recursive glob support may vary by implementation + result = glob.glob(root_dir / "**/*.txt") + # Should find at least the root file, and potentially subdirectory files + assert len(result) >= 1 + assert any("root.txt" in str(p) for p in result) + + # Test recursive pattern from specific subdirectory + result = glob.glob(subdir1 / "**/*.txt") + # Should find at least the sub1.txt file + assert len(result) >= 1 + assert any("sub1.txt" in str(p) for p in result) + + # Test recursive pattern with specific depth + result = glob.glob(root_dir / "*/*.txt") + assert len(result) == 1 + assert "sub1.txt" in str(result[0]) + + +def test_patch_glob_with_iglob(rig): + """Test iglob iterator functionality.""" + with patch_glob(): + test_dir = rig.create_cloud_path("iglob_test") + test_dir.mkdir(exist_ok=True) + + # Create test files + files = [] + for i in range(3): + test_file = test_dir / f"file{i}.txt" + test_file.write_text(f"content{i}") + files.append(test_file) + + # Test iglob returns iterator + result = glob.iglob(test_dir / "*.txt") + assert hasattr(result, "__iter__") + + # Convert to list and verify + result_list = list(result) + assert len(result_list) == 3 + assert all(isinstance(p, type(test_dir)) for p in result_list) + + # Test that iterator can only be consumed once + result2 = glob.iglob(test_dir / "*.txt") + first_item = next(result2) + assert isinstance(first_item, type(test_dir)) + + +def test_patch_glob_with_root_dir_parameter(rig): + """Test glob with root_dir parameter.""" + with patch_glob(): + # Create test structure + root_dir = rig.create_cloud_path("root_dir_test") + root_dir.mkdir(exist_ok=True) + + subdir = root_dir / "subdir" + subdir.mkdir(exist_ok=True) + + test_file = subdir / "test.txt" + test_file.write_text("test") + + # Test with root_dir parameter + result = glob.glob("test.txt", root_dir=subdir) + assert len(result) == 1 + assert isinstance(result[0], type(root_dir)) + assert "test.txt" in str(result[0]) + + # Test with pattern and root_dir + result = glob.glob("*.txt", root_dir=subdir) + assert len(result) == 1 + + # Test with recursive pattern and root_dir + result = glob.glob("**/*.txt", root_dir=root_dir) + assert len(result) == 1 + + +def test_patch_glob_with_complex_patterns(rig): + """Test glob with complex pattern combinations.""" + with patch_glob(): + test_dir = rig.create_cloud_path("complex_pattern_test") + test_dir.mkdir(exist_ok=True) + + # Create files with various names + files = [ + "file1.txt", + "file2.py", + "data.csv", + "config.json", + "README.md", + "test_file.py", + "archive.tar.gz", + ] + + created_files = [] + for filename in files: + file_path = test_dir / filename + file_path.write_text("content") + created_files.append(file_path) + + # Test multiple extensions (brace expansion not supported in standard glob) + # So we test individual patterns instead + result = glob.glob(test_dir / "*.txt") + assert len(result) == 1 + result = glob.glob(test_dir / "*.py") + assert len(result) == 2 + + # Test character classes + result = glob.glob(test_dir / "file[0-9].*") + assert len(result) == 2 # file1.txt and file2.py + + # Test negation (not supported in standard glob, but test for errors) + try: + result = glob.glob(test_dir / "!*.txt") + # If negation works, it should return non-txt files + assert all("txt" not in str(p) for p in result) + except (ValueError, TypeError): + # Negation not supported, which is expected + pass + + # For HTTP(S), advanced patterns may require directory listings that aren't supported + is_http = rig.path_class.cloud_prefix.startswith("http") + if not is_http: + # Test question mark wildcard + result = glob.glob(test_dir / "file?.txt") + # The ? wildcard should match exactly one character + # Only file1.txt matches in our setup + assert len(result) == 1 + assert any("file1.txt" in str(f) for f in result) + + # Test multiple wildcards + result = glob.glob(test_dir / "*file*.py") + assert len(result) == 2 # file2.py and test_file.py both contain "file" + assert any("test_file.py" in str(f) for f in result) + assert any("file2.py" in str(f) for f in result) + + +def test_patch_glob_error_handling(rig): + """Test glob error handling for invalid patterns and paths.""" + with patch_glob(): + # Ensure directory exists and is listable by creating at least one file + test_dir = rig.create_cloud_path("error_test") + dummy = test_dir / "dummy.txt" + dummy.write_text("dummy") + + # Test with empty pattern (some providers may return the directory's immediate children) + result = glob.glob(test_dir / "") + assert isinstance(result, list) + if len(result) == 1: + assert str(result[0]).endswith("/error_test/dummy.txt") or str(result[0]).endswith( + "\\error_test\\dummy.txt" + ) + + # Test with just wildcards + result = glob.glob(test_dir / "*") + assert isinstance(result, list) + + +def test_patch_glob_context_manager(rig): + """Test that glob patches are applied and restored correctly.""" + original_glob = glob.glob + original_iglob = glob.iglob + + with patch_glob(): + # Patches should be applied + assert glob.glob != original_glob + assert glob.iglob != original_iglob + + # Patches should be restored + assert glob.glob == original_glob + assert glob.iglob == original_iglob + + +def test_patch_glob_mixed_usage(rig): + """Test mixed usage of CloudPath and regular paths with glob.""" + with patch_glob(): + # Create test structure + cloud_dir = rig.create_cloud_path("mixed_test") + cloud_dir.mkdir(exist_ok=True) + + test_file = cloud_dir / "test.txt" + test_file.write_text("test") + + # Test CloudPath pattern + cloud_result = glob.glob(cloud_dir / "*.txt") + assert len(cloud_result) == 1 + assert isinstance(cloud_result[0], type(cloud_dir)) + + # Test string pattern (should still work) + string_result = glob.glob("*.py") # Find Python files in current directory + assert isinstance(string_result, list) + + # Test with root_dir as CloudPath and string pattern + result = glob.glob("*.txt", root_dir=cloud_dir) + assert len(result) == 1 + assert isinstance(result[0], type(cloud_dir)) + + +def test_patch_glob_edge_cases(rig): + """Test glob with edge cases and boundary conditions.""" + with patch_glob(): + test_dir = rig.create_cloud_path("edge_case_test") + test_dir.mkdir(exist_ok=True) + + # Create files with special names + is_http = rig.path_class.cloud_prefix.startswith("http") + special_files = [ + # For HTTP(S), skip file with spaces because URLs may not be encoded by the client + *([] if is_http else ["file with spaces.txt"]), + "file-with-dashes.txt", + "file_with_underscores.txt", + "file.with.dots.txt", + "file123.txt", + "123file.txt", + ".hidden.txt", + "file.txt.bak", + ] + + created_files = [] + for filename in special_files: + file_path = test_dir / filename + file_path.write_text("content") + created_files.append(file_path) + + # Test files with spaces (skip for HTTP(S)) + if not is_http: + result = glob.glob(test_dir / "* *.txt") + assert len(result) == 1 + assert "file with spaces.txt" in str(result[0]) + + # Test files with dashes + result = glob.glob(test_dir / "*-*.txt") + assert len(result) == 1 + assert "file-with-dashes.txt" in str(result[0]) + + # Test files with underscores + result = glob.glob(test_dir / "*_*.txt") + assert len(result) == 1 + assert "file_with_underscores.txt" in str(result[0]) - # cp.write_text("Hello!") + # Test files with dots + result = glob.glob(test_dir / "*.*.txt") + # Our mock providers may treat hidden files like normal entries, so allow 1 or 2 + assert 1 <= len(result) <= 2 + assert any("file.with.dots.txt" in str(f) for f in result) - # # remove cache - # cp._local.unlink() + # Test hidden files (may not be supported equally in all providers) + result = glob.glob(test_dir / ".*.txt") + # Accept either 0 or 1 depending on provider behavior + assert len(result) in (0, 1) + if result: + assert ".hidden.txt" in str(result[0]) + # Test files ending with .bak + result = glob.glob(test_dir / "*.bak") + assert len(result) == 1 + assert "file.txt.bak" in str(result[0]) -def test_patches(rig): - pass + # Test numeric patterns + result = glob.glob(test_dir / "[0-9]*.txt") + assert len(result) == 1 + assert "123file.txt" in str(result[0]) From d2e327560844937dd669bbddcac1f06547995cc6 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 16 Aug 2025 17:32:30 -0700 Subject: [PATCH 07/11] tests, docs, compatibility --- cloudpathlib/__init__.py | 7 +- cloudpathlib/http/httpclient.py | 6 + cloudpathlib/local/localclient.py | 8 + cloudpathlib/patches.py | 22 ++ docs/docs/patching_builtins.ipynb | 423 +++++++++++++++----------- docs/docs/script/patching_builtins.py | 238 +++++++++++++++ docs/mkdocs.yml | 3 +- test-open.py | 25 -- tests/test_patching.py | 35 ++- 9 files changed, 565 insertions(+), 202 deletions(-) create mode 100644 docs/docs/script/patching_builtins.py delete mode 100644 test-open.py diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py index 3d78bed7..8caf2613 100644 --- a/cloudpathlib/__init__.py +++ b/cloudpathlib/__init__.py @@ -5,7 +5,7 @@ from .azure.azblobclient import AzureBlobClient from .azure.azblobpath import AzureBlobPath from .cloudpath import CloudPath, implementation_registry -from .patches import patch_open, patch_os_functions, patch_glob +from .patches import patch_open, patch_os_functions, patch_glob, patch_all_builtins from .gs.gsclient import GSClient from .gs.gspath import GSPath from .http.httpclient import HttpClient, HttpsClient @@ -38,6 +38,7 @@ "patch_open", "patch_glob", "patch_os_functions", + "patch_all_builtins", "S3Client", "S3Path", ] @@ -53,6 +54,4 @@ patch_glob() if bool(os.environ.get("CLOUDPATHLIB_PATCH_ALL", "")): - patch_open() - patch_os_functions() - patch_glob() + patch_all_builtins() diff --git a/cloudpathlib/http/httpclient.py b/cloudpathlib/http/httpclient.py index 7dbbb9b7..a67690ea 100644 --- a/cloudpathlib/http/httpclient.py +++ b/cloudpathlib/http/httpclient.py @@ -79,6 +79,12 @@ def _get_metadata(self, cloud_path: HttpPath) -> dict: "content_type": response.headers.get("Content-Type", None), } + def _is_file_or_dir(self, cloud_path: HttpPath) -> Optional[str]: + if self.dir_matcher(cloud_path.as_url()): + return "dir" + else: + return "file" + def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path: local_path = Path(local_path) with self.opener.open(cloud_path.as_url()) as response: diff --git a/cloudpathlib/local/localclient.py b/cloudpathlib/local/localclient.py index d37cb7c3..50ec666b 100644 --- a/cloudpathlib/local/localclient.py +++ b/cloudpathlib/local/localclient.py @@ -118,6 +118,14 @@ def _is_file(self, cloud_path: "LocalPath", follow_symlinks=True) -> bool: return self._cloud_path_to_local(cloud_path).is_file(**kwargs) + def _is_file_or_dir(self, cloud_path: "LocalPath") -> Optional[str]: + if self._is_dir(cloud_path): + return "dir" + elif self._is_file(cloud_path): + return "file" + else: + raise FileNotFoundError(f"Path could not be identified as file or dir: {cloud_path}") + def _list_dir( self, cloud_path: "LocalPath", recursive=False ) -> Iterable[Tuple["LocalPath", bool]]: diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index e3a259de..15e15102 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -378,3 +378,25 @@ def __exit__(self, exc_type, exc_value, traceback): def patch_glob(): return _GlobPatch() + + +class _PatchAllBuiltins: + def __init__(self): + self.patch_open = patch_open() + self.patch_os_functions = patch_os_functions() + self.patch_glob = patch_glob() + + def __enter__(self): + self.patch_open.__enter__() + self.patch_os_functions.__enter__() + self.patch_glob.__enter__() + return + + def __exit__(self, exc_type, exc_value, traceback): + self.patch_open.__exit__(exc_type, exc_value, traceback) + self.patch_os_functions.__exit__(exc_type, exc_value, traceback) + self.patch_glob.__exit__(exc_type, exc_value, traceback) + + +def patch_all_builtins(): + return _PatchAllBuiltins() diff --git a/docs/docs/patching_builtins.ipynb b/docs/docs/patching_builtins.ipynb index d8bb7f5b..c8de55c4 100644 --- a/docs/docs/patching_builtins.ipynb +++ b/docs/docs/patching_builtins.ipynb @@ -4,15 +4,35 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Patching Python builtins (third-party library compatibility)\n", + "# Compatibility" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Patching Python builtins (third-party library compatibility)\n", "\n", - "Not every Python library is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries then may internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them.\n", + "Not every Python library in the broad universe of Python libraries is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them.\n", "\n", - "This means that out-of-the-box you can't just pass a `CloudPath` object to any method or function and have it work. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.\n", + "This means that out-of-the-box you can't just pass a `CloudPath` object to any library. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.\n", "\n", "The long-term solution is to ask developers to implement their library to support either (1) pathlib-compatible objects for files and directories, or (2) file-like objects passed directly (e.g., so you could call `CloudPath.open` in your code and pass the the file-like object to the library).\n", "\n", - "The short-term workaround that will be compatible with some libraries is to patch the builtins to make `open`, `os`, `os.path`, and `glob` work with `CloudPath` objects. Because this overrides default Python functionality, this is not on by default. When patched, these functions will use the `CloudPath` version if they are passed a `CloudPath` and will fallback to their normal implementations otherwise.\n", + "The near-term workaround that will be compatible with some libraries is to patch the builtins to make `open`, `os`, `os.path`, and `glob` work with `CloudPath` objects. Because this overrides default Python functionality, this is not on by default. When patched, these functions will use the `CloudPath` version if they are passed a `CloudPath` and will fallback to their normal implementations otherwise.\n", + "\n", + "There are three ways to enable these patches: environment variables, globally with a function call, or just in a specific context with a context manager.\n", + "\n", + "## Differences in reading versus writing to `CloudPath`\n", + "\n", + "A major reason to patch these builtins is if you want to write to a `CloudPath` with a third party library. For scenarios where you are reading files, you may not need to do any patching. Many python libraries support using [`__fspath__`](https://docs.python.org/3/library/os.html#os.PathLike.__fspath__) to get the location of a file on disk.\n", + "\n", + "We implement `CloudPath.__fspath__`, which will cache the file to the local disk and provide that file path as a string to any library that uses `fspath`. This works well for reading files, but not for writing them. Because there is no callback to our code once that filepath gets written to, we can't see changes and then push those changes from the cache back to the cloud (see related discussions in [#73](https://github.com/drivendataorg/cloudpathlib/issues/73), [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), [#140](https://github.com/drivendataorg/cloudpathlib/pull/140)). In many scenarios our code will never get called again.\n", + "\n", + "For this reason, it is better to patch the built-in functions to handle `CloudPath` objects rather than rely on `__fspath__`, especially if you are writing to these files.\n", + "\n", + "\n", + "## Setting with environment variables\n", "\n", "These methods can be enabled by setting the following environment variables:\n", " - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob`\n", @@ -22,23 +42,57 @@ "\n", "You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported.\n", "\n", - "Alternatively, you can call methods to patch the functions.\n", + "## Setting with patch methods globally\n", + "\n", + "Instead of setting environment variables, you can call methods to patch the functions. For example, you may call these at import time in your application or script. This will use the patched methods throughout your application.\n", "\n", "```python\n", - "from cloudpathlib import patch_open, patch_os_functions, patch_glob\n", + "from cloudpathlib import patch_all_builtins, patch_open, patch_os_functions, patch_glob\n", "\n", - "# patch builtins\n", + "# patch the builtins your code or a library that you call uses\n", "patch_open()\n", "patch_os_functions()\n", "patch_glob()\n", - "```" + "\n", + "# or, if you want all of these at once\n", + "patch_all_builtins()\n", + "```\n", + "\n", + "## Setting with a context manager\n", + "\n", + "Finally, you can control the scope which the patach is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application.\n", + "\n", + "```python\n", + "from cloudpathlib import patch_all_builtins\n", + "\n", + "with patch_all_builtins():\n", + " with open(cloud_path) as f:\n", + " data = f.read()\n", + "```\n", + "\n", + "This is the narrowest, most targeted way to update the builtin Python methods that don't just work with `CloudPath` objects.\n", + "\n", + "Next, we'll walk through some examples of patching and using these methods.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see a similar result for patching the functions in the `os` module." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "These patch methods are all context managers, so if you want to control where the patch is active, you can use them in a `with` statement. For example:" + "## Patching `open`\n", + "\n", + "Sometimes code uses the Python built-in `open` to open files and operate on them. In those cases, passing a `CloudPath` will fail. You can patch the built-in `open` so that when a `CloudPath` is provided it uses `CloudPath.open`, otherwise defers to the original behavior.\n", + "\n", + "Here's an example that would not work unless you patch the built-ins (for example, if you depend on a third-party library that calls `open`).\n", + "\n", + "It will fail with an `OverwriteNewerLocalError` becasuse `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud).\n" ] }, { @@ -47,8 +101,10 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext autoreload\n", - "%autoreload 2" + "# Imagine that deep in a third-party library a function is implemented like this\n", + "def library_function(filepath: str):\n", + " with open(filepath, \"w\") as f:\n", + " f.write(\"hello!\")" ] }, { @@ -60,54 +116,89 @@ "name": "stdout", "output_type": "stream", "text": [ - "Unpatched version fails:\n", - "'S3Path' object is not subscriptable\n", - "Patched succeeds:\n", - "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n", - "`glob` module now is equivalent to `CloudPath.glob`\n", - "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n" + "\n", + "Local file (/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/tmpnoc8ue_f/cloudpathlib-test-bucket/patching_builtins/new_file.txt) for cloud path (s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt) is newer on disk, but is being requested for download from cloud. Either (1) push your changes to the cloud, (2) remove the local file, or (3) pass `force_overwrite_from_cloud=True` to overwrite; or set env var CLOUDPATHLIB_FORCE_OVERWRITE_FROM_CLOUD=1.\n" ] } ], "source": [ - "from glob import glob\n", + "from cloudpathlib import CloudPath\n", "\n", - "from cloudpathlib import patch_glob, CloudPath\n", + "# create file to read\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt\")\n", "\n", "try:\n", - " glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**\"))\n", + " library_function(cp)\n", + "\n", + " # read the text that was written\n", + " assert cp.read_text() == \"hello!\"\n", "except Exception as e:\n", - " print(\"Unpatched version fails:\")\n", - " print(e)\n", + " print(type(e))\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Patching `open` in Jupyter notebooks\n", "\n", + "Since this documentation runs as a Jupyter notebook, there is an extra step to patch `open`. Jupyter notebooks inject their own `open` into the user namespace. After enabling the patch, ensure the notebook's `open` refers to the patched built-in:\n", "\n", - "with patch_glob():\n", - " print(\"Patched succeeds:\")\n", - " print(glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**/*\")))\n", + "```python\n", + "from cloudpathlib import patch_open\n", "\n", - " # or equivalently\n", - " print(\"`glob` module now is equivalent to `CloudPath.glob`\")\n", - " print(glob(\"**/*dir*/**/*\", root_dir=CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))" + "open = patch_open().patched # rebind notebook's open to the patched version\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Succeeded!\n" + ] + } + ], + "source": [ + "from cloudpathlib import CloudPath, patch_open\n", + "\n", + "# enable patch and rebind notebook's open\n", + "open = patch_open().patched\n", + "\n", + "# create file to read\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "\n", + "library_function(cp)\n", + "assert cp.read_text() == \"hello!\"\n", + "print(\"Succeeded!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can see a similar result for patching the functions in the `os` module." + "## Examples: os.path functions with CloudPath\n", + "\n", + "The snippet below demonstrates common `os.path` functions when patched to accept `CloudPath` values. These calls work for `CloudPath` and still behave normally for string paths.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "False\n", - "Patched version of `os.path.isdir` returns: True\n" + "Unpatched version fails:\n", + "expected S3Path.__fspath__() to return str or bytes, not S3Path\n" ] } ], @@ -116,248 +207,238 @@ "\n", "from cloudpathlib import patch_os_functions, CloudPath\n", "\n", + "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "folder = cp.parent\n", + "\n", "try:\n", - " print(os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))\n", + " print(os.path.isdir(folder))\n", "except Exception as e:\n", " print(\"Unpatched version fails:\")\n", - " print(e)\n", - "\n", - "\n", - "with patch_os_functions():\n", - " result = os.path.isdir(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\"))\n", - " print(\"Patched version of `os.path.isdir` returns: \", result)" + " print(e)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 5, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patched version of `os.path.isdir` returns: True\n", + "basename: file.txt\n", + "dirname: s3://cloudpathlib-test-bucket/patching_builtins\n", + "join: s3://cloudpathlib-test-bucket/patching_builtins/dir/sub/name.txt\n" + ] + } + ], "source": [ - "## Patching `open`\n", - "\n", - "Sometimes code uses the Python built-in `open` to open files and operate on them. In those cases, passing a `CloudPath` will fail. You can patch the built-in `open` so that when a `CloudPath` is provided it uses `CloudPath.open`, otherwise defers to the original behavior.\n", - "\n", - "### Patching `open` in Jupyter notebooks\n", - "\n", - "Jupyter notebooks inject their own `open` into the user namespace. After enabling the patch, ensure the notebook's `open` refers to the patched built-in:\n", + "with patch_os_functions():\n", + " result = os.path.isdir(folder)\n", + " print(\"Patched version of `os.path.isdir` returns: \", result)\n", "\n", - "```python\n", - "from cloudpathlib import patch_open\n", + " print(\"basename:\", os.path.basename(cp))\n", "\n", - "open = patch_open().patched # rebind notebook's open to the patched version\n", - "```\n", + " print(\"dirname:\", os.path.dirname(cp))\n", "\n", - "Here's an example that doesn't work right now (for example, if you depend on a third-party library that calls `open`)." + " joined = os.path.join(folder, \"dir\", \"sub\", \"name.txt\")\n", + " print(\"join:\", joined)" ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# deep in a third-party library a function calls the built-in open\n", - "def library_function(filepath: str):\n", - " with open(filepath, \"w\") as f:\n", - " f.write(\"hello!\")" + "## Examples: glob with CloudPath\n", + "\n", + "The snippet below demonstrates `glob.glob` and `glob.iglob` working with `CloudPath` as the pattern or `root_dir` when patched.\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Errno 2] No such file or directory: '/var/folders/sz/c8j64tx91mj0jb0vd1s4wj700000gn/T/tmpykd4wirh/cloudpathlib-test-bucket/patching_builtins/new_file.txt'\n" + "Unpatched version fails:\n", + "'S3Path' object is not subscriptable\n" ] } ], "source": [ - "from cloudpathlib import CloudPath\n", + "from glob import glob\n", "\n", - "# create file to read\n", - "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt\")\n", + "from cloudpathlib import patch_glob, CloudPath\n", "\n", - "# fails with a TypeError if passed a CloudPath\n", "try:\n", - " library_function(cp)\n", - " cp.read_text() == \"hello!\"\n", + " glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**\"))\n", "except Exception as e:\n", + " print(\"Unpatched version fails:\")\n", " print(e)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "True\n" + "Patched succeeds:\n", + "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n", + "[S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/nested-dir/test.file'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirB/fileB'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/fileC'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD'), S3Path('s3://cloudpathlib-test-bucket/manual-tests/glob_test/dirC/dirD/fileD')]\n" ] } ], "source": [ - "from cloudpathlib import CloudPath, patch_open\n", - "\n", - "# enable patch and rebind notebook's open\n", - "open = patch_open().patched\n", - "\n", - "\n", - "# deep in a third-party library a function calls the built-in open\n", - "def library_function(filepath: str):\n", - " with open(filepath, \"w\") as f:\n", - " f.write(\"hello!\")\n", + "with patch_glob():\n", + " print(\"Patched succeeds:\")\n", + " print(glob(CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**/*\")))\n", "\n", + " # or equivalently\n", + " print(glob(\"**/*dir*/**/*\", root_dir=CloudPath(\"s3://cloudpathlib-test-bucket/manual-tests/\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examples with third party libraries\n", "\n", - "# create file to read\n", - "cp = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/file.txt\")\n", + "Here we show that third party libraries, like Pillow, that don't work as expected without patching the built-ins.\n", "\n", - "try:\n", - " library_function(cp)\n", - " print(cp.read_text() == \"hello!\")\n", - "except Exception as e:\n", - " print(e)" + "However, if we patch built-ins, we can see the functions work as expected." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Examples: os.path functions with CloudPath\n", - "\n", - "The snippet below demonstrates common `os.path` functions when patched to accept `CloudPath` values. These calls work for `CloudPath` and still behave normally for string paths.\n" + "## Pillow example" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "basename: example.txt\n", - "dirname: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", - "exists(file): True\n", - "isfile(file): True\n", - "isdir(dir): True\n", - "join: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/sub/name.txt\n", - "split head: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", - "split tail: example.txt\n", - "splitext root: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/example\n", - "splitext ext: .txt\n", - "commonpath: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir\n", - "commonprefix: s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/dir/\n" + "Pillow without patch: FAILED: expected S3Path.__fspath__() to return str or bytes, not S3Path\n" ] } ], "source": [ - "import os\n", - "from cloudpathlib import CloudPath, patch_os_functions\n", + "from cloudpathlib import CloudPath, patch_all_builtins\n", + "from PIL import Image\n", "\n", - "# Create a small demo structure in your configured cloud provider (mocked in tests)\n", - "base = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/ospath_demo/\")\n", - "file_path = base / \"dir\" / \"example.txt\"\n", "\n", - "with patch_os_functions():\n", - " # ensure directory/file exist for demo purposes\n", - " file_path.parent.mkdir(exist_ok=True)\n", - " file_path.write_text(\"content\")\n", - "\n", - " # basename\n", - " print(\"basename:\", os.path.basename(file_path)) # => \"example.txt\"\n", - "\n", - " # dirname\n", - " print(\"dirname:\", os.path.dirname(file_path)) # => CloudPath(.../ospath_demo/dir)\n", - "\n", - " # exists / isfile / isdir\n", - " print(\"exists(file):\", os.path.exists(file_path))\n", - " print(\"isfile(file):\", os.path.isfile(file_path))\n", - " print(\"isdir(dir):\", os.path.isdir(file_path.parent))\n", - "\n", - " # join\n", - " joined = os.path.join(base, \"dir\", \"sub\", \"name.txt\")\n", - " print(\"join:\", joined)\n", - "\n", - " # split\n", - " head, tail = os.path.split(file_path)\n", - " print(\"split head:\", head)\n", - " print(\"split tail:\", tail)\n", - "\n", - " # splitext\n", - " root, ext = os.path.splitext(file_path)\n", - " print(\"splitext root:\", root)\n", - " print(\"splitext ext:\", ext)\n", - "\n", - " # commonpath/commonprefix\n", - " p1 = base / \"dir\" / \"a.txt\"\n", - " p2 = base / \"dir\" / \"b.txt\"\n", - " print(\"commonpath:\", os.path.commonpath([p1, p2])) # => CloudPath(.../ospath_demo/dir)\n", - " print(\"commonprefix:\", os.path.commonprefix([p1, p2]))" + "base = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/third_party/\")\n", + "\n", + "img_path = base / \"pillow_demo.png\"\n", + "\n", + "# Unpatched: using CloudPath directly fails\n", + "try:\n", + " Image.new(\"RGB\", (10, 10), color=(255, 0, 0)).save(img_path)\n", + "except Exception as e:\n", + " print(\"Pillow without patch: FAILED:\", e)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "With patches, Pillow successfully writes to a CloudPath\n" + ] + } + ], + "source": [ + "# Patched: success with patching builtins\n", + "with patch_all_builtins():\n", + " Image.new(\"RGB\", (10, 10), color=(255, 0, 0)).save(img_path)\n", + "\n", + " assert img_path.read_bytes()\n", + " print(\"With patches, Pillow successfully writes to a CloudPath\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Examples: glob with CloudPath\n", + "## Caveat: Some libraries still do not work\n", "\n", - "The snippet below demonstrates `glob.glob` and `glob.iglob` working with `CloudPath` as the pattern or `root_dir` when patched.\n" + "Even with patches, some libraries will not work. For example, writing directly to a `CloudPath` with `pandas` is not possible because `pandas` has a complex set of IO checks it does in its own codebase.\n", + "\n", + "For many of these libraries (including `pandas`) using `CloudPath.open` and then passing the buffer to the functions that can read and write to those buffers is usually the cleanest workaround.\n", + "\n", + "For example, here is the best way to write to a `CloudPath` with `pandas`:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "*.txt: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt')]\n", - "**/*.txt: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt'), S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/sub/file3.txt')]\n", - "root_dir + pattern: [S3Path('s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file2.py')]\n", - "iglob first: s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/file1.txt\n" + "Could not write with `to_csv` because error: Invalid file path or buffer object type: \n" ] } ], "source": [ - "import glob\n", - "from cloudpathlib import CloudPath, patch_glob\n", + "import pandas as pd\n", "\n", - "root = CloudPath(\"s3://cloudpathlib-test-bucket/patching_builtins/glob_demo/\")\n", - "\n", - "with patch_glob():\n", - " # setup demo files\n", - " (root / \"sub\").mkdir(exist_ok=True)\n", - " (root / \"file1.txt\").write_text(\"1\")\n", - " (root / \"file2.py\").write_text(\"2\")\n", - " (root / \"sub\" / \"file3.txt\").write_text(\"3\")\n", + "df = pd.DataFrame([[0, 1], [2, 3]], columns=[\"a\", \"b\"])\n", "\n", - " # Pattern as CloudPath\n", - " print(\"*.txt:\", glob.glob(root / \"*.txt\"))\n", + "cloud_path = base / \"data.csv\"\n", "\n", - " # Recursive patterns\n", - " print(\"**/*.txt:\", glob.glob(root / \"**/*.txt\"))\n", - "\n", - " # Using root_dir with string pattern\n", - " print(\"root_dir + pattern:\", glob.glob(\"*.py\", root_dir=root))\n", + "try:\n", + " df.to_csv(cloud_path)\n", + "except Exception as e:\n", + " print(\"Could not write with `to_csv` because error: \", e)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully wrote to s3://cloudpathlib-test-bucket/patching_builtins/third_party/data.csv\n" + ] + } + ], + "source": [ + "# instead, use .open\n", + "with cloud_path.open(\"w\") as f:\n", + " df.to_csv(f)\n", "\n", - " # iglob iterator\n", - " it = glob.iglob(root / \"*.txt\")\n", - " print(\"iglob first:\", next(it))" + "assert cloud_path.exists()\n", + "print(\"Successfully wrote to \", cloud_path)" ] } ], "metadata": { "kernelspec": { - "display_name": "cloudpathlib", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -375,5 +456,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/docs/script/patching_builtins.py b/docs/docs/script/patching_builtins.py new file mode 100644 index 00000000..c72686a8 --- /dev/null +++ b/docs/docs/script/patching_builtins.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Compatibility + +# ## Patching Python builtins (third-party library compatibility) +# +# Not every Python library in the broad universe of Python libraries is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them. +# +# This means that out-of-the-box you can't just pass a `CloudPath` object to any library. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point. +# +# The long-term solution is to ask developers to implement their library to support either (1) pathlib-compatible objects for files and directories, or (2) file-like objects passed directly (e.g., so you could call `CloudPath.open` in your code and pass the the file-like object to the library). +# +# The near-term workaround that will be compatible with some libraries is to patch the builtins to make `open`, `os`, `os.path`, and `glob` work with `CloudPath` objects. Because this overrides default Python functionality, this is not on by default. When patched, these functions will use the `CloudPath` version if they are passed a `CloudPath` and will fallback to their normal implementations otherwise. +# +# There are three ways to enable these patches: environment variables, globally with a function call, or just in a specific context with a context manager. +# +# ## Differences in reading versus writing to `CloudPath` +# +# A major reason to patch these builtins is if you want to write to a `CloudPath` with a third party library. For scenarios where you are reading files, you may not need to do any patching. Many python libraries support using [`__fspath__`](https://docs.python.org/3/library/os.html#os.PathLike.__fspath__) to get the location of a file on disk. +# +# We implement `CloudPath.__fspath__`, which will cache the file to the local disk and provide that file path as a string to any library that uses `fspath`. This works well for reading files, but not for writing them. Because there is no callback to our code once that filepath gets written to, we can't see changes and then push those changes from the cache back to the cloud (see related discussions in [#73](https://github.com/drivendataorg/cloudpathlib/issues/73), [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), [#140](https://github.com/drivendataorg/cloudpathlib/pull/140)). In many scenarios our code will never get called again. +# +# For this reason, it is better to patch the built-in functions to handle `CloudPath` objects rather than rely on `__fspath__`, especially if you are writing to these files. +# +# +# ## Setting with environment variables +# +# These methods can be enabled by setting the following environment variables: +# - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob` +# - `CLOUDPATHLIB_PACTH_OPEN=1` - patch the builtin `open` method +# - `CLOUDPATHLIB_PACTH_OS_FUNCTIONS=1` - patch the `os` functions +# - `CLOUDPATHLIB_PACTH_GLOB=1` - patch the `glob` module +# +# You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported. +# +# ## Setting with patch methods globally +# +# Instead of setting environment variables, you can call methods to patch the functions. For example, you may call these at import time in your application or script. This will use the patched methods throughout your application. +# +# ```python +# from cloudpathlib import patch_all_builtins, patch_open, patch_os_functions, patch_glob +# +# # patch the builtins your code or a library that you call uses +# patch_open() +# patch_os_functions() +# patch_glob() +# +# # or, if you want all of these at once +# patch_all_builtins() +# ``` +# +# ## Setting with a context manager +# +# Finally, you can control the scope which the patach is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application. +# +# ```python +# from cloudpathlib import patch_all_builtins +# +# with patch_all_builtins(): +# with open(cloud_path) as f: +# data = f.read() +# ``` +# +# This is the narrowest, most targeted way to update the builtin Python methods that don't just work with `CloudPath` objects. +# +# Next, we'll walk through some examples of patching and using these methods. +# + +# We can see a similar result for patching the functions in the `os` module. + +# ## Patching `open` +# +# Sometimes code uses the Python built-in `open` to open files and operate on them. In those cases, passing a `CloudPath` will fail. You can patch the built-in `open` so that when a `CloudPath` is provided it uses `CloudPath.open`, otherwise defers to the original behavior. +# +# Here's an example that would not work unless you patch the built-ins (for example, if you depend on a third-party library that calls `open`). +# +# It will fail with an `OverwriteNewerLocalError` becasuse `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud). +# + +# Imagine that deep in a third-party library a function is implemented like this +def library_function(filepath: str): + with open(filepath, "w") as f: + f.write("hello!") + + +from cloudpathlib import CloudPath + +# create file to read +cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt") + +try: + library_function(cp) + + # read the text that was written + assert cp.read_text() == "hello!" +except Exception as e: + print(type(e)) + print(e) + + +# ### Patching `open` in Jupyter notebooks +# +# Since this documentation runs as a Jupyter notebook, there is an extra step to patch `open`. Jupyter notebooks inject their own `open` into the user namespace. After enabling the patch, ensure the notebook's `open` refers to the patched built-in: +# +# ```python +# from cloudpathlib import patch_open +# +# open = patch_open().patched # rebind notebook's open to the patched version +# ``` + +from cloudpathlib import CloudPath, patch_open + +# enable patch and rebind notebook's open +open = patch_open().patched + +# create file to read +cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/file.txt") + +library_function(cp) +assert cp.read_text() == "hello!" +print("Succeeded!") + + +# ## Examples: os.path functions with CloudPath +# +# The snippet below demonstrates common `os.path` functions when patched to accept `CloudPath` values. These calls work for `CloudPath` and still behave normally for string paths. +# + +import os + +from cloudpathlib import patch_os_functions, CloudPath + +cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/file.txt") +folder = cp.parent + +try: + print(os.path.isdir(folder)) +except Exception as e: + print("Unpatched version fails:") + print(e) + + +with patch_os_functions(): + result = os.path.isdir(folder) + print("Patched version of `os.path.isdir` returns: ", result) + + print("basename:", os.path.basename(cp)) + + print("dirname:", os.path.dirname(cp)) + + joined = os.path.join(folder, "dir", "sub", "name.txt") + print("join:", joined) + + +# ## Examples: glob with CloudPath +# +# The snippet below demonstrates `glob.glob` and `glob.iglob` working with `CloudPath` as the pattern or `root_dir` when patched. +# + +from glob import glob + +from cloudpathlib import patch_glob, CloudPath + +try: + glob(CloudPath("s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**")) +except Exception as e: + print("Unpatched version fails:") + print(e) + + +with patch_glob(): + print("Patched succeeds:") + print(glob(CloudPath("s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**/*"))) + + # or equivalently + print(glob("**/*dir*/**/*", root_dir=CloudPath("s3://cloudpathlib-test-bucket/manual-tests/"))) + + +# # Examples with third party libraries +# +# Here we show that third party libraries, like Pillow, that don't work as expected without patching the built-ins. +# +# However, if we patch built-ins, we can see the functions work as expected. + +# ## Pillow example + +from cloudpathlib import CloudPath, patch_all_builtins +from PIL import Image + + +base = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/third_party/") + +img_path = base / "pillow_demo.png" + +# Unpatched: using CloudPath directly fails +try: + Image.new("RGB", (10, 10), color=(255, 0, 0)).save(img_path) +except Exception as e: + print("Pillow without patch: FAILED:", e) + + +# Patched: success with patching builtins +with patch_all_builtins(): + Image.new("RGB", (10, 10), color=(255, 0, 0)).save(img_path) + + assert img_path.read_bytes() + print("With patches, Pillow successfully writes to a CloudPath") + + +# ## Caveat: Some libraries still do not work +# +# Even with patches, some libraries will not work. For example, writing directly to a `CloudPath` with `pandas` is not possible because `pandas` has a complex set of IO checks it does in its own codebase. +# +# For many of these libraries (including `pandas`) using `CloudPath.open` and then passing the buffer to the functions that can read and write to those buffers is usually the cleanest workaround. +# +# For example, here is the best way to write to a `CloudPath` with `pandas`: + +import pandas as pd + +df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'b']) + +cloud_path = base / "data.csv" + +try: + df.to_csv(cloud_path) +except Exception as e: + print("Could not write with `to_csv` because error: ", e) + + +# instead, use .open +with cloud_path.open("w") as f: + df.to_csv(f) + +assert cloud_path.exists() +print("Successfully wrote to ", cloud_path) + + diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 29743fb4..cd917ce3 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -18,9 +18,10 @@ nav: - Home: "index.md" - Why cloudpathlib?: "why_cloudpathlib.ipynb" - Authentication: "authentication.md" + - AnyPath: "anypath-polymorphism.md" - HTTP URLs: "http.md" - Caching: "caching.ipynb" - - AnyPath: "anypath-polymorphism.md" + - Compatibility: "patching_builtins.ipynb" - Other Client settings: "other_client_settings.md" - Testing code that uses cloudpathlib: "testing_mocked_cloudpathlib.ipynb" - Integrations: "integrations.md" diff --git a/test-open.py b/test-open.py deleted file mode 100644 index 47fa12a0..00000000 --- a/test-open.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from cloudpathlib import CloudPath, patch_open, patch_os_functions - - -def hello(cp): - with open(cp, "a") as f: - f.write(" written") - - -if __name__ == "__main__": - patch_open() - - cp = CloudPath("s3://cloudpathlib-test-bucket/manual/text_file.txt") - cp.write_text("yah") - - hello(cp) - - print(cp.read_text()) - cp.unlink() - - patch_os_functions() - - print(list(os.walk("."))) - print(list(cp.parent.client._list_dir(cp.parent, recursive=True))) - print(list(os.walk(cp.parent))) diff --git a/tests/test_patching.py b/tests/test_patching.py index e40bdad6..f5920ea6 100644 --- a/tests/test_patching.py +++ b/tests/test_patching.py @@ -7,7 +7,7 @@ import pytest -from cloudpathlib import patch_open, patch_os_functions, patch_glob +from cloudpathlib import patch_open, patch_os_functions, patch_glob, patch_all_builtins import cloudpathlib from cloudpathlib.cloudpath import CloudPath @@ -626,3 +626,36 @@ def test_patch_glob_edge_cases(rig): result = glob.glob(test_dir / "[0-9]*.txt") assert len(result) == 1 assert "123file.txt" in str(result[0]) + + +def test_patch_all_builtins_simple(rig): + cp = rig.create_cloud_path("dir_0/new_file_patch_all.txt") + test_dir = rig.create_cloud_path("test_patch_all_dir/") + + # Without patch, opening a CloudPath should fail + with pytest.raises(FileNotFoundError): + with open(cp, "w") as f: + f.write("Hello!") + + # With all builtins patched, open, os.path, and glob should work + with patch_all_builtins(): + # Test open patching + with open(cp, "w") as f: + f.write("Hello!") + assert cp.read_text() == "Hello!" + + # Test os.path patching + assert os.path.exists(cp) + assert os.path.isfile(cp) + assert os.path.basename(cp) == "new_file_patch_all.txt" + + # Test glob patching + test_dir.mkdir(exist_ok=True) + glob_file1 = test_dir / "glob1.txt" + glob_file2 = test_dir / "glob2.txt" + glob_file1.write_text("content1") + glob_file2.write_text("content2") + + result = glob.glob(test_dir / "*.txt") + assert len(result) == 2 + assert all(isinstance(p, type(test_dir)) for p in result) From ec9c8777197df6e2714e96404daac6bd5744804b Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 16 Aug 2025 17:46:41 -0700 Subject: [PATCH 08/11] copilot review --- cloudpathlib/patches.py | 2 +- docs/docs/patching_builtins.ipynb | 14 +++++++------- docs/docs/script/patching_builtins.py | 17 ++++++++--------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index 15e15102..4c7ce658 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -316,7 +316,7 @@ def _cloudpath_glob_iglob( elif isinstance(pathname, CloudPath): if root_dir is not None: - InvalidGlobArgumentsError( + raise InvalidGlobArgumentsError( "If pathname is a CloudPath, root_dir must also be a CloudPath or None." ) diff --git a/docs/docs/patching_builtins.ipynb b/docs/docs/patching_builtins.ipynb index c8de55c4..10b28793 100644 --- a/docs/docs/patching_builtins.ipynb +++ b/docs/docs/patching_builtins.ipynb @@ -35,12 +35,12 @@ "## Setting with environment variables\n", "\n", "These methods can be enabled by setting the following environment variables:\n", - " - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob`\n", - " - `CLOUDPATHLIB_PACTH_OPEN=1` - patch the builtin `open` method\n", - " - `CLOUDPATHLIB_PACTH_OS_FUNCTIONS=1` - patch the `os` functions\n", - " - `CLOUDPATHLIB_PACTH_GLOB=1` - patch the `glob` module\n", + " - `CLOUDPATHLIB_PATCH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob`\n", + " - `CLOUDPATHLIB_PATCH_OPEN=1` - patch the builtin `open` method\n", + " - `CLOUDPATHLIB_PATCH_OS_FUNCTIONS=1` - patch the `os` functions\n", + " - `CLOUDPATHLIB_PATCH_GLOB=1` - patch the `glob` module\n", "\n", - "You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported.\n", + "You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PATCH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PATCH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported.\n", "\n", "## Setting with patch methods globally\n", "\n", @@ -60,7 +60,7 @@ "\n", "## Setting with a context manager\n", "\n", - "Finally, you can control the scope which the patach is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application.\n", + "Finally, you can control the scope which the patch is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application.\n", "\n", "```python\n", "from cloudpathlib import patch_all_builtins\n", @@ -92,7 +92,7 @@ "\n", "Here's an example that would not work unless you patch the built-ins (for example, if you depend on a third-party library that calls `open`).\n", "\n", - "It will fail with an `OverwriteNewerLocalError` becasuse `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud).\n" + "It will fail with an `OverwriteNewerLocalError` because `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud).\n" ] }, { diff --git a/docs/docs/script/patching_builtins.py b/docs/docs/script/patching_builtins.py index c72686a8..51102c45 100644 --- a/docs/docs/script/patching_builtins.py +++ b/docs/docs/script/patching_builtins.py @@ -27,12 +27,12 @@ # ## Setting with environment variables # # These methods can be enabled by setting the following environment variables: -# - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob` -# - `CLOUDPATHLIB_PACTH_OPEN=1` - patch the builtin `open` method -# - `CLOUDPATHLIB_PACTH_OS_FUNCTIONS=1` - patch the `os` functions -# - `CLOUDPATHLIB_PACTH_GLOB=1` - patch the `glob` module +# - `CLOUDPATHLIB_PATCH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob` +# - `CLOUDPATHLIB_PATCH_OPEN=1` - patch the builtin `open` method +# - `CLOUDPATHLIB_PATCH_OS_FUNCTIONS=1` - patch the `os` functions +# - `CLOUDPATHLIB_PATCH_GLOB=1` - patch the `glob` module # -# You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported. +# You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PATCH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PATCH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported. # # ## Setting with patch methods globally # @@ -52,7 +52,7 @@ # # ## Setting with a context manager # -# Finally, you can control the scope which the patach is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application. +# Finally, you can control the scope which the patch is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application. # # ```python # from cloudpathlib import patch_all_builtins @@ -75,7 +75,7 @@ # # Here's an example that would not work unless you patch the built-ins (for example, if you depend on a third-party library that calls `open`). # -# It will fail with an `OverwriteNewerLocalError` becasuse `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud). +# It will fail with an `OverwriteNewerLocalError` because `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud). # # Imagine that deep in a third-party library a function is implemented like this @@ -218,7 +218,7 @@ def library_function(filepath: str): import pandas as pd -df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'b']) +df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "b"]) cloud_path = base / "data.csv" @@ -235,4 +235,3 @@ def library_function(filepath: str): assert cloud_path.exists() print("Successfully wrote to ", cloud_path) - From 8618bd039ecc96d0a16474815a6c1e0898d6a56b Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sat, 16 Aug 2025 17:48:19 -0700 Subject: [PATCH 09/11] update history --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index 0e4f5cc3..6f69fbd6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ - Fixed issue with GS credentials, using default auth enables a wider set of authentication methods in GS (Issue [#390](https://github.com/drivendataorg/cloudpathlib/issues/390), PR [#514](https://github.com/drivendataorg/cloudpathlib/pull/514), thanks @ljyanesm) - Added support for http(s) urls with `HttpClient`, `HttpPath`, `HttpsClient`, and `HttpsPath`. (Issue [#455](https://github.com/drivendataorg/cloudpathlib/issues/455), PR [#468](https://github.com/drivendataorg/cloudpathlib/pull/468)) +- Added support for patching the builtins `open`, `os`, `os.path`, and `glob` to work with `CloudPath` objects. (Issue [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), PR [#322](https://github.com/drivendataorg/cloudpathlib/pull/322)) ## v0.21.1 (2025-05-14) From d9d3d930e870e069b0f5f6a22fd29058b3e4295a Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 29 Aug 2025 17:30:12 -0700 Subject: [PATCH 10/11] code review changes --- HISTORY.md | 5 +++-- Makefile | 2 +- cloudpathlib/patches.py | 24 +++++------------------- docs/docs/patching_builtins.ipynb | 12 +++++++++++- tests/test_patching.py | 15 ++++++--------- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6f69fbd6..170fd03b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,10 +1,11 @@ # cloudpathlib Changelog -## UNRELEASED +## v0.22.0 (2025-08-29) - Fixed issue with GS credentials, using default auth enables a wider set of authentication methods in GS (Issue [#390](https://github.com/drivendataorg/cloudpathlib/issues/390), PR [#514](https://github.com/drivendataorg/cloudpathlib/pull/514), thanks @ljyanesm) - Added support for http(s) urls with `HttpClient`, `HttpPath`, `HttpsClient`, and `HttpsPath`. (Issue [#455](https://github.com/drivendataorg/cloudpathlib/issues/455), PR [#468](https://github.com/drivendataorg/cloudpathlib/pull/468)) -- Added support for patching the builtins `open`, `os`, `os.path`, and `glob` to work with `CloudPath` objects. (Issue [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), PR [#322](https://github.com/drivendataorg/cloudpathlib/pull/322)) +- Added experimental support for patching the builtins `open`, `os`, `os.path`, and `glob` to work with `CloudPath` objects. It is off by default; see the new "Compatibility" section in the docs for more information. (Issue [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), PR [#322](https://github.com/drivendataorg/cloudpathlib/pull/322)) +- Added support for `CloudPath(*parts)` to create a `CloudPath` object from a list of parts (to match `pathlib.Path`). **This is a potentially breaking change for users that relied on the second arg being the `client` instead of making it an explicit kwarg.** (PR [#322](https://github.com/drivendataorg/cloudpathlib/pull/322)) ## v0.21.1 (2025-05-14) diff --git a/Makefile b/Makefile index e429cbbd..9464111b 100644 --- a/Makefile +++ b/Makefile @@ -81,7 +81,7 @@ test: ## run tests with mocked cloud SDKs python -m pytest -vv test-debug: ## rerun tests that failed in last run and stop with pdb at failures - python -m pytest -n=0 -vv --lf --pdb --capture=no + python -m pytest -n=0 -vv --lf --pdb test-live-cloud: ## run tests on live cloud backends USE_LIVE_CLOUD=1 python -m pytest -vv diff --git a/cloudpathlib/patches.py b/cloudpathlib/patches.py index 4c7ce658..dafce869 100644 --- a/cloudpathlib/patches.py +++ b/cloudpathlib/patches.py @@ -120,25 +120,11 @@ def _cloudpath_os_unlink(path, *, dir_fd=None): def _cloudpath_os_walk(top, topdown=True, onerror=None, followlinks=False): - try: - dirs, files = [], [] - for p in top.iterdir(): - dirs.append(p) if p.is_dir() else files.append(p) - - if topdown: - yield (top, files, dirs) - - for d in dirs: - yield from _cloudpath_os_walk(d, topdown=topdown, onerror=onerror) - - if not topdown: - yield (top, files, dirs) - - except Exception as e: - if onerror is not None: - onerror(e) - else: - raise + # pathlib.Path.walk returns dirs and files as string, not Path objects + # we follow the same convention, but since these could get used downstream, + # this method may need to be changed to return absolute CloudPath objects + # if it becomes a compatibility problem with major downstream libraries + yield from top.walk(top_down=topdown, on_error=onerror, follow_symlinks=followlinks) def _cloudpath_os_path_basename(path): diff --git a/docs/docs/patching_builtins.ipynb b/docs/docs/patching_builtins.ipynb index 10b28793..ec637301 100644 --- a/docs/docs/patching_builtins.ipynb +++ b/docs/docs/patching_builtins.ipynb @@ -7,6 +7,16 @@ "# Compatibility" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

Experimental

\n", + "

Patching open, os, os.path, and glob to work with CloudPath objects is experimental. It is off by default, and it may change or be removed in the future.

\n", + "
" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -438,7 +448,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "cloudpathlib", "language": "python", "name": "python3" }, diff --git a/tests/test_patching.py b/tests/test_patching.py index f5920ea6..10940fe1 100644 --- a/tests/test_patching.py +++ b/tests/test_patching.py @@ -145,8 +145,12 @@ def test_patch_os_functions(rig): assert len(result) > 0 for root, dirs, files in result: assert isinstance(root, CloudPath) - assert all(isinstance(d, CloudPath) for d in dirs) - assert all(isinstance(f, CloudPath) for f in files) + assert all( + isinstance(d, str) for d in dirs + ) # pathlib.Path.walk returns dirs as string, not Path + assert all( + isinstance(f, str) for f in files + ) # pathlib.Path.walk returns filenames as string, not Path # Test os.path.basename result = os.path.basename(test_file) @@ -296,13 +300,6 @@ def test_patch_os_functions_mixed_usage(rig): os.unlink(local_path) -def test_patch_glob_basic(rig): - """Test that glob functions are callable when patched.""" - with patch_glob(): - assert callable(glob.glob) - assert callable(glob.iglob) - - def test_patch_glob_with_strings(rig): """Test glob with regular string patterns.""" with patch_glob(): From bb1a701dce60a6befb90ad4d7eefd1d62961dc11 Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Fri, 29 Aug 2025 17:38:53 -0700 Subject: [PATCH 11/11] pin pytest-rerunfailures --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1b6cfc8f..a21e4bd6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -25,7 +25,7 @@ pytest-cases>=3.9.1 pytest-cov pytest-duration-insights pytest-reportlog -pytest-rerunfailures +pytest-rerunfailures<16.0 pytest-xdist python-dotenv pywin32; sys_platform == 'win32'