From 01f365fe6c3a2f60c8b366e41cf26d74e74973f5 Mon Sep 17 00:00:00 2001 From: Luis Yanes <8437968+ljyanesm@users.noreply.github.com> Date: Sat, 2 Aug 2025 20:58:17 +0100 Subject: [PATCH 1/4] Refactor GS authentication to use default credentials (#514) * Refactor authentication to use default credentials for Google Cloud Storage client This enables federated identity Updates HISTORY.md * Keeps same functionality for API whilst enhancing the env var alternative * Simplifies credential handling logic, and updates docstring * Updates HISTORY.md --------- Co-authored-by: ljyanesm --- HISTORY.md | 1 + cloudpathlib/gs/gsclient.py | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6aeb9e5a..988c6662 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## v0.21.1 (2025-05-14) +- Fixed issue with GS credentials, using default auth enables a wider set of authentication methods in GS (Issue [#390](https://github.com/drivendataorg/cloudpathlib/issues/390), PR [#514](https://github.com/drivendataorg/cloudpathlib/pull/514), thanks @ljyanesm) - Fixed `rmtree` fail on Azure with no `hns` and more than 256 blobs to drop (Issue [#509](https://github.com/drivendataorg/cloudpathlib/issues/509), PR [#508](https://github.com/drivendataorg/cloudpathlib/pull/508), thanks @alikefia) - Added support for http(s) urls with `HttpClient`, `HttpPath`, `HttpsClient`, and `HttpsPath`. (Issue [#455](https://github.com/drivendataorg/cloudpathlib/issues/455 ), PR [#468](https://github.com/drivendataorg/cloudpathlib/pull/468)) diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py index 2816d88d..96705ece 100644 --- a/cloudpathlib/gs/gsclient.py +++ b/cloudpathlib/gs/gsclient.py @@ -15,6 +15,7 @@ from google.auth.credentials import Credentials from google.api_core.retry import Retry + from google.auth import default as google_default_auth from google.auth.exceptions import DefaultCredentialsError from google.cloud.storage import Client as StorageClient @@ -51,18 +52,14 @@ def __init__( ): """Class constructor. Sets up a [`Storage Client`](https://googleapis.dev/python/storage/latest/client.html). - Supports the following authentication methods of `Storage Client`. + Supports, in this order, the following authentication methods of `Storage Client`. - - Environment variable `"GOOGLE_APPLICATION_CREDENTIALS"` containing a - path to a JSON credentials file for a Google service account. See - [Authenticating as a Service - Account](https://cloud.google.com/docs/authentication/production). - - File path to a JSON credentials file for a Google service account. - - OAuth2 Credentials object and a project name. - Instantiated and already authenticated `Storage Client`. + - OAuth2 Credentials object and a project name. + - File path to a JSON credentials file for a Google service account. + - Google Cloud SDK default credentials. See [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials) - If multiple methods are used, priority order is reverse of list above - (later in list takes priority). If no authentication methods are used, + If no authentication methods are used, then the client will be instantiated as anonymous, which will only have access to public buckets. @@ -91,18 +88,24 @@ def __init__( timeout (Optional[float]): Cloud Storage [timeout value](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) retry (Optional[google.api_core.retry.Retry]): Cloud Storage [retry configuration](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout#configuring-retries) """ - if application_credentials is None: - application_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") - + # don't check `GOOGLE_APPLICATION_CREDENTIALS` since `google_default_auth` already does that + # use explicit client if storage_client is not None: self.client = storage_client + # use explicit credentials elif credentials is not None: self.client = StorageClient(credentials=credentials, project=project) + # use explicit credential file elif application_credentials is not None: self.client = StorageClient.from_service_account_json(application_credentials) + # use default credentials based on SDK precedence else: try: - self.client = StorageClient() + # use `google_default_auth` instead of `StorageClient()` since it + # handles precedence of creds in different locations properly + credentials, default_project = google_default_auth() + project = project or default_project # use explicit project if present + self.client = StorageClient(credentials=credentials, project=project) except DefaultCredentialsError: self.client = StorageClient.create_anonymous_client() From 562b3c70b1a08f0270e5aa13785a2e8de010573d Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 3 Aug 2025 21:32:54 -0700 Subject: [PATCH 2/4] bonus: test fixes and docs --- CONTRIBUTING.md | 143 +++++++++++++++++++++++++++ tests/conftest.py | 84 +++++++++++----- tests/mock_clients/mock_adls_gen2.py | 6 +- 3 files changed, 204 insertions(+), 29 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b37da474..d15f0794 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -110,6 +110,149 @@ When the tests finish, if it is using a live server, the test files will be dele If you want to speed up your testing during development, you may comment out some of the rigs in [`conftest.py`](tests/conftest.py). Don't commit this change, and make sure you run against all the rigs before submitting a PR. +### Test Fixtures and Rigs + +The test suite uses a comprehensive set of fixtures and test rigs to ensure consistent behavior across all cloud providers. Here's a detailed overview of the available fixtures and their properties: + +#### Core Fixtures + +**`assets_dir`** - Path to the test assets directory containing sample files and directories used across all tests. + +**`live_server`** - Boolean indicating whether to use live cloud servers (controlled by `USE_LIVE_CLOUD=1` environment variable). + +**`wait_for_mkdir`** - Fixture that patches `os.mkdir` to wait for directory creation, useful for tests that are sometimes flaky due to filesystem timing. + +#### Cloud Provider Test Rigs + +The `CloudProviderTestRig` class is the foundation for all cloud provider testing. Each rig provides: + +- **`path_class`**: The CloudPath subclass for the provider (e.g., `S3Path`, `AzureBlobPath`) +- **`client_class`**: The Client subclass for the provider (e.g., `S3Client`, `AzureBlobClient`) +- **`drive`**: The bucket/container name for the provider +- **`test_dir`**: Unique test directory name generated from session UUID, module name, and function name +- **`live_server`**: Whether the rig uses live cloud servers +- **`required_client_kwargs`**: Additional client configuration parameters +- **`cloud_prefix`**: The cloud prefix for the provider (e.g., `s3://`, `az://`) + +Each rig provides a `create_cloud_path(path, client=None)` method that constructs cloud paths with the proper prefix and test directory structure. + +#### Available Test Rigs + +**Azure Rigs:** +- **`azure_rig`**: Tests Azure Blob Storage with mocked or live backend +- **`azure_gen2_rig`**: Tests Azure Data Lake Storage Gen2 with mocked or live backend + - Has `is_adls_gen2 = True` flag for tests that need to skip ADLS Gen2 specific behavior + - Uses `AZURE_STORAGE_CONNECTION_STRING` or `AZURE_STORAGE_GEN2_CONNECTION_STRING` environment variables + +**Google Cloud Storage:** +- **`gs_rig`**: Tests Google Cloud Storage with mocked or live backend + - Uses `LIVE_GS_BUCKET` environment variable for live testing + +**Amazon S3:** +- **`s3_rig`**: Tests AWS S3 with mocked or live backend + - Uses `LIVE_S3_BUCKET` environment variable for live testing +- **`custom_s3_rig`**: Tests S3-compatible services (MinIO, Ceph, etc.) + - Has `is_custom_s3 = True` flag for tests that need to skip AWS-specific behavior + - Uses `CUSTOM_S3_BUCKET`, `CUSTOM_S3_ENDPOINT`, `CUSTOM_S3_KEY_ID`, `CUSTOM_S3_SECRET_KEY` environment variables + +**Local Storage Rigs:** +- **`local_azure_rig`**: Tests Azure Blob Storage using local filesystem simulation +- **`local_gs_rig`**: Tests Google Cloud Storage using local filesystem simulation +- **`local_s3_rig`**: Tests S3 using local filesystem simulation + +**HTTP/HTTPS Rigs:** +- **`http_rig`**: Tests HTTP endpoints with local test server +- **`https_rig`**: Tests HTTPS endpoints with local test server and self-signed certificates + - Both use `HttpProviderTestRig` subclass with additional HTTP-specific functionality + +#### Fixture Unions + +The test suite uses `pytest-cases` fixture unions to run tests against multiple providers: + +- **`rig`**: Runs tests against all cloud providers (Azure Blob, Azure ADLS Gen2, GCS, S3, Custom S3, Local Azure, Local S3, Local GCS, HTTP, HTTPS) +- **`azure_rigs`**: Runs tests against both Azure Blob and Azure ADLS Gen2 +- **`s3_like_rig`**: Runs tests against AWS S3 and Custom S3 (for S3-compatible services) +- **`http_like_rig`**: Runs tests against HTTP and HTTPS endpoints + +#### HTTP Server Fixtures + +**`http_server`** and **`https_server`** (from `tests/http_fixtures.py`): +- Start local HTTP/HTTPS test servers with custom request handlers +- Support PUT, DELETE, POST, GET, and HEAD methods for comprehensive testing +- Use self-signed certificates for HTTPS testing +- Automatically clean up server directories after tests + +#### Mock Clients + +Located in `tests/mock_clients/`, these provide local filesystem-based implementations of cloud SDKs: + +- **`mock_azureblob.py`**: Mock Azure Blob Storage client +- **`mock_adls_gen2.py`**: Mock Azure Data Lake Storage Gen2 client +- **`mock_gs.py`**: Mock Google Cloud Storage client +- **`mock_s3.py`**: Mock AWS S3 client +- **`utils.py`**: Utility functions for mock clients (e.g., `delete_empty_parents_up_to_root`) + +#### Test Assets + +Located in `tests/assets/`, the test assets provide a consistent set of files and directories: + +``` +tests/assets/ +├── dir_0/ +│ ├── file0_0.txt +│ ├── file0_1.txt +│ └── file0_2.txt +└── dir_1/ + ├── file_1_0.txt + └── dir_1_0/ + └── file_1_0_0.txt +``` + +These assets are automatically copied to each test rig's directory and provide a predictable file structure for testing file operations, directory traversal, and other functionality. + +#### Utility Fixtures + +**`utilities_dir`**: Path to test utilities directory containing SSL certificates for HTTPS testing. + +**`_sync_filesystem()`**: Utility function that forces filesystem synchronization to stabilize tests, especially important on Windows where `os.sync()` is not available. + +#### Environment Variables for Live Testing + +When `USE_LIVE_CLOUD=1` is set, the following environment variables control live cloud testing: + +- **Azure**: `AZURE_STORAGE_CONNECTION_STRING`, `AZURE_STORAGE_GEN2_CONNECTION_STRING`, `LIVE_AZURE_CONTAINER` +- **Google Cloud**: `LIVE_GS_BUCKET` (requires Google Cloud credentials) +- **AWS S3**: `LIVE_S3_BUCKET` (requires AWS credentials) +- **Custom S3**: `CUSTOM_S3_BUCKET`, `CUSTOM_S3_ENDPOINT`, `CUSTOM_S3_KEY_ID`, `CUSTOM_S3_SECRET_KEY` + +#### Using Test Rigs in Your Tests + +When writing tests, use the rig's `create_cloud_path()` method to create cloud paths: + +```python +def test_file_operations(rig): + # Create a path to an existing file in the test assets + cp = rig.create_cloud_path("dir_0/file0_0.txt") + + # Create a path to a non-existent file + cp2 = rig.create_cloud_path("path/that/does/not/exist.txt") + + # Get a client instance + client = rig.client_class() +``` + +For provider-specific tests, you can check rig properties: + +```python +def test_azure_specific_feature(azure_rig): + if azure_rig.is_adls_gen2: + # Skip or test ADLS Gen2 specific behavior + pass + else: + # Test Azure Blob Storage specific behavior + pass +``` + ### Authoring tests We want our test suite coverage to be comprehensive, so PRs need to add tests if they add new functionality. If you are adding a new feature, you will need to add tests for it. If you are changing an existing feature, you will need to update the tests to match the new behavior. diff --git a/tests/conftest.py b/tests/conftest.py index 9fc6e5ea..919039f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,6 +69,12 @@ def assets_dir() -> Path: return Path(__file__).parent / "assets" +@fixture() +def live_server() -> bool: + """Whether to use a live server.""" + return os.getenv("USE_LIVE_CLOUD") == "1" + + class CloudProviderTestRig: """Class that holds together the components needed to test a cloud implementation.""" @@ -146,11 +152,14 @@ def wrapped_mkdir(path, *args, **kwargs): monkeypatch.setattr(os, "mkdir", wrapped_mkdir) -def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) - test_dir = create_test_dir_name(request) +def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) + if live_server + else DEFAULT_CONTAINER_NAME + ) - live_server = os.getenv("USE_LIVE_CLOUD") == "1" + test_dir = create_test_dir_name(request) connection_kwargs = dict() tmpdir = TemporaryDirectory() @@ -227,26 +236,28 @@ def _azure_fixture(conn_str_env_var, adls_gen2, request, monkeypatch, assets_dir @fixture() -def azure_rig(request, monkeypatch, assets_dir): +def azure_rig(request, monkeypatch, assets_dir, live_server): yield from _azure_fixture( - "AZURE_STORAGE_CONNECTION_STRING", False, request, monkeypatch, assets_dir + "AZURE_STORAGE_CONNECTION_STRING", False, request, monkeypatch, assets_dir, live_server ) @fixture() -def azure_gen2_rig(request, monkeypatch, assets_dir): +def azure_gen2_rig(request, monkeypatch, assets_dir, live_server): yield from _azure_fixture( - "AZURE_STORAGE_GEN2_CONNECTION_STRING", True, request, monkeypatch, assets_dir + "AZURE_STORAGE_GEN2_CONNECTION_STRING", True, request, monkeypatch, assets_dir, live_server ) @fixture() -def gs_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_GS_BUCKET", DEFAULT_GS_BUCKET_NAME) +def gs_rig(request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_GS_BUCKET", DEFAULT_GS_BUCKET_NAME) + if live_server + else DEFAULT_GS_BUCKET_NAME + ) test_dir = create_test_dir_name(request) - live_server = os.getenv("USE_LIVE_CLOUD") == "1" - if live_server: # Set up test assets bucket = google_storage.Client().bucket(drive) @@ -293,11 +304,14 @@ def gs_rig(request, monkeypatch, assets_dir): @fixture() -def s3_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) - test_dir = create_test_dir_name(request) +def s3_rig(request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) + if live_server + else DEFAULT_S3_BUCKET_NAME + ) - live_server = os.getenv("USE_LIVE_CLOUD") == "1" + test_dir = create_test_dir_name(request) if live_server: # Set up test assets @@ -339,19 +353,22 @@ def s3_rig(request, monkeypatch, assets_dir): @fixture() -def custom_s3_rig(request, monkeypatch, assets_dir): +def custom_s3_rig(request, monkeypatch, assets_dir, live_server): """ Custom S3 rig used to test the integrations with non-AWS S3-compatible object storages like - MinIO (https://min.io/) - CEPH (https://ceph.io/ceph-storage/object-storage/) - others """ - drive = os.getenv("CUSTOM_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) + drive = ( + os.getenv("CUSTOM_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) + if live_server + else DEFAULT_S3_BUCKET_NAME + ) + test_dir = create_test_dir_name(request) custom_endpoint_url = os.getenv("CUSTOM_S3_ENDPOINT", "https://s3.us-west-1.drivendatabws.com") - live_server = os.getenv("USE_LIVE_CLOUD") == "1" - if live_server: monkeypatch.setenv("AWS_ACCESS_KEY_ID", os.getenv("CUSTOM_S3_KEY_ID")) monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", os.getenv("CUSTOM_S3_SECRET_KEY")) @@ -425,8 +442,13 @@ def _spin_up_bucket(): @fixture() -def local_azure_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) +def local_azure_rig(request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_AZURE_CONTAINER", DEFAULT_CONTAINER_NAME) + if live_server + else DEFAULT_CONTAINER_NAME + ) + test_dir = create_test_dir_name(request) # copy test assets @@ -451,8 +473,13 @@ def local_azure_rig(request, monkeypatch, assets_dir): @fixture() -def local_gs_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_GS_BUCKET", DEFAULT_GS_BUCKET_NAME) +def local_gs_rig(request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_GS_BUCKET", DEFAULT_GS_BUCKET_NAME) + if live_server + else DEFAULT_GS_BUCKET_NAME + ) + test_dir = create_test_dir_name(request) # copy test assets @@ -476,8 +503,13 @@ def local_gs_rig(request, monkeypatch, assets_dir): @fixture() -def local_s3_rig(request, monkeypatch, assets_dir): - drive = os.getenv("LIVE_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) +def local_s3_rig(request, monkeypatch, assets_dir, live_server): + drive = ( + os.getenv("LIVE_S3_BUCKET", DEFAULT_S3_BUCKET_NAME) + if live_server + else DEFAULT_S3_BUCKET_NAME + ) + test_dir = create_test_dir_name(request) # copy test assets diff --git a/tests/mock_clients/mock_adls_gen2.py b/tests/mock_clients/mock_adls_gen2.py index 8d17b662..aaee7cd1 100644 --- a/tests/mock_clients/mock_adls_gen2.py +++ b/tests/mock_clients/mock_adls_gen2.py @@ -4,7 +4,7 @@ from azure.core.exceptions import ResourceNotFoundError from azure.storage.filedatalake import FileProperties -from tests.mock_clients.mock_azureblob import _JsonCache, DEFAULT_CONTAINER_NAME +from tests.mock_clients.mock_azureblob import _JsonCache class MockedDataLakeServiceClient: @@ -87,7 +87,7 @@ def get_file_properties(self): raise ResourceNotFoundError def rename_file(self, new_name): - new_path = self.root / new_name[len(DEFAULT_CONTAINER_NAME + "/") :] + new_path = self.root / new_name.split("/", 1)[1] (self.root / self.key).rename(new_path) @@ -106,5 +106,5 @@ def create_directory(self): (self.root / self.key).mkdir(parents=True, exist_ok=True) def rename_directory(self, new_name): - new_path = self.root / new_name[len(DEFAULT_CONTAINER_NAME + "/") :] + new_path = self.root / new_name.split("/", 1)[1] (self.root / self.key).rename(new_path) From f97f82770e3a38a0e7a43215e53c972fa45fc81b Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 3 Aug 2025 21:49:21 -0700 Subject: [PATCH 3/4] Mock default auth --- tests/conftest.py | 2 ++ tests/mock_clients/mock_gs.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 919039f1..b5dda1b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ mocked_client_class_factory as mocked_gsclient_class_factory, DEFAULT_GS_BUCKET_NAME, MockTransferManager, + mock_default_auth, ) from .mock_clients.mock_s3 import mocked_session_class_factory, DEFAULT_S3_BUCKET_NAME from .utils import _sync_filesystem @@ -282,6 +283,7 @@ def gs_rig(request, monkeypatch, assets_dir, live_server): "transfer_manager", MockTransferManager, ) + monkeypatch.setattr(cloudpathlib.gs.gsclient, "google_default_auth", mock_default_auth) rig = CloudProviderTestRig( path_class=GSPath, diff --git a/tests/mock_clients/mock_gs.py b/tests/mock_clients/mock_gs.py index 4ecdee1e..26487767 100644 --- a/tests/mock_clients/mock_gs.py +++ b/tests/mock_clients/mock_gs.py @@ -228,3 +228,7 @@ def download_chunks_concurrently( crc32c_checksum=True, ): blob.download_to_filename(filename) + + +def mock_default_auth(): + return "fake-credentials", "fake-default-project" From 4f438df95c17d92619d82200dc2f83475dea7e2c Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Sun, 3 Aug 2025 22:20:50 -0700 Subject: [PATCH 4/4] fix history --- HISTORY.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 988c6662..0e4f5cc3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,10 +1,13 @@ # cloudpathlib Changelog -## v0.21.1 (2025-05-14) +## UNRELEASED - Fixed issue with GS credentials, using default auth enables a wider set of authentication methods in GS (Issue [#390](https://github.com/drivendataorg/cloudpathlib/issues/390), PR [#514](https://github.com/drivendataorg/cloudpathlib/pull/514), thanks @ljyanesm) +- Added support for http(s) urls with `HttpClient`, `HttpPath`, `HttpsClient`, and `HttpsPath`. (Issue [#455](https://github.com/drivendataorg/cloudpathlib/issues/455), PR [#468](https://github.com/drivendataorg/cloudpathlib/pull/468)) + +## v0.21.1 (2025-05-14) + - Fixed `rmtree` fail on Azure with no `hns` and more than 256 blobs to drop (Issue [#509](https://github.com/drivendataorg/cloudpathlib/issues/509), PR [#508](https://github.com/drivendataorg/cloudpathlib/pull/508), thanks @alikefia) -- Added support for http(s) urls with `HttpClient`, `HttpPath`, `HttpsClient`, and `HttpsPath`. (Issue [#455](https://github.com/drivendataorg/cloudpathlib/issues/455 ), PR [#468](https://github.com/drivendataorg/cloudpathlib/pull/468)) ## v0.21.0 (2025-03-03)