diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..5207c2eb 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Fixed `_get_blob_by_metadata_version` to return the newest blob (highest generation) when multiple blobs match the requested version. diff --git a/policyengine/utils/data/version_aware_storage_client.py b/policyengine/utils/data/version_aware_storage_client.py index 0fdc144d..5f2e76a9 100644 --- a/policyengine/utils/data/version_aware_storage_client.py +++ b/policyengine/utils/data/version_aware_storage_client.py @@ -112,16 +112,23 @@ def _get_blob_by_metadata_version( f"Searching for blob {bucket.name}/{key} with metadata version '{version}'" ) versions = bucket.list_blobs(prefix=key, versions=True) + matching_blobs = [] for blob in versions: if blob.metadata and blob.metadata.get("version") == version: - logger.info( - f"Found blob {bucket.name}/{key} with metadata version '{version}'" - ) - return blob + matching_blobs.append(blob) - raise ValueError( - f"No blob found with version '{version}' for {bucket.name}/{key}" + if not matching_blobs: + raise ValueError( + f"No blob found with version '{version}' for {bucket.name}/{key}" + ) + + # Return the blob with the highest generation number (newest) + newest_blob = max(matching_blobs, key=lambda b: b.generation) + logger.info( + f"Found blob {bucket.name}/{key} with metadata version '{version}' " + f"(generation {newest_blob.generation})" ) + return newest_blob def crc32c( self, diff --git a/tests/utils/data/test_version_aware_storage_client.py b/tests/utils/data/test_version_aware_storage_client.py index 38b6ebac..0ae448e2 100644 --- a/tests/utils/data/test_version_aware_storage_client.py +++ b/tests/utils/data/test_version_aware_storage_client.py @@ -164,10 +164,13 @@ def test_get_blob__metadata_version_searches_blobs( # Create mock blobs with different metadata versions blob1 = MagicMock() blob1.metadata = {"version": "1.0.0"} + blob1.generation = 100 blob2 = MagicMock() blob2.metadata = {"version": "1.2.3"} + blob2.generation = 200 blob3 = MagicMock() blob3.metadata = None + blob3.generation = 150 bucket.list_blobs.return_value = [blob1, blob3, blob2] @@ -177,6 +180,45 @@ def test_get_blob__metadata_version_searches_blobs( assert result == blob2 bucket.list_blobs.assert_called_with(prefix="test_key", versions=True) + @patch( + "policyengine.utils.data.version_aware_storage_client.Client", + autospec=True, + ) + def test_get_blob__metadata_version_returns_newest_when_multiple_match( + self, mock_client_class + ): + """Test that when multiple blobs have the same version, the newest (highest generation) is returned.""" + mock_instance = mock_client_class.return_value + bucket = mock_instance.bucket.return_value + + # Create mock blobs with the SAME metadata version but different generations + oldest_blob = MagicMock() + oldest_blob.metadata = {"version": "1.2.3"} + oldest_blob.generation = 100 + + middle_blob = MagicMock() + middle_blob.metadata = {"version": "1.2.3"} + middle_blob.generation = 200 + + newest_blob = MagicMock() + newest_blob.metadata = {"version": "1.2.3"} + newest_blob.generation = 300 + + # Return blobs in non-sorted order to ensure we're not relying on order + bucket.list_blobs.return_value = [ + middle_blob, + oldest_blob, + newest_blob, + ] + + client = VersionAwareStorageClient() + result = client.get_blob("test_bucket", "test_key", version="1.2.3") + + # Should return the blob with the highest generation number + assert result == newest_blob + assert result.generation == 300 + bucket.list_blobs.assert_called_with(prefix="test_key", versions=True) + @patch( "policyengine.utils.data.version_aware_storage_client.Client", autospec=True, @@ -217,6 +259,7 @@ def test_get_blob__generation_fallback_to_metadata( # Set up metadata-based lookup to succeed metadata_blob = MagicMock() metadata_blob.metadata = {"version": "999"} + metadata_blob.generation = 100 bucket.list_blobs.return_value = [metadata_blob] client = VersionAwareStorageClient()