From 2908d6dd3f68d93349b583de450bb33eaeda4dd0 Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Thu, 12 Mar 2026 15:54:06 +0100 Subject: [PATCH 1/4] Fixed s3 decoding Signed-off-by: Kanthi Subramanian --- src/Storages/ObjectStorage/Utils.cpp | 8 ++-- .../integration/test_database_iceberg/test.py | 48 ++++++++++++++++++- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index b5330ac180b2..8f490b504cc2 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -513,9 +513,11 @@ std::pair resolveObjectStorageForPath( normalized_path = "gs://" + target_decomposed.authority + "/" + target_decomposed.key; } S3::URI s3_uri(normalized_path); - - std::string key_to_use = s3_uri.key; - + + // Use key (parsed without URI decoding) so that percent-encoded + // characters in object keys (e.g. %2F in Iceberg partition paths) are preserved. + std::string key_to_use = target_decomposed.key; + bool use_base_storage = false; if (base_storage->getType() == ObjectStorageType::S3) { diff --git a/tests/integration/test_database_iceberg/test.py b/tests/integration/test_database_iceberg/test.py index fa1ff5b45359..57efe7d2fb2e 100644 --- a/tests/integration/test_database_iceberg/test.py +++ b/tests/integration/test_database_iceberg/test.py @@ -625,6 +625,52 @@ def test_table_with_slash(started_cluster): assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n" +def test_partition_value_with_slash(started_cluster): + """Partition value containing '/' produces object keys with %2F; reading must preserve encoding.""" + node = started_cluster.instances["node1"] + + test_ref = f"test_partition_slash_{uuid.uuid4()}" + table_name = f"{test_ref}_table" + root_namespace = f"{test_ref}_namespace" + + # Partition by symbol (string) so partition value "us/west" becomes path segment symbol=us%2Fwest + partition_spec = PartitionSpec( + PartitionField( + source_id=2, field_id=1000, transform=IdentityTransform(), name="symbol" + ) + ) + schema = DEFAULT_SCHEMA + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + table = create_table( + catalog, + root_namespace, + table_name, + schema, + partition_spec=partition_spec, + sort_order=DEFAULT_SORT_ORDER, + ) + + # Write a row with partition value containing slash (path will have %2F in S3 key) + data = [ + { + "datetime": datetime.now(), + "symbol": "us/west", + "bid": 100.0, + "ask": 101.0, + "details": {"created_by": "test"}, + } + ] + df = pa.Table.from_pylist(data) + table.append(df) + + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + assert 1 == int(node.query(f"SELECT count() FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`")) + assert "us/west" in node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`") + + def test_cluster_select(started_cluster): node1 = started_cluster.instances["node1"] node2 = started_cluster.instances["node2"] @@ -665,7 +711,7 @@ def test_cluster_select(started_cluster): assert len(cluster_secondary_queries) == 1 assert node2.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`", settings={"parallel_replicas_for_cluster_engines":1, 'enable_parallel_replicas': 2, 'cluster_for_parallel_replicas': 'cluster_simple', 'parallel_replicas_for_cluster_engines' : 1}) == 'pablo\n' - + def test_not_specified_catalog_type(started_cluster): node = started_cluster.instances["node1"] settings = { From e8d4d202a388ce6a069df474a94e7fec4df99ded Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Thu, 26 Mar 2026 15:09:56 +0100 Subject: [PATCH 2/4] Added fix for path style URI parsing Signed-off-by: Kanthi Subramanian --- src/Storages/ObjectStorage/Utils.cpp | 9 ++ .../tests/gtest_scheme_authority_key.cpp | 122 ++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 273691d87968..619d1bba21b1 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -518,6 +518,15 @@ std::pair resolveObjectStorageForPath( // Use key (parsed without URI decoding) so that percent-encoded // characters in object keys (e.g. %2F in Iceberg partition paths) are preserved. std::string key_to_use = target_decomposed.key; + + // For path-style HTTP(S) URLs, SchemeAuthorityKey puts / in .key + // because the bucket lives in the URL path, not the hostname. + // Strip the bucket prefix so the key is relative to the bucket. + if (!s3_uri.is_virtual_hosted_style + && key_to_use.starts_with(s3_uri.bucket + "/")) + { + key_to_use = key_to_use.substr(s3_uri.bucket.size() + 1); + } bool use_base_storage = false; if (base_storage->getType() == ObjectStorageType::S3) diff --git a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp new file mode 100644 index 000000000000..34e439c3c81b --- /dev/null +++ b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp @@ -0,0 +1,122 @@ +#include + +#include +#include +#include "config.h" + +#if USE_AWS_S3 && USE_AVRO + +#include +#include +#include +#include +#include + +using namespace DB; + +namespace +{ +struct ClientFake : DB::S3::Client +{ + explicit ClientFake() + : DB::S3::Client( + 1, + DB::S3::ServerSideEncryptionKMSConfig(), + std::make_shared("test_access_key", "test_secret"), + DB::S3::ClientFactory::instance().createClientConfiguration( + "test_region", + DB::RemoteHostFilter(), + 1, + DB::S3::PocoHTTPClientConfiguration::RetryStrategy{.max_retries = 0}, + true, + true, + true, + false, + {}, + /* request_throttler = */ {}, + "http"), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + DB::S3::ClientSettings()) + { + } + + Aws::S3::Model::GetObjectOutcome GetObject([[maybe_unused]] const Aws::S3::Model::GetObjectRequest &) const override + { + UNREACHABLE(); + } +}; + +std::shared_ptr makeBaseStorage(S3::URI uri) +{ + S3Capabilities cap; + ObjectStorageKeyGeneratorPtr gen; + const String disk_name = "s3"; + return std::make_shared( + std::make_unique(), std::make_unique(), std::move(uri), cap, gen, disk_name); +} + +void assertResolveReturnsBaseKey(const std::string & table_location, const std::string & path, S3::URI base_uri, const std::string & expected_key) +{ + auto base = makeBaseStorage(std::move(base_uri)); + SecondaryStorages secondary_storages; + auto [storage, key] = resolveObjectStorageForPath(table_location, path, base, secondary_storages, nullptr); + ASSERT_EQ(storage.get(), base.get()); + ASSERT_EQ(key, expected_key); +} +} + +TEST(ResolveObjectStorageForPathKey, S3SchemeKeyNoBucketInPath) +{ + const char * path = "s3://my-bucket/dir/file.parquet"; + S3::URI target(path); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyNoBucket) +{ + const char * path = "https://my-bucket.s3.amazonaws.com/dir/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, PathStyleHttpsBucketPrefixStripped) +{ + const char * path = "https://s3.amazonaws.com/my-bucket/dir/file.parquet"; + S3::URI target(path); + ASSERT_FALSE(target.is_virtual_hosted_style); + ASSERT_EQ(target.bucket, "my-bucket"); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, PathStyleMinioPreservesPercentEncodedSlash) +{ + const char * path = "http://minio:9000/bucket/partition=us%2Fwest/data.parquet"; + S3::URI target(path); + ASSERT_FALSE(target.is_virtual_hosted_style); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("http://minio:9000/bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/data.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, S3SchemePreservesPercentEncodedSlash) +{ + const char * path = "s3://bucket/partition=us%2Fwest/file.parquet"; + S3::URI target(path); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/file.parquet"); +} + +#endif From 357fbb384e35f1eaaf2ec0317122919ac340a0cb Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Fri, 27 Mar 2026 03:55:12 +0100 Subject: [PATCH 3/4] Added gtest to test s3 uri with bucket name in key --- .../tests/gtest_scheme_authority_key.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp index 34e439c3c81b..bcb61fa4a0bc 100644 --- a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp +++ b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp @@ -119,4 +119,17 @@ TEST(ResolveObjectStorageForPathKey, S3SchemePreservesPercentEncodedSlash) assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/file.parquet"); } +TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyMayStartWithBucketNamePrefix) +{ + const char * path = "https://bucket.s3.amazonaws.com/bucket/nested/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style); + + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "bucket/nested/file.parquet"); +} + #endif From 5caa8cff9063fce8dce07c7de721502ccb5cf08d Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Fri, 27 Mar 2026 04:01:00 +0100 Subject: [PATCH 4/4] Added gtest to test s3 uri with bucket name in key --- .../tests/gtest_scheme_authority_key.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp index bcb61fa4a0bc..f0a7e7613b25 100644 --- a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp +++ b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp @@ -132,4 +132,18 @@ TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyMayStartWithBucketName assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "bucket/nested/file.parquet"); } +TEST(ResolveObjectStorageForPathKey, S3SchemeKeyMayStartWithBucketNamePrefix) +{ + const char * path = "s3://bucket/bucket/nested/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style) + << "Requires s3:// -> virtual-hosted mapping (global context); use HTTPS test if this fails"; + + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "bucket/nested/file.parquet"); +} + #endif