From ab327d20fb01e4907380f592c271d44ac727f66c Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Thu, 21 May 2026 16:43:37 +0200 Subject: [PATCH 1/4] Fix Iceberg ARRAY columns with dot-separated names returning empty lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When querying an Iceberg table through the `iceberg(...)` table function or a DataLakeCatalog, a column whose name contains a `.` and whose type is `Array(T)` (e.g. `` `a.b` ARRAY ``) returned empty arrays instead of the stored values. The same data read by Spark returned the expected values. Fixes #90731. The Parquet V3 reader path (`SchemaConverter` + `ColumnMapper` + `FormatFilterInfo`) is already correct after the dotted-name field-id work in 0a218cd4e8b, 4b733bae561 and f24c1a46063. This change addresses two residual upstream defects that affect dotted-name `Array(T)` columns regardless of source: * `ColumnsDescription::getAllRegisteredNames` explicitly filtered out any column whose name contained `.`, under the assumption such names were always flattened Nested subcolumns. A column whose stored name literally contains a dot (allowed by MergeTree with backticks, and produced by Iceberg / Spark) is a first-class registered name and must appear in `IHints` misspelling suggestions. The function is only consumed by `IHints`-style suggestion paths (and by `StorageSystemZooKeeper` for column-name iteration, where no dotted names exist), so relaxing it has no effect on parsing, planning, storage, or wire protocol. * `NestedUtils::getSubcolumnsOfNested` treated every `Array(T)` column whose name contained `.` as a flattened element of a synthetic `Nested` structure named after the prefix. This caused the Arrow, ORC and pre-V3 Parquet readers to look for a struct field with the prefix name in the data file rather than the literal dotted column, returning an empty array. The fix uses a two-pass scan: a synthetic `Nested` entry is only emitted when at least two `Array(T)` columns share the same dotted prefix. A lone column such as `a.b: Array(T)` no longer appears in the synthetic-Nested map. Genuine flattened `Nested` with multiple fields is unaffected; the existing early-continue on `isNested()` also covers the one-field-Nested edge case. Tests: * `tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py::test_dotted_array_column` — end-to-end repro of #90731 against s3, azure and local storage. * `test_dotted_array_alongside_real_nested` in the same file — mixed- schema regression guard verifying a lone dotted `Array` column coexists with genuine flattened-Nested siblings. * `tests/queries/0_stateless/04259_dotted_array_not_nested.sql` — isolates Bug B without Iceberg. * `tests/queries/0_stateless/04260_dotted_column_in_hints.sh` — verifies Bug A by checking the misspelling hint output. Changelog category (leave one): - Bug Fix (user-visible misbehavior in an official stable release) Changelog entry: Fix reading Iceberg tables whose `ARRAY` column names contain a dot (e.g. `` `a.b` ARRAY ``), which previously returned empty arrays. Two upstream defects were responsible: `ColumnsDescription::getAllRegisteredNames` filtered out dotted names, and `NestedUtils::getSubcolumnsOfNested` misclassified lone dotted `Array(T)` columns as flattened `Nested` children. (cherry picked from commit f8467afa849f7ce5aec7a7d372b00fdabf13b4b1) --- src/DataTypes/NestedUtils.cpp | 24 ++++- src/Storages/ColumnsDescription.cpp | 5 +- .../test_column_names_with_dots.py | 98 +++++++++++++++++++ .../04259_dotted_array_not_nested.reference | 3 + .../04259_dotted_array_not_nested.sql | 18 ++++ .../04260_dotted_column_in_hints.reference | 1 + .../04260_dotted_column_in_hints.sh | 19 ++++ 7 files changed, 160 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.reference create mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.sql create mode 100644 tests/queries/0_stateless/04260_dotted_column_in_hints.reference create mode 100755 tests/queries/0_stateless/04260_dotted_column_in_hints.sh diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 56ebe66c2ecc..531d29b66250 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -212,21 +212,37 @@ using NameToDataType = std::map; NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types) { - std::unordered_map nested; + /// Pass 1: count how many Array(T) columns share each dotted prefix. + /// A lone column like `a.b Array(T)` must not be collapsed into a synthetic + /// Nested parent — only genuine flat-Nested groups (n.x, n.y, ...) qualify. + std::unordered_map prefix_count; for (const auto & name_type : names_and_types) { /// Skip subcolumns (e.g. `c0.c2.null` derived from `c0.c2 Array(Nullable(Tuple()))`). - /// They are not real flat-nested columns like `n.a Array(T)`, `n.b Array(T)`. if (name_type.isSubcolumn()) continue; const auto * type_arr = typeid_cast(name_type.type.get()); - - /// Ignore true Nested type, but try to unite flatten arrays to Nested type. if (!isNested(name_type.type) && type_arr) { auto split = splitName(name_type.name); if (!split.second.empty()) + ++prefix_count[split.first]; + } + } + + /// Pass 2: build Nested only for prefixes shared by at least two columns. + std::unordered_map nested; + for (const auto & name_type : names_and_types) + { + if (name_type.isSubcolumn()) + continue; + + const auto * type_arr = typeid_cast(name_type.type.get()); + if (!isNested(name_type.type) && type_arr) + { + auto split = splitName(name_type.name); + if (!split.second.empty() && prefix_count[split.first] >= 2) nested[split.first].emplace_back(split.second, type_arr->getNestedType()); } } diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 45c32efa8d59..1c0c6c14f50b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -976,10 +976,7 @@ std::vector ColumnsDescription::getAllRegisteredNames() const std::vector names; names.reserve(columns.size()); for (const auto & column : columns) - { - if (!column.name.contains('.')) - names.push_back(column.name); - } + names.emplace_back(column.name); return names; } diff --git a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py index cb239e1e8372..0bf9ae27d539 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py @@ -216,3 +216,101 @@ def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spa ).strip() expected = "deep_value1\ndeep_value2\ndeep_value3" assert result == expected, f"Expected:\n{expected}\nGot:\n{result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_column(started_cluster_iceberg_with_spark, storage_type): + """ + Regression test for issue #90731. + A top-level ARRAY column whose name literally contains a dot (e.g. `a.b`) + must be returned with its actual values, not as an empty array. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_column_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType + + data = [(["a", "b", "c"],)] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_alongside_real_nested(started_cluster_iceberg_with_spark, storage_type): + """ + Regression guard: a lone dotted Array column (`a.b`) must not interfere with + a genuine flat-Nested group (`c.x`, `c.y`) that shares a different prefix. + All three columns must round-trip correctly. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_alongside_real_nested_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType, IntegerType as SparkIntegerType + + data = [(["a", "b", "c"], [1, 2], ["p", "q"])] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + StructField("c.x", ArrayType(SparkIntegerType())), + StructField("c.y", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table function: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table engine: {result}" diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.reference b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference new file mode 100644 index 000000000000..e6c5c3fcae88 --- /dev/null +++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference @@ -0,0 +1,3 @@ +['a','b','c'] +['a','b','c'] +[1,2] ['p','q'] diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.sql b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql new file mode 100644 index 000000000000..ba9396f11826 --- /dev/null +++ b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql @@ -0,0 +1,18 @@ +-- Regression test for #90731. +-- A lone Array(T) column with a dot in its name must not be collapsed into +-- a synthetic Nested structure and must be readable as a plain array. + +CREATE TABLE t1 (`a.b` Array(String)) ENGINE = Memory; +INSERT INTO t1 VALUES (['a','b','c']); +SELECT `a.b` FROM t1; + +-- In a mixed table, the lone dotted column must not interfere with the +-- genuine flat-Nested group (c.x / c.y share prefix 'c'). +CREATE TABLE t2 (`a.b` Array(String), `c.x` Array(Int32), `c.y` Array(String)) + ENGINE = Memory; +INSERT INTO t2 VALUES (['a','b','c'], [1,2], ['p','q']); +SELECT `a.b` FROM t2; +SELECT `c.x`, `c.y` FROM t2; + +DROP TABLE t1; +DROP TABLE t2; diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.reference b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference new file mode 100644 index 000000000000..9766475a4185 --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference @@ -0,0 +1 @@ +ok diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh new file mode 100755 index 000000000000..a7cfe12dee4b --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Regression test for #90731. +# ColumnsDescription::getAllRegisteredNames must include columns whose names +# contain a dot, so they appear in IHints suggestions after a typo. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q " + CREATE TABLE t_dotted_hint (\`a.b\` Array(String)) + ENGINE = MergeTree ORDER BY tuple(); +" + +# Misspell the column name; the error message must suggest the real name 'a.b'. +$CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \ + | grep -qF "a.b" && echo "ok" || echo "FAIL" + +$CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;" From b0b731ee88ab58155222b8ae519c06391f403db8 Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Wed, 10 Jun 2026 17:10:43 +0200 Subject: [PATCH 2/4] Revert the getSubcolumnsOfNested >= 2 gate: it caused a SIGSEGV and is not needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The earlier `Nested::getSubcolumnsOfNested` change added a two-pass `>= 2` prefix-count gate so a lone dotted `Array(T)` column (e.g. `` `a.b` ``) would not be collapsed into a synthetic `Nested`. That gate is the root cause of a server crash and breaks existing mutation behavior, and it is unnecessary for the Iceberg fix. This restores `getSubcolumnsOfNested` to the `antalya-26.3` base behavior (it becomes byte-identical to base again). The `ColumnsDescription::getAllRegisteredNames` hint fix and the regression tests are kept. Root cause of the crash ======================= The gate's prefix count is computed over whatever column subset the caller passes. For wide parts, `IMergeTreeReader` builds `converted_requested_columns = Nested::convertToSubcolumns(columns_)` from the *per-read requested set*. When a single member of a genuine flat-`Nested` group is read on its own, the subset count is 1, so the gate refuses to remap it to a `Nested` subcolumn: * `03742_test_flattened_crash` (`SELECT arr.nested FROM ... ORDER BY arr.nested`): the dropped-and-re-added `arr.nested Array(Tuple(a String, b Float64))` is materialized through the plain-array default path instead of the `Nested` path, producing a column whose declared type does not match its backing data. Sending the result then dereferences a null `ColumnString` -> `SIGSEGV` in `SerializationString::serializeBinaryBulk` (via `SerializationArray` -> `SerializationTuple` -> `SerializationNamed` -> `NativeWriter`). The fault zeroes the test-failure count, which is why the Fast test reports `Server died` instead of a failed test. * `02565_update_empty_nested`: `nested.arr2` read alone during the mutation loses the shared-offset `Nested` serialization, so its default value reads with the wrong size -> `validateNestedArraySizes` reports `SIZES_OF_ARRAYS_DONT_MATCH` (2 vs 0). The count cannot be made reliable from the per-read subset: at the `NamesAndTypesList` level a real flat-`Nested` member and a standalone dotted `Array(T)` are indistinguishable, and basing the decision on the full storage schema instead introduces a separate mutation-write corruption (`CANNOT_READ_ALL_DATA`, truncated `nested.size0` marks). Why the gate is not needed ========================== The Iceberg / object-storage read path does not call `getSubcolumnsOfNested`, `collect`, or `convertToSubcolumns` at all (only MergeTree, Log, `generateRandom`, `mergeTreeIndex`, and the Arrow/ORC struct-prefix branch do, and a lone dotted column does not enter that branch). Reading a lone dotted `a.b Array(String)` — and a mix of a lone dotted column with a genuine flat `Nested` (`c.x`, `c.y`) — from a real local Iceberg table (both the `Iceberg` engine and the `icebergLocal` table function) returns the correct values without this change; the dotted-name `Array` read is already handled by the Parquet field-id work. So the gate provided no benefit while introducing the crash. Validated locally: `03742_test_flattened_crash` no longer crashes, `02565_update_empty_nested` returns `450000 450000`, `04259_dotted_array_not_nested` and `04260_dotted_column_in_hints` pass, the `icebergLocal` dotted-array reads are correct, and single-field `Nested` reads are unchanged. --- src/DataTypes/NestedUtils.cpp | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index f8472af6b2c9..8e7ed78e88d9 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -285,37 +285,21 @@ using NameToDataType = std::map; NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types) { - /// Pass 1: count how many Array(T) columns share each dotted prefix. - /// A lone column like `a.b Array(T)` must not be collapsed into a synthetic - /// Nested parent — only genuine flat-Nested groups (n.x, n.y, ...) qualify. - std::unordered_map prefix_count; + std::unordered_map nested; for (const auto & name_type : names_and_types) { /// Skip subcolumns (e.g. `c0.c2.null` derived from `c0.c2 Array(Nullable(Tuple()))`). + /// They are not real flat-nested columns like `n.a Array(T)`, `n.b Array(T)`. if (name_type.isSubcolumn()) continue; const auto * type_arr = typeid_cast(name_type.type.get()); - if (!isNested(name_type.type) && type_arr) - { - auto split = splitName(name_type.name); - if (!split.second.empty()) - ++prefix_count[split.first]; - } - } - /// Pass 2: build Nested only for prefixes shared by at least two columns. - std::unordered_map nested; - for (const auto & name_type : names_and_types) - { - if (name_type.isSubcolumn()) - continue; - - const auto * type_arr = typeid_cast(name_type.type.get()); + /// Ignore true Nested type, but try to unite flatten arrays to Nested type. if (!isNested(name_type.type) && type_arr) { auto split = splitName(name_type.name); - if (!split.second.empty() && prefix_count[split.first] >= 2) + if (!split.second.empty()) nested[split.first].emplace_back(split.second, type_arr->getNestedType()); } } From 7524aed2e02bb0291d9779cbe6a5a823ffa86262 Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Thu, 11 Jun 2026 13:50:03 +0200 Subject: [PATCH 3/4] Strengthen 04260: assert the dotted name appears in the IHints suggestion list The test exercises the line PR #1894 removes (the dot-filter in `ColumnsDescription::getAllRegisteredNames`). It now asserts the dotted column name appears specifically inside the `Maybe you meant: [...]` suggestion list rather than anywhere in the message, so re-introducing the filter (which would empty the suggestion list) is caught directly. --- tests/queries/0_stateless/04260_dotted_column_in_hints.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh index a7cfe12dee4b..43c8e6c90c55 100755 --- a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh @@ -12,8 +12,11 @@ $CLICKHOUSE_CLIENT -q " ENGINE = MergeTree ORDER BY tuple(); " -# Misspell the column name; the error message must suggest the real name 'a.b'. +# Misspell the column name; the dotted name must appear in the IHints suggestion +# list ("Maybe you meant: [...]"), which is produced from +# ColumnsDescription::getAllRegisteredNames. If that method filtered out dotted +# names again, the suggestion list would be empty and this would fail. $CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \ - | grep -qF "a.b" && echo "ok" || echo "FAIL" + | grep -qF "Maybe you meant: ['a.b']" && echo "ok" || echo "FAIL" $CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;" From def81c28344616f6a3f2803b3309b83c7c481b43 Mon Sep 17 00:00:00 2001 From: "Daniel Q. Kim" Date: Thu, 11 Jun 2026 14:09:01 +0200 Subject: [PATCH 4/4] Add Iceberg-path regression test for dotted-name Array columns; drop Memory-engine 04259 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `04261_iceberg_dotted_array.sh` reads dotted-name `Array` columns through the actual Iceberg read path (`IcebergLocal` engine + `icebergLocal` table function with an explicitly-declared schema), which the previous tests did not cover: * a lone `` `a.b` Array(String) `` column must read back its stored values (the core regression for #90731 — it previously came back empty), and * a lone dotted column alongside dotted columns that share a prefix (`` `c.x` ``, `` `c.y` ``) — all must round-trip correctly. `04259_dotted_array_not_nested.sql` is removed: it used `ENGINE = Memory`, which does not exercise the Iceberg read path (flagged in review) and does not go through `Nested::convertToSubcolumns` / `collect`, so it neither reproduced Iceberg test above, while the hint-path change is covered by `04260_dotted_column_in_hints.sh`. --- .../04259_dotted_array_not_nested.reference | 3 -- .../04259_dotted_array_not_nested.sql | 18 -------- .../04261_iceberg_dotted_array.reference | 4 ++ .../0_stateless/04261_iceberg_dotted_array.sh | 42 +++++++++++++++++++ 4 files changed, 46 insertions(+), 21 deletions(-) delete mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.reference delete mode 100644 tests/queries/0_stateless/04259_dotted_array_not_nested.sql create mode 100644 tests/queries/0_stateless/04261_iceberg_dotted_array.reference create mode 100755 tests/queries/0_stateless/04261_iceberg_dotted_array.sh diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.reference b/tests/queries/0_stateless/04259_dotted_array_not_nested.reference deleted file mode 100644 index e6c5c3fcae88..000000000000 --- a/tests/queries/0_stateless/04259_dotted_array_not_nested.reference +++ /dev/null @@ -1,3 +0,0 @@ -['a','b','c'] -['a','b','c'] -[1,2] ['p','q'] diff --git a/tests/queries/0_stateless/04259_dotted_array_not_nested.sql b/tests/queries/0_stateless/04259_dotted_array_not_nested.sql deleted file mode 100644 index ba9396f11826..000000000000 --- a/tests/queries/0_stateless/04259_dotted_array_not_nested.sql +++ /dev/null @@ -1,18 +0,0 @@ --- Regression test for #90731. --- A lone Array(T) column with a dot in its name must not be collapsed into --- a synthetic Nested structure and must be readable as a plain array. - -CREATE TABLE t1 (`a.b` Array(String)) ENGINE = Memory; -INSERT INTO t1 VALUES (['a','b','c']); -SELECT `a.b` FROM t1; - --- In a mixed table, the lone dotted column must not interfere with the --- genuine flat-Nested group (c.x / c.y share prefix 'c'). -CREATE TABLE t2 (`a.b` Array(String), `c.x` Array(Int32), `c.y` Array(String)) - ENGINE = Memory; -INSERT INTO t2 VALUES (['a','b','c'], [1,2], ['p','q']); -SELECT `a.b` FROM t2; -SELECT `c.x`, `c.y` FROM t2; - -DROP TABLE t1; -DROP TABLE t2; diff --git a/tests/queries/0_stateless/04261_iceberg_dotted_array.reference b/tests/queries/0_stateless/04261_iceberg_dotted_array.reference new file mode 100644 index 000000000000..2f250ab24c9a --- /dev/null +++ b/tests/queries/0_stateless/04261_iceberg_dotted_array.reference @@ -0,0 +1,4 @@ +['x','y','z'] +['x','y','z'] +['a','b','c'] [1,2] ['p','q'] +['a','b','c'] [1,2] ['p','q'] diff --git a/tests/queries/0_stateless/04261_iceberg_dotted_array.sh b/tests/queries/0_stateless/04261_iceberg_dotted_array.sh new file mode 100755 index 000000000000..3ee39fcdb0ba --- /dev/null +++ b/tests/queries/0_stateless/04261_iceberg_dotted_array.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Regression test for https://github.com/ClickHouse/ClickHouse/issues/90731. +# An Iceberg ARRAY column whose name literally contains a dot (e.g. `a.b`) must +# read back its stored values, not an empty array. The mixed case also checks a +# lone dotted column next to several dotted columns that share a prefix +# (`c.x`, `c.y`): all of them must round-trip correctly. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +LONE_PATH="${CLICKHOUSE_USER_FILES}/lakehouses/${CLICKHOUSE_DATABASE}_iceberg_dotted_lone" +MIXED_PATH="${CLICKHOUSE_USER_FILES}/lakehouses/${CLICKHOUSE_DATABASE}_iceberg_dotted_mixed" +rm -rf "${LONE_PATH}" "${MIXED_PATH}" + +# 1) A lone dotted Array column must read its stored values, not []. +${CLICKHOUSE_CLIENT} --query " + SET allow_experimental_insert_into_iceberg = 1; + DROP TABLE IF EXISTS t_iceberg_dotted_lone; + CREATE TABLE t_iceberg_dotted_lone (\`a.b\` Array(String)) ENGINE = IcebergLocal('${LONE_PATH}', 'Parquet'); + INSERT INTO t_iceberg_dotted_lone (\`a.b\`) SELECT ['x', 'y', 'z']; +" +# Read through the Iceberg engine table ... +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\` FROM t_iceberg_dotted_lone" +# ... and through the icebergLocal table function with an explicitly-declared schema. +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\` FROM icebergLocal('${LONE_PATH}', 'Parquet', '\`a.b\` Array(String)')" + +# 2) A lone dotted column alongside dotted columns that share a prefix (c.x, c.y). +${CLICKHOUSE_CLIENT} --query " + SET allow_experimental_insert_into_iceberg = 1; + DROP TABLE IF EXISTS t_iceberg_dotted_mixed; + CREATE TABLE t_iceberg_dotted_mixed (\`a.b\` Array(String), \`c.x\` Array(Int32), \`c.y\` Array(String)) ENGINE = IcebergLocal('${MIXED_PATH}', 'Parquet'); + INSERT INTO t_iceberg_dotted_mixed (\`a.b\`, \`c.x\`, \`c.y\`) SELECT ['a', 'b', 'c'], [1, 2], ['p', 'q']; +" +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\`, \`c.x\`, \`c.y\` FROM t_iceberg_dotted_mixed" +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\`, \`c.x\`, \`c.y\` FROM icebergLocal('${MIXED_PATH}', 'Parquet', '\`a.b\` Array(String), \`c.x\` Array(Int32), \`c.y\` Array(String)')" + +# Cleanup +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS t_iceberg_dotted_lone" +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS t_iceberg_dotted_mixed" +rm -rf "${LONE_PATH}" "${MIXED_PATH}"