diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 45c32efa8d59..1c0c6c14f50b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -976,10 +976,7 @@ std::vector ColumnsDescription::getAllRegisteredNames() const std::vector names; names.reserve(columns.size()); for (const auto & column : columns) - { - if (!column.name.contains('.')) - names.push_back(column.name); - } + names.emplace_back(column.name); return names; } diff --git a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py index cb239e1e8372..0bf9ae27d539 100644 --- a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py +++ b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py @@ -216,3 +216,101 @@ def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spa ).strip() expected = "deep_value1\ndeep_value2\ndeep_value3" assert result == expected, f"Expected:\n{expected}\nGot:\n{result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_column(started_cluster_iceberg_with_spark, storage_type): + """ + Regression test for issue #90731. + A top-level ARRAY column whose name literally contains a dot (e.g. `a.b`) + must be returned with its actual values, not as an empty array. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_column_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType + + data = [(["a", "b", "c"],)] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}" + + +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) +def test_dotted_array_alongside_real_nested(started_cluster_iceberg_with_spark, storage_type): + """ + Regression guard: a lone dotted Array column (`a.b`) must not interfere with + a genuine flat-Nested group (`c.x`, `c.y`) that shares a different prefix. + All three columns must round-trip correctly. + """ + instance = started_cluster_iceberg_with_spark.instances["node1"] + spark = started_cluster_iceberg_with_spark.spark_session + TABLE_NAME = "test_dotted_array_alongside_real_nested_" + storage_type + "_" + get_uuid_str() + + from pyspark.sql.types import ArrayType, IntegerType as SparkIntegerType + + data = [(["a", "b", "c"], [1, 2], ["p", "q"])] + schema = StructType([ + StructField("a.b", ArrayType(StringType())), + StructField("c.x", ArrayType(SparkIntegerType())), + StructField("c.y", ArrayType(StringType())), + ]) + df = spark.createDataFrame(data=data, schema=schema) + + write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2") + + default_upload_directory( + started_cluster_iceberg_with_spark, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + # Test via table function + table_function_expr = get_creation_expression( + storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True + ) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {table_function_expr}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table function: {result}" + + # Test via table engine + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark) + + result = instance.query( + f"SELECT `a.b`, `c.x`, `c.y` FROM {TABLE_NAME}" + ).strip() + assert result == "['a','b','c']\t[1,2]\t['p','q']", \ + f"Unexpected result via table engine: {result}" diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.reference b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference new file mode 100644 index 000000000000..9766475a4185 --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.reference @@ -0,0 +1 @@ +ok diff --git a/tests/queries/0_stateless/04260_dotted_column_in_hints.sh b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh new file mode 100755 index 000000000000..43c8e6c90c55 --- /dev/null +++ b/tests/queries/0_stateless/04260_dotted_column_in_hints.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Regression test for #90731. +# ColumnsDescription::getAllRegisteredNames must include columns whose names +# contain a dot, so they appear in IHints suggestions after a typo. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q " + CREATE TABLE t_dotted_hint (\`a.b\` Array(String)) + ENGINE = MergeTree ORDER BY tuple(); +" + +# Misspell the column name; the dotted name must appear in the IHints suggestion +# list ("Maybe you meant: [...]"), which is produced from +# ColumnsDescription::getAllRegisteredNames. If that method filtered out dotted +# names again, the suggestion list would be empty and this would fail. +$CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \ + | grep -qF "Maybe you meant: ['a.b']" && echo "ok" || echo "FAIL" + +$CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;" diff --git a/tests/queries/0_stateless/04261_iceberg_dotted_array.reference b/tests/queries/0_stateless/04261_iceberg_dotted_array.reference new file mode 100644 index 000000000000..2f250ab24c9a --- /dev/null +++ b/tests/queries/0_stateless/04261_iceberg_dotted_array.reference @@ -0,0 +1,4 @@ +['x','y','z'] +['x','y','z'] +['a','b','c'] [1,2] ['p','q'] +['a','b','c'] [1,2] ['p','q'] diff --git a/tests/queries/0_stateless/04261_iceberg_dotted_array.sh b/tests/queries/0_stateless/04261_iceberg_dotted_array.sh new file mode 100755 index 000000000000..3ee39fcdb0ba --- /dev/null +++ b/tests/queries/0_stateless/04261_iceberg_dotted_array.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Regression test for https://github.com/ClickHouse/ClickHouse/issues/90731. +# An Iceberg ARRAY column whose name literally contains a dot (e.g. `a.b`) must +# read back its stored values, not an empty array. The mixed case also checks a +# lone dotted column next to several dotted columns that share a prefix +# (`c.x`, `c.y`): all of them must round-trip correctly. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +LONE_PATH="${CLICKHOUSE_USER_FILES}/lakehouses/${CLICKHOUSE_DATABASE}_iceberg_dotted_lone" +MIXED_PATH="${CLICKHOUSE_USER_FILES}/lakehouses/${CLICKHOUSE_DATABASE}_iceberg_dotted_mixed" +rm -rf "${LONE_PATH}" "${MIXED_PATH}" + +# 1) A lone dotted Array column must read its stored values, not []. +${CLICKHOUSE_CLIENT} --query " + SET allow_experimental_insert_into_iceberg = 1; + DROP TABLE IF EXISTS t_iceberg_dotted_lone; + CREATE TABLE t_iceberg_dotted_lone (\`a.b\` Array(String)) ENGINE = IcebergLocal('${LONE_PATH}', 'Parquet'); + INSERT INTO t_iceberg_dotted_lone (\`a.b\`) SELECT ['x', 'y', 'z']; +" +# Read through the Iceberg engine table ... +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\` FROM t_iceberg_dotted_lone" +# ... and through the icebergLocal table function with an explicitly-declared schema. +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\` FROM icebergLocal('${LONE_PATH}', 'Parquet', '\`a.b\` Array(String)')" + +# 2) A lone dotted column alongside dotted columns that share a prefix (c.x, c.y). +${CLICKHOUSE_CLIENT} --query " + SET allow_experimental_insert_into_iceberg = 1; + DROP TABLE IF EXISTS t_iceberg_dotted_mixed; + CREATE TABLE t_iceberg_dotted_mixed (\`a.b\` Array(String), \`c.x\` Array(Int32), \`c.y\` Array(String)) ENGINE = IcebergLocal('${MIXED_PATH}', 'Parquet'); + INSERT INTO t_iceberg_dotted_mixed (\`a.b\`, \`c.x\`, \`c.y\`) SELECT ['a', 'b', 'c'], [1, 2], ['p', 'q']; +" +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\`, \`c.x\`, \`c.y\` FROM t_iceberg_dotted_mixed" +${CLICKHOUSE_CLIENT} --query "SELECT \`a.b\`, \`c.x\`, \`c.y\` FROM icebergLocal('${MIXED_PATH}', 'Parquet', '\`a.b\` Array(String), \`c.x\` Array(Int32), \`c.y\` Array(String)')" + +# Cleanup +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS t_iceberg_dotted_lone" +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS t_iceberg_dotted_mixed" +rm -rf "${LONE_PATH}" "${MIXED_PATH}"