Merge pull request ClickHouse#94335 from mkmkme/dot-issue

Avogar · mkmkme · commit de63cf04b2a6 · 2026-02-25T13:46:26.000+01:00
Fix reading columns with dot-separated names from Iceberg
diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp
@@ -135,7 +135,7 @@ NamesAndTypesList SchemaConverter::inferSchema()
     return res;
 }
 
-std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElement & element) const
+std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElement & element, const String & current_path) const
 {
     if (!column_mapper)
         return element.name;
@@ -150,8 +150,19 @@ std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElem
     auto it = map.find(element.field_id);
     if (it == map.end())
         throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id);
-    auto split = Nested::splitName(std::string_view(it->second), /*reverse=*/ true);
-    return split.second.empty() ? split.first : split.second;
+
+    /// At top level (empty path), return the full mapped name. For nested
+    /// elements, strip the parent path prefix to get the child name.
+    if (current_path.empty())
+        return it->second;
+
+    /// Strip "current_path." prefix to get the child name (preserves dots in child names)
+    std::string_view mapped = it->second;
+    if (mapped.starts_with(current_path) && mapped.size() > current_path.size()
+        && mapped[current_path.size()] == '.')
+        return mapped.substr(current_path.size() + 1);
+
+    return it->second;
 }
 
 void SchemaConverter::processSubtree(TraversalNode & node)
@@ -169,7 +180,7 @@ void SchemaConverter::processSubtree(TraversalNode & node)
 
     if (node.schema_context == SchemaContext::None)
     {
-        node.appendNameComponent(node.element->name, useColumnMapperIfNeeded(*node.element));
+        node.appendNameComponent(node.element->name, useColumnMapperIfNeeded(*node.element, node.name));
 
         if (sample_block)
         {
@@ -617,7 +628,7 @@ void SchemaConverter::processSubtreeTuple(TraversalNode & node)
     std::vector<String> element_names_in_file;
     for (size_t i = 0; i < size_t(node.element->num_children); ++i)
     {
-        const String & element_name = element_names_in_file.emplace_back(useColumnMapperIfNeeded(file_metadata.schema.at(schema_idx)));
+        const String & element_name = element_names_in_file.emplace_back(useColumnMapperIfNeeded(file_metadata.schema.at(schema_idx), node.name));
         std::optional<size_t> idx_in_output_tuple = i - skipped_unsupported_columns;
         if (lookup_by_name)
         {
diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.h b/src/Processors/Formats/Impl/Parquet/SchemaConverter.h
@@ -137,8 +137,10 @@ struct SchemaConverter
         DataTypePtr & out_inferred_type, std::optional<GeoColumnMetadata> geo_metadata) const;
 
     /// Returns element.name or a corresponding name from ColumnMapper.
-    /// For tuple elements, that's just the element name like `x`, not the whole path like `t.x`.
-    std::string_view useColumnMapperIfNeeded(const parq::SchemaElement & element) const;
+    /// For nested tuple elements, returns just the element name like `x`, not the whole path like `t.x`.
+    /// For top-level columns (when current_path is empty), returns the full mapped name to support
+    /// column names with dots (e.g. `integer.col` in Iceberg).
+    std::string_view useColumnMapperIfNeeded(const parq::SchemaElement & element, const String & current_path) const;
 };
 
 }
diff --git a/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py b/tests/integration/test_storage_iceberg_with_spark/test_column_names_with_dots.py
@@ -0,0 +1,218 @@
+import pytest
+
+from pyspark.sql.types import (
+    IntegerType,
+    StringType,
+    StructField,
+    StructType,
+)
+
+from helpers.iceberg_utils import (
+    default_upload_directory,
+    write_iceberg_from_df,
+    create_iceberg_table,
+    get_creation_expression,
+    get_uuid_str,
+)
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_column_names_with_dots(started_cluster_iceberg_with_spark, storage_type):
+    """
+    Test that Iceberg tables with dot-separated column names are read correctly.
+    This tests the fix for field ID-based column name mapping in Parquet V3 reader.
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_column_names_with_dots_" + storage_type + "_" + get_uuid_str()
+
+    # Create DataFrame with column names containing dots
+    data = [
+        (1, "value1", "multi_dot_value1"),
+        (2, "value2", "multi_dot_value2"),
+        (3, "value3", "multi_dot_value3"),
+    ]
+    schema = StructType([
+        StructField("id", IntegerType()),
+        StructField("name.column", StringType()),
+        StructField("double.column.dot", StringType()),
+    ])
+    df = spark.createDataFrame(data=data, schema=schema)
+
+    write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    # Test via table function
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
+    )
+
+    # Verify single-dot column name
+    result = instance.query(
+        f"SELECT `name.column` FROM {table_function_expr} ORDER BY id"
+    ).strip()
+    assert result == "value1\nvalue2\nvalue3", f"Expected values, got: {result}"
+
+    # Verify multi-dot column name
+    result = instance.query(
+        f"SELECT `double.column.dot` FROM {table_function_expr} ORDER BY id"
+    ).strip()
+    assert result == "multi_dot_value1\nmulti_dot_value2\nmulti_dot_value3", f"Expected values, got: {result}"
+
+    # Verify all columns together
+    result = instance.query(
+        f"SELECT id, `name.column`, `double.column.dot` FROM {table_function_expr} ORDER BY id"
+    ).strip()
+    expected = "1\tvalue1\tmulti_dot_value1\n2\tvalue2\tmulti_dot_value2\n3\tvalue3\tmulti_dot_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"
+
+    # Test via table engine
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)
+
+    result = instance.query(
+        f"SELECT `name.column`, `double.column.dot` FROM {TABLE_NAME} ORDER BY id"
+    ).strip()
+    expected = "value1\tmulti_dot_value1\nvalue2\tmulti_dot_value2\nvalue3\tmulti_dot_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_nested_struct_with_dotted_field(started_cluster_iceberg_with_spark, storage_type):
+    """
+    Test that nested struct fields with dot-separated names are read correctly.
+    This tests the fix for prefix stripping in useColumnMapperIfNeeded.
+    E.g., for my_struct.weird.field we should return "weird.field", not just "field".
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_nested_struct_with_dotted_field_" + storage_type + "_" + get_uuid_str()
+
+    # Create DataFrame with nested struct containing a dotted field
+    data = [
+        (1, (100, "nested_dot_value1")),
+        (2, (200, "nested_dot_value2")),
+        (3, (300, "nested_dot_value3")),
+    ]
+    schema = StructType(
+        [
+            StructField("id", IntegerType()),
+            StructField(
+                "my_struct",
+                StructType(
+                    [
+                        StructField("normal_field", IntegerType()),
+                        StructField("weird.field", StringType()),
+                    ]
+                )
+            )
+        ]
+    )
+    df = spark.createDataFrame(data=data, schema=schema)
+
+    write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    # Test via table function
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
+    )
+
+    # Verify nested struct with dotted field via table function
+    result = instance.query(
+        f"SELECT my_struct.normal_field, `my_struct.weird.field` FROM {table_function_expr} ORDER BY id"
+    ).strip()
+    expected = "100\tnested_dot_value1\n200\tnested_dot_value2\n300\tnested_dot_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"
+
+    # Test via table engine
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)
+
+    result = instance.query(
+        f"SELECT my_struct.normal_field, `my_struct.weird.field` FROM {TABLE_NAME} ORDER BY id"
+    ).strip()
+    expected = "100\tnested_dot_value1\n200\tnested_dot_value2\n300\tnested_dot_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot\n{result}"
+
+
+@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
+def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spark, storage_type):
+    """
+    Test deeply nested structs where EVERY level has dots in the name.
+    Structure: my.struct -> some_dot.separated_parent -> weird.field
+    Full path: my.struct.some_dot.separated_parent.weird.field
+
+    This verifies that prefix stripping works correctly at all nesting depths.
+    """
+    instance = started_cluster_iceberg_with_spark.instances["node1"]
+    spark = started_cluster_iceberg_with_spark.spark_session
+    TABLE_NAME = "test_deeply_nested_struct_with_dotted_names_" + storage_type + "_" + get_uuid_str()
+
+    # Create DataFrame with deeply nested struct containing dotted names
+    data = [
+        (1, (("deep_value1",),)),
+        (2, (("deep_value2",),)),
+        (3, (("deep_value3",),)),
+    ]
+    schema = StructType(
+        [
+            StructField("id", IntegerType()),
+            StructField(
+                "my.struct",
+                StructType(
+                    [
+                        StructField(
+                            "some_dot.separated_parent",
+                            StructType(
+                                [
+                                    StructField("weird.field", StringType()),
+                                ]
+                            ),
+                        ),
+                    ]
+                ),
+            ),
+        ]
+    )
+    df = spark.createDataFrame(data=data, schema=schema)
+
+    write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")
+
+    default_upload_directory(
+        started_cluster_iceberg_with_spark,
+        storage_type,
+        f"/iceberg_data/default/{TABLE_NAME}/",
+        f"/iceberg_data/default/{TABLE_NAME}/",
+    )
+
+    # Test via table function
+    table_function_expr = get_creation_expression(
+        storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
+    )
+
+    # Query the deeply nested dotted field
+    result = instance.query(
+        f"SELECT `my.struct.some_dot.separated_parent.weird.field` FROM {table_function_expr} ORDER BY id"
+    ).strip()
+    expected = "deep_value1\ndeep_value2\ndeep_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"
+
+    # Test via table engine
+    create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)
+
+    result = instance.query(
+        f"SELECT `my.struct.some_dot.separated_parent.weird.field` FROM {TABLE_NAME} ORDER BY id"
+    ).strip()
+    expected = "deep_value1\ndeep_value2\ndeep_value3"
+    assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"