-
Notifications
You must be signed in to change notification settings - Fork 589
[CH] Unsupported partition values with escape char #8836
Copy link
Copy link
Closed
Labels
Description
Backend
CH (ClickHouse)
Bug description
Test Code
test("test partitioned with escaped characters") {
val schema = StructType(
Seq(
StructField.apply("id", IntegerType, nullable = true),
StructField.apply("escape", StringType, nullable = true)
))
val data: Seq[Row] = Seq(
Row(1, "="),
Row(2, "/"),
Row(3, "#"),
Row(4, ":"),
Row(5, "\\"),
Row(6, "\u0001"),
Row(7, " ")
)
val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
df.createOrReplaceTempView("origin_table")
spark.sql("select * from origin_table").show()
spark.sql(s"""
|DROP TABLE IF EXISTS partition_escape;
|""".stripMargin)
spark.sql(s"""
|CREATE TABLE IF NOT EXISTS partition_escape
|(
| c1 int,
| c2 string
|)
|USING clickhouse
|PARTITIONED BY (c2)
|TBLPROPERTIES (storage_policy='__hdfs_main',
| orderByKey='c1',
| primaryKey='c1')
|LOCATION '$HDFS_URL/test/partition_escape'
|""".stripMargin)
spark.sql("insert into partition_escape select * from origin_table")
spark.sql("select * from partition_escape").show()
}
Error Msg:
2025-02-26 17:30:50.161 [1][ScalaTest-run-running-GlutenClickHouseMergeTreeWriteOnHDFSSuite] WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2025-02-26 17:31:32.303 [68][Executor task launch worker for task 0.0 in stage 18.0 (TID 11)] ERROR org.apache.spark.task.TaskResources: Task 11 failed by error:
org.apache.gluten.exception.GlutenException: Unable to open HDFS file: hdfs://127.0.0.1:8020/3-3/test/partition_escape/c2=%3A/5318fb88-759d-4e59-99de-395ceca04ba6_0_001/metadata.gluten. Error: File does not exist: /3-3/test/partition_escape/c2=%3A/5318fb88-759d-4e59-99de-395ceca04ba6_0_001/metadata.gluten
at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1990)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:768)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:442)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:528)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1086)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1029)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1762)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2957)
Spark version
None
Spark configurations
No response
System information
No response
Relevant logs
Reactions are currently unavailable