Skip to content

Commit 4e70fff

Browse files
committed
[SPARK-21786][SQL] The 'spark.sql.parquet.compression.codec' configuration doesn't take effect on tables with partition field(s)
Add test.
1 parent 677541b commit 4e70fff

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed

sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,4 +728,70 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
728728
assert(e.contains("mismatched input 'ROW'"))
729729
}
730730
}
731+
732+
test("[SPARK-21786] The 'spark.sql.parquet.compression.codec' " +
733+
"configuration doesn't take effect on tables with partition field(s)") {
734+
withTempDir { tmpDir =>
735+
withTempView("table_source") {
736+
(0 until 10000).toDF("a").createOrReplaceTempView("table_source")
737+
738+
val tableWithPartition = "table_with_partition"
739+
val tableNoPartition = "table_no_partition"
740+
withTable(tableWithPartition, tableNoPartition) {
741+
sql(
742+
s"""
743+
|CREATE TABLE $tableNoPartition(a int)
744+
|STORED AS PARQUET
745+
|LOCATION '${tmpDir.toURI.toString.stripSuffix("/")}/$tableNoPartition'
746+
""".stripMargin)
747+
sql(
748+
s"""
749+
|CREATE TABLE $tableWithPartition(a int)
750+
|PARTITIONED BY (p int)
751+
|STORED AS PARQUET
752+
|LOCATION '${tmpDir.toURI.toString.stripSuffix("/")}/$tableWithPartition'
753+
""".stripMargin)
754+
755+
def insertOverwriteTable(tableName: String, codec: String, isPartitioned: Boolean): Unit = {
756+
withSQLConf("spark.sql.parquet.compression.codec" -> codec) {
757+
sql(
758+
s"""
759+
|INSERT OVERWRITE TABLE $tableName
760+
|${if (isPartitioned) "partition (p=10000)" else "" }
761+
|SELECT * from table_source
762+
""".stripMargin)
763+
}
764+
}
765+
766+
def getDirFiles(file: File): List[File] = {
767+
if (!file.exists()) Nil
768+
else if (file.isFile) List(file)
769+
else {
770+
file.listFiles().filterNot(_.getName.startsWith(".hive-staging"))
771+
.groupBy(_.isFile).flatMap {
772+
case (isFile, files) if isFile => files.toList
773+
case (_, dirs) => dirs.flatMap(getDirFiles)
774+
}.toList
775+
}
776+
}
777+
778+
def getTableSize(tableName: String, codec: String, isPartitioned: Boolean = false): Long = {
779+
insertOverwriteTable(tableName, codec, isPartitioned)
780+
val path = s"${tmpDir.toURI.toString.stripSuffix("/")}/$tableName"
781+
val dir = new File(path)
782+
val files = getDirFiles(dir).filter(_.getName.startsWith("part-"))
783+
files.map(_.length()).sum
784+
}
785+
786+
//In fact, partitioned and unpartitioned table meta information is slightly different,
787+
//and partitioned tables are slightly larger, but the differences are not very large.
788+
//Think less than 1024Byte
789+
val maxDiff = 1024
790+
assert(getTableSize(tableWithPartition, "uncompressed", true) - getTableSize(tableNoPartition, "uncompressed") < maxDiff)
791+
assert(getTableSize(tableWithPartition, "gzip", true) - getTableSize(tableNoPartition, "gzip") < maxDiff)
792+
assert(getTableSize(tableWithPartition, "uncompressed", true) - getTableSize(tableWithPartition, "gzip", true) > maxDiff)
793+
}
794+
}
795+
}
796+
}
731797
}

0 commit comments

Comments
 (0)