Skip to content

Commit 9fccc36

Browse files
Zhenhua Wangcloud-fan
authored andcommitted
[SPARK-21083][SQL] Store zero size and row count when analyzing empty table
## What changes were proposed in this pull request? We should be able to store zero size and row count after analyzing empty table. This pr also enhances the test cases for re-analyzing tables. ## How was this patch tested? Added a new test case and enhanced some test cases. Author: Zhenhua Wang <[email protected]> Closes #18292 from wzhfy/analyzeNewColumn.
1 parent 0b8dd2d commit 9fccc36

File tree

3 files changed

+59
-11
lines changed

3 files changed

+59
-11
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.command
2020
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
2121
import org.apache.spark.sql.catalyst.TableIdentifier
2222
import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTableType}
23-
import org.apache.spark.sql.execution.SQLExecution
2423

2524

2625
/**
@@ -40,10 +39,10 @@ case class AnalyzeTableCommand(
4039
}
4140
val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
4241

43-
val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
42+
val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
4443
val oldRowCount = tableMeta.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
4544
var newStats: Option[CatalogStatistics] = None
46-
if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
45+
if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
4746
newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
4847
}
4948
// We only set rowCount when noscan is false, because otherwise:

sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,19 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
8282
}
8383
}
8484

85+
test("analyze empty table") {
86+
val table = "emptyTable"
87+
withTable(table) {
88+
sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
89+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
90+
val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
91+
assert(fetchedStats1.get.sizeInBytes == 0)
92+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
93+
val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
94+
assert(fetchedStats2.get.sizeInBytes == 0)
95+
}
96+
}
97+
8598
test("analyze column command - unsupported types and invalid columns") {
8699
val tableName = "column_stats_test1"
87100
withTable(tableName) {

sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import scala.util.matching.Regex
2626
import org.apache.spark.sql._
2727
import org.apache.spark.sql.catalyst.TableIdentifier
2828
import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
29+
import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
2930
import org.apache.spark.sql.catalyst.util.StringUtils
3031
import org.apache.spark.sql.execution.command.DDLUtils
3132
import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -210,27 +211,62 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
210211
}
211212
}
212213

213-
test("test elimination of the influences of the old stats") {
214+
test("keep existing row count in stats with noscan if table is not changed") {
214215
val textTable = "textTable"
215216
withTable(textTable) {
216-
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
217+
sql(s"CREATE TABLE $textTable (key STRING, value STRING)")
217218
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
218219
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
219220
val fetchedStats1 =
220221
checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
221222

222223
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
223-
// when the total size is not changed, the old row count is kept
224+
// when the table is not changed, total size is the same, and the old row count is kept
224225
val fetchedStats2 =
225226
checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
226227
assert(fetchedStats1 == fetchedStats2)
228+
}
229+
}
227230

228-
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
229-
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
230-
// update total size and remove the old and invalid row count
231+
test("keep existing column stats if table is not changed") {
232+
val table = "update_col_stats_table"
233+
withTable(table) {
234+
sql(s"CREATE TABLE $table (c1 INT, c2 STRING, c3 DOUBLE)")
235+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
236+
val fetchedStats0 =
237+
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
238+
assert(fetchedStats0.get.colStats == Map("c1" -> ColumnStat(0, None, None, 0, 4, 4)))
239+
240+
// Insert new data and analyze: have the latest column stats.
241+
sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0")
242+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
243+
val fetchedStats1 =
244+
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
245+
assert(fetchedStats1.colStats == Map(
246+
"c1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
247+
avgLen = 4, maxLen = 4)))
248+
249+
// Analyze another column: since the table is not changed, the precious column stats are kept.
250+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c2")
251+
val fetchedStats2 =
252+
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
253+
assert(fetchedStats2.colStats == Map(
254+
"c1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
255+
avgLen = 4, maxLen = 4),
256+
"c2" -> ColumnStat(distinctCount = 1, min = None, max = None, nullCount = 0,
257+
avgLen = 1, maxLen = 1)))
258+
259+
// Insert new data and analyze: stale column stats are removed and newly collected column
260+
// stats are added.
261+
sql(s"INSERT INTO TABLE $table SELECT 2, 'b', 20.0")
262+
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1, c3")
231263
val fetchedStats3 =
232-
checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = None)
233-
assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
264+
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2)).get
265+
assert(fetchedStats3.colStats == Map(
266+
"c1" -> ColumnStat(distinctCount = 2, min = Some(1), max = Some(2), nullCount = 0,
267+
avgLen = 4, maxLen = 4),
268+
"c3" -> ColumnStat(distinctCount = 2, min = Some(10.0), max = Some(20.0), nullCount = 0,
269+
avgLen = 8, maxLen = 8)))
234270
}
235271
}
236272

0 commit comments

Comments
 (0)