@@ -26,6 +26,7 @@ import scala.util.matching.Regex
2626import org .apache .spark .sql ._
2727import org .apache .spark .sql .catalyst .TableIdentifier
2828import org .apache .spark .sql .catalyst .catalog .{CatalogRelation , CatalogStatistics }
29+ import org .apache .spark .sql .catalyst .plans .logical .ColumnStat
2930import org .apache .spark .sql .catalyst .util .StringUtils
3031import org .apache .spark .sql .execution .command .DDLUtils
3132import org .apache .spark .sql .execution .datasources .LogicalRelation
@@ -210,27 +211,62 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
210211 }
211212 }
212213
213- test(" test elimination of the influences of the old stats " ) {
214+ test(" keep existing row count in stats with noscan if table is not changed " ) {
214215 val textTable = " textTable"
215216 withTable(textTable) {
216- sql(s " CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE " )
217+ sql(s " CREATE TABLE $textTable (key STRING, value STRING) " )
217218 sql(s " INSERT INTO TABLE $textTable SELECT * FROM src " )
218219 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS " )
219220 val fetchedStats1 =
220221 checkTableStats(textTable, hasSizeInBytes = true , expectedRowCounts = Some (500 ))
221222
222223 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS noscan " )
223- // when the total size is not changed, the old row count is kept
224+ // when the table is not changed, total size is the same, and the old row count is kept
224225 val fetchedStats2 =
225226 checkTableStats(textTable, hasSizeInBytes = true , expectedRowCounts = Some (500 ))
226227 assert(fetchedStats1 == fetchedStats2)
228+ }
229+ }
227230
228- sql(s " INSERT INTO TABLE $textTable SELECT * FROM src " )
229- sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS noscan " )
230- // update total size and remove the old and invalid row count
231+ test(" keep existing column stats if table is not changed" ) {
232+ val table = " update_col_stats_table"
233+ withTable(table) {
234+ sql(s " CREATE TABLE $table (c1 INT, c2 STRING, c3 DOUBLE) " )
235+ sql(s " ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1 " )
236+ val fetchedStats0 =
237+ checkTableStats(table, hasSizeInBytes = true , expectedRowCounts = Some (0 ))
238+ assert(fetchedStats0.get.colStats == Map (" c1" -> ColumnStat (0 , None , None , 0 , 4 , 4 )))
239+
240+ // Insert new data and analyze: have the latest column stats.
241+ sql(s " INSERT INTO TABLE $table SELECT 1, 'a', 10.0 " )
242+ sql(s " ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1 " )
243+ val fetchedStats1 =
244+ checkTableStats(table, hasSizeInBytes = true , expectedRowCounts = Some (1 )).get
245+ assert(fetchedStats1.colStats == Map (
246+ " c1" -> ColumnStat (distinctCount = 1 , min = Some (1 ), max = Some (1 ), nullCount = 0 ,
247+ avgLen = 4 , maxLen = 4 )))
248+
249+ // Analyze another column: since the table is not changed, the precious column stats are kept.
250+ sql(s " ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c2 " )
251+ val fetchedStats2 =
252+ checkTableStats(table, hasSizeInBytes = true , expectedRowCounts = Some (1 )).get
253+ assert(fetchedStats2.colStats == Map (
254+ " c1" -> ColumnStat (distinctCount = 1 , min = Some (1 ), max = Some (1 ), nullCount = 0 ,
255+ avgLen = 4 , maxLen = 4 ),
256+ " c2" -> ColumnStat (distinctCount = 1 , min = None , max = None , nullCount = 0 ,
257+ avgLen = 1 , maxLen = 1 )))
258+
259+ // Insert new data and analyze: stale column stats are removed and newly collected column
260+ // stats are added.
261+ sql(s " INSERT INTO TABLE $table SELECT 2, 'b', 20.0 " )
262+ sql(s " ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1, c3 " )
231263 val fetchedStats3 =
232- checkTableStats(textTable, hasSizeInBytes = true , expectedRowCounts = None )
233- assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
264+ checkTableStats(table, hasSizeInBytes = true , expectedRowCounts = Some (2 )).get
265+ assert(fetchedStats3.colStats == Map (
266+ " c1" -> ColumnStat (distinctCount = 2 , min = Some (1 ), max = Some (2 ), nullCount = 0 ,
267+ avgLen = 4 , maxLen = 4 ),
268+ " c3" -> ColumnStat (distinctCount = 2 , min = Some (10.0 ), max = Some (20.0 ), nullCount = 0 ,
269+ avgLen = 8 , maxLen = 8 )))
234270 }
235271 }
236272
0 commit comments