Skip to content

Commit 195d428

Browse files
author
wangzhenhua
committed
fix some conversion logic
1 parent 1a5069d commit 195d428

File tree

6 files changed

+36
-33
lines changed

6 files changed

+36
-33
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,9 @@ case class Statistics(
7676
*
7777
* 1. Supported data types are defined in `ColumnStat.supportsType`.
7878
* 2. The JVM data type stored in min/max is the internal data type for the corresponding
79-
* Catalyst data type. For example, the internal type of DateType is Int, and that the internal
80-
* type of TimestampType is Long.
81-
* 3. For integral types, they are all upcasted to Longs, i.e. Shorts are stored as Longs.
82-
* For FloatType, Float is upcasted to Double.
83-
* 4. There is no guarantee that the statistics collected are accurate. Approximation algorithms
79+
* Catalyst data type. For example, the internal type of DateType is Int, and that the internal
80+
* type of TimestampType is Long.
81+
* 3. There is no guarantee that the statistics collected are accurate. Approximation algorithms
8482
* (sketches) might have been used, and the data collected can also be stale.
8583
*
8684
* @param distinctCount number of distinct values
@@ -131,12 +129,10 @@ case class ColumnStat(
131129
*/
132130
private def toExternalString(v: Any, colName: String, dataType: DataType): String = {
133131
val externalValue = dataType match {
134-
case BooleanType => v.asInstanceOf[Boolean]
135-
case _: IntegralType => v.toString.toLong
136-
case DateType => DateTimeUtils.toJavaDate(v.toString.toInt)
137-
case TimestampType => DateTimeUtils.toJavaTimestamp(v.toString.toLong)
138-
case FloatType | DoubleType => v.toString.toDouble
139-
case _: DecimalType => Decimal.fromDecimal(v).toJavaBigDecimal
132+
case DateType => DateTimeUtils.toJavaDate(v.asInstanceOf[Int])
133+
case TimestampType => DateTimeUtils.toJavaTimestamp(v.asInstanceOf[Long])
134+
case BooleanType | _: IntegralType | FloatType | DoubleType => v
135+
case _: DecimalType => v.asInstanceOf[Decimal].toJavaBigDecimal
140136
// This version of Spark does not use min/max for binary/string types so we ignore it.
141137
case _ =>
142138
throw new AnalysisException("Column statistics deserialization is not supported for " +
@@ -202,10 +198,14 @@ object ColumnStat extends Logging {
202198
private def fromExternalString(s: String, name: String, dataType: DataType): Any = {
203199
dataType match {
204200
case BooleanType => s.toBoolean
205-
case _: IntegralType => s.toLong
206-
case DateType => DateTimeUtils.fromJavaDate(java.sql.Date.valueOf(s)).toLong
201+
case DateType => DateTimeUtils.fromJavaDate(java.sql.Date.valueOf(s))
207202
case TimestampType => DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf(s))
208-
case FloatType | DoubleType => s.toDouble
203+
case ByteType => s.toByte
204+
case ShortType => s.toShort
205+
case IntegerType => s.toInt
206+
case LongType => s.toLong
207+
case FloatType => s.toFloat
208+
case DoubleType => s.toDouble
209209
case _: DecimalType => Decimal(s)
210210
// This version of Spark does not use min/max for binary/string types so we ignore it.
211211
case BinaryType | StringType => null

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,16 @@ object EstimationUtils {
9090

9191
def fromDecimal(dec: Decimal, dataType: DataType): Any = {
9292
dataType match {
93-
case _: IntegralType | DateType | TimestampType => dec.toLong
94-
case FloatType | DoubleType => dec.toDouble
95-
case _: DecimalType => dec
9693
case BooleanType => dec.toLong == 1
94+
case DateType => dec.toInt
95+
case TimestampType => dec.toLong
96+
case ByteType => dec.toByte
97+
case ShortType => dec.toShort
98+
case IntegerType => dec.toInt
99+
case LongType => dec.toLong
100+
case FloatType => dec.toFloat
101+
case DoubleType => dec.toDouble
102+
case _: DecimalType => dec
97103
}
98104
}
99105

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/Range.scala

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,7 @@ trait Range {
2929
/** For simplicity we use decimal to unify operations of numeric ranges. */
3030
case class NumericRange(min: Decimal, max: Decimal) extends Range {
3131
override def contains(l: Literal): Boolean = {
32-
val lit = l.dataType match {
33-
case BooleanType => if (l.value.asInstanceOf[Boolean]) Decimal(1) else Decimal(0)
34-
case _ => Decimal(l.value.toString)
35-
}
32+
val lit = EstimationUtils.toDecimal(l.value, l.dataType)
3633
min <= lit && max >= lit
3734
}
3835
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,17 +262,17 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
262262
AttributeReference("cbool", BooleanType)() -> ColumnStat(distinctCount = 1,
263263
min = Some(false), max = Some(false), nullCount = 0, avgLen = 1, maxLen = 1),
264264
AttributeReference("cbyte", ByteType)() -> ColumnStat(distinctCount = 1,
265-
min = Some(1L), max = Some(1L), nullCount = 0, avgLen = 1, maxLen = 1),
265+
min = Some(1.toByte), max = Some(1.toByte), nullCount = 0, avgLen = 1, maxLen = 1),
266266
AttributeReference("cshort", ShortType)() -> ColumnStat(distinctCount = 1,
267-
min = Some(1L), max = Some(1L), nullCount = 0, avgLen = 2, maxLen = 2),
267+
min = Some(1.toShort), max = Some(1.toShort), nullCount = 0, avgLen = 2, maxLen = 2),
268268
AttributeReference("cint", IntegerType)() -> ColumnStat(distinctCount = 1,
269-
min = Some(1L), max = Some(1L), nullCount = 0, avgLen = 4, maxLen = 4),
269+
min = Some(1), max = Some(1), nullCount = 0, avgLen = 4, maxLen = 4),
270270
AttributeReference("clong", LongType)() -> ColumnStat(distinctCount = 1,
271271
min = Some(1L), max = Some(1L), nullCount = 0, avgLen = 8, maxLen = 8),
272272
AttributeReference("cdouble", DoubleType)() -> ColumnStat(distinctCount = 1,
273273
min = Some(1.0), max = Some(1.0), nullCount = 0, avgLen = 8, maxLen = 8),
274274
AttributeReference("cfloat", FloatType)() -> ColumnStat(distinctCount = 1,
275-
min = Some(1.0), max = Some(1.0), nullCount = 0, avgLen = 4, maxLen = 4),
275+
min = Some(1.0f), max = Some(1.0f), nullCount = 0, avgLen = 4, maxLen = 4),
276276
AttributeReference("cdec", DecimalType.SYSTEM_DEFAULT)() -> ColumnStat(distinctCount = 1,
277277
min = Some(dec), max = Some(dec), nullCount = 0, avgLen = 16, maxLen = 16),
278278
AttributeReference("cstring", StringType)() -> ColumnStat(distinctCount = 1,

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,17 +74,17 @@ class ProjectEstimationSuite extends StatsEstimationTestBase {
7474
AttributeReference("cbool", BooleanType)() -> ColumnStat(distinctCount = 2,
7575
min = Some(false), max = Some(true), nullCount = 0, avgLen = 1, maxLen = 1),
7676
AttributeReference("cbyte", ByteType)() -> ColumnStat(distinctCount = 2,
77-
min = Some(1L), max = Some(2L), nullCount = 0, avgLen = 1, maxLen = 1),
77+
min = Some(1.toByte), max = Some(2.toByte), nullCount = 0, avgLen = 1, maxLen = 1),
7878
AttributeReference("cshort", ShortType)() -> ColumnStat(distinctCount = 2,
79-
min = Some(1L), max = Some(3L), nullCount = 0, avgLen = 2, maxLen = 2),
79+
min = Some(1.toShort), max = Some(3.toShort), nullCount = 0, avgLen = 2, maxLen = 2),
8080
AttributeReference("cint", IntegerType)() -> ColumnStat(distinctCount = 2,
81-
min = Some(1L), max = Some(4L), nullCount = 0, avgLen = 4, maxLen = 4),
81+
min = Some(1), max = Some(4), nullCount = 0, avgLen = 4, maxLen = 4),
8282
AttributeReference("clong", LongType)() -> ColumnStat(distinctCount = 2,
8383
min = Some(1L), max = Some(5L), nullCount = 0, avgLen = 8, maxLen = 8),
8484
AttributeReference("cdouble", DoubleType)() -> ColumnStat(distinctCount = 2,
8585
min = Some(1.0), max = Some(6.0), nullCount = 0, avgLen = 8, maxLen = 8),
8686
AttributeReference("cfloat", FloatType)() -> ColumnStat(distinctCount = 2,
87-
min = Some(1.0), max = Some(7.0), nullCount = 0, avgLen = 4, maxLen = 4),
87+
min = Some(1.0f), max = Some(7.0f), nullCount = 0, avgLen = 4, maxLen = 4),
8888
AttributeReference("cdecimal", DecimalType.SYSTEM_DEFAULT)() -> ColumnStat(distinctCount = 2,
8989
min = Some(dec1), max = Some(dec2), nullCount = 0, avgLen = 16, maxLen = 16),
9090
AttributeReference("cstring", StringType)() -> ColumnStat(distinctCount = 2,

sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,12 +202,12 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
202202
/** A mapping from column to the stats collected. */
203203
protected val stats = mutable.LinkedHashMap(
204204
"cbool" -> ColumnStat(2, Some(false), Some(true), 1, 1, 1),
205-
"cbyte" -> ColumnStat(2, Some(1L), Some(2L), 1, 1, 1),
206-
"cshort" -> ColumnStat(2, Some(1L), Some(3L), 1, 2, 2),
207-
"cint" -> ColumnStat(2, Some(1L), Some(4L), 1, 4, 4),
205+
"cbyte" -> ColumnStat(2, Some(1.toByte), Some(2.toByte), 1, 1, 1),
206+
"cshort" -> ColumnStat(2, Some(1.toShort), Some(3.toShort), 1, 2, 2),
207+
"cint" -> ColumnStat(2, Some(1), Some(4), 1, 4, 4),
208208
"clong" -> ColumnStat(2, Some(1L), Some(5L), 1, 8, 8),
209209
"cdouble" -> ColumnStat(2, Some(1.0), Some(6.0), 1, 8, 8),
210-
"cfloat" -> ColumnStat(2, Some(1.0), Some(7.0), 1, 4, 4),
210+
"cfloat" -> ColumnStat(2, Some(1.0f), Some(7.0f), 1, 4, 4),
211211
"cdecimal" -> ColumnStat(2, Some(Decimal(dec1)), Some(Decimal(dec2)), 1, 16, 16),
212212
"cstring" -> ColumnStat(2, None, None, 1, 3, 3),
213213
"cbinary" -> ColumnStat(2, None, None, 1, 3, 3),

0 commit comments

Comments
 (0)