Skip to content

Commit c24b6b6

Browse files
viiryacloud-fan
authored andcommitted
[SPARK-11753][SQL][TEST-HADOOP2.2] Make allowNonNumericNumbers option work
## What changes were proposed in this pull request? Jackson suppprts `allowNonNumericNumbers` option to parse non-standard non-numeric numbers such as "NaN", "Infinity", "INF". Currently used Jackson version (2.5.3) doesn't support it all. This patch upgrades the library and make the two ignored tests in `JsonParsingOptionsSuite` passed. ## How was this patch tested? `JsonParsingOptionsSuite`. Author: Liang-Chi Hsieh <[email protected]> Author: Liang-Chi Hsieh <[email protected]> Closes #9759 from viirya/fix-json-nonnumric.
1 parent 6075f5b commit c24b6b6

File tree

10 files changed

+102
-53
lines changed

10 files changed

+102
-53
lines changed

dev/deps/spark-deps-hadoop-2.2

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,13 @@ hk2-utils-2.4.0-b34.jar
7272
httpclient-4.5.2.jar
7373
httpcore-4.4.4.jar
7474
ivy-2.4.0.jar
75-
jackson-annotations-2.5.3.jar
76-
jackson-core-2.5.3.jar
75+
jackson-annotations-2.7.3.jar
76+
jackson-core-2.7.3.jar
7777
jackson-core-asl-1.9.13.jar
78-
jackson-databind-2.5.3.jar
78+
jackson-databind-2.7.3.jar
7979
jackson-mapper-asl-1.9.13.jar
80-
jackson-module-scala_2.11-2.5.3.jar
80+
jackson-module-paranamer-2.7.3.jar
81+
jackson-module-scala_2.11-2.7.3.jar
8182
janino-2.7.8.jar
8283
javassist-3.18.1-GA.jar
8384
javax.annotation-api-1.2.jar
@@ -127,7 +128,7 @@ objenesis-2.1.jar
127128
opencsv-2.3.jar
128129
oro-2.0.8.jar
129130
osgi-resource-locator-1.0.1.jar
130-
paranamer-2.6.jar
131+
paranamer-2.8.jar
131132
parquet-column-1.7.0.jar
132133
parquet-common-1.7.0.jar
133134
parquet-encoding-1.7.0.jar

dev/deps/spark-deps-hadoop-2.3

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar
7474
httpclient-4.5.2.jar
7575
httpcore-4.4.4.jar
7676
ivy-2.4.0.jar
77-
jackson-annotations-2.5.3.jar
78-
jackson-core-2.5.3.jar
77+
jackson-annotations-2.7.3.jar
78+
jackson-core-2.7.3.jar
7979
jackson-core-asl-1.9.13.jar
80-
jackson-databind-2.5.3.jar
80+
jackson-databind-2.7.3.jar
8181
jackson-mapper-asl-1.9.13.jar
82-
jackson-module-scala_2.11-2.5.3.jar
82+
jackson-module-paranamer-2.7.3.jar
83+
jackson-module-scala_2.11-2.7.3.jar
8384
janino-2.7.8.jar
8485
java-xmlbuilder-1.0.jar
8586
javassist-3.18.1-GA.jar
@@ -134,7 +135,7 @@ objenesis-2.1.jar
134135
opencsv-2.3.jar
135136
oro-2.0.8.jar
136137
osgi-resource-locator-1.0.1.jar
137-
paranamer-2.6.jar
138+
paranamer-2.8.jar
138139
parquet-column-1.7.0.jar
139140
parquet-common-1.7.0.jar
140141
parquet-encoding-1.7.0.jar

dev/deps/spark-deps-hadoop-2.4

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,13 @@ hk2-utils-2.4.0-b34.jar
7474
httpclient-4.5.2.jar
7575
httpcore-4.4.4.jar
7676
ivy-2.4.0.jar
77-
jackson-annotations-2.5.3.jar
78-
jackson-core-2.5.3.jar
77+
jackson-annotations-2.7.3.jar
78+
jackson-core-2.7.3.jar
7979
jackson-core-asl-1.9.13.jar
80-
jackson-databind-2.5.3.jar
80+
jackson-databind-2.7.3.jar
8181
jackson-mapper-asl-1.9.13.jar
82-
jackson-module-scala_2.11-2.5.3.jar
82+
jackson-module-paranamer-2.7.3.jar
83+
jackson-module-scala_2.11-2.7.3.jar
8384
janino-2.7.8.jar
8485
java-xmlbuilder-1.0.jar
8586
javassist-3.18.1-GA.jar
@@ -134,7 +135,7 @@ objenesis-2.1.jar
134135
opencsv-2.3.jar
135136
oro-2.0.8.jar
136137
osgi-resource-locator-1.0.1.jar
137-
paranamer-2.6.jar
138+
paranamer-2.8.jar
138139
parquet-column-1.7.0.jar
139140
parquet-common-1.7.0.jar
140141
parquet-encoding-1.7.0.jar

dev/deps/spark-deps-hadoop-2.6

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,14 @@ htrace-core-3.0.4.jar
8080
httpclient-4.5.2.jar
8181
httpcore-4.4.4.jar
8282
ivy-2.4.0.jar
83-
jackson-annotations-2.5.3.jar
84-
jackson-core-2.5.3.jar
83+
jackson-annotations-2.7.3.jar
84+
jackson-core-2.7.3.jar
8585
jackson-core-asl-1.9.13.jar
86-
jackson-databind-2.5.3.jar
86+
jackson-databind-2.7.3.jar
8787
jackson-jaxrs-1.9.13.jar
8888
jackson-mapper-asl-1.9.13.jar
89-
jackson-module-scala_2.11-2.5.3.jar
89+
jackson-module-paranamer-2.7.3.jar
90+
jackson-module-scala_2.11-2.7.3.jar
9091
jackson-xc-1.9.13.jar
9192
janino-2.7.8.jar
9293
java-xmlbuilder-1.0.jar
@@ -142,7 +143,7 @@ objenesis-2.1.jar
142143
opencsv-2.3.jar
143144
oro-2.0.8.jar
144145
osgi-resource-locator-1.0.1.jar
145-
paranamer-2.6.jar
146+
paranamer-2.8.jar
146147
parquet-column-1.7.0.jar
147148
parquet-common-1.7.0.jar
148149
parquet-encoding-1.7.0.jar

dev/deps/spark-deps-hadoop-2.7

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,14 @@ htrace-core-3.1.0-incubating.jar
8080
httpclient-4.5.2.jar
8181
httpcore-4.4.4.jar
8282
ivy-2.4.0.jar
83-
jackson-annotations-2.5.3.jar
84-
jackson-core-2.5.3.jar
83+
jackson-annotations-2.7.3.jar
84+
jackson-core-2.7.3.jar
8585
jackson-core-asl-1.9.13.jar
86-
jackson-databind-2.5.3.jar
86+
jackson-databind-2.7.3.jar
8787
jackson-jaxrs-1.9.13.jar
8888
jackson-mapper-asl-1.9.13.jar
89-
jackson-module-scala_2.11-2.5.3.jar
89+
jackson-module-paranamer-2.7.3.jar
90+
jackson-module-scala_2.11-2.7.3.jar
9091
jackson-xc-1.9.13.jar
9192
janino-2.7.8.jar
9293
java-xmlbuilder-1.0.jar
@@ -143,7 +144,7 @@ objenesis-2.1.jar
143144
opencsv-2.3.jar
144145
oro-2.0.8.jar
145146
osgi-resource-locator-1.0.1.jar
146-
paranamer-2.6.jar
147+
paranamer-2.8.jar
147148
parquet-column-1.7.0.jar
148149
parquet-common-1.7.0.jar
149150
parquet-encoding-1.7.0.jar

pom.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160
<jline.version>${scala.version}</jline.version>
161161
<jline.groupid>org.scala-lang</jline.groupid>
162162
<codehaus.jackson.version>1.9.13</codehaus.jackson.version>
163-
<fasterxml.jackson.version>2.5.3</fasterxml.jackson.version>
163+
<fasterxml.jackson.version>2.7.3</fasterxml.jackson.version>
164164
<snappy.version>1.1.2.4</snappy.version>
165165
<netlib.java.version>1.1.2</netlib.java.version>
166166
<calcite.version>1.2.0-incubating</calcite.version>
@@ -180,6 +180,7 @@
180180
<antlr4.version>4.5.2-1</antlr4.version>
181181
<jpam.version>1.1</jpam.version>
182182
<selenium.version>2.52.0</selenium.version>
183+
<paranamer.version>2.8</paranamer.version>
183184

184185
<test.java.home>${java.home}</test.java.home>
185186
<test.exclude.tags></test.exclude.tags>
@@ -1825,6 +1826,11 @@
18251826
<artifactId>antlr4-runtime</artifactId>
18261827
<version>${antlr4.version}</version>
18271828
</dependency>
1829+
<dependency>
1830+
<groupId>com.thoughtworks.paranamer</groupId>
1831+
<artifactId>paranamer</artifactId>
1832+
<version>${paranamer.version}</version>
1833+
</dependency>
18281834
</dependencies>
18291835
</dependencyManagement>
18301836

python/pyspark/sql/readwriter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
193193
set, it uses the default value, ``true``.
194194
:param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is
195195
set, it uses the default value, ``false``.
196+
:param allowNonNumericNumbers: allows using non-numeric numbers such as "NaN", "Infinity",
197+
"-Infinity", "INF", "-INF", which are convertd to floating
198+
point numbers, ``true``.
196199
:param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character
197200
using backslash quoting mechanism. If None is
198201
set, it uses the default value, ``false``.

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
293293
* </li>
294294
* <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
295295
* (e.g. 00012)</li>
296+
* <li>`allowNonNumericNumbers` (default `true`): allows using non-numeric numbers such as "NaN",
297+
* "Infinity", "-Infinity", "INF", "-INF", which are convertd to floating point numbers.</li>
296298
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
297299
* character using backslash quoting mechanism</li>
298300
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,15 @@ object JacksonParser extends Logging {
129129
case (VALUE_STRING, FloatType) =>
130130
// Special case handling for NaN and Infinity.
131131
val value = parser.getText
132-
val lowerCaseValue = value.toLowerCase()
133-
if (lowerCaseValue.equals("nan") ||
134-
lowerCaseValue.equals("infinity") ||
135-
lowerCaseValue.equals("-infinity") ||
136-
lowerCaseValue.equals("inf") ||
137-
lowerCaseValue.equals("-inf")) {
132+
if (value.equals("NaN") ||
133+
value.equals("Infinity") ||
134+
value.equals("+Infinity") ||
135+
value.equals("-Infinity")) {
138136
value.toFloat
137+
} else if (value.equals("+INF") || value.equals("INF")) {
138+
Float.PositiveInfinity
139+
} else if (value.equals("-INF")) {
140+
Float.NegativeInfinity
139141
} else {
140142
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.")
141143
}
@@ -146,13 +148,15 @@ object JacksonParser extends Logging {
146148
case (VALUE_STRING, DoubleType) =>
147149
// Special case handling for NaN and Infinity.
148150
val value = parser.getText
149-
val lowerCaseValue = value.toLowerCase()
150-
if (lowerCaseValue.equals("nan") ||
151-
lowerCaseValue.equals("infinity") ||
152-
lowerCaseValue.equals("-infinity") ||
153-
lowerCaseValue.equals("inf") ||
154-
lowerCaseValue.equals("-inf")) {
151+
if (value.equals("NaN") ||
152+
value.equals("Infinity") ||
153+
value.equals("+Infinity") ||
154+
value.equals("-Infinity")) {
155155
value.toDouble
156+
} else if (value.equals("+INF") || value.equals("INF")) {
157+
Double.PositiveInfinity
158+
} else if (value.equals("-INF")) {
159+
Double.NegativeInfinity
156160
} else {
157161
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.")
158162
}

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.json
1919

2020
import org.apache.spark.sql.QueryTest
2121
import org.apache.spark.sql.test.SharedSQLContext
22+
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
2223

2324
/**
2425
* Test cases for various [[JSONOptions]].
@@ -93,23 +94,51 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
9394
assert(df.first().getLong(0) == 18)
9495
}
9596

96-
// The following two tests are not really working - need to look into Jackson's
97-
// JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS.
98-
ignore("allowNonNumericNumbers off") {
99-
val str = """{"age": NaN}"""
100-
val rdd = spark.sparkContext.parallelize(Seq(str))
101-
val df = spark.read.json(rdd)
102-
103-
assert(df.schema.head.name == "_corrupt_record")
97+
test("allowNonNumericNumbers off") {
98+
// non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off.
99+
var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
100+
"""{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""",
101+
"""{"age": +INF}""", """{"age": -INF}""")
102+
testCases.foreach { str =>
103+
val rdd = spark.sparkContext.parallelize(Seq(str))
104+
val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd)
105+
106+
assert(df.schema.head.name == "_corrupt_record")
107+
}
108+
109+
// quoted non-numeric numbers should still work even allowNonNumericNumbers is off.
110+
testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""",
111+
"""{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""",
112+
"""{"age": "-INF"}""")
113+
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity,
114+
_.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity)
115+
val schema = StructType(StructField("age", DoubleType, true) :: Nil)
116+
117+
testCases.zipWithIndex.foreach { case (str, idx) =>
118+
val rdd = spark.sparkContext.parallelize(Seq(str))
119+
val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd)
120+
121+
assert(df.schema.head.name == "age")
122+
assert(tests(idx)(df.first().getDouble(0)))
123+
}
104124
}
105125

106-
ignore("allowNonNumericNumbers on") {
107-
val str = """{"age": NaN}"""
108-
val rdd = spark.sparkContext.parallelize(Seq(str))
109-
val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd)
110-
111-
assert(df.schema.head.name == "age")
112-
assert(df.first().getDouble(0).isNaN)
126+
test("allowNonNumericNumbers on") {
127+
val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
128+
"""{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""",
129+
"""{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""",
130+
"""{"age": "-Infinity"}""")
131+
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity,
132+
_.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity,
133+
_.isNegInfinity, _.isPosInfinity, _.isNegInfinity)
134+
val schema = StructType(StructField("age", DoubleType, true) :: Nil)
135+
testCases.zipWithIndex.foreach { case (str, idx) =>
136+
val rdd = spark.sparkContext.parallelize(Seq(str))
137+
val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd)
138+
139+
assert(df.schema.head.name == "age")
140+
assert(tests(idx)(df.first().getDouble(0)))
141+
}
113142
}
114143

115144
test("allowBackslashEscapingAnyCharacter off") {

0 commit comments

Comments
 (0)