|
17 | 17 |
|
18 | 18 | package org.apache.spark |
19 | 19 |
|
20 | | -import java.io.{File, FileWriter} |
| 20 | +import java.io._ |
| 21 | +import java.util.zip.GZIPOutputStream |
21 | 22 |
|
22 | 23 | import org.apache.spark.deploy.SparkHadoopUtil |
23 | 24 | import org.apache.spark.input.PortableDataStream |
@@ -540,4 +541,63 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { |
540 | 541 | }.collect() |
541 | 542 | assert(inputPaths.toSet === Set(s"$outDir/part-00000", s"$outDir/part-00001")) |
542 | 543 | } |
| 544 | + |
| 545 | + test("spark.files.ignoreCorruptFiles should work both HadoopRDD and NewHadoopRDD") { |
| 546 | + val inputFile = File.createTempFile("input-", ".gz") |
| 547 | + try { |
| 548 | + // Create a corrupt gzip file |
| 549 | + val byteOutput = new ByteArrayOutputStream() |
| 550 | + val gzip = new GZIPOutputStream(byteOutput) |
| 551 | + try { |
| 552 | + gzip.write(Array[Byte](1, 2, 3, 4)) |
| 553 | + } finally { |
| 554 | + gzip.close() |
| 555 | + } |
| 556 | + val bytes = byteOutput.toByteArray |
| 557 | + val o = new FileOutputStream(inputFile) |
| 558 | + try { |
| 559 | + // It's corrupt since we only write half of bytes into the file. |
| 560 | + o.write(bytes.take(bytes.length / 2)) |
| 561 | + } finally { |
| 562 | + o.close() |
| 563 | + } |
| 564 | + |
| 565 | + // Spark job should ignore corrupt files by default |
| 566 | + sc = new SparkContext("local", "test") |
| 567 | + // Test HadoopRDD |
| 568 | + assert(sc.textFile(inputFile.toURI.toString).collect().isEmpty) |
| 569 | + // Test NewHadoopRDD |
| 570 | + assert { |
| 571 | + sc.newAPIHadoopFile( |
| 572 | + inputFile.toURI.toString, |
| 573 | + classOf[NewTextInputFormat], |
| 574 | + classOf[LongWritable], |
| 575 | + classOf[Text]).collect().isEmpty |
| 576 | + } |
| 577 | + sc.stop() |
| 578 | + |
| 579 | + // Reading a corrupt gzip file should throw EOFException |
| 580 | + val conf = new SparkConf().set("spark.files.ignoreCorruptFiles", "false") |
| 581 | + sc = new SparkContext("local", "test", conf) |
| 582 | + // Test HadoopRDD |
| 583 | + var e = intercept[SparkException] { |
| 584 | + sc.textFile(inputFile.toURI.toString).collect() |
| 585 | + } |
| 586 | + assert(e.getCause.isInstanceOf[EOFException]) |
| 587 | + assert(e.getCause.getMessage === "Unexpected end of input stream") |
| 588 | + // Test NewHadoopRDD |
| 589 | + e = intercept[SparkException] { |
| 590 | + sc.newAPIHadoopFile( |
| 591 | + inputFile.toURI.toString, |
| 592 | + classOf[NewTextInputFormat], |
| 593 | + classOf[LongWritable], |
| 594 | + classOf[Text]).collect() |
| 595 | + } |
| 596 | + assert(e.getCause.isInstanceOf[EOFException]) |
| 597 | + assert(e.getCause.getMessage === "Unexpected end of input stream") |
| 598 | + } finally { |
| 599 | + inputFile.delete() |
| 600 | + } |
| 601 | + } |
| 602 | + |
543 | 603 | } |
0 commit comments