[SPARK-32480] Support insert overwrite to move data to trash

Udbhav30 · Udbhav30 · commit c7b56a4071b1 · 2020-08-26T14:38:32.000+05:30
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -270,7 +270,7 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * Move data to trash if 'spark.sql.truncate.trash.enabled' is true
+   * Move data to trash if 'spark.sql.trash.enabled' is true
    */
   def moveToTrashIfEnabled(
       fs: FileSystem,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2722,13 +2722,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val TRUNCATE_TRASH_ENABLED =
-    buildConf("spark.sql.truncate.trash.enabled")
-      .doc("This configuration decides when truncating table, whether data files will be moved " +
-        "to trash directory or deleted permanently. The trash retention time is controlled by " +
-        "fs.trash.interval, and in default, the server side configuration value takes " +
-        "precedence over the client-side one. Note that if fs.trash.interval is non-positive, " +
-        "this will be a no-op and log a warning message.")
+  val TRASH_ENABLED =
+    buildConf("spark.sql.trash.enabled")
+      .doc("This configuration decides when truncating table and insert overwrite, whether " +
+        "data files will be moved to trash directory or deleted permanently. The trash " +
+        "retention time is controlled by fs.trash.interval, and in default, the server side " +
+        "configuration value takes precedence over the client-side one. Note that if " +
+        "fs.trash.interval is non-positive, this will be a no-op and log a warning message.")
       .version("3.1.0")
       .booleanConf
       .createWithDefault(false)
@@ -3345,7 +3345,7 @@ class SQLConf extends Serializable with Logging {
 
   def legacyPathOptionBehavior: Boolean = getConf(SQLConf.LEGACY_PATH_OPTION_BEHAVIOR)
 
-  def truncateTrashEnabled: Boolean = getConf(SQLConf.TRUNCATE_TRASH_ENABLED)
+  def trashEnabled: Boolean = getConf(SQLConf.TRASH_ENABLED)
 
   /** ********************** SQLConf functionality methods ************ */
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -490,7 +490,7 @@ case class TruncateTableCommand(
       }
     val hadoopConf = spark.sessionState.newHadoopConf()
     val ignorePermissionAcl = SQLConf.get.truncateTableIgnorePermissionAcl
-    val isTrashEnabled = SQLConf.get.truncateTrashEnabled
+    val isTrashEnabled = SQLConf.get.trashEnabled
     locations.foreach { location =>
       if (location.isDefined) {
         val path = new Path(location.get)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -3105,7 +3105,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   test("SPARK-32481 Move data to trash on truncate table if enabled") {
     val trashIntervalKey = "fs.trash.interval"
     withTable("tab1") {
-      withSQLConf(SQLConf.TRUNCATE_TRASH_ENABLED.key -> "true") {
+      withSQLConf(SQLConf.TRASH_ENABLED.key -> "true") {
         sql("CREATE TABLE tab1 (col INT) USING parquet")
         sql("INSERT INTO tab1 SELECT 1")
         // scalastyle:off hadoopconfiguration
@@ -3133,7 +3133,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   test("SPARK-32481 delete data permanently on truncate table if trash interval is non-positive") {
     val trashIntervalKey = "fs.trash.interval"
     withTable("tab1") {
-      withSQLConf(SQLConf.TRUNCATE_TRASH_ENABLED.key -> "true") {
+      withSQLConf(SQLConf.TRASH_ENABLED.key -> "true") {
         sql("CREATE TABLE tab1 (col INT) USING parquet")
         sql("INSERT INTO tab1 SELECT 1")
         // scalastyle:off hadoopconfiguration
@@ -3159,7 +3159,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
 
   test("SPARK-32481 Do not move data to trash on truncate table if disabled") {
     withTable("tab1") {
-      withSQLConf(SQLConf.TRUNCATE_TRASH_ENABLED.key -> "false") {
+      withSQLConf(SQLConf.TRASH_ENABLED.key -> "false") {
         sql("CREATE TABLE tab1 (col INT) USING parquet")
         sql("INSERT INTO tab1 SELECT 1")
         val hadoopConf = spark.sessionState.newHadoopConf()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.client.HiveClientImpl
 import org.apache.spark.sql.util.SchemaUtils
+import org.apache.spark.util.Utils
 
 /**
  * Command for writing the results of `query` to file system.
@@ -108,8 +109,11 @@ case class InsertIntoHiveDirCommand(
         outputLocation = tmpPath.toString)
 
       if (overwrite && fs.exists(writeToPath)) {
+        val isTrashEnabled = sparkSession.sessionState.conf.trashEnabled
         fs.listStatus(writeToPath).foreach { existFile =>
-          if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true)
+          if (Option(existFile.getPath) != createdTempDir) {
+            Utils.moveToTrashIfEnabled(fs, existFile.getPath, isTrashEnabled, hadoopConf)
+          }
         }
       }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.execution
 import java.util.Locale
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{Path, Trash}
 import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
@@ -309,9 +309,22 @@ case class InsertIntoHiveTable(
             partitionPath.foreach { path =>
               val fs = path.getFileSystem(hadoopConf)
               if (fs.exists(path)) {
-                if (!fs.delete(path, true)) {
-                  throw new RuntimeException(
-                    s"Cannot remove partition directory '$path'")
+                val isTrashEnabled = sparkSession.sessionState.conf.trashEnabled
+                if (!isTrashEnabled) {
+                  if (!fs.delete(path, true)) {
+                    throw new RuntimeException(
+                      s"Cannot remove partition directory '$path'")
+                  }
+                } else {
+                  logDebug(s"will move data ${partitionPath.toString} to trash")
+                  val isSuccess = Trash.moveToAppropriateTrash(fs, partitionPath, hadoopConf)
+                  if (!isSuccess) {
+                    logWarning(s"Failed to move data ${partitionPath.toString} to trash")
+                    if (!fs.delete(path, true)) {
+                      throw new RuntimeException(
+                        s"Cannot remove partition directory '$path'")
+                    }
+                  }
                 }
                 // Don't let Hive do overwrite operation since it is slower.
                 doHiveOverwrite = false

Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,7 @@ private[spark] object Utils extends Logging {`
`270`	`270`	`}`
`271`	`271`
`272`	`272`	`/**`
`273`		`- * Move data to trash if 'spark.sql.truncate.trash.enabled' is true`
	`273`	`+ * Move data to trash if 'spark.sql.trash.enabled' is true`
`274`	`274`	`*/`
`275`	`275`	`def moveToTrashIfEnabled(`
`276`	`276`	`fs: FileSystem,`
Original file line number	Diff line number	Diff line change
`@@ -490,7 +490,7 @@ case class TruncateTableCommand(`
`490`	`490`	`}`
`491`	`491`	`val hadoopConf = spark.sessionState.newHadoopConf()`
`492`	`492`	`val ignorePermissionAcl = SQLConf.get.truncateTableIgnorePermissionAcl`
`493`		`- val isTrashEnabled = SQLConf.get.truncateTrashEnabled`
	`493`	`+ val isTrashEnabled = SQLConf.get.trashEnabled`
`494`	`494`	`locations.foreach { location =>`
`495`	`495`	`if (location.isDefined) {`
`496`	`496`	`val path = new Path(location.get)`