apache
diff --git a/‎R/pkg/vignettes/sparkr-vignettes.Rmd‎
Lines changed: 35 additions & 15 deletions b/‎R/pkg/vignettes/sparkr-vignettes.Rmd‎
Lines changed: 35 additions & 15 deletions
diff --git a/‎common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java‎
Lines changed: 21 additions & 0 deletions b/‎common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java‎
Lines changed: 5 additions & 2 deletions b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java‎
Lines changed: 60 additions & 2 deletions b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java‎
Lines changed: 60 additions & 2 deletions
diff --git a/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java‎
Lines changed: 3 additions & 1 deletion b/‎common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java‎
Lines changed: 1 addition & 1 deletion b/‎common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java‎
Lines changed: 1 addition & 1 deletion b/‎common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java‎
Lines changed: 6 additions & 1 deletion b/‎common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java‎
Lines changed: 9 additions & 0 deletions b/‎core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 6 additions & 9 deletions b/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 6 additions & 9 deletions
@@ -503,6 +503,8 @@ SparkR supports the following machine learning models and algorithms.
 
 #### Tree - Classification and Regression
 
+* Decision Tree
+
 * Gradient-Boosted Trees (GBT)
 
 * Random Forest
@@ -776,16 +778,32 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
+#### Decision Tree
+
+`spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+We use the `Titanic` dataset to train a decision tree and make predictions:
+
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2)
+summary(dtModel)
+predictions <- predict(dtModel, df)
+```
+
 #### Gradient-Boosted Trees
 
 `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-We use the `longley` dataset to train a gradient-boosted tree and make predictions:
+We use the `Titanic` dataset to train a gradient-boosted tree and make predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -795,11 +813,12 @@ predictions <- predict(gbtModel, df)
 `spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-In the following example, we use the `longley` dataset to train a random forest and make predictions:
+In the following example, we use the `Titanic` dataset to train a random forest and make predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```
@@ -965,17 +984,18 @@ Given a `SparkDataFrame`, the test compares continuous data in a given column `t
 specified by parameter `nullHypothesis`.
 Users can call `summary` to get a summary of the test results.
 
-In the following example, we test whether the `longley` dataset's `Armed_Forces` column
+In the following example, we test whether the `Titanic` dataset's `Freq` column
 follows a normal distribution.  We set the parameters of the normal distribution using
 the mean and standard deviation of the sample.
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-afStats <- head(select(df, mean(df$Armed_Forces), sd(df$Armed_Forces)))
-afMean <- afStats[1]
-afStd <- afStats[2]
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+freqStats <- head(select(df, mean(df$Freq), sd(df$Freq)))
+freqMean <- freqStats[1]
+freqStd <- freqStats[2]
 
-test <- spark.kstest(df, "Armed_Forces", "norm", c(afMean, afStd))
+test <- spark.kstest(df, "Freq", "norm", c(freqMean, freqStd))
 testSummary <- summary(test)
 testSummary
 ```
 
@@ -23,6 +23,8 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
 
+import scala.Tuple2;
+
 import com.google.common.base.Preconditions;
 import io.netty.channel.Channel;
 import org.slf4j.Logger;
@@ -94,6 +96,25 @@ public ManagedBuffer getChunk(long streamId, int chunkIndex) {
     return nextChunk;
   }
 
+  @Override
+  public ManagedBuffer openStream(String streamChunkId) {
+    Tuple2<Long, Integer> streamIdAndChunkId = parseStreamChunkId(streamChunkId);
+    return getChunk(streamIdAndChunkId._1, streamIdAndChunkId._2);
+  }
+
+  public static String genStreamChunkId(long streamId, int chunkId) {
+    return String.format("%d_%d", streamId, chunkId);
+  }
+
+  public static Tuple2<Long, Integer> parseStreamChunkId(String streamChunkId) {
+    String[] array = streamChunkId.split("_");
+    assert array.length == 2:
+      "Stream id and chunk index should be specified when open stream for fetching block.";
+    long streamId = Long.valueOf(array[0]);
+    int chunkIndex = Integer.valueOf(array[1]);
+    return new Tuple2<>(streamId, chunkIndex);
+  }
+
   @Override
   public void connectionTerminated(Channel channel) {
     // Close all streams which have been associated with the channel.
 
@@ -17,6 +17,7 @@
 
 package org.apache.spark.network.shuffle;
 
+import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.List;
@@ -86,14 +87,16 @@ public void fetchBlocks(
       int port,
       String execId,
       String[] blockIds,
-      BlockFetchingListener listener) {
+      BlockFetchingListener listener,
+      File[] shuffleFiles) {
     checkInit();
     logger.debug("External shuffle fetch from {}:{} (executor id {})", host, port, execId);
     try {
       RetryingBlockFetcher.BlockFetchStarter blockFetchStarter =
           (blockIds1, listener1) -> {
             TransportClient client = clientFactory.createClient(host, port);
-            new OneForOneBlockFetcher(client, appId, execId, blockIds1, listener1).start();
+            new OneForOneBlockFetcher(client, appId, execId, blockIds1, listener1, conf,
+              shuffleFiles).start();
           };
 
       int maxRetries = conf.maxIORetries();
 
@@ -17,19 +17,28 @@
 
 package org.apache.spark.network.shuffle;
 
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
 import java.util.Arrays;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.spark.network.buffer.FileSegmentManagedBuffer;
 import org.apache.spark.network.buffer.ManagedBuffer;
 import org.apache.spark.network.client.ChunkReceivedCallback;
 import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.StreamCallback;
 import org.apache.spark.network.client.TransportClient;
+import org.apache.spark.network.server.OneForOneStreamManager;
 import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
 import org.apache.spark.network.shuffle.protocol.OpenBlocks;
 import org.apache.spark.network.shuffle.protocol.StreamHandle;
+import org.apache.spark.network.util.TransportConf;
 
 /**
  * Simple wrapper on top of a TransportClient which interprets each chunk as a whole block, and
@@ -48,6 +57,8 @@ public class OneForOneBlockFetcher {
   private final String[] blockIds;
   private final BlockFetchingListener listener;
   private final ChunkReceivedCallback chunkCallback;
+  private TransportConf transportConf = null;
+  private File[] shuffleFiles = null;
 
   private StreamHandle streamHandle = null;
 
@@ -56,12 +67,20 @@ public OneForOneBlockFetcher(
       String appId,
       String execId,
       String[] blockIds,
-      BlockFetchingListener listener) {
+      BlockFetchingListener listener,
+      TransportConf transportConf,
+      File[] shuffleFiles) {
     this.client = client;
     this.openMessage = new OpenBlocks(appId, execId, blockIds);
     this.blockIds = blockIds;
     this.listener = listener;
     this.chunkCallback = new ChunkCallback();
+    this.transportConf = transportConf;
+    if (shuffleFiles != null) {
+      this.shuffleFiles = shuffleFiles;
+      assert this.shuffleFiles.length == blockIds.length:
+        "Number of shuffle files should equal to blocks";
+    }
   }
 
   /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */
@@ -100,7 +119,12 @@ public void onSuccess(ByteBuffer response) {
           // Immediately request all chunks -- we expect that the total size of the request is
           // reasonable due to higher level chunking in [[ShuffleBlockFetcherIterator]].
           for (int i = 0; i < streamHandle.numChunks; i++) {
-            client.fetchChunk(streamHandle.streamId, i, chunkCallback);
+            if (shuffleFiles != null) {
+              client.stream(OneForOneStreamManager.genStreamChunkId(streamHandle.streamId, i),
+                new DownloadCallback(shuffleFiles[i], i));
+            } else {
+              client.fetchChunk(streamHandle.streamId, i, chunkCallback);
+            }
           }
         } catch (Exception e) {
           logger.error("Failed while starting block fetches after success", e);
@@ -126,4 +150,38 @@ private void failRemainingBlocks(String[] failedBlockIds, Throwable e) {
       }
     }
   }
+
+  private class DownloadCallback implements StreamCallback {
+
+    private WritableByteChannel channel = null;
+    private File targetFile = null;
+    private int chunkIndex;
+
+    public DownloadCallback(File targetFile, int chunkIndex) throws IOException {
+      this.targetFile = targetFile;
+      this.channel = Channels.newChannel(new FileOutputStream(targetFile));
+      this.chunkIndex = chunkIndex;
+    }
+
+    @Override
+    public void onData(String streamId, ByteBuffer buf) throws IOException {
+      channel.write(buf);
+    }
+
+    @Override
+    public void onComplete(String streamId) throws IOException {
+      channel.close();
+      ManagedBuffer buffer = new FileSegmentManagedBuffer(transportConf, targetFile, 0,
+        targetFile.length());
+      listener.onBlockFetchSuccess(blockIds[chunkIndex], buffer);
+    }
+
+    @Override
+    public void onFailure(String streamId, Throwable cause) throws IOException {
+      channel.close();
+      // On receipt of a failure, fail every block from chunkIndex onwards.
+      String[] remainingBlockIds = Arrays.copyOfRange(blockIds, chunkIndex, blockIds.length);
+      failRemainingBlocks(remainingBlockIds, cause);
+    }
+  }
 }
@@ -18,6 +18,7 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.Closeable;
+import java.io.File;
 
 /** Provides an interface for reading shuffle files, either from an Executor or external service. */
 public abstract class ShuffleClient implements Closeable {
@@ -40,5 +41,6 @@ public abstract void fetchBlocks(
       int port,
       String execId,
       String[] blockIds,
-      BlockFetchingListener listener);
+      BlockFetchingListener listener,
+      File[] shuffleFiles);
 }
@@ -204,7 +204,7 @@ public void onBlockFetchFailure(String blockId, Throwable t) {
 
       String[] blockIds = { "shuffle_2_3_4", "shuffle_6_7_8" };
       OneForOneBlockFetcher fetcher =
-          new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener);
+          new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener, conf, null);
       fetcher.start();
       blockFetchLatch.await();
       checkSecurityException(exception.get());
 
@@ -158,7 +158,7 @@ public void onBlockFetchFailure(String blockId, Throwable exception) {
             }
           }
         }
-      });
+      }, null);
 
     if (!requestsRemaining.tryAcquire(blockIds.length, 5, TimeUnit.SECONDS)) {
       fail("Timeout getting response from the server");
 
@@ -46,8 +46,13 @@
 import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
 import org.apache.spark.network.shuffle.protocol.OpenBlocks;
 import org.apache.spark.network.shuffle.protocol.StreamHandle;
+import org.apache.spark.network.util.MapConfigProvider;
+import org.apache.spark.network.util.TransportConf;
 
 public class OneForOneBlockFetcherSuite {
+
+  private static final TransportConf conf = new TransportConf("shuffle", MapConfigProvider.EMPTY);
+
   @Test
   public void testFetchOne() {
     LinkedHashMap<String, ManagedBuffer> blocks = Maps.newLinkedHashMap();
@@ -126,7 +131,7 @@ private static BlockFetchingListener fetchBlocks(LinkedHashMap<String, ManagedBu
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
     String[] blockIds = blocks.keySet().toArray(new String[blocks.size()]);
     OneForOneBlockFetcher fetcher =
-      new OneForOneBlockFetcher(client, "app-id", "exec-id", blockIds, listener);
+      new OneForOneBlockFetcher(client, "app-id", "exec-id", blockIds, listener, conf, null);
 
     // Respond to the "OpenBlocks" message with an appropriate ShuffleStreamHandle with streamId 123
     doAnswer(invocationOnMock -> {
 
@@ -19,6 +19,7 @@
 
 import javax.annotation.concurrent.GuardedBy;
 import java.io.IOException;
+import java.nio.channels.ClosedByInterruptException;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.BitSet;
@@ -184,6 +185,10 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
                 break;
               }
             }
+          } catch (ClosedByInterruptException e) {
+            // This called by user to kill a task (e.g: speculative task).
+            logger.error("error while calling spill() on " + c, e);
+            throw new RuntimeException(e.getMessage());
           } catch (IOException e) {
             logger.error("error while calling spill() on " + c, e);
             throw new OutOfMemoryError("error while calling spill() on " + c + " : "
@@ -201,6 +206,10 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
               Utils.bytesToString(released), consumer);
             got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId, mode);
           }
+        } catch (ClosedByInterruptException e) {
+          // This called by user to kill a task (e.g: speculative task).
+          logger.error("error while calling spill() on " + consumer, e);
+          throw new RuntimeException(e.getMessage());
         } catch (IOException e) {
           logger.error("error while calling spill() on " + consumer, e);
           throw new OutOfMemoryError("error while calling spill() on " + consumer + " : "
 
@@ -422,17 +422,14 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th
       for (int partition = 0; partition < numPartitions; partition++) {
         for (int i = 0; i < spills.length; i++) {
           final long partitionLengthInSpill = spills[i].partitionLengths[partition];
-          long bytesToTransfer = partitionLengthInSpill;
           final FileChannel spillInputChannel = spillInputChannels[i];
           final long writeStartTime = System.nanoTime();
-          while (bytesToTransfer > 0) {
-            final long actualBytesTransferred = spillInputChannel.transferTo(
-              spillInputChannelPositions[i],
-              bytesToTransfer,
-              mergedFileOutputChannel);
-            spillInputChannelPositions[i] += actualBytesTransferred;
-            bytesToTransfer -= actualBytesTransferred;
-          }
+          Utils.copyFileStreamNIO(
+            spillInputChannel,
+            mergedFileOutputChannel,
+            spillInputChannelPositions[i],
+            partitionLengthInSpill);
+          spillInputChannelPositions[i] += partitionLengthInSpill;
           writeMetrics.incWriteTime(System.nanoTime() - writeStartTime);
           bytesWrittenToMergedFile += partitionLengthInSpill;
           partitionLengths[partition] += partitionLengthInSpill;
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ public void onBlockFetchFailure(String blockId, Throwable exception) {`
`158`	`158`	`}`
`159`	`159`	`}`
`160`	`160`	`}`
`161`		`- });`
	`161`	`+ }, null);`
`162`	`162`
`163`	`163`	`if (!requestsRemaining.tryAcquire(blockIds.length, 5, TimeUnit.SECONDS)) {`
`164`	`164`	`fail("Timeout getting response from the server");`