Skip to content

Commit 619c0fa

Browse files
committed
Merge remote-tracking branch 'upstream/master' into pyspark-inputformats
Conflicts: project/SparkBuild.scala
2 parents 1c8efbc + 792d908 commit 619c0fa

File tree

242 files changed

+10345
-1934
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

242 files changed

+10345
-1934
lines changed

assembly/pom.xml

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@
3030
<name>Spark Project Assembly</name>
3131
<url>http://spark.incubator.apache.org/</url>
3232

33+
<properties>
34+
<spark.jar>${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</spark.jar>
35+
<deb.pkg.name>spark</deb.pkg.name>
36+
<deb.install.path>/usr/share/spark</deb.install.path>
37+
<deb.user>root</deb.user>
38+
</properties>
39+
3340
<repositories>
3441
<!-- A repository in the local filesystem for the Py4J JAR, which is not in Maven central -->
3542
<repository>
@@ -79,7 +86,7 @@
7986
<artifactId>maven-shade-plugin</artifactId>
8087
<configuration>
8188
<shadedArtifactAttached>false</shadedArtifactAttached>
82-
<outputFile>${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</outputFile>
89+
<outputFile>${spark.jar}</outputFile>
8390
<artifactSet>
8491
<includes>
8592
<include>*:*</include>
@@ -171,5 +178,112 @@
171178
</plugins>
172179
</build>
173180
</profile>
181+
<profile>
182+
<id>deb</id>
183+
<build>
184+
<plugins>
185+
<plugin>
186+
<groupId>org.codehaus.mojo</groupId>
187+
<artifactId>buildnumber-maven-plugin</artifactId>
188+
<version>1.1</version>
189+
<executions>
190+
<execution>
191+
<phase>validate</phase>
192+
<goals>
193+
<goal>create</goal>
194+
</goals>
195+
<configuration>
196+
<shortRevisionLength>8</shortRevisionLength>
197+
</configuration>
198+
</execution>
199+
</executions>
200+
</plugin>
201+
<plugin>
202+
<groupId>org.vafer</groupId>
203+
<artifactId>jdeb</artifactId>
204+
<version>0.11</version>
205+
<executions>
206+
<execution>
207+
<phase>package</phase>
208+
<goals>
209+
<goal>jdeb</goal>
210+
</goals>
211+
<configuration>
212+
<deb>${project.build.directory}/${deb.pkg.name}_${project.version}-${buildNumber}_all.deb</deb>
213+
<attach>false</attach>
214+
<compression>gzip</compression>
215+
<dataSet>
216+
<data>
217+
<src>${spark.jar}</src>
218+
<type>file</type>
219+
<mapper>
220+
<type>perm</type>
221+
<user>${deb.user}</user>
222+
<group>${deb.user}</group>
223+
<prefix>${deb.install.path}/jars</prefix>
224+
</mapper>
225+
</data>
226+
<data>
227+
<src>${basedir}/src/deb/RELEASE</src>
228+
<type>file</type>
229+
<mapper>
230+
<type>perm</type>
231+
<user>${deb.user}</user>
232+
<group>${deb.user}</group>
233+
<prefix>${deb.install.path}</prefix>
234+
</mapper>
235+
</data>
236+
<data>
237+
<src>${basedir}/../conf</src>
238+
<type>directory</type>
239+
<mapper>
240+
<type>perm</type>
241+
<user>${deb.user}</user>
242+
<group>${deb.user}</group>
243+
<prefix>${deb.install.path}/conf</prefix>
244+
<filemode>744</filemode>
245+
</mapper>
246+
</data>
247+
<data>
248+
<src>${basedir}/../bin</src>
249+
<type>directory</type>
250+
<mapper>
251+
<type>perm</type>
252+
<user>${deb.user}</user>
253+
<group>${deb.user}</group>
254+
<prefix>${deb.install.path}/bin</prefix>
255+
<filemode>744</filemode>
256+
</mapper>
257+
</data>
258+
<data>
259+
<src>${basedir}/../sbin</src>
260+
<type>directory</type>
261+
<mapper>
262+
<type>perm</type>
263+
<user>${deb.user}</user>
264+
<group>${deb.user}</group>
265+
<prefix>${deb.install.path}/sbin</prefix>
266+
<filemode>744</filemode>
267+
</mapper>
268+
</data>
269+
<data>
270+
<src>${basedir}/../python</src>
271+
<type>directory</type>
272+
<mapper>
273+
<type>perm</type>
274+
<user>${deb.user}</user>
275+
<group>${deb.user}</group>
276+
<prefix>${deb.install.path}/python</prefix>
277+
<filemode>744</filemode>
278+
</mapper>
279+
</data>
280+
</dataSet>
281+
</configuration>
282+
</execution>
283+
</executions>
284+
</plugin>
285+
</plugins>
286+
</build>
287+
</profile>
174288
</profiles>
175289
</project>

assembly/src/deb/RELEASE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
compute-classpath.sh uses the existence of this file to decide whether to put the assembly jar on the
2+
classpath or instead to use classfiles in the source tree.

bin/compute-classpath.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-dep
3939
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
4040
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
4141
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
42+
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
4243
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
4344

4445
DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
@@ -59,6 +60,7 @@ if [[ $SPARK_TESTING == 1 ]]; then
5960
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/test-classes"
6061
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/test-classes"
6162
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
63+
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
6264
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
6365
fi
6466

bin/spark-class2.cmd

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ for %%d in ("%TOOLS_DIR%\target\scala-%SCALA_VERSION%\spark-tools*assembly*.jar"
7373

7474
rem Compute classpath using external script
7575
set DONT_PRINT_CLASSPATH=1
76-
call "%FWDIR%sbin\compute-classpath.cmd"
76+
call "%FWDIR%bin\compute-classpath.cmd"
7777
set DONT_PRINT_CLASSPATH=0
7878
set CLASSPATH=%CLASSPATH%;%SPARK_TOOLS_JAR%
7979

bin/spark-shell.cmd

100644100755
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,6 @@ rem limitations under the License.
1818
rem
1919

2020
rem Find the path of sbin
21-
set SBIN=%~dp0..\sbin\
21+
set BIN=%~dp0..\bin\
2222

23-
cmd /V /E /C %SBIN%spark-class2.cmd org.apache.spark.repl.Main %*
23+
cmd /V /E /C %BIN%spark-class2.cmd org.apache.spark.repl.Main %*

core/src/main/scala/org/apache/spark/Accumulators.scala

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717

1818
package org.apache.spark
1919

20-
import java.io._
20+
import java.io.{ObjectInputStream, Serializable}
2121

2222
import scala.collection.mutable.Map
2323
import scala.collection.generic.Growable
2424
import org.apache.spark.serializer.JavaSerializer
2525

2626
/**
27-
* A datatype that can be accumulated, i.e. has an commutative and associative "add" operation,
27+
* A data type that can be accumulated, ie has an commutative and associative "add" operation,
2828
* but where the result type, `R`, may be different from the element type being added, `T`.
2929
*
30-
* You must define how to add data, and how to merge two of these together. For some datatypes,
30+
* You must define how to add data, and how to merge two of these together. For some data types,
3131
* such as a counter, these might be the same operation. In that case, you can use the simpler
3232
* [[org.apache.spark.Accumulator]]. They won't always be the same, though -- e.g., imagine you are
3333
* accumulating a set. You will add items to the set, and you will union two sets together.
@@ -45,7 +45,7 @@ class Accumulable[R, T] (
4545
val id = Accumulators.newId
4646
@transient private var value_ = initialValue // Current value on master
4747
val zero = param.zero(initialValue) // Zero value to be passed to workers
48-
var deserialized = false
48+
private var deserialized = false
4949

5050
Accumulators.register(this, true)
5151

@@ -127,7 +127,7 @@ class Accumulable[R, T] (
127127

128128
/**
129129
* Helper object defining how to accumulate values of a particular type. An implicit
130-
* AccumulableParam needs to be available when you create Accumulables of a specific type.
130+
* AccumulableParam needs to be available when you create [[Accumulable]]s of a specific type.
131131
*
132132
* @tparam R the full accumulated data (result type)
133133
* @tparam T partial data that can be added in
@@ -185,8 +185,30 @@ class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Ser
185185
}
186186

187187
/**
188-
* A simpler value of [[org.apache.spark.Accumulable]] where the result type being accumulated is the same
189-
* as the types of elements being merged.
188+
* A simpler value of [[Accumulable]] where the result type being accumulated is the same
189+
* as the types of elements being merged, i.e. variables that are only "added" to through an
190+
* associative operation and can therefore be efficiently supported in parallel. They can be used
191+
* to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of type
192+
* `Int` and `Double`, and programmers can add support for new types.
193+
*
194+
* An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]].
195+
* Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator.
196+
* However, they cannot read its value. Only the driver program can read the accumulator's value,
197+
* using its value method.
198+
*
199+
* The interpreter session below shows an accumulator being used to add up the elements of an array:
200+
*
201+
* {{{
202+
* scala> val accum = sc.accumulator(0)
203+
* accum: spark.Accumulator[Int] = 0
204+
*
205+
* scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x)
206+
* ...
207+
* 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
208+
*
209+
* scala> accum.value
210+
* res2: Int = 10
211+
* }}}
190212
*
191213
* @param initialValue initial value of accumulator
192214
* @param param helper object defining how to add elements of type `T`
@@ -196,9 +218,9 @@ class Accumulator[T](@transient initialValue: T, param: AccumulatorParam[T])
196218
extends Accumulable[T,T](initialValue, param)
197219

198220
/**
199-
* A simpler version of [[org.apache.spark.AccumulableParam]] where the only datatype you can add in is the same type
200-
* as the accumulated value. An implicit AccumulatorParam object needs to be available when you create
201-
* Accumulators of a specific type.
221+
* A simpler version of [[org.apache.spark.AccumulableParam]] where the only data type you can add
222+
* in is the same type as the accumulated value. An implicit AccumulatorParam object needs to be
223+
* available when you create Accumulators of a specific type.
202224
*
203225
* @tparam T type of value to accumulate
204226
*/

core/src/main/scala/org/apache/spark/Aggregator.scala

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark
1919

20+
import scala.{Option, deprecated}
21+
2022
import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
2123

2224
/**
@@ -31,10 +33,14 @@ case class Aggregator[K, V, C] (
3133
mergeValue: (C, V) => C,
3234
mergeCombiners: (C, C) => C) {
3335

34-
private val sparkConf = SparkEnv.get.conf
35-
private val externalSorting = sparkConf.getBoolean("spark.shuffle.externalSorting", true)
36+
private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
37+
38+
@deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
39+
def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
40+
combineValuesByKey(iter, null)
3641

37-
def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]) : Iterator[(K, C)] = {
42+
def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
43+
context: TaskContext): Iterator[(K, C)] = {
3844
if (!externalSorting) {
3945
val combiners = new AppendOnlyMap[K,C]
4046
var kv: Product2[K, V] = null
@@ -47,17 +53,23 @@ case class Aggregator[K, V, C] (
4753
}
4854
combiners.iterator
4955
} else {
50-
val combiners =
51-
new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
56+
val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)
5257
while (iter.hasNext) {
5358
val (k, v) = iter.next()
5459
combiners.insert(k, v)
5560
}
61+
// TODO: Make this non optional in a future release
62+
Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
63+
Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
5664
combiners.iterator
5765
}
5866
}
5967

60-
def combineCombinersByKey(iter: Iterator[(K, C)]) : Iterator[(K, C)] = {
68+
@deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
69+
def combineCombinersByKey(iter: Iterator[(K, C)]) : Iterator[(K, C)] =
70+
combineCombinersByKey(iter, null)
71+
72+
def combineCombinersByKey(iter: Iterator[(K, C)], context: TaskContext) : Iterator[(K, C)] = {
6173
if (!externalSorting) {
6274
val combiners = new AppendOnlyMap[K,C]
6375
var kc: Product2[K, C] = null
@@ -75,6 +87,9 @@ case class Aggregator[K, V, C] (
7587
val (k, c) = iter.next()
7688
combiners.insert(k, c)
7789
}
90+
// TODO: Make this non optional in a future release
91+
Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
92+
Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
7893
combiners.iterator
7994
}
8095
}

core/src/main/scala/org/apache/spark/FutureAction.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ import org.apache.spark.rdd.RDD
2727

2828

2929
/**
30-
* A future for the result of an action. This is an extension of the Scala Future interface to
31-
* support cancellation.
30+
* A future for the result of an action to support cancellation. This is an extension of the
31+
* Scala Future interface to support cancellation.
3232
*/
3333
trait FutureAction[T] extends Future[T] {
3434
// Note that we redefine methods of the Future trait here explicitly so we can specify a different
@@ -86,7 +86,7 @@ trait FutureAction[T] extends Future[T] {
8686

8787

8888
/**
89-
* The future holding the result of an action that triggers a single job. Examples include
89+
* A [[FutureAction]] holding the result of an action that triggers a single job. Examples include
9090
* count, collect, reduce.
9191
*/
9292
class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc: => T)
@@ -150,7 +150,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc:
150150

151151

152152
/**
153-
* A FutureAction for actions that could trigger multiple Spark jobs. Examples include take,
153+
* A [[FutureAction]] for actions that could trigger multiple Spark jobs. Examples include take,
154154
* takeSample. Cancellation works by setting the cancelled flag to true and interrupting the
155155
* action thread if it is being blocked by a job.
156156
*/

core/src/main/scala/org/apache/spark/InterruptibleIterator.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package org.apache.spark
1919

2020
/**
2121
* An iterator that wraps around an existing iterator to provide task killing functionality.
22-
* It works by checking the interrupted flag in TaskContext.
22+
* It works by checking the interrupted flag in [[TaskContext]].
2323
*/
2424
class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T])
2525
extends Iterator[T] {

0 commit comments

Comments
 (0)