Skip to content

Commit 2048c00

Browse files
committed
small update
1 parent a7c86bb commit 2048c00

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ object Word2VecModel extends MLReadable[Word2VecModel] {
355355
sc: SparkContext,
356356
numWords: Int,
357357
vectorSize: Int): Int = {
358-
val floatSize = 4.0 // Use Double to help avoid overflow
358+
val floatSize = 4L // Use Long to help avoid overflow
359359
val averageWordSize = 15
360360
// [SPARK-11994] - We want to partition the model in partitions smaller than
361361
// spark.kryoserializer.buffer.max
@@ -365,7 +365,11 @@ object Word2VecModel extends MLReadable[Word2VecModel] {
365365
// Assuming an average word size of 15 bytes, the formula is:
366366
// (floatSize * vectorSize + 15) * numWords
367367
val approximateSizeInBytes = (floatSize * vectorSize + averageWordSize) * numWords
368-
((approximateSizeInBytes / bufferSizeInBytes) + 1).toInt
368+
val numPartitions = (approximateSizeInBytes / bufferSizeInBytes) + 1
369+
require(numPartitions < 10e8, s"Word2VecModel calculated that it needs $numPartitions " +
370+
s"partitions to save this model, which is too large. Try increasing " +
371+
s"spark.kryoserializer.buffer.max so that Word2VecModel can use fewer partitions.")
372+
numPartitions.toInt
369373
}
370374
}
371375

0 commit comments

Comments
 (0)