@@ -21,8 +21,9 @@ import scala.util.Random
2121
2222import org .scalatest .FunSuite
2323
24- import org .apache .spark .mllib .linalg .Vectors
24+ import org .apache .spark .mllib .linalg .{ Vector , Vectors }
2525import org .apache .spark .mllib .util .{LocalClusterSparkContext , LocalSparkContext }
26+ import org .apache .spark .mllib .util .TestingUtils ._
2627
2728class KMeansSuite extends FunSuite with LocalSparkContext {
2829
@@ -41,26 +42,26 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
4142 // centered at the mean of the points
4243
4344 var model = KMeans .train(data, k = 1 , maxIterations = 1 )
44- assert(model.clusterCenters.head === center)
45+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
4546
4647 model = KMeans .train(data, k = 1 , maxIterations = 2 )
47- assert(model.clusterCenters.head === center)
48+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
4849
4950 model = KMeans .train(data, k = 1 , maxIterations = 5 )
50- assert(model.clusterCenters.head === center)
51+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
5152
5253 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
53- assert(model.clusterCenters.head === center)
54+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
5455
5556 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
56- assert(model.clusterCenters.head === center)
57+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
5758
5859 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 1 , initializationMode = RANDOM )
59- assert(model.clusterCenters.head === center)
60+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
6061
6162 model = KMeans .train(
6263 data, k = 1 , maxIterations = 1 , runs = 1 , initializationMode = K_MEANS_PARALLEL )
63- assert(model.clusterCenters.head === center)
64+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
6465 }
6566
6667 test(" no distinct points" ) {
@@ -104,26 +105,26 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
104105
105106 var model = KMeans .train(data, k = 1 , maxIterations = 1 )
106107 assert(model.clusterCenters.size === 1 )
107- assert(model.clusterCenters.head === center)
108+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
108109
109110 model = KMeans .train(data, k = 1 , maxIterations = 2 )
110- assert(model.clusterCenters.head === center)
111+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
111112
112113 model = KMeans .train(data, k = 1 , maxIterations = 5 )
113- assert(model.clusterCenters.head === center)
114+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
114115
115116 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
116- assert(model.clusterCenters.head === center)
117+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
117118
118119 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
119- assert(model.clusterCenters.head === center)
120+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
120121
121122 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 1 , initializationMode = RANDOM )
122- assert(model.clusterCenters.head === center)
123+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
123124
124125 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 1 ,
125126 initializationMode = K_MEANS_PARALLEL )
126- assert(model.clusterCenters.head === center)
127+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
127128 }
128129
129130 test(" single cluster with sparse data" ) {
@@ -149,31 +150,39 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
149150 val center = Vectors .sparse(n, Seq ((0 , 1.0 ), (1 , 3.0 ), (2 , 4.0 )))
150151
151152 var model = KMeans .train(data, k = 1 , maxIterations = 1 )
152- assert(model.clusterCenters.head === center)
153+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
153154
154155 model = KMeans .train(data, k = 1 , maxIterations = 2 )
155- assert(model.clusterCenters.head === center)
156+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
156157
157158 model = KMeans .train(data, k = 1 , maxIterations = 5 )
158- assert(model.clusterCenters.head === center)
159+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
159160
160161 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
161- assert(model.clusterCenters.head === center)
162+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
162163
163164 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 5 )
164- assert(model.clusterCenters.head === center)
165+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
165166
166167 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 1 , initializationMode = RANDOM )
167- assert(model.clusterCenters.head === center)
168+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
168169
169170 model = KMeans .train(data, k = 1 , maxIterations = 1 , runs = 1 ,
170171 initializationMode = K_MEANS_PARALLEL )
171- assert(model.clusterCenters.head === center)
172+ assert(model.clusterCenters.head ~== center absTol 1E-5 )
172173
173174 data.unpersist()
174175 }
175176
176177 test(" k-means|| initialization" ) {
178+
179+ case class VectorWithCompare (x : Vector ) extends Ordered [VectorWithCompare ] {
180+ @ Override def compare (that : VectorWithCompare ): Int = {
181+ if (this .x.toArray.foldLeft[Double ](0.0 )((acc, x) => acc + x * x) >
182+ that.x.toArray.foldLeft[Double ](0.0 )((acc, x) => acc + x * x)) - 1 else 1
183+ }
184+ }
185+
177186 val points = Seq (
178187 Vectors .dense(1.0 , 2.0 , 6.0 ),
179188 Vectors .dense(1.0 , 3.0 , 0.0 ),
@@ -188,15 +197,19 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
188197 // unselected point as long as it hasn't yet selected all of them
189198
190199 var model = KMeans .train(rdd, k = 5 , maxIterations = 1 )
191- assert(Set (model.clusterCenters: _* ) === Set (points : _* ))
200+
201+ assert(model.clusterCenters.sortBy(VectorWithCompare (_))
202+ .zip(points.sortBy(VectorWithCompare (_))).forall(x => x._1 ~== (x._2) absTol 1E-5 ))
192203
193204 // Iterations of Lloyd's should not change the answer either
194205 model = KMeans .train(rdd, k = 5 , maxIterations = 10 )
195- assert(Set (model.clusterCenters: _* ) === Set (points : _* ))
206+ assert(model.clusterCenters.sortBy(VectorWithCompare (_))
207+ .zip(points.sortBy(VectorWithCompare (_))).forall(x => x._1 ~== (x._2) absTol 1E-5 ))
196208
197209 // Neither should more runs
198210 model = KMeans .train(rdd, k = 5 , maxIterations = 10 , runs = 5 )
199- assert(Set (model.clusterCenters: _* ) === Set (points : _* ))
211+ assert(model.clusterCenters.sortBy(VectorWithCompare (_))
212+ .zip(points.sortBy(VectorWithCompare (_))).forall(x => x._1 ~== (x._2) absTol 1E-5 ))
200213 }
201214
202215 test(" two clusters" ) {
0 commit comments