Merge pull request #4176 from amueller/mean_shift_no_centers

amueller · amueller · commit 46293669e987 · 2015-02-07T13:26:07.000+01:00
[MRG + 1] Better error messages in MeanShift, slightly more robust to bad binning.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -179,6 +179,9 @@ Enhancements
    - Allow the fitting and scoring of all clustering algorithms in
      :class:`pipeline.Pipeline`. By `Andreas Müller`_.
 
+   - More robust seeding and improved error messages in :class:`cluster.MeanShift`
+     by `Andreas Müller`_.
+
 Documentation improvements
 ..........................
 
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
@@ -85,21 +85,22 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
         the number of samples. The sklearn.cluster.estimate_bandwidth function
         can be used to do this more efficiently.
 
-    seeds : array-like, shape=[n_seeds, n_features]
-        Point used as initial kernel locations.
+    seeds : array-like, shape=[n_seeds, n_features] or None
+        Point used as initial kernel locations. If None and bin_seeding=False,
+        each data point is used as a seed. If None and bin_seeding=True,
+        see bin_seeding.
 
-    bin_seeding : boolean
+    bin_seeding : boolean, default=False
         If true, initial kernel locations are not locations of all
         points, but rather the location of the discretized version of
         points, where points are binned onto a grid whose coarseness
         corresponds to the bandwidth. Setting this option to True will speed
         up the algorithm because fewer seeds will be initialized.
-        default value: False
         Ignored if seeds argument is not None.
 
-    min_bin_freq : int, optional
+    min_bin_freq : int, default=1
        To speed up the algorithm, accept only those bins with at least
-       min_bin_freq points as seeds. If not defined, set to 1.
+       min_bin_freq points as seeds.
 
     cluster_all : boolean, default True
         If true, then all points are clustered, even those orphans that are
@@ -133,6 +134,9 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
 
     if bandwidth is None:
         bandwidth = estimate_bandwidth(X)
+    elif bandwidth <= 0:
+        raise ValueError("bandwidth needs to be greater than zero or None, got %f" %
+                         bandwidth)
     if seeds is None:
         if bin_seeding:
             seeds = get_bin_seeds(X, bandwidth, min_bin_freq)
@@ -155,13 +159,19 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
                 break  # Depending on seeding strategy this condition may occur
             my_old_mean = my_mean  # save the old mean
             my_mean = np.mean(points_within, axis=0)
-            # If converged or at max_iter, addS the cluster
+            # If converged or at max_iter, adds the cluster
             if (extmath.norm(my_mean - my_old_mean) < stop_thresh or
                     completed_iterations == max_iter):
                 center_intensity_dict[tuple(my_mean)] = len(points_within)
                 break
             completed_iterations += 1
 
+    if not center_intensity_dict:
+        # nothing near seeds
+        raise ValueError("No point was within bandwidth=%f of any seed."
+                         " Try a different seeding strategy or increase the bandwidth."
+                         % bandwidth)
+
     # POST PROCESSING: remove near duplicate points
     # If the distance between two kernels is less than the bandwidth,
     # then we have to remove one because it is a duplicate. Remove the
@@ -225,12 +235,16 @@ def get_bin_seeds(X, bin_size, min_bin_freq=1):
     # Bin points
     bin_sizes = defaultdict(int)
     for point in X:
-        binned_point = np.cast[np.int32](point / bin_size)
+        binned_point = np.round(point / bin_size)
         bin_sizes[tuple(binned_point)] += 1
 
     # Select only those bins as seeds which have enough members
     bin_seeds = np.array([point for point, freq in six.iteritems(bin_sizes) if
                           freq >= min_bin_freq], dtype=np.float32)
+    if len(bin_seeds) == len(X):
+        warnings.warn("Binning data failed with provided bin_size=%f, using data"
+                      " points as seeds." % bin_size)
+        return X
     bin_seeds = bin_seeds * bin_size
     return bin_seeds
 
diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py
@@ -4,11 +4,13 @@
 """
 
 import numpy as np
+import warnings
 
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_raise_message
 
 from sklearn.cluster import MeanShift
 from sklearn.cluster import mean_shift
@@ -52,6 +54,13 @@ def test_meanshift_predict():
     assert_array_equal(labels, labels2)
 
 
+def test_meanshift_all_orphans():
+    # init away from the data, crash with a sensible warning
+    ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
+    msg = "No point was within bandwidth=0.1"
+    assert_raise_message(ValueError, msg, ms.fit, X,)
+
+
 def test_unfitted():
     """Non-regression: before fit, there should be not fitted attributes."""
     ms = MeanShift()
@@ -65,7 +74,7 @@ def test_bin_seeds():
     algorithm
     """
     # Data is just 6 points in the plane
-    X = np.array([[1., 1.], [1.5, 1.5], [1.8, 1.2],
+    X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
                   [2., 1.], [2.1, 1.1], [0., 0.]])
 
     # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
@@ -83,6 +92,13 @@ def test_bin_seeds():
     assert_true(len(ground_truth.symmetric_difference(test_result)) == 0)
 
     # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
-    test_bins = get_bin_seeds(X, 0.01, 1)
-    test_result = set([tuple(p) for p in test_bins])
-    assert_true(len(test_result) == 6)
+    # we bail and use the whole data here.
+    with warnings.catch_warnings(record=True):
+        test_bins = get_bin_seeds(X, 0.01, 1)
+    assert_array_equal(test_bins, X)
+
+    # tight clusters around [0, 0] and [1, 1], only get two bins
+    X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
+                      cluster_std=0.1, random_state=0)
+    test_bins = get_bin_seeds(X, 1)
+    assert_array_equal(test_bins, [[0, 0], [1, 1]])