scikit-learn · MechCoder · Nov 4, 2014 · Oct 16, 2014 · Oct 16, 2014 · arjoly
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -14,6 +14,7 @@
 from .utils.validation import check_consistent_length
 from .utils import deprecated
 from .utils.random import random_choice_csc
+from .utils.stats import _weighted_percentile
 from .utils.multiclass import class_distribution
 
 
@@ -366,7 +367,7 @@ def y_mean_(self):
             return self.constant_
         raise AttributeError
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the random regressor.
 
         Parameters
@@ -378,6 +379,9 @@ def fit(self, X, y):
         y : array-like, shape = [n_samples] or [n_samples, n_outputs]
             Target values.
 
+        sample_weight : array-like of shape = [n_samples], optional
+            Sample weights.
+
         Returns
         -------
         self : object
@@ -389,25 +393,40 @@ def fit(self, X, y):
                              "'mean', 'median', 'quantile' or 'constant'"
                              % self.strategy)
 
-        y = check_array(y, accept_sparse='csr', ensure_2d=False)
+        y = check_array(y, ensure_2d=False)
         if len(y) == 0:
             raise ValueError("y must not be empty.")
-        self.output_2d_ = (y.ndim == 2)
 
-        check_consistent_length(X, y)
+        self.output_2d_ = y.ndim == 2
+        if y.ndim == 1:
+            y = np.reshape(y, (-1, 1))
+        self.n_outputs_ = y.shape[1]
+
+        check_consistent_length(X, y, sample_weight)
 
         if self.strategy == "mean":
-            self.constant_ = np.mean(y, axis=0)
+            self.constant_ = np.average(y, axis=0, weights=sample_weight)
 
         elif self.strategy == "median":
-            self.constant_ = np.median(y, axis=0)
+            if sample_weight is None:
+                self.constant_ = np.median(y, axis=0)
+            else:
+                self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
+                                                       percentile=50.)
+                                  for k in range(self.n_outputs_)]
 
         elif self.strategy == "quantile":
             if self.quantile is None or not np.isscalar(self.quantile):
                 raise ValueError("Quantile must be a scalar in the range "
                                  "[0.0, 1.0], but got %s." % self.quantile)
 
-            self.constant_ = np.percentile(y, axis=0, q=self.quantile * 100.0)
+            percentile = self.quantile * 100.0
+            if sample_weight is None:
+                self.constant_ = np.percentile(y, axis=0, q=percentile)
+            else:
+                self.constant_ = [_weighted_percentile(y[:, k], sample_weight,
+                                                       percentile=percentile)
+                                  for k in range(self.n_outputs_)]
 
         elif self.strategy == "constant":
             if self.constant is None:
@@ -426,7 +445,6 @@ def fit(self, X, y):
             self.constant_ = self.constant
 
         self.constant_ = np.reshape(self.constant_, (1, -1))
-        self.n_outputs_ = np.size(self.constant_)  # y.shape[1] is not safe
         return self
 
     def predict(self, X):

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
@@ -37,6 +37,7 @@
 from ..base import RegressorMixin
 from ..utils import check_random_state, check_array, check_X_y, column_or_1d
 from ..utils.extmath import logsumexp
+from ..utils.stats import _weighted_percentile
 from ..externals import six
 from ..feature_selection.from_model import _LearntSelectorMixin
 
@@ -50,18 +51,6 @@
 from ._gradient_boosting import _random_sample_mask
 
 
-def _weighted_percentile(array, sample_weight, percentile=50):
-    """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """
-    sorted_idx = np.argsort(array)
-
-    # Find index of median prediction for each sample
-    weight_cdf = sample_weight[sorted_idx].cumsum()
-    percentile_or_above = weight_cdf >= (percentile / 100.0) * weight_cdf[-1]
-    percentile_idx = percentile_or_above.argmax()
-
-    return array[sorted_idx[percentile_idx]]
-
-
 class QuantileEstimator(BaseEstimator):
     """An estimator predicting the alpha-quantile of the training targets."""
     def __init__(self, alpha=0.9):

diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
@@ -11,6 +11,7 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.stats import _weighted_percentile
 
 from sklearn.dummy import DummyClassifier, DummyRegressor
 
@@ -572,6 +573,24 @@ def test_most_frequent_strategy_sparse_target():
                                                     np.zeros((n_samples, 1))]))
 
 
+def test_dummy_regressor_sample_weight(n_samples=10):
+    random_state = np.random.RandomState(seed=1)
+
+    X = [[0]] * n_samples
+    y = random_state.rand(n_samples)
+    sample_weight = random_state.rand(n_samples)
+
+    est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
+    assert_equal(est.constant_, np.average(y, weights=sample_weight))
+
+    est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
+    assert_equal(est.constant_, _weighted_percentile(y, sample_weight, 50.))
+
+    est = DummyRegressor(strategy="quantile", quantile=.95).fit(X, y,
+                                                                sample_weight)
+    assert_equal(est.constant_, _weighted_percentile(y, sample_weight, 95.))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
@@ -44,3 +44,16 @@ def _rankdata(a, method="average"):
 
 except TypeError as e:
     rankdata = _rankdata
+
+
+def _weighted_percentile(array, sample_weight, percentile=50):
+    """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """
+    sorted_idx = np.argsort(array)
+
+    # Find index of median prediction for each sample
+    weight_cdf = sample_weight[sorted_idx].cumsum()
+    percentile_or_above = weight_cdf >= (percentile / 100.0) * weight_cdf[-1]
+    percentile_idx = percentile_or_above.argmax()
+
+    return array[sorted_idx[percentile_idx]]
+