scikit-learn · jjerphan · Nov 29, 2021 · Nov 6, 2021 · Nov 6, 2021 · Nov 6, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -9,6 +9,13 @@ Version 1.0.2
 
 **In Development**
 
+- |Fix| :class:`cluster.Birch`,
+  :class:`feature_selection.RFECV`, :class:`ensemble.RandomForestRegressor`,
+  :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.GradientBoostingRegressor`, and
+  :class:`ensemble.GradientBoostingClassifier` do not raise warning when fitted
+  on a pandas DataFrame anymore. :pr:`21578` by `Thomas Fan`_.
+
 Changelog
 ---------
 

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
@@ -676,6 +676,10 @@ def predict(self, X):
         """
         check_is_fitted(self)
         X = self._validate_data(X, accept_sparse="csr", reset=False)
+        return self._predict(X)
+
+    def _predict(self, X):
+        """Predict data using the ``centroids_`` of subclusters."""
         kwargs = {"Y_norm_squared": self._subcluster_norms}
 
         with config_context(assume_finite=True):
@@ -745,4 +749,4 @@ def _global_clustering(self, X=None):
             self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
 
         if compute_labels:
-            self.labels_ = self.predict(X)
+            self.labels_ = self._predict(X)
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -519,8 +519,10 @@ def _compute_oob_predictions(self, X, y):
         oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \
                 (n_samples, 1, n_outputs)
             The OOB predictions.
-      """
-        X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
+        """
+        # Prediction requires X to be in CSR format
+        if issparse(X):
+            X = X.tocsr()
 
         n_samples = y.shape[0]
         n_outputs = self.n_outputs_

diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
@@ -643,7 +643,7 @@ def _fit_stages(
             loss_history = np.full(self.n_iter_no_change, np.inf)
             # We create a generator to get the predictions for X_val after
             # the addition of each successive stage
-            y_val_pred_iter = self._staged_raw_predict(X_val)
+            y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False)
 
         # perform boosting iterations
         i = begin_at_stage
@@ -736,7 +736,7 @@ def _raw_predict(self, X):
         predict_stages(self.estimators_, X, self.learning_rate, raw_predictions)
         return raw_predictions
 
-    def _staged_raw_predict(self, X):
+    def _staged_raw_predict(self, X, check_input=True):
         """Compute raw predictions of ``X`` for each iteration.
 
         This method allows monitoring (i.e. determine error on testing set)
@@ -749,6 +749,9 @@ def _staged_raw_predict(self, X):
             ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
 
+        check_input : bool, default=True
+            If False, the input arrays X will not be checked.
+
         Returns
         -------
         raw_predictions : generator of ndarray of shape (n_samples, k)
@@ -757,9 +760,10 @@ def _staged_raw_predict(self, X):
             Regression and binary classification are special cases with
             ``k == 1``, otherwise ``k==n_classes``.
         """
-        X = self._validate_data(
-            X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False
-        )
+        if check_input:
+            X = self._validate_data(
+                X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False
+            )
         raw_predictions = self._raw_predict_init(X)
         for i in range(self.estimators_.shape[0]):
             predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)

diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py
@@ -87,6 +87,10 @@ def transform(self, X):
             force_all_finite=not _safe_tags(self, key="allow_nan"),
             reset=False,
         )
+        return self._transform(X)
+
+    def _transform(self, X):
+        """Reduce X to the selected features."""
         mask = self.get_support()
         if not mask.any():
             warn(

diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
@@ -736,7 +736,7 @@ def fit(self, X, y, groups=None):
         self.n_features_ = rfe.n_features_
         self.ranking_ = rfe.ranking_
         self.estimator_ = clone(self.estimator)
-        self.estimator_.fit(self.transform(X), y)
+        self.estimator_.fit(self._transform(X), y)
 
         # reverse to stay consistent with before
         scores_rev = scores[:, ::-1]

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -332,6 +332,24 @@ def test_check_n_features_in_after_fitting(estimator):
     check_n_features_in_after_fitting(estimator.__class__.__name__, estimator)
 
 
+def _estimators_that_predict_in_fit():
+    for estimator in _tested_estimators():
+        est_params = set(estimator.get_params())
+        if "oob_score" in est_params:
+            yield estimator.set_params(oob_score=True, bootstrap=True)
+        elif "early_stopping" in est_params:
+            est = estimator.set_params(early_stopping=True, n_iter_no_change=1)
+            if est.__class__.__name__ in {"MLPClassifier", "MLPRegressor"}:
+                # TODO: FIX MLP to not check validation set during MLP
+                yield pytest.param(
+                    est, marks=pytest.mark.xfail(msg="MLP still validates in fit")
+                )
+            else:
+                yield est
+        elif "n_iter_no_change" in est_params:
+            yield estimator.set_params(n_iter_no_change=1)
+
+
 # NOTE: When running `check_dataframe_column_names_consistency` on a meta-estimator that
 # delegates validation to a base estimator, the check is testing that the base estimator
 # is checking for column name consistency.
@@ -340,6 +358,7 @@ def test_check_n_features_in_after_fitting(estimator):
         _tested_estimators(),
         [make_pipeline(LogisticRegression(C=1))],
         list(_generate_search_cv_instances()),
+        _estimators_that_predict_in_fit(),
     )
 )
 

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -3791,7 +3791,16 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
     else:
         y = rng.randint(low=0, high=2, size=n_samples)
     y = _enforce_estimator_tags_y(estimator, y)
-    estimator.fit(X, y)
+
+    # Check that calling `fit` does not raise any warnings about feature names.
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "error",
+            message="X does not have valid feature names",
+            category=UserWarning,
+            module="sklearn",
+        )
+        estimator.fit(X, y)
 
     if not hasattr(estimator, "feature_names_in_"):
         raise ValueError(
@@ -3853,6 +3862,12 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
             f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n",
         ),
     ]
+    params = {
+        key: value
+        for key, value in estimator.get_params().items()
+        if "early_stopping" in key
+    }
+    early_stopping_enabled = any(value is True for value in params.values())
 
     for invalid_name, additional_message in invalid_names:
         X_bad = pd.DataFrame(X, columns=invalid_name)
@@ -3876,7 +3891,8 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
                     method(X_bad)
 
         # partial_fit checks on second call
-        if not hasattr(estimator, "partial_fit"):
+        # Do not call partial fit if early_stopping is on
+        if not hasattr(estimator, "partial_fit") or early_stopping_enabled:
             continue
 
         estimator = clone(estimator_orig)