TST non-regression test for #6147, roc_auc on memmap data

ogrisel · ogrisel · commit 613f1ad5617e · 2016-01-29T10:57:42.000+01:00
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
@@ -1,4 +1,8 @@
 import pickle
+import tempfile
+import shutil
+import os
+import numbers
 
 import numpy as np
 
@@ -30,6 +34,7 @@
 from sklearn.cross_validation import train_test_split, cross_val_score
 from sklearn.grid_search import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
+from sklearn.externals import joblib
 
 
 REGRESSION_SCORERS = ['r2', 'mean_absolute_error', 'mean_squared_error',
@@ -46,6 +51,46 @@
 MULTILABEL_ONLY_SCORERS = ['precision_samples', 'recall_samples', 'f1_samples']
 
 
+def _make_estimators(X_train, y_train, y_ml_train):
+    # Make estimators that make sense to test various scoring methods
+    sensible_regr = DummyRegressor(strategy='median')
+    sensible_regr.fit(X_train, y_train)
+    sensible_clf = DecisionTreeClassifier(random_state=0)
+    sensible_clf.fit(X_train, y_train)
+    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
+    sensible_ml_clf.fit(X_train, y_ml_train)
+    return dict(
+        [(name, sensible_regr) for name in REGRESSION_SCORERS] +
+        [(name, sensible_clf) for name in CLF_SCORERS] +
+        [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
+    )
+
+
+X_mm, y_mm, y_ml_mm = None, None, None
+ESTIMATORS = None
+TEMP_FOLDER = None
+
+
+def setup_module():
+    # Create some memory mapped data
+    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
+    TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_')
+    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
+    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
+                                             random_state=0)
+    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
+    joblib.dump((X, y, y_ml), filename)
+    X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r')
+    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
+
+
+def teardown_module():
+    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
+    # GC closes the mmap file descriptors
+    X_mm, y_mm, y_ml_mm, ESTIMATORS = None, None, None, None
+    shutil.rmtree(TEMP_FOLDER)
+
+
 class EstimatorWithoutFit(object):
     """Dummy estimator to test check_scoring"""
     pass
@@ -318,18 +363,7 @@ def test_scorer_sample_weight():
     sample_weight[:10] = 0
 
     # get sensible estimators for each metric
-    sensible_regr = DummyRegressor(strategy='median')
-    sensible_regr.fit(X_train, y_train)
-    sensible_clf = DecisionTreeClassifier(random_state=0)
-    sensible_clf.fit(X_train, y_train)
-    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
-    sensible_ml_clf.fit(X_train, y_ml_train)
-    estimator = dict([(name, sensible_regr)
-                      for name in REGRESSION_SCORERS] +
-                     [(name, sensible_clf)
-                      for name in CLF_SCORERS] +
-                     [(name, sensible_ml_clf)
-                      for name in MULTILABEL_ONLY_SCORERS])
+    estimator = _make_estimators(X_train, y_train, y_ml_train)
 
     for name, scorer in SCORERS.items():
         if name in MULTILABEL_ONLY_SCORERS:
@@ -355,3 +389,21 @@ def test_scorer_sample_weight():
             assert_true("sample_weight" in str(e),
                         "scorer {0} raises unhelpful exception when called "
                         "with sample weights: {1}".format(name, str(e)))
+
+
+@ignore_warnings  # UndefinedMetricWarning for P / R scores
+def check_scorer_memmap(scorer_name):
+    scorer, estimator = SCORERS[scorer_name], ESTIMATORS[scorer_name]
+    if scorer_name in MULTILABEL_ONLY_SCORERS:
+        score = scorer(estimator, X_mm, y_ml_mm)
+    else:
+        score = scorer(estimator, X_mm, y_mm)
+    assert isinstance(score, numbers.Number), scorer_name
+
+
+def test_scorer_memmap_input():
+    # Non-regression test for #6147: some score functions would
+    # return singleton memmap when computed on memmap data instead of scalar
+    # float values.
+    for name in SCORERS.keys():
+        yield check_scorer_memmap, name