scikit-learn · SnakeHunt2012 · Apr 19, 2015 · Apr 20, 2015 · Apr 23, 2015 · Apr 23, 2015
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -213,8 +213,8 @@ def __repr__(self):
         )
 
     def __len__(self):
-        return int(factorial(self.n) / factorial(self.n - self.p)
-                   / factorial(self.p))
+        return int(factorial(self.n) / factorial(self.n - self.p) /
+                   factorial(self.p))
 
 
 class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
@@ -595,13 +595,19 @@ def __len__(self):
 class BaseShuffleSplit(with_metaclass(ABCMeta)):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
 
-    def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
+    def __init__(self, n, n_iter=10, test_size=None, train_size=None,
                  random_state=None):
         self.n = n
         self.n_iter = n_iter
+        self.random_state = random_state
+
+        # we will compute the final value of
+        # test_size in _validate_shuffle_split()
         self.test_size = test_size
+        # we will compute the final value of
+        # test_size in _validate_shuffle_split()
         self.train_size = train_size
-        self.random_state = random_state
+
         self.n_train, self.n_test = _validate_shuffle_split(n, test_size,
                                                             train_size)
 
@@ -632,17 +638,22 @@ class ShuffleSplit(BaseShuffleSplit):
     n_iter : int (default 10)
         Number of re-shuffling & splitting iterations.
 
-    test_size : float (default 0.1), int, or None
+    test_size : float, int, or None (default is None)
         If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
+        proportion of the dataset to include in the test split.
+        If int, represents the absolute number of test samples.
+        If None, the value is automatically set accroding to train_size,
+        when test_size and train_size are both None, their values will be
+        computed by default (.1/.9).
 
     train_size : float, int, or None (default is None)
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the dataset to include in the train split. If
         int, represents the absolute number of train samples. If None,
         the value is automatically set to the complement of the test size.
+        If None, the value is automatically set accroding to test_size,
+        when test_size and train_size are both None, their values will be
+        computed by default (.1/.9).
 
     random_state : int or RandomState
         Pseudo-random number generator state used for random sampling.
@@ -700,63 +711,95 @@ def __len__(self):
 
 def _validate_shuffle_split(n, test_size, train_size):
     if test_size is None and train_size is None:
-        raise ValueError(
-            'test_size and train_size can not both be None')
-
-    if test_size is not None:
-        if np.asarray(test_size).dtype.kind == 'f':
+        # user didn't set test_size
+        # we will use default value
+        test_size = 0.1
+        train_size = 1. - test_size
+
+    if test_size is not None and train_size is None:
+        test_size_kind = np.asarray(test_size).dtype.kind
+        if test_size_kind == 'f':
             if test_size >= 1.:
                 raise ValueError(
                     'test_size=%f should be smaller '
                     'than 1.0 or be an integer' % test_size)
-        elif np.asarray(test_size).dtype.kind == 'i':
+            train_size = 1. - test_size
+        elif test_size_kind == 'i':
             if test_size >= n:
                 raise ValueError(
                     'test_size=%d should be smaller '
                     'than the number of samples %d' % (test_size, n))
+            train_size = n - test_size
         else:
             raise ValueError("Invalid value for test_size: %r" % test_size)
-
-    if train_size is not None:
-        if np.asarray(train_size).dtype.kind == 'f':
+    elif test_size is None and train_size is not None:
+        train_size_kind = np.asarray(train_size).dtype.kind
+        if train_size_kind == 'f':
             if train_size >= 1.:
                 raise ValueError("train_size=%f should be smaller "
                                  "than 1.0 or be an integer" % train_size)
-            elif np.asarray(test_size).dtype.kind == 'f' and \
-                    train_size + test_size > 1.:
+            test_size = 1. - train_size
+        elif train_size_kind == 'i':
+            if train_size >= n:
+                raise ValueError("train_size=%d should be smaller "
+                                 "than the number of samples %d" %
+                                 (train_size, n))
+            test_size = n - train_size
+        else:
+            raise ValueError("Invalid value for train_size: %r" % train_size)
+    else:  # test_size is not None and train_set is not None:
+        test_size_kind = np.array(test_size).dtype.kind
+        train_size_kind = np.array(train_size).dtype.kind
+        if test_size_kind == 'f' and \
+           train_size_kind == 'f':
+            if test_size >= 1.:
+                raise ValueError('test_size=%f should be smaller '
+                                 'than 1.0 or be an integer' % test_size)
+            if train_size >= 1.:
+                raise ValueError('train_size=%f should be smaller '
+                                 'than 1.0 or be an integer' % train_size)
+            if train_size + test_size > 1.:
                 raise ValueError('The sum of test_size and train_size = %f, '
                                  'should be smaller than 1.0. Reduce '
                                  'test_size and/or train_size.' %
                                  (train_size + test_size))
-        elif np.asarray(train_size).dtype.kind == 'i':
+        elif test_size_kind == 'i' and train_size_kind == 'i':
+            if test_size >= n:
+                raise ValueError('test_size=%d should be smaller '
+                                 'than the number of samples %d' %
+                                 (test_size, n))
             if train_size >= n:
                 raise ValueError("train_size=%d should be smaller "
                                  "than the number of samples %d" %
                                  (train_size, n))
+            if train_size + test_size > n:
+                raise ValueError('The sum of test_size and train_size = %d, '
+                                 'should be smaller than n. Reduce '
+                                 'test_size and/or train_size.' %
+                                 (train_size + test_size))
         else:
-            raise ValueError("Invalid value for train_size: %r" % train_size)
+            # test_size_kind != train_size_kind or
+            # their kinds are nether 'f' nor 'i'
+            if test_size_kind not in ['f', 'i']:
+                raise ValueError("Invalid value for test_size: %r" %
+                                 test_size)
+            if train_size_kind not in ['f', 'i']:
+                raise ValueError("Invalid value for train_size: %r" %
+                                 train_size)
+            raise ValueError(
+                "Type of test_size and train_size is not the same, "
+                "test_size: %r, train_size: %r" %
+                (type(test_size), type(train_size)))
 
     if np.asarray(test_size).dtype.kind == 'f':
         n_test = ceil(test_size * n)
     elif np.asarray(test_size).dtype.kind == 'i':
         n_test = float(test_size)
 
-    if train_size is None:
-        n_train = n - n_test
-    else:
-        if np.asarray(train_size).dtype.kind == 'f':
-            n_train = floor(train_size * n)
-        else:
-            n_train = float(train_size)
-
-    if test_size is None:
-        n_test = n - n_train
-
-    if n_train + n_test > n:
-        raise ValueError('The sum of train_size and test_size = %d, '
-                         'should be smaller than the number of '
-                         'samples %d. Reduce test_size and/or '
-                         'train_size.' % (n_train + n_test, n))
+    if np.asarray(train_size).dtype.kind == 'f':
+        n_train = floor(train_size * n)
+    elif np.asarray(train_size).dtype.kind == 'i':
+        n_train = float(train_size)
 
     return int(n_train), int(n_test)
 
@@ -782,17 +825,22 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     n_iter : int (default 10)
         Number of re-shuffling & splitting iterations.
 
-    test_size : float (default 0.1), int, or None
+    test_size : float, int, or None (default is None)
         If float, should be between 0.0 and 1.0 and represent the
-        proportion of the dataset to include in the test split. If
-        int, represents the absolute number of test samples. If None,
-        the value is automatically set to the complement of the train size.
+        proportion of the dataset to include in the test split.
+        If int, represents the absolute number of test samples.
+        If None, the value is automatically set accroding  to train_size,
+        when test_size and train_size are both None, their values will be
+        computed by default (.1/.9).
 
     train_size : float, int, or None (default is None)
         If float, should be between 0.0 and 1.0 and represent the
         proportion of the dataset to include in the train split. If
         int, represents the absolute number of train samples. If None,
         the value is automatically set to the complement of the test size.
+        If None, the value is automatically set accroding to test_size,
+        when test_size and train_size are both None, their values will be
+        computed by default (.1/.9).
 
     random_state : int or RandomState
         Pseudo-random number generator state used for random sampling.
@@ -816,11 +864,12 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     TRAIN: [0 2] TEST: [3 1]
     """
 
-    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
+    def __init__(self, y, n_iter=10, test_size=None, train_size=None,
                  random_state=None):
 
         super(StratifiedShuffleSplit, self).__init__(
-            len(y), n_iter, test_size, train_size, random_state)
+            len(y), n_iter=n_iter, test_size=test_size, train_size=train_size,
+            random_state=random_state)
 
         self.y = np.array(y)
         self.classes, self.y_indices = np.unique(y, return_inverse=True)

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -362,13 +362,48 @@ def test_shuffle_split():
     ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
     for typ in six.integer_types:
         ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
-    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
+    ss5 = cval.ShuffleSplit(10, train_size=0.8, random_state=0)
+    for t1, t2, t3, t4, t5 in zip(ss1, ss2, ss3, ss4, ss5):
         assert_array_equal(t1[0], t2[0])
         assert_array_equal(t2[0], t3[0])
         assert_array_equal(t3[0], t4[0])
+        assert_array_equal(t4[0], t5[0])
         assert_array_equal(t1[1], t2[1])
         assert_array_equal(t2[1], t3[1])
         assert_array_equal(t3[1], t4[1])
+        assert_array_equal(t4[1], t5[1])
+
+    ss6 = cval.ShuffleSplit(10, random_state=0)
+    ss7 = cval.ShuffleSplit(10, test_size=None, train_size=None,
+                            random_state=0)
+    ss8 = cval.ShuffleSplit(10, test_size=None, random_state=0)
+    ss9 = cval.ShuffleSplit(10, train_size=None, random_state=0)
+    ss10 = cval.ShuffleSplit(10, test_size=0.1, train_size=0.9,
+                             random_state=0)
+    ss11 = cval.ShuffleSplit(10, test_size=0.1, random_state=0)
+    ss12 = cval.ShuffleSplit(10, train_size=0.9, random_state=0)
+    ss13 = cval.ShuffleSplit(10, test_size=0.1, train_size=None,
+                             random_state=0)
+    ss14 = cval.ShuffleSplit(10, test_size=None, train_size=0.9,
+                             random_state=0)
+    for t6, t7, t8, t9, t10, t11, t12, t13, t14 in \
+            zip(ss6, ss7, ss8, ss9, ss10, ss11, ss12, ss13, ss14):
+        assert_array_equal(t6[0], t7[0])
+        assert_array_equal(t7[0], t8[0])
+        assert_array_equal(t8[0], t9[0])
+        assert_array_equal(t9[0], t10[0])
+        assert_array_equal(t10[0], t11[0])
+        assert_array_equal(t11[0], t12[0])
+        assert_array_equal(t12[0], t13[0])
+        assert_array_equal(t13[0], t14[0])
+        assert_array_equal(t6[1], t7[1])
+        assert_array_equal(t7[1], t8[1])
+        assert_array_equal(t8[1], t9[1])
+        assert_array_equal(t9[1], t10[1])
+        assert_array_equal(t10[1], t11[1])
+        assert_array_equal(t11[1], t12[1])
+        assert_array_equal(t12[1], t13[1])
+        assert_array_equal(t13[1], t14[1])
 
 
 def test_stratified_shuffle_split_init():
@@ -872,8 +907,26 @@ def test_shufflesplit_errors():
     assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
     assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
     assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
-    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
-                  train_size=None)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8")
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8",
+                  test_size=".2")
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8",
+                  test_size=.2)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8")
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8",
+                  train_size=".2")
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8",
+                  train_size=.2)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j,
+                  test_size=.2j)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j,
+                  test_size=.2)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j,
+                  train_size=.2j)
+    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j,
+                  train_size=.2)
 
 
 def test_shufflesplit_reproducible():