Skip to content
135 changes: 92 additions & 43 deletions sklearn/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ def __repr__(self):
)

def __len__(self):
return int(factorial(self.n) / factorial(self.n - self.p)
/ factorial(self.p))
return int(factorial(self.n) / factorial(self.n - self.p) /
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha! I can't believe we're actually calculating these factorials when scipy provides a binomial coefficient function.

factorial(self.p))


class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)):
Expand Down Expand Up @@ -595,13 +595,19 @@ def __len__(self):
class BaseShuffleSplit(with_metaclass(ABCMeta)):
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""

def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
def __init__(self, n, n_iter=10, test_size=None, train_size=None,
random_state=None):
self.n = n
self.n_iter = n_iter
self.random_state = random_state

# we will compute the final value of
# test_size in _validate_shuffle_split()
self.test_size = test_size
# we will compute the final value of
# test_size in _validate_shuffle_split()
self.train_size = train_size
self.random_state = random_state

self.n_train, self.n_test = _validate_shuffle_split(n, test_size,
train_size)

Expand Down Expand Up @@ -632,17 +638,22 @@ class ShuffleSplit(BaseShuffleSplit):
n_iter : int (default 10)
Number of re-shuffling & splitting iterations.

test_size : float (default 0.1), int, or None
test_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
proportion of the dataset to include in the test split.
If int, represents the absolute number of test samples.
If None, the value is automatically set accroding to train_size,
when test_size and train_size are both None, their values will be
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a similar comment to train_size?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course, I'll fix them.

computed by default (.1/.9).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about they will take the default values of .1 and .9 respectively.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I'm not good at English.


train_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
If None, the value is automatically set accroding to test_size,
when test_size and train_size are both None, their values will be
computed by default (.1/.9).

random_state : int or RandomState
Pseudo-random number generator state used for random sampling.
Expand Down Expand Up @@ -700,63 +711,95 @@ def __len__(self):

def _validate_shuffle_split(n, test_size, train_size):
if test_size is None and train_size is None:
raise ValueError(
'test_size and train_size can not both be None')

if test_size is not None:
if np.asarray(test_size).dtype.kind == 'f':
# user didn't set test_size
# we will use default value
test_size = 0.1
train_size = 1. - test_size

if test_size is not None and train_size is None:
test_size_kind = np.asarray(test_size).dtype.kind
if test_size_kind == 'f':
if test_size >= 1.:
raise ValueError(
'test_size=%f should be smaller '
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be between 0.0 and 1.0 or an integer?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that sounds reasonable.

'than 1.0 or be an integer' % test_size)
elif np.asarray(test_size).dtype.kind == 'i':
train_size = 1. - test_size
elif test_size_kind == 'i':
if test_size >= n:
raise ValueError(
'test_size=%d should be smaller '
'than the number of samples %d' % (test_size, n))
train_size = n - test_size
else:
raise ValueError("Invalid value for test_size: %r" % test_size)

if train_size is not None:
if np.asarray(train_size).dtype.kind == 'f':
elif test_size is None and train_size is not None:
train_size_kind = np.asarray(train_size).dtype.kind
if train_size_kind == 'f':
if train_size >= 1.:
raise ValueError("train_size=%f should be smaller "
"than 1.0 or be an integer" % train_size)
elif np.asarray(test_size).dtype.kind == 'f' and \
train_size + test_size > 1.:
test_size = 1. - train_size
elif train_size_kind == 'i':
if train_size >= n:
raise ValueError("train_size=%d should be smaller "
"than the number of samples %d" %
(train_size, n))
test_size = n - train_size
else:
raise ValueError("Invalid value for train_size: %r" % train_size)
else: # test_size is not None and train_set is not None:
test_size_kind = np.array(test_size).dtype.kind
train_size_kind = np.array(train_size).dtype.kind
if test_size_kind == 'f' and \
train_size_kind == 'f':
if test_size >= 1.:
raise ValueError('test_size=%f should be smaller '
'than 1.0 or be an integer' % test_size)
if train_size >= 1.:
raise ValueError('train_size=%f should be smaller '
'than 1.0 or be an integer' % train_size)
if train_size + test_size > 1.:
raise ValueError('The sum of test_size and train_size = %f, '
'should be smaller than 1.0. Reduce '
'test_size and/or train_size.' %
(train_size + test_size))
elif np.asarray(train_size).dtype.kind == 'i':
elif test_size_kind == 'i' and train_size_kind == 'i':
if test_size >= n:
raise ValueError('test_size=%d should be smaller '
'than the number of samples %d' %
(test_size, n))
if train_size >= n:
raise ValueError("train_size=%d should be smaller "
"than the number of samples %d" %
(train_size, n))
if train_size + test_size > n:
raise ValueError('The sum of test_size and train_size = %d, '
'should be smaller than n. Reduce '
'test_size and/or train_size.' %
(train_size + test_size))
else:
raise ValueError("Invalid value for train_size: %r" % train_size)
# test_size_kind != train_size_kind or
# their kinds are nether 'f' nor 'i'
if test_size_kind not in ['f', 'i']:
raise ValueError("Invalid value for test_size: %r" %
test_size)
if train_size_kind not in ['f', 'i']:
raise ValueError("Invalid value for train_size: %r" %
train_size)
raise ValueError(
"Type of test_size and train_size is not the same, "
"test_size: %r, train_size: %r" %
(type(test_size), type(train_size)))

if np.asarray(test_size).dtype.kind == 'f':
n_test = ceil(test_size * n)
elif np.asarray(test_size).dtype.kind == 'i':
n_test = float(test_size)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get the need for this line. Does removing the else clauses break any tests?


if train_size is None:
n_train = n - n_test
else:
if np.asarray(train_size).dtype.kind == 'f':
n_train = floor(train_size * n)
else:
n_train = float(train_size)

if test_size is None:
n_test = n - n_train

if n_train + n_test > n:
raise ValueError('The sum of train_size and test_size = %d, '
'should be smaller than the number of '
'samples %d. Reduce test_size and/or '
'train_size.' % (n_train + n_test, n))
if np.asarray(train_size).dtype.kind == 'f':
n_train = floor(train_size * n)
elif np.asarray(train_size).dtype.kind == 'i':
n_train = float(train_size)

return int(n_train), int(n_test)

Expand All @@ -782,17 +825,22 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
n_iter : int (default 10)
Number of re-shuffling & splitting iterations.

test_size : float (default 0.1), int, or None
test_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the test split. If
int, represents the absolute number of test samples. If None,
the value is automatically set to the complement of the train size.
proportion of the dataset to include in the test split.
If int, represents the absolute number of test samples.
If None, the value is automatically set accroding to train_size,
when test_size and train_size are both None, their values will be
computed by default (.1/.9).

train_size : float, int, or None (default is None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
If None, the value is automatically set accroding to test_size,
when test_size and train_size are both None, their values will be
computed by default (.1/.9).

random_state : int or RandomState
Pseudo-random number generator state used for random sampling.
Expand All @@ -816,11 +864,12 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
TRAIN: [0 2] TEST: [3 1]
"""

def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
def __init__(self, y, n_iter=10, test_size=None, train_size=None,
random_state=None):

super(StratifiedShuffleSplit, self).__init__(
len(y), n_iter, test_size, train_size, random_state)
len(y), n_iter=n_iter, test_size=test_size, train_size=train_size,
random_state=random_state)

self.y = np.array(y)
self.classes, self.y_indices = np.unique(y, return_inverse=True)
Expand Down
59 changes: 56 additions & 3 deletions sklearn/tests/test_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,48 @@ def test_shuffle_split():
ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
for typ in six.integer_types:
ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
ss5 = cval.ShuffleSplit(10, train_size=0.8, random_state=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is painful to read. I would suggest putting the ShuffleSplits in lists and do a for loop over the indices. WDYT?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds make sense.

for t1, t2, t3, t4, t5 in zip(ss1, ss2, ss3, ss4, ss5):
assert_array_equal(t1[0], t2[0])
assert_array_equal(t2[0], t3[0])
assert_array_equal(t3[0], t4[0])
assert_array_equal(t4[0], t5[0])
assert_array_equal(t1[1], t2[1])
assert_array_equal(t2[1], t3[1])
assert_array_equal(t3[1], t4[1])
assert_array_equal(t4[1], t5[1])

ss6 = cval.ShuffleSplit(10, random_state=0)
ss7 = cval.ShuffleSplit(10, test_size=None, train_size=None,
random_state=0)
ss8 = cval.ShuffleSplit(10, test_size=None, random_state=0)
ss9 = cval.ShuffleSplit(10, train_size=None, random_state=0)
ss10 = cval.ShuffleSplit(10, test_size=0.1, train_size=0.9,
random_state=0)
ss11 = cval.ShuffleSplit(10, test_size=0.1, random_state=0)
ss12 = cval.ShuffleSplit(10, train_size=0.9, random_state=0)
ss13 = cval.ShuffleSplit(10, test_size=0.1, train_size=None,
random_state=0)
ss14 = cval.ShuffleSplit(10, test_size=None, train_size=0.9,
random_state=0)
for t6, t7, t8, t9, t10, t11, t12, t13, t14 in \
zip(ss6, ss7, ss8, ss9, ss10, ss11, ss12, ss13, ss14):
assert_array_equal(t6[0], t7[0])
assert_array_equal(t7[0], t8[0])
assert_array_equal(t8[0], t9[0])
assert_array_equal(t9[0], t10[0])
assert_array_equal(t10[0], t11[0])
assert_array_equal(t11[0], t12[0])
assert_array_equal(t12[0], t13[0])
assert_array_equal(t13[0], t14[0])
assert_array_equal(t6[1], t7[1])
assert_array_equal(t7[1], t8[1])
assert_array_equal(t8[1], t9[1])
assert_array_equal(t9[1], t10[1])
assert_array_equal(t10[1], t11[1])
assert_array_equal(t11[1], t12[1])
assert_array_equal(t12[1], t13[1])
assert_array_equal(t13[1], t14[1])


def test_stratified_shuffle_split_init():
Expand Down Expand Up @@ -872,8 +907,26 @@ def test_shufflesplit_errors():
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
train_size=None)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8")
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8",
test_size=".2")
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8",
test_size=.2)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8")
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8",
train_size=".2")
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8",
train_size=.2)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j,
test_size=.2j)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j,
test_size=.2)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j,
train_size=.2j)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j,
train_size=.2)


def test_shufflesplit_reproducible():
Expand Down