-
-
Notifications
You must be signed in to change notification settings - Fork 26.5k
[MRG+1-1] consistent setting of train_size and test_size in ShuffleSplit [Fix Bug #4618 ] #4621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7d6a6f6
3bf57c2
d48fdba
d8da235
9c46ff7
b29f759
dc94a90
4384a30
6ec7847
cc255a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,8 +213,8 @@ def __repr__(self): | |
| ) | ||
|
|
||
| def __len__(self): | ||
| return int(factorial(self.n) / factorial(self.n - self.p) | ||
| / factorial(self.p)) | ||
| return int(factorial(self.n) / factorial(self.n - self.p) / | ||
| factorial(self.p)) | ||
|
|
||
|
|
||
| class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): | ||
|
|
@@ -595,13 +595,19 @@ def __len__(self): | |
| class BaseShuffleSplit(with_metaclass(ABCMeta)): | ||
| """Base class for ShuffleSplit and StratifiedShuffleSplit""" | ||
|
|
||
| def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, | ||
| def __init__(self, n, n_iter=10, test_size=None, train_size=None, | ||
| random_state=None): | ||
| self.n = n | ||
| self.n_iter = n_iter | ||
| self.random_state = random_state | ||
|
|
||
| # we will compute the final value of | ||
| # test_size in _validate_shuffle_split() | ||
| self.test_size = test_size | ||
| # we will compute the final value of | ||
| # test_size in _validate_shuffle_split() | ||
| self.train_size = train_size | ||
| self.random_state = random_state | ||
|
|
||
| self.n_train, self.n_test = _validate_shuffle_split(n, test_size, | ||
| train_size) | ||
|
|
||
|
|
@@ -632,17 +638,22 @@ class ShuffleSplit(BaseShuffleSplit): | |
| n_iter : int (default 10) | ||
| Number of re-shuffling & splitting iterations. | ||
|
|
||
| test_size : float (default 0.1), int, or None | ||
| test_size : float, int, or None (default is None) | ||
| If float, should be between 0.0 and 1.0 and represent the | ||
| proportion of the dataset to include in the test split. If | ||
| int, represents the absolute number of test samples. If None, | ||
| the value is automatically set to the complement of the train size. | ||
| proportion of the dataset to include in the test split. | ||
| If int, represents the absolute number of test samples. | ||
| If None, the value is automatically set accroding to train_size, | ||
| when test_size and train_size are both None, their values will be | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a similar comment to
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course, I'll fix them. |
||
| computed by default (.1/.9). | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I'm not good at English. |
||
|
|
||
| train_size : float, int, or None (default is None) | ||
| If float, should be between 0.0 and 1.0 and represent the | ||
| proportion of the dataset to include in the train split. If | ||
| int, represents the absolute number of train samples. If None, | ||
| the value is automatically set to the complement of the test size. | ||
| If None, the value is automatically set accroding to test_size, | ||
| when test_size and train_size are both None, their values will be | ||
| computed by default (.1/.9). | ||
|
|
||
| random_state : int or RandomState | ||
| Pseudo-random number generator state used for random sampling. | ||
|
|
@@ -700,63 +711,95 @@ def __len__(self): | |
|
|
||
| def _validate_shuffle_split(n, test_size, train_size): | ||
| if test_size is None and train_size is None: | ||
| raise ValueError( | ||
| 'test_size and train_size can not both be None') | ||
|
|
||
| if test_size is not None: | ||
| if np.asarray(test_size).dtype.kind == 'f': | ||
| # user didn't set test_size | ||
| # we will use default value | ||
| test_size = 0.1 | ||
| train_size = 1. - test_size | ||
|
|
||
| if test_size is not None and train_size is None: | ||
| test_size_kind = np.asarray(test_size).dtype.kind | ||
| if test_size_kind == 'f': | ||
| if test_size >= 1.: | ||
| raise ValueError( | ||
| 'test_size=%f should be smaller ' | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be between 0.0 and 1.0 or an integer?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, that sounds reasonable. |
||
| 'than 1.0 or be an integer' % test_size) | ||
| elif np.asarray(test_size).dtype.kind == 'i': | ||
| train_size = 1. - test_size | ||
| elif test_size_kind == 'i': | ||
| if test_size >= n: | ||
| raise ValueError( | ||
| 'test_size=%d should be smaller ' | ||
| 'than the number of samples %d' % (test_size, n)) | ||
| train_size = n - test_size | ||
| else: | ||
| raise ValueError("Invalid value for test_size: %r" % test_size) | ||
|
|
||
| if train_size is not None: | ||
| if np.asarray(train_size).dtype.kind == 'f': | ||
| elif test_size is None and train_size is not None: | ||
| train_size_kind = np.asarray(train_size).dtype.kind | ||
| if train_size_kind == 'f': | ||
| if train_size >= 1.: | ||
| raise ValueError("train_size=%f should be smaller " | ||
| "than 1.0 or be an integer" % train_size) | ||
| elif np.asarray(test_size).dtype.kind == 'f' and \ | ||
| train_size + test_size > 1.: | ||
| test_size = 1. - train_size | ||
| elif train_size_kind == 'i': | ||
| if train_size >= n: | ||
| raise ValueError("train_size=%d should be smaller " | ||
| "than the number of samples %d" % | ||
| (train_size, n)) | ||
| test_size = n - train_size | ||
| else: | ||
| raise ValueError("Invalid value for train_size: %r" % train_size) | ||
| else: # test_size is not None and train_set is not None: | ||
| test_size_kind = np.array(test_size).dtype.kind | ||
| train_size_kind = np.array(train_size).dtype.kind | ||
| if test_size_kind == 'f' and \ | ||
| train_size_kind == 'f': | ||
| if test_size >= 1.: | ||
| raise ValueError('test_size=%f should be smaller ' | ||
| 'than 1.0 or be an integer' % test_size) | ||
| if train_size >= 1.: | ||
| raise ValueError('train_size=%f should be smaller ' | ||
| 'than 1.0 or be an integer' % train_size) | ||
| if train_size + test_size > 1.: | ||
| raise ValueError('The sum of test_size and train_size = %f, ' | ||
| 'should be smaller than 1.0. Reduce ' | ||
| 'test_size and/or train_size.' % | ||
| (train_size + test_size)) | ||
| elif np.asarray(train_size).dtype.kind == 'i': | ||
| elif test_size_kind == 'i' and train_size_kind == 'i': | ||
| if test_size >= n: | ||
| raise ValueError('test_size=%d should be smaller ' | ||
| 'than the number of samples %d' % | ||
| (test_size, n)) | ||
| if train_size >= n: | ||
| raise ValueError("train_size=%d should be smaller " | ||
| "than the number of samples %d" % | ||
| (train_size, n)) | ||
| if train_size + test_size > n: | ||
| raise ValueError('The sum of test_size and train_size = %d, ' | ||
| 'should be smaller than n. Reduce ' | ||
| 'test_size and/or train_size.' % | ||
| (train_size + test_size)) | ||
| else: | ||
| raise ValueError("Invalid value for train_size: %r" % train_size) | ||
| # test_size_kind != train_size_kind or | ||
| # their kinds are nether 'f' nor 'i' | ||
| if test_size_kind not in ['f', 'i']: | ||
| raise ValueError("Invalid value for test_size: %r" % | ||
| test_size) | ||
| if train_size_kind not in ['f', 'i']: | ||
| raise ValueError("Invalid value for train_size: %r" % | ||
| train_size) | ||
| raise ValueError( | ||
| "Type of test_size and train_size is not the same, " | ||
| "test_size: %r, train_size: %r" % | ||
| (type(test_size), type(train_size))) | ||
|
|
||
| if np.asarray(test_size).dtype.kind == 'f': | ||
| n_test = ceil(test_size * n) | ||
| elif np.asarray(test_size).dtype.kind == 'i': | ||
| n_test = float(test_size) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get the need for this line. Does removing the else clauses break any tests? |
||
|
|
||
| if train_size is None: | ||
| n_train = n - n_test | ||
| else: | ||
| if np.asarray(train_size).dtype.kind == 'f': | ||
| n_train = floor(train_size * n) | ||
| else: | ||
| n_train = float(train_size) | ||
|
|
||
| if test_size is None: | ||
| n_test = n - n_train | ||
|
|
||
| if n_train + n_test > n: | ||
| raise ValueError('The sum of train_size and test_size = %d, ' | ||
| 'should be smaller than the number of ' | ||
| 'samples %d. Reduce test_size and/or ' | ||
| 'train_size.' % (n_train + n_test, n)) | ||
| if np.asarray(train_size).dtype.kind == 'f': | ||
| n_train = floor(train_size * n) | ||
| elif np.asarray(train_size).dtype.kind == 'i': | ||
| n_train = float(train_size) | ||
|
|
||
| return int(n_train), int(n_test) | ||
|
|
||
|
|
@@ -782,17 +825,22 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
| n_iter : int (default 10) | ||
| Number of re-shuffling & splitting iterations. | ||
|
|
||
| test_size : float (default 0.1), int, or None | ||
| test_size : float, int, or None (default is None) | ||
| If float, should be between 0.0 and 1.0 and represent the | ||
| proportion of the dataset to include in the test split. If | ||
| int, represents the absolute number of test samples. If None, | ||
| the value is automatically set to the complement of the train size. | ||
| proportion of the dataset to include in the test split. | ||
| If int, represents the absolute number of test samples. | ||
| If None, the value is automatically set accroding to train_size, | ||
| when test_size and train_size are both None, their values will be | ||
| computed by default (.1/.9). | ||
|
|
||
| train_size : float, int, or None (default is None) | ||
| If float, should be between 0.0 and 1.0 and represent the | ||
| proportion of the dataset to include in the train split. If | ||
| int, represents the absolute number of train samples. If None, | ||
| the value is automatically set to the complement of the test size. | ||
| If None, the value is automatically set accroding to test_size, | ||
| when test_size and train_size are both None, their values will be | ||
| computed by default (.1/.9). | ||
|
|
||
| random_state : int or RandomState | ||
| Pseudo-random number generator state used for random sampling. | ||
|
|
@@ -816,11 +864,12 @@ class StratifiedShuffleSplit(BaseShuffleSplit): | |
| TRAIN: [0 2] TEST: [3 1] | ||
| """ | ||
|
|
||
| def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, | ||
| def __init__(self, y, n_iter=10, test_size=None, train_size=None, | ||
| random_state=None): | ||
|
|
||
| super(StratifiedShuffleSplit, self).__init__( | ||
| len(y), n_iter, test_size, train_size, random_state) | ||
| len(y), n_iter=n_iter, test_size=test_size, train_size=train_size, | ||
| random_state=random_state) | ||
|
|
||
| self.y = np.array(y) | ||
| self.classes, self.y_indices = np.unique(y, return_inverse=True) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -362,13 +362,48 @@ def test_shuffle_split(): | |
| ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) | ||
| for typ in six.integer_types: | ||
| ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) | ||
| for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): | ||
| ss5 = cval.ShuffleSplit(10, train_size=0.8, random_state=0) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is painful to read. I would suggest putting the ShuffleSplits in lists and do a for loop over the indices. WDYT?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That sounds make sense. |
||
| for t1, t2, t3, t4, t5 in zip(ss1, ss2, ss3, ss4, ss5): | ||
| assert_array_equal(t1[0], t2[0]) | ||
| assert_array_equal(t2[0], t3[0]) | ||
| assert_array_equal(t3[0], t4[0]) | ||
| assert_array_equal(t4[0], t5[0]) | ||
| assert_array_equal(t1[1], t2[1]) | ||
| assert_array_equal(t2[1], t3[1]) | ||
| assert_array_equal(t3[1], t4[1]) | ||
| assert_array_equal(t4[1], t5[1]) | ||
|
|
||
| ss6 = cval.ShuffleSplit(10, random_state=0) | ||
| ss7 = cval.ShuffleSplit(10, test_size=None, train_size=None, | ||
| random_state=0) | ||
| ss8 = cval.ShuffleSplit(10, test_size=None, random_state=0) | ||
| ss9 = cval.ShuffleSplit(10, train_size=None, random_state=0) | ||
| ss10 = cval.ShuffleSplit(10, test_size=0.1, train_size=0.9, | ||
| random_state=0) | ||
| ss11 = cval.ShuffleSplit(10, test_size=0.1, random_state=0) | ||
| ss12 = cval.ShuffleSplit(10, train_size=0.9, random_state=0) | ||
| ss13 = cval.ShuffleSplit(10, test_size=0.1, train_size=None, | ||
| random_state=0) | ||
| ss14 = cval.ShuffleSplit(10, test_size=None, train_size=0.9, | ||
| random_state=0) | ||
| for t6, t7, t8, t9, t10, t11, t12, t13, t14 in \ | ||
| zip(ss6, ss7, ss8, ss9, ss10, ss11, ss12, ss13, ss14): | ||
| assert_array_equal(t6[0], t7[0]) | ||
| assert_array_equal(t7[0], t8[0]) | ||
| assert_array_equal(t8[0], t9[0]) | ||
| assert_array_equal(t9[0], t10[0]) | ||
| assert_array_equal(t10[0], t11[0]) | ||
| assert_array_equal(t11[0], t12[0]) | ||
| assert_array_equal(t12[0], t13[0]) | ||
| assert_array_equal(t13[0], t14[0]) | ||
| assert_array_equal(t6[1], t7[1]) | ||
| assert_array_equal(t7[1], t8[1]) | ||
| assert_array_equal(t8[1], t9[1]) | ||
| assert_array_equal(t9[1], t10[1]) | ||
| assert_array_equal(t10[1], t11[1]) | ||
| assert_array_equal(t11[1], t12[1]) | ||
| assert_array_equal(t12[1], t13[1]) | ||
| assert_array_equal(t13[1], t14[1]) | ||
|
|
||
|
|
||
| def test_stratified_shuffle_split_init(): | ||
|
|
@@ -872,8 +907,26 @@ def test_shufflesplit_errors(): | |
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, | ||
| train_size=None) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8") | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8", | ||
| test_size=".2") | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=".8", | ||
| test_size=.2) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8") | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8", | ||
| train_size=".2") | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=".8", | ||
| train_size=.2) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j, | ||
| test_size=.2j) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=.8j, | ||
| test_size=.2) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j, | ||
| train_size=.2j) | ||
| assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=.8j, | ||
| train_size=.2) | ||
|
|
||
|
|
||
| def test_shufflesplit_reproducible(): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ha! I can't believe we're actually calculating these factorials when scipy provides a binomial coefficient function.