From 2b11ebb3e3e8a4a4fa382dc92fb377eea186ddd4 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Mon, 7 Dec 2015 02:58:28 -0500 Subject: [PATCH 1/5] Introduced a new optional flag stratify_across_classes to StratifiedShuffleSplit. Programmed the inner-working to achieve the expected CV splits. Enhanced the validation of the specification for train_size and test_size (sum=1). There might be a potential bug in the ShuffleSplit class, which primarily anchors on test_size, and doesn't handle the specification of train_size (and infer the test_size appropriately). It seems like not much testing has been performed in cases when the user only specifies train_size. --- sklearn/model_selection/_split.py | 78 +++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 6b8e6ea55af33..f5119eed46fe2 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -772,11 +772,12 @@ class BaseShuffleSplit(with_metaclass(ABCMeta)): """Base class for ShuffleSplit and StratifiedShuffleSplit""" def __init__(self, n_iter=10, test_size=0.1, train_size=None, - random_state=None): - _validate_shuffle_split_init(test_size, train_size) + stratify_across_classes=False, random_state=None): + _validate_shuffle_split_init(test_size, train_size,stratify_across_classes) self.n_iter = n_iter self.test_size = test_size self.train_size = train_size + self.stratify_across_classes = stratify_across_classes self.random_state = random_state def split(self, X, y=None, labels=None): @@ -1005,6 +1006,11 @@ class StratifiedShuffleSplit(BaseShuffleSplit): int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. + stratify_across_classes : bool (default is False) + If True, the sizes for the different classes in the training set + are made equal to the fixed percentage (or number) from the smallest class. + The underlying motivation is to avoid class-imbalance during training phase. + random_state : int or RandomState Pseudo-random number generator state used for random sampling. @@ -1028,14 +1034,14 @@ class StratifiedShuffleSplit(BaseShuffleSplit): """ def __init__(self, n_iter=10, test_size=0.1, train_size=None, - random_state=None): + random_state=None, stratify_across_classes=False): super(StratifiedShuffleSplit, self).__init__( - n_iter, test_size, train_size, random_state) + n_iter, test_size, train_size, stratify_across_classes, random_state) - def _iter_indices(self, X, y, labels=None): + def _iter_indices(self, X, y=None, labels=None): n_samples = _num_samples(X) - n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, - self.train_size) + n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size, + self.train_size, self.stratify_across_classes) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] @@ -1057,8 +1063,14 @@ def _iter_indices(self, X, y, labels=None): rng = check_random_state(self.random_state) p_i = class_counts / float(n_samples) - n_i = np.round(n_train * p_i).astype(int) - t_i = np.minimum(class_counts - n_i, + if self.stratify_across_classes: + n_train_per_class = np.round(class_counts*train_size) + smallest_size = np.max([1,np.min(n_train_per_class)]) + n_i = np.tile(smallest_size,class_counts.shape).astype(int) + t_i = (class_counts - n_i).astype(int) + else: + n_i = np.round(n_train * p_i).astype(int) + t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int)) for _ in range(self.n_iter): @@ -1089,8 +1101,8 @@ def _iter_indices(self, X, y, labels=None): yield train, test -def _validate_shuffle_split_init(test_size, train_size): - """Validation helper to check the test_size and train_size at init +def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes): + """Validation helper to check the test_size, train_size and stratify_across_classes at init NOTE This does not take into account the number of samples which is known only at split @@ -1098,6 +1110,11 @@ def _validate_shuffle_split_init(test_size, train_size): if test_size is None and train_size is None: raise ValueError('test_size and train_size can not both be None') + if test_size is not None and train_size is not None\ + and (np.asarray(train_size).dtype.kind == 'f' and np.asarray(test_size).dtype.kind == 'f') \ + and ( train_size + test_size != 1. ): + raise ValueError('test_size and train_size must sum to 1. Got %f' % np.sum(test_size+train_size) ) + if test_size is not None: if np.asarray(test_size).dtype.kind == 'f': if test_size >= 1.: @@ -1123,8 +1140,12 @@ def _validate_shuffle_split_init(test_size, train_size): # int values are checked during split based on the input raise ValueError("Invalid value for train_size: %r" % train_size) + if not isinstance(stratify_across_classes,bool): + raise TypeError('stratify_across_classes must be a boolean flag. Got %s' % + stratify_across_classes.__class__) -def _validate_shuffle_split(n_samples, test_size, train_size): + +def _validate_shuffle_split(n_samples, test_size, train_size, stratify_across_classes): """ Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) @@ -1139,20 +1160,34 @@ def _validate_shuffle_split(n_samples, test_size, train_size): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) + # this check is necessary to ensure expected behaviour + if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f' + and test_size+train_size>1.0): + raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size) + if np.asarray(test_size).dtype.kind == 'f': n_test = ceil(test_size * n_samples) + if train_size is None: + train_size = 1.0 - test_size + n_train = n_samples - n_test elif np.asarray(test_size).dtype.kind == 'i': n_test = float(test_size) - - if train_size is None: - n_train = n_samples - n_test + if train_size is None: + train_size = 1.0 - test_size + n_train = n_samples - n_test elif np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n_samples) - else: + if test_size is None: + test_size = 1.0 - train_size + n_test = n_samples - n_train + elif np.asarray(train_size).dtype.kind == 'i': n_train = float(train_size) - - if test_size is None: n_test = n_samples - n_train + else: + raise TypeError('Unexpected specification of train_size and test_size.' + 'Only one of test_size or train_size must be specified.' + ' either as percentage p such that 0.0 < p < 1.0, ' + ' or as an integer n such that 0 < n < #samples.') if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' @@ -1160,7 +1195,12 @@ def _validate_shuffle_split(n_samples, test_size, train_size): 'samples %d. Reduce test_size and/or ' 'train_size.' % (n_train + n_test, n_samples)) - return int(n_train), int(n_test) + if n_train < 1 or n_test < 1: + raise ValueError('There must be at least one data point for training' + 'and one data point for testing. Got {0:d} training' + 'and {1:d} testing points.'.format(n_train, n_test)) + + return int(n_train), int(n_test), float(train_size), float(test_size) class PredefinedSplit(BaseCrossValidator): From e4953bdc78fbbae336a93711f8cb52f9b45993c3 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Mon, 7 Dec 2015 03:37:41 -0500 Subject: [PATCH 2/5] removed unnecessary requirement for stratify_across_classes to _validate_shuffle_split. --- sklearn/model_selection/_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index f5119eed46fe2..2395e01dbe8d1 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1041,7 +1041,7 @@ def __init__(self, n_iter=10, test_size=0.1, train_size=None, def _iter_indices(self, X, y=None, labels=None): n_samples = _num_samples(X) n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size, - self.train_size, self.stratify_across_classes) + self.train_size) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] @@ -1145,7 +1145,7 @@ def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes): stratify_across_classes.__class__) -def _validate_shuffle_split(n_samples, test_size, train_size, stratify_across_classes): +def _validate_shuffle_split(n_samples, test_size, train_size): """ Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) From 35fdb8bbec73667e0db5630a4366cec77ce26370 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Mon, 7 Dec 2015 04:00:09 -0500 Subject: [PATCH 3/5] correcting for the number of output args of _validate_shuffle_split. PyCharm seems to have not saved my previous changes\! --- sklearn/model_selection/_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 2395e01dbe8d1..db8606b800364 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -896,7 +896,7 @@ class ShuffleSplit(BaseShuffleSplit): def _iter_indices(self, X, y=None, labels=None): n_samples = _num_samples(X) - n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, + n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size, self.train_size) rng = check_random_state(self.random_state) for i in range(self.n_iter): From 7b85cacfb59fbc8efb19c2e4abf03f4bc0ffcfd7 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Mon, 7 Dec 2015 04:44:10 -0500 Subject: [PATCH 4/5] Fixing the UnboundLocalError with an apriori assignment of None to n_train and n_test. --- sklearn/model_selection/_split.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index db8606b800364..2cf99be5373ed 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1160,6 +1160,10 @@ def _validate_shuffle_split(n_samples, test_size, train_size): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) + # to work around UnboundLocalError + n_train = None + n_test = None + # this check is necessary to ensure expected behaviour if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f' and test_size+train_size>1.0): From db82b7905bbc83239546b180d003f30876a808c0 Mon Sep 17 00:00:00 2001 From: Pradeep Reddy Raamana Date: Mon, 7 Dec 2015 19:46:56 -0500 Subject: [PATCH 5/5] Fixes to address the wildly incorrect specification of train and test sizes, and to deal with nose tests properly. --- sklearn/model_selection/_split.py | 44 ++++++++++++++++----- sklearn/model_selection/tests/test_split.py | 5 ++- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 2cf99be5373ed..609606e6ea11c 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1160,49 +1160,75 @@ def _validate_shuffle_split(n_samples, test_size, train_size): raise ValueError("train_size=%d should be smaller than the number of" " samples %d" % (train_size, n_samples)) - # to work around UnboundLocalError - n_train = None - n_test = None + if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'i' ) or \ + (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'f' ): + raise ValueError("Data types of train and test sizes mixed up. Choose either float or int, only one.") # this check is necessary to ensure expected behaviour if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f' - and test_size+train_size>1.0): + and test_size+train_size!=1.0): raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size) + if (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'i' + and test_size+train_size != n_samples): + raise ValueError("Sum of train and test size must be n_samples. Got %f" % np.sum([test_size,train_size])) + + # to work around the UnboundLocalError + n_test = None + n_train = None + if np.asarray(test_size).dtype.kind == 'f': n_test = ceil(test_size * n_samples) if train_size is None: train_size = 1.0 - test_size n_train = n_samples - n_test + else: + # TODO probably incorrect way to handle this situation - need to discuss with core-dev and community + n_train = n_samples - n_test elif np.asarray(test_size).dtype.kind == 'i': n_test = float(test_size) if train_size is None: - train_size = 1.0 - test_size + train_size = n_samples - n_test + n_train = n_samples - n_test + else: + # assert ((np.asarray(train_size).dtype.kind == 'i') and (train_size+test_size == n_samples)), \ + # 'when test_size is int, train_size also must be int and the sum must be n_samples' + # TODO assert np.asarray(train_size).dtype.kind == 'i', 'when test_size is int, train_size also must be int.' n_train = n_samples - n_test elif np.asarray(train_size).dtype.kind == 'f': n_train = floor(train_size * n_samples) if test_size is None: test_size = 1.0 - train_size n_test = n_samples - n_train + else: + # TODO assert ( (np.asarray(test_size).dtype.kind == 'f') and (train_size+test_size == 1.0)), \ + # 'when train_size is float, test_size also must be float and they must sum to 1.0' + # assert np.asarray(test_size).dtype.kind == 'f', 'when train_size is float, test_size also must be float.' + n_test = n_samples - n_train elif np.asarray(train_size).dtype.kind == 'i': n_train = float(train_size) n_test = n_samples - n_train + test_size = int(n_test) else: raise TypeError('Unexpected specification of train_size and test_size.' 'Only one of test_size or train_size must be specified.' ' either as percentage p such that 0.0 < p < 1.0, ' ' or as an integer n such that 0 < n < #samples.') + if n_test is None or n_train is None: + raise ValueError('Combination of various inputs led to invalid calculations.') + if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' 'should be smaller than the number of ' 'samples %d. Reduce test_size and/or ' 'train_size.' % (n_train + n_test, n_samples)) - if n_train < 1 or n_test < 1: - raise ValueError('There must be at least one data point for training' - 'and one data point for testing. Got {0:d} training' - 'and {1:d} testing points.'.format(n_train, n_test)) + # TODO find a way to handle the sticky situation in which float & int are mixed to specify train_size and test_size + # if n_train < 1 or n_test < 1: + # raise ValueError('There must be at least one data point for training' + # 'and one data point for testing. Got {0:d} training' + # 'and {1:d} testing points.'.format(n_train, n_test)) return int(n_train), int(n_test), float(train_size), float(test_size) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 4689ead6007d0..cdc33aed40a52 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -559,10 +559,13 @@ def assert_counts_are_ok(idx_counts, p): counter[id] += 1 assert_equal(n_splits, n_iter) - n_train, n_test = _validate_shuffle_split(n_samples, + n_train, n_test, _, _ = _validate_shuffle_split(n_samples, test_size=1./n_folds, train_size=1.-(1./n_folds)) + # TODO need to revise this test for the proposed modification of StratifiedShuffleSplit + # TODO this is also to do with ensuring train_size + test_size = 1.0 when they are float + # TODO and similary when they are int, train_size + test_size = n_samples assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0)