From 2b11ebb3e3e8a4a4fa382dc92fb377eea186ddd4 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Mon, 7 Dec 2015 02:58:28 -0500
Subject: [PATCH 1/5] Introduced a new optional flag stratify_across_classes to
 StratifiedShuffleSplit. Programmed the inner-working to achieve the expected
 CV splits. Enhanced the validation of the specification for train_size and
 test_size (sum=1).  There might be a potential bug in the ShuffleSplit class,
 which primarily anchors on test_size, and doesn't handle the specification of
 train_size (and infer the test_size appropriately). It seems like not much
 testing has been performed in cases when the user only specifies train_size.

---
 sklearn/model_selection/_split.py | 78 +++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 19 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 6b8e6ea55af33..f5119eed46fe2 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -772,11 +772,12 @@ class BaseShuffleSplit(with_metaclass(ABCMeta)):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
 
     def __init__(self, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
-        _validate_shuffle_split_init(test_size, train_size)
+                stratify_across_classes=False, random_state=None):
+        _validate_shuffle_split_init(test_size, train_size,stratify_across_classes)
         self.n_iter = n_iter
         self.test_size = test_size
         self.train_size = train_size
+        self.stratify_across_classes = stratify_across_classes
         self.random_state = random_state
 
     def split(self, X, y=None, labels=None):
@@ -1005,6 +1006,11 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
         int, represents the absolute number of train samples. If None,
         the value is automatically set to the complement of the test size.
 
+    stratify_across_classes : bool (default is False)
+        If True, the sizes for the different classes in the training set
+        are made equal to the fixed percentage (or number) from the smallest class.
+        The underlying motivation is to avoid class-imbalance during training phase.
+
     random_state : int or RandomState
         Pseudo-random number generator state used for random sampling.
 
@@ -1028,14 +1034,14 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     """
 
     def __init__(self, n_iter=10, test_size=0.1, train_size=None,
-                 random_state=None):
+                 random_state=None, stratify_across_classes=False):
         super(StratifiedShuffleSplit, self).__init__(
-            n_iter, test_size, train_size, random_state)
+            n_iter, test_size, train_size, stratify_across_classes, random_state)
 
-    def _iter_indices(self, X, y, labels=None):
+    def _iter_indices(self, X, y=None, labels=None):
         n_samples = _num_samples(X)
-        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
-                                                  self.train_size)
+        n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
+                                                  self.train_size, self.stratify_across_classes)
         classes, y_indices = np.unique(y, return_inverse=True)
         n_classes = classes.shape[0]
 
@@ -1057,8 +1063,14 @@ def _iter_indices(self, X, y, labels=None):
 
         rng = check_random_state(self.random_state)
         p_i = class_counts / float(n_samples)
-        n_i = np.round(n_train * p_i).astype(int)
-        t_i = np.minimum(class_counts - n_i,
+        if self.stratify_across_classes:
+            n_train_per_class = np.round(class_counts*train_size)
+            smallest_size = np.max([1,np.min(n_train_per_class)])
+            n_i = np.tile(smallest_size,class_counts.shape).astype(int)
+            t_i = (class_counts - n_i).astype(int)
+        else:
+            n_i = np.round(n_train * p_i).astype(int)
+            t_i = np.minimum(class_counts - n_i,
                          np.round(n_test * p_i).astype(int))
 
         for _ in range(self.n_iter):
@@ -1089,8 +1101,8 @@ def _iter_indices(self, X, y, labels=None):
             yield train, test
 
 
-def _validate_shuffle_split_init(test_size, train_size):
-    """Validation helper to check the test_size and train_size at init
+def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes):
+    """Validation helper to check the test_size, train_size and stratify_across_classes at init
 
     NOTE This does not take into account the number of samples which is known
     only at split
@@ -1098,6 +1110,11 @@ def _validate_shuffle_split_init(test_size, train_size):
     if test_size is None and train_size is None:
         raise ValueError('test_size and train_size can not both be None')
 
+    if test_size is not None and train_size is not None\
+            and (np.asarray(train_size).dtype.kind == 'f' and np.asarray(test_size).dtype.kind == 'f') \
+            and ( train_size + test_size != 1. ):
+        raise ValueError('test_size and train_size must sum to 1. Got %f' % np.sum(test_size+train_size) )
+
     if test_size is not None:
         if np.asarray(test_size).dtype.kind == 'f':
             if test_size >= 1.:
@@ -1123,8 +1140,12 @@ def _validate_shuffle_split_init(test_size, train_size):
             # int values are checked during split based on the input
             raise ValueError("Invalid value for train_size: %r" % train_size)
 
+    if not isinstance(stratify_across_classes,bool):
+        raise TypeError('stratify_across_classes must be a boolean flag. Got %s' %
+                        stratify_across_classes.__class__)
 
-def _validate_shuffle_split(n_samples, test_size, train_size):
+
+def _validate_shuffle_split(n_samples, test_size, train_size, stratify_across_classes):
     """
     Validation helper to check if the test/test sizes are meaningful wrt to the
     size of the data (n_samples)
@@ -1139,20 +1160,34 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
         raise ValueError("train_size=%d should be smaller than the number of"
                          " samples %d" % (train_size, n_samples))
 
+    # this check is necessary to ensure expected behaviour
+    if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f'
+        and test_size+train_size>1.0):
+        raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size)
+
     if np.asarray(test_size).dtype.kind == 'f':
         n_test = ceil(test_size * n_samples)
+        if train_size is None:
+            train_size = 1.0 - test_size
+            n_train = n_samples - n_test
     elif np.asarray(test_size).dtype.kind == 'i':
         n_test = float(test_size)
-
-    if train_size is None:
-        n_train = n_samples - n_test
+        if train_size is None:
+            train_size = 1.0 - test_size
+            n_train = n_samples - n_test
     elif np.asarray(train_size).dtype.kind == 'f':
         n_train = floor(train_size * n_samples)
-    else:
+        if test_size is None:
+            test_size = 1.0 - train_size
+            n_test = n_samples - n_train
+    elif np.asarray(train_size).dtype.kind == 'i':
         n_train = float(train_size)
-
-    if test_size is None:
         n_test = n_samples - n_train
+    else:
+        raise TypeError('Unexpected specification of train_size and test_size.'
+                        'Only one of test_size or train_size must be specified.'
+                        ' either as percentage p such that 0.0 < p < 1.0, '
+                        '     or as an integer n such that 0 < n < #samples.')
 
     if n_train + n_test > n_samples:
         raise ValueError('The sum of train_size and test_size = %d, '
@@ -1160,7 +1195,12 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
                          'samples %d. Reduce test_size and/or '
                          'train_size.' % (n_train + n_test, n_samples))
 
-    return int(n_train), int(n_test)
+    if n_train < 1 or n_test < 1:
+        raise ValueError('There must be at least one data point for training'
+                            'and one data point for testing. Got {0:d} training'
+                            'and {1:d} testing points.'.format(n_train, n_test))
+
+    return int(n_train), int(n_test), float(train_size), float(test_size)
 
 
 class PredefinedSplit(BaseCrossValidator):

From e4953bdc78fbbae336a93711f8cb52f9b45993c3 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Mon, 7 Dec 2015 03:37:41 -0500
Subject: [PATCH 2/5] removed unnecessary requirement for
 stratify_across_classes to _validate_shuffle_split.

---
 sklearn/model_selection/_split.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index f5119eed46fe2..2395e01dbe8d1 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1041,7 +1041,7 @@ def __init__(self, n_iter=10, test_size=0.1, train_size=None,
     def _iter_indices(self, X, y=None, labels=None):
         n_samples = _num_samples(X)
         n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
-                                                  self.train_size, self.stratify_across_classes)
+                                                  self.train_size)
         classes, y_indices = np.unique(y, return_inverse=True)
         n_classes = classes.shape[0]
 
@@ -1145,7 +1145,7 @@ def _validate_shuffle_split_init(test_size, train_size,stratify_across_classes):
                         stratify_across_classes.__class__)
 
 
-def _validate_shuffle_split(n_samples, test_size, train_size, stratify_across_classes):
+def _validate_shuffle_split(n_samples, test_size, train_size):
     """
     Validation helper to check if the test/test sizes are meaningful wrt to the
     size of the data (n_samples)

From 35fdb8bbec73667e0db5630a4366cec77ce26370 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Mon, 7 Dec 2015 04:00:09 -0500
Subject: [PATCH 3/5] correcting for the number of output args of
 _validate_shuffle_split. PyCharm seems to have not saved my previous
 changes\!

---
 sklearn/model_selection/_split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 2395e01dbe8d1..db8606b800364 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -896,7 +896,7 @@ class ShuffleSplit(BaseShuffleSplit):
 
     def _iter_indices(self, X, y=None, labels=None):
         n_samples = _num_samples(X)
-        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
+        n_train, n_test, train_size, test_size = _validate_shuffle_split(n_samples, self.test_size,
                                                   self.train_size)
         rng = check_random_state(self.random_state)
         for i in range(self.n_iter):

From 7b85cacfb59fbc8efb19c2e4abf03f4bc0ffcfd7 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <raamana@gmail.com>
Date: Mon, 7 Dec 2015 04:44:10 -0500
Subject: [PATCH 4/5] Fixing the UnboundLocalError with an apriori assignment
 of None to n_train and n_test.

---
 sklearn/model_selection/_split.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index db8606b800364..2cf99be5373ed 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1160,6 +1160,10 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
         raise ValueError("train_size=%d should be smaller than the number of"
                          " samples %d" % (train_size, n_samples))
 
+    # to work around UnboundLocalError
+    n_train = None
+    n_test = None
+
     # this check is necessary to ensure expected behaviour
     if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f'
         and test_size+train_size>1.0):

From db82b7905bbc83239546b180d003f30876a808c0 Mon Sep 17 00:00:00 2001
From: Pradeep Reddy Raamana <praamana@research.baycrest.org>
Date: Mon, 7 Dec 2015 19:46:56 -0500
Subject: [PATCH 5/5] Fixes to address the wildly incorrect specification of
 train and test sizes, and to deal with nose tests properly.

---
 sklearn/model_selection/_split.py           | 44 ++++++++++++++++-----
 sklearn/model_selection/tests/test_split.py |  5 ++-
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 2cf99be5373ed..609606e6ea11c 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -1160,49 +1160,75 @@ def _validate_shuffle_split(n_samples, test_size, train_size):
         raise ValueError("train_size=%d should be smaller than the number of"
                          " samples %d" % (train_size, n_samples))
 
-    # to work around UnboundLocalError
-    n_train = None
-    n_test = None
+    if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'i' ) or \
+            (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'f' ):
+        raise ValueError("Data types of train and test sizes mixed up. Choose either float or int, only one.")
 
     # this check is necessary to ensure expected behaviour
     if (np.asarray(test_size).dtype.kind == 'f' and np.asarray(train_size).dtype.kind == 'f'
-        and test_size+train_size>1.0):
+        and test_size+train_size!=1.0):
         raise ValueError("Sum of train and test size percentages must be 1. Got %f" % test_size+train_size)
 
+    if (np.asarray(test_size).dtype.kind == 'i' and np.asarray(train_size).dtype.kind == 'i'
+        and test_size+train_size != n_samples):
+        raise ValueError("Sum of train and test size must be n_samples. Got %f" % np.sum([test_size,train_size]))
+
+    # to work around the UnboundLocalError
+    n_test = None
+    n_train = None
+
     if np.asarray(test_size).dtype.kind == 'f':
         n_test = ceil(test_size * n_samples)
         if train_size is None:
             train_size = 1.0 - test_size
             n_train = n_samples - n_test
+        else:
+            # TODO probably incorrect way to handle this situation - need to discuss with core-dev and community
+            n_train = n_samples - n_test
     elif np.asarray(test_size).dtype.kind == 'i':
         n_test = float(test_size)
         if train_size is None:
-            train_size = 1.0 - test_size
+            train_size = n_samples - n_test
+            n_train = n_samples - n_test
+        else:
+            # assert ((np.asarray(train_size).dtype.kind == 'i') and (train_size+test_size == n_samples)), \
+            #     'when test_size is int, train_size also must be int and the sum must be n_samples'
+            # TODO assert np.asarray(train_size).dtype.kind == 'i', 'when test_size is int, train_size also must be int.'
             n_train = n_samples - n_test
     elif np.asarray(train_size).dtype.kind == 'f':
         n_train = floor(train_size * n_samples)
         if test_size is None:
             test_size = 1.0 - train_size
             n_test = n_samples - n_train
+        else:
+            # TODO assert ( (np.asarray(test_size).dtype.kind == 'f') and (train_size+test_size == 1.0)), \
+            #     'when train_size is float, test_size also must be float and they must sum to 1.0'
+            # assert np.asarray(test_size).dtype.kind == 'f', 'when train_size is float, test_size also must be float.'
+            n_test = n_samples - n_train
     elif np.asarray(train_size).dtype.kind == 'i':
         n_train = float(train_size)
         n_test = n_samples - n_train
+        test_size = int(n_test)
     else:
         raise TypeError('Unexpected specification of train_size and test_size.'
                         'Only one of test_size or train_size must be specified.'
                         ' either as percentage p such that 0.0 < p < 1.0, '
                         '     or as an integer n such that 0 < n < #samples.')
 
+    if n_test is None or n_train is None:
+        raise ValueError('Combination of various inputs led to invalid calculations.')
+
     if n_train + n_test > n_samples:
         raise ValueError('The sum of train_size and test_size = %d, '
                          'should be smaller than the number of '
                          'samples %d. Reduce test_size and/or '
                          'train_size.' % (n_train + n_test, n_samples))
 
-    if n_train < 1 or n_test < 1:
-        raise ValueError('There must be at least one data point for training'
-                            'and one data point for testing. Got {0:d} training'
-                            'and {1:d} testing points.'.format(n_train, n_test))
+    # TODO find a way to handle the sticky situation in which float & int are mixed to specify train_size and test_size
+    # if n_train < 1 or n_test < 1:
+    #     raise ValueError('There must be at least one data point for training'
+    #                         'and one data point for testing. Got {0:d} training'
+    #                         'and {1:d} testing points.'.format(n_train, n_test))
 
     return int(n_train), int(n_test), float(train_size), float(test_size)
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 4689ead6007d0..cdc33aed40a52 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -559,10 +559,13 @@ def assert_counts_are_ok(idx_counts, p):
                     counter[id] += 1
         assert_equal(n_splits, n_iter)
 
-        n_train, n_test = _validate_shuffle_split(n_samples,
+        n_train, n_test, _, _ = _validate_shuffle_split(n_samples,
                                                   test_size=1./n_folds,
                                                   train_size=1.-(1./n_folds))
 
+        # TODO need to revise this test for the proposed modification of StratifiedShuffleSplit
+        # TODO this is also to do with ensuring train_size + test_size = 1.0 when they are float
+        # TODO and similary when they are int, train_size + test_size = n_samples
         assert_equal(len(train), n_train)
         assert_equal(len(test), n_test)
         assert_equal(len(set(train).intersection(test)), 0)