From d93b74ccf342819a17e2bfdec05f2d9a460aa5cc Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 26 Jan 2021 15:33:01 +0300 Subject: [PATCH 01/72] KBD changes 'auto' option added (Sturges rule); KMeans algorithm changed from 'auto' to 'full'; Format-strings changed to f-strings. --- sklearn/preprocessing/_discretization.py | 73 ++++++++++++++---------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 22fa236f3314e..caecdf5917920 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,8 +28,9 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int or array-like of shape (n_features,), default=5 + n_bins : int or array-like of shape (n_features,), default='auto' The number of bins to produce. Raises ValueError if ``n_bins < 2``. + For 'auto' option Sturges formula is used. encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. @@ -126,7 +127,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ @_deprecate_positional_args - def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', + def __init__(self, n_bins='auto', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins self.encode = encode @@ -163,20 +164,22 @@ def fit(self, X, y=None): f"{supported_dtype + (None,)}. Got dtype={self.dtype} " f" instead." ) - valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: - raise ValueError("Valid options for 'encode' are {}. " - "Got encode={!r} instead." - .format(valid_encode, self.encode)) + raise ValueError( + f"Valid options for 'encode' are {valid_encode}. " + f"Got encode={self.encode!r} instead." + ) valid_strategy = ('uniform', 'quantile', 'kmeans') if self.strategy not in valid_strategy: - raise ValueError("Valid options for 'strategy' are {}. " - "Got strategy={!r} instead." - .format(valid_strategy, self.strategy)) + raise ValueError( + f"Valid options for 'strategy' are {valid_strategy}. " + f"Got strategy={self.strategy!r} instead." + ) n_features = X.shape[1] - n_bins = self._validate_n_bins(n_features) + n_samples = X.shape[0] + n_bins = self._validate_n_bins(n_features, n_samples) bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): @@ -184,8 +187,8 @@ def fit(self, X, y=None): col_min, col_max = column.min(), column.max() if col_min == col_max: - warnings.warn("Feature %d is constant and will be " - "replaced with 0." % jj) + warnings.warn(f"Feature {jj} is constant and will be " + f"replaced with 0.") n_bins[jj] = 1 bin_edges[jj] = np.array([-np.inf, np.inf]) continue @@ -205,7 +208,7 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() @@ -217,9 +220,9 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn('Bins whose width are too small (i.e., <= ' - '1e-8) in feature %d are removed. Consider ' - 'decreasing the number of bins.' % jj) + warnings.warn(f"Bins whose width are too small (i.e., <= " + f"1e-8) in feature {jj} are removed. Consider " + f"decreasing the number of bins.") n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges @@ -236,20 +239,25 @@ def fit(self, X, y=None): return self - def _validate_n_bins(self, n_features): + def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self.n_bins + if self.n_bins == 'auto': + # calculcate number of bins depending on number of samples in X + orig_bins = np.ceil(np.log2(n_samples) + 1.) + else: + orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): - raise ValueError("{} received an invalid n_bins type. " - "Received {}, expected int." - .format(KBinsDiscretizer.__name__, - type(orig_bins).__name__)) + raise ValueError( + f"{KBinsDiscretizer.__name__} received an invalid n_bins type. " + f"Received {type(orig_bins).__name__}, expected int." + ) if orig_bins < 2: - raise ValueError("{} received an invalid number " - "of bins. Received {}, expected at least 2." - .format(KBinsDiscretizer.__name__, orig_bins)) + raise ValueError( + f"{KBinsDiscretizer.__name__} received an invalid number " + f"of bins. Received {orig_bins}, expected at least 2." + ) return np.full(n_features, orig_bins, dtype=int) n_bins = check_array(orig_bins, dtype=int, copy=True, @@ -264,10 +272,11 @@ def _validate_n_bins(self, n_features): violating_indices = np.where(bad_nbins_value)[0] if violating_indices.shape[0] > 0: indices = ", ".join(str(i) for i in violating_indices) - raise ValueError("{} received an invalid number " - "of bins at indices {}. Number of bins " - "must be at least 2, and must be an int." - .format(KBinsDiscretizer.__name__, indices)) + raise ValueError( + f"{KBinsDiscretizer.__name__} received an invalid number " + f"of bins at indices {indices}. Number of bins " + f"must be at least 2, and must be an int." + ) return n_bins def transform(self, X): @@ -342,8 +351,10 @@ def inverse_transform(self, Xt): Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32)) n_features = self.n_bins_.shape[0] if Xinv.shape[1] != n_features: - raise ValueError("Incorrect number of features. Expecting {}, " - "received {}.".format(n_features, Xinv.shape[1])) + raise ValueError( + f"Incorrect number of features. Expecting {n_features}, " + f"received {Xinv.shape[1]}." + ) for jj in range(n_features): bin_edges = self.bin_edges_[jj] From 59a84d6408339bf7ffd7e7b80cc524f9018c84c9 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 26 Jan 2021 15:45:58 +0300 Subject: [PATCH 02/72] Small fix --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index caecdf5917920..7a561c7ccd3fa 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -244,7 +244,7 @@ def _validate_n_bins(self, n_features, n_samples): """ if self.n_bins == 'auto': # calculcate number of bins depending on number of samples in X - orig_bins = np.ceil(np.log2(n_samples) + 1.) + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) else: orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): From 566ac59799d4cb01dcf48763b03b69e22cb78459 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 26 Jan 2021 20:16:55 +0300 Subject: [PATCH 03/72] Added checks for n_bins=str --- sklearn/preprocessing/_discretization.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 7a561c7ccd3fa..a0d4e1a76a09f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -242,11 +242,17 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - if self.n_bins == 'auto': - # calculcate number of bins depending on number of samples in X - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - else: - orig_bins = self.n_bins + orig_bins = self.n_bins + if isinstance(orig_bins, str): + if self.n_bins == 'auto': + # calculcate number of bins + # depending on number of samples with Sturges rule + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + else: + raise ValueError( + f"{KBinsDiscretizer.__name__} received an invalid n_bins value " + f"{orig_bins!r}, while only 'auto' is supported." + ) if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( From 675e1e6db86f94d1e44ef8a51e32b8dad580524f Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 08:29:34 +0000 Subject: [PATCH 04/72] Lint changes --- sklearn/preprocessing/_discretization.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a0d4e1a76a09f..d6fe4c2e9808c 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -208,7 +208,8 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, algorithm='full') + km = KMeans(n_clusters=n_bins[jj], init=init, + n_init=1, algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() @@ -220,9 +221,10 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn(f"Bins whose width are too small (i.e., <= " - f"1e-8) in feature {jj} are removed. Consider " - f"decreasing the number of bins.") + warnings.warn(f"Bins whose width are too small " + f"(i.e., <= 1e-8) in feature {jj} " + f"are removed. Consider decreasing " + f"the number of bins.") n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges @@ -250,13 +252,15 @@ def _validate_n_bins(self, n_features, n_samples): orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) else: raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid n_bins value " + f"{KBinsDiscretizer.__name__} received " + f"an invalid n_bins value " f"{orig_bins!r}, while only 'auto' is supported." ) if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid n_bins type. " + f"{KBinsDiscretizer.__name__} received " + f"an invalid n_bins type. " f"Received {type(orig_bins).__name__}, expected int." ) if orig_bins < 2: From ab4f868aa0845443e71019078a297f3672282768 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 09:01:56 +0000 Subject: [PATCH 05/72] Changed behaviour to catch n_bins<2 in 'auto' --- sklearn/preprocessing/_discretization.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d6fe4c2e9808c..3adc96316d707 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -244,24 +244,18 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self.n_bins - if isinstance(orig_bins, str): - if self.n_bins == 'auto': - # calculcate number of bins - # depending on number of samples with Sturges rule - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - else: - raise ValueError( - f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins value " - f"{orig_bins!r}, while only 'auto' is supported." - ) + if self.n_bins == 'auto': + # calculcate number of bins + # depending on number of samples with Sturges rule + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + else: + orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins type. " - f"Received {type(orig_bins).__name__}, expected int." + f"an invalid n_bins type. Received " + f"{type(orig_bins).__name__}, expected int or 'auto'." ) if orig_bins < 2: raise ValueError( From 87579eddef96a3de24514669fd563f7a9acbdd2d Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 10:01:59 +0000 Subject: [PATCH 06/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 3adc96316d707..75cfeec4c4ddf 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -244,13 +244,12 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - if self.n_bins == 'auto': - # calculcate number of bins - # depending on number of samples with Sturges rule - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - else: - orig_bins = self.n_bins - if isinstance(orig_bins, numbers.Number): + orig_bins = self.n_bins + if isinstance(orig_bins, numbers.Number) | isinstance(orig_bins, str): + if self.n_bins == 'auto': + # calculcate number of bins + # depending on number of samples with Sturges rule + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From 44ea08db3019861160e9546f96ae2a3f2b45e5e1 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 10:05:44 +0000 Subject: [PATCH 07/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 75cfeec4c4ddf..344f48647908b 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -246,7 +246,7 @@ def _validate_n_bins(self, n_features, n_samples): """ orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number) | isinstance(orig_bins, str): - if self.n_bins == 'auto': + if orig_bins == 'auto': # calculcate number of bins # depending on number of samples with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) From 731de9dd0c16e7dc7c013c1db7d65dc61f98634b Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 10:48:04 +0000 Subject: [PATCH 08/72] Update sklearn/preprocessing/_discretization.py Co-authored-by: Joel Nothman --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 344f48647908b..27e7f97fcebac 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -245,7 +245,7 @@ def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ orig_bins = self.n_bins - if isinstance(orig_bins, numbers.Number) | isinstance(orig_bins, str): + if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): if orig_bins == 'auto': # calculcate number of bins # depending on number of samples with Sturges rule From 55b84f683541dbd3391972bd21b2b3e848789b29 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 10:48:21 +0000 Subject: [PATCH 09/72] Update sklearn/preprocessing/_discretization.py Co-authored-by: Joel Nothman --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 27e7f97fcebac..03cb171fbbecd 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -30,7 +30,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): ---------- n_bins : int or array-like of shape (n_features,), default='auto' The number of bins to produce. Raises ValueError if ``n_bins < 2``. - For 'auto' option Sturges formula is used. + For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. From fc9f93582c8f884af8793b465a4aedf511904e29 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 11:35:36 +0000 Subject: [PATCH 10/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 9d607c82d5831..fe1c9c114b19f 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -45,7 +45,14 @@ def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1.1) err_msg = ("KBinsDiscretizer received an invalid " - "n_bins type. Received float, expected int.") + "n_bins type. Received float, expected int or 'auto'.") + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Bad string value + est = KBinsDiscretizer(n_bins='rice') + err_msg = ("KBinsDiscretizer received an invalid " + "n_bins type. Received float, expected int or 'auto'.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) From f5bff722cccfea293ed6016b8af0b50f27d95b0c Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 11:41:57 +0000 Subject: [PATCH 11/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index fe1c9c114b19f..95abb40e6503d 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -48,7 +48,7 @@ def test_invalid_n_bins(): "n_bins type. Received float, expected int or 'auto'.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) - + # Bad string value est = KBinsDiscretizer(n_bins='rice') err_msg = ("KBinsDiscretizer received an invalid " From a8e24e543b7569b56e99fc081cad1c1db883adef Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 14:21:53 +0000 Subject: [PATCH 12/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 03cb171fbbecd..eaee8e7b794ae 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,7 +28,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int or array-like of shape (n_features,), default='auto' + n_bins : int, array-like of integers of shape (n_features,) or 'auto', \ + default='auto' The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. From c3638272edaf089435b1a4854f5594cad8878659 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 14:36:51 +0000 Subject: [PATCH 13/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index eaee8e7b794ae..d9cb7ba0fc955 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -29,9 +29,12 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- n_bins : int, array-like of integers of shape (n_features,) or 'auto', \ - default='auto' + default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. + + .. versionadded:: 0.24 + Added 'auto' option encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. @@ -128,7 +131,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ @_deprecate_positional_args - def __init__(self, n_bins='auto', *, encode='onehot', strategy='quantile', + def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins self.encode = encode @@ -152,6 +155,10 @@ def fit(self, X, y=None): ------- self """ + if self.n_bins == 'warn': + warnings.warn("The default value of n_bins will change from " + "5 to 'auto' in 0.25.", FutureWarning) + self.n_bins = 5 X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From 6d767c5a38464a04609f004f559036b3ecfbec64 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 15:21:55 +0000 Subject: [PATCH 14/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d9cb7ba0fc955..39046a9689c38 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -32,7 +32,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. - + .. versionadded:: 0.24 Added 'auto' option From dc6b095ce3f9fe28c0bcd3aba6b018cc1112b24a Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 16:47:43 +0000 Subject: [PATCH 15/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 39046a9689c38..6a4cfda168d5d 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -158,7 +158,8 @@ def fit(self, X, y=None): if self.n_bins == 'warn': warnings.warn("The default value of n_bins will change from " "5 to 'auto' in 0.25.", FutureWarning) - self.n_bins = 5 + self.n_bins = 5 + X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From 1dac5f902cac182fce96bb3cc6a6d29f3ba2ad4b Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Thu, 28 Jan 2021 16:50:39 +0000 Subject: [PATCH 16/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 6a4cfda168d5d..d919aeedc160f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -159,7 +159,7 @@ def fit(self, X, y=None): warnings.warn("The default value of n_bins will change from " "5 to 'auto' in 0.25.", FutureWarning) self.n_bins = 5 - + X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From abff57615c49204c62e6ba8647473a028bbcd45e Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 08:10:48 +0000 Subject: [PATCH 17/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d919aeedc160f..75a15db35e49a 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -134,6 +134,10 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins + if self.n_bins == 'warn': + warnings.warn("The default value of n_bins will change from " + "5 to 'auto' in 0.25.", FutureWarning) + self.n_bins = 5 self.encode = encode self.strategy = strategy self.dtype = dtype @@ -155,11 +159,6 @@ def fit(self, X, y=None): ------- self """ - if self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 0.25.", FutureWarning) - self.n_bins = 5 - X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From bcb118d774974aad1547a2bbfa21069ae7de83a1 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 13:38:15 +0300 Subject: [PATCH 18/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 75a15db35e49a..8dbadf3d3c659 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -255,8 +255,7 @@ def _validate_n_bins(self, n_features, n_samples): orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): if orig_bins == 'auto': - # calculcate number of bins - # depending on number of samples with Sturges rule + # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) if not isinstance(orig_bins, numbers.Integral): raise ValueError( From cca972cccb0131da44a6f868a3e12b418eb8610f Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 11:30:13 +0000 Subject: [PATCH 19/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 95abb40e6503d..549dad86dd98d 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -52,7 +52,7 @@ def test_invalid_n_bins(): # Bad string value est = KBinsDiscretizer(n_bins='rice') err_msg = ("KBinsDiscretizer received an invalid " - "n_bins type. Received float, expected int or 'auto'.") + "n_bins type. Received str, expected int or 'auto'.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) From 7b552e6c4a650ddad5864ba086a565780469576f Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 11:33:59 +0000 Subject: [PATCH 20/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 8dbadf3d3c659..0f98c3429d90f 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -134,10 +134,6 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins - if self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 0.25.", FutureWarning) - self.n_bins = 5 self.encode = encode self.strategy = strategy self.dtype = dtype @@ -159,6 +155,9 @@ def fit(self, X, y=None): ------- self """ + if self.n_bins == 'warn': + warnings.warn("The default value of n_bins will change from " + "5 to 'auto' in 0.25.", FutureWarning) X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) @@ -257,6 +256,9 @@ def _validate_n_bins(self, n_features, n_samples): if orig_bins == 'auto': # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + if orig_bins == 'warn: + # deprecation cycle case, should be deleted afterwards + orig_bins = 5 if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From 617bf90c8213b45aad7d61b0654f38be1f1dd3b1 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 11:39:48 +0000 Subject: [PATCH 21/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 0f98c3429d90f..d910b7712c334 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,8 +28,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int, array-like of integers of shape (n_features,) or 'auto', \ - default=5 + n_bins : {int, array-like (n_features,), dtype=integral, 'auto'}, default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. From 9563a1cdd728970ee092aa8c4283c46de2bf9576 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 11:40:53 +0000 Subject: [PATCH 22/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d910b7712c334..22ed9e30b0a82 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -255,7 +255,7 @@ def _validate_n_bins(self, n_features, n_samples): if orig_bins == 'auto': # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - if orig_bins == 'warn: + if orig_bins == 'warn': # deprecation cycle case, should be deleted afterwards orig_bins = 5 if not isinstance(orig_bins, numbers.Integral): From 28dbbc5fd72889ea0fe59cef96250587cb575347 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 12:38:19 +0000 Subject: [PATCH 23/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 22ed9e30b0a82..5711dbe56b812 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -129,7 +129,6 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ - @_deprecate_positional_args def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins @@ -137,6 +136,7 @@ def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', self.strategy = strategy self.dtype = dtype + @_deprecate_positional_args def fit(self, X, y=None): """ Fit the estimator. @@ -154,9 +154,10 @@ def fit(self, X, y=None): ------- self """ - if self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 0.25.", FutureWarning) + if isinstance(self.n_bins, str): + if self.n_bins == 'warn': + warnings.warn("The default value of n_bins will change from " + "5 to 'auto' in 0.25.", FutureWarning) X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From 3e5a86dd0fa9414684844d153618ba4074fb60f8 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 12:43:50 +0000 Subject: [PATCH 24/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 5711dbe56b812..b60ba7bbb1d9b 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -154,7 +154,7 @@ def fit(self, X, y=None): ------- self """ - if isinstance(self.n_bins, str): + if isinstance(self.n_bins, str): if self.n_bins == 'warn': warnings.warn("The default value of n_bins will change from " "5 to 'auto' in 0.25.", FutureWarning) From 0d7cc149ddf24c8f529eed6d8dd37385e96246b2 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 13:40:23 +0000 Subject: [PATCH 25/72] Update sklearn/preprocessing/_discretization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index b60ba7bbb1d9b..6dc4f960f7f40 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -32,7 +32,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. - .. versionadded:: 0.24 + .. versionadded:: 1.0 Added 'auto' option encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' From 0c9eb1815b96d65107b7cf1c8853ddb987648cd4 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 13:43:03 +0000 Subject: [PATCH 26/72] Update sklearn/preprocessing/_discretization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 6dc4f960f7f40..5910d9e9e469e 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,7 +28,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : {int, array-like (n_features,), dtype=integral, 'auto'}, default=5 + n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral, default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. From 119976b176cfdebbac5ad64b320feccd27c9541c Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 13:56:38 +0000 Subject: [PATCH 27/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 549dad86dd98d..13226190925ea 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -36,6 +36,15 @@ def test_valid_n_bins(): assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int) +def test_n_bins_5_deprecated(): + # FIXME: remove in 1.2 + est = KBinsDiscretizer() + depr_msg = ("The default value of n_bins will change from " + "5 to 'auto' in 1.2") + with pytest.warns(FutureWarning, match=depr_msg): + est.fit(X) + + def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1) err_msg = ("KBinsDiscretizer received an invalid " From fdcbd299993e29e3cafe8d49d32119454724c313 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 13:56:41 +0000 Subject: [PATCH 28/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 5910d9e9e469e..83d4824b8dad3 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,7 +28,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral, default=5 + n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral,\ + default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. @@ -154,10 +155,12 @@ def fit(self, X, y=None): ------- self """ + self._n_bins = self.n_bins if isinstance(self.n_bins, str): if self.n_bins == 'warn': warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 0.25.", FutureWarning) + "5 to 'auto' in 1.2", FutureWarning) + self._n_bins = 5 X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) @@ -251,14 +254,11 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self.n_bins + orig_bins = self._n_bins if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): if orig_bins == 'auto': # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - if orig_bins == 'warn': - # deprecation cycle case, should be deleted afterwards - orig_bins = 5 if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From fe5bc1d0ca538c93464c51536d5fa759e91794c3 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 14:53:01 +0000 Subject: [PATCH 29/72] added test for auto --- sklearn/preprocessing/tests/test_discretization.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 13226190925ea..39c82742f6605 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -45,6 +45,14 @@ def test_n_bins_5_deprecated(): est.fit(X) +def test_auto_bins(): + est = KBinsDiscretizer(n_bins='auto') + # for sturges rule: ceil(log2(4) + 1) + expected_bins = [3, 3, 3, 3] + est.fit(X) + assert est.n_bins_ = expected_bins + + def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1) err_msg = ("KBinsDiscretizer received an invalid " From aede54ad2b1bc690593256a64a089bb5143a4e1b Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 14:55:09 +0000 Subject: [PATCH 30/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 39c82742f6605..61ca8d6f445aa 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -48,9 +48,10 @@ def test_n_bins_5_deprecated(): def test_auto_bins(): est = KBinsDiscretizer(n_bins='auto') # for sturges rule: ceil(log2(4) + 1) + # where 4 is n_samples in X expected_bins = [3, 3, 3, 3] est.fit(X) - assert est.n_bins_ = expected_bins + assert est.n_bins_ == expected_bins def test_invalid_n_bins(): From a0b04da9a7b45ea0f0efc2307ee8a2eeaa148544 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Fri, 29 Jan 2021 15:28:36 +0000 Subject: [PATCH 31/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 61ca8d6f445aa..ff406d2c30410 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -51,7 +51,7 @@ def test_auto_bins(): # where 4 is n_samples in X expected_bins = [3, 3, 3, 3] est.fit(X) - assert est.n_bins_ == expected_bins + assert np.all(est.n_bins_ == expected_bins) def test_invalid_n_bins(): From ca735de561dd2a39e9d12c4fb994b3269cb9259f Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:36:44 +0000 Subject: [PATCH 32/72] Update sklearn/preprocessing/_discretization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/preprocessing/_discretization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 83d4824b8dad3..40e9af8c2a893 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -156,8 +156,7 @@ def fit(self, X, y=None): self """ self._n_bins = self.n_bins - if isinstance(self.n_bins, str): - if self.n_bins == 'warn': + if isinstance(self.n_bins, str) and self.n_bins == 'warn': warnings.warn("The default value of n_bins will change from " "5 to 'auto' in 1.2", FutureWarning) self._n_bins = 5 From 08f5467343d887b42e233626e407ebcf000502c3 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:37:34 +0000 Subject: [PATCH 33/72] Update sklearn/preprocessing/tests/test_discretization.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/preprocessing/tests/test_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index ff406d2c30410..ad67f5943e18f 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -47,7 +47,7 @@ def test_n_bins_5_deprecated(): def test_auto_bins(): est = KBinsDiscretizer(n_bins='auto') - # for sturges rule: ceil(log2(4) + 1) + # for sturges rule: ceil(log2(4) + 1) = 3 # where 4 is n_samples in X expected_bins = [3, 3, 3, 3] est.fit(X) From eba8dbb07a72f8baf7d7a1bad26536dcaadf5555 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:10 +0000 Subject: [PATCH 34/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index ad67f5943e18f..1ab40c544743f 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -47,7 +47,7 @@ def test_n_bins_5_deprecated(): def test_auto_bins(): est = KBinsDiscretizer(n_bins='auto') - # for sturges rule: ceil(log2(4) + 1) = 3 + # for Sturges rule: ceil(log2(4) + 1) = 3 # where 4 is n_samples in X expected_bins = [3, 3, 3, 3] est.fit(X) From 1eb2322a32d1c2024842767d7c5e4f30b4585300 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:17 +0000 Subject: [PATCH 35/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 40e9af8c2a893..a8c4df3ada7a8 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -130,6 +130,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ + @_deprecate_positional_args def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins @@ -137,7 +138,7 @@ def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', self.strategy = strategy self.dtype = dtype - @_deprecate_positional_args + def fit(self, X, y=None): """ Fit the estimator. @@ -186,8 +187,7 @@ def fit(self, X, y=None): f"Got strategy={self.strategy!r} instead." ) - n_features = X.shape[1] - n_samples = X.shape[0] + n_samples, n_features = X.shape n_bins = self._validate_n_bins(n_features, n_samples) bin_edges = np.zeros(n_features, dtype=object) @@ -254,10 +254,10 @@ def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ orig_bins = self._n_bins - if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): - if orig_bins == 'auto': - # calculate number of bins with Sturges rule - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + if isinstance(orig_bins, str) and (orig_bins == 'auto'): + # calculate number of bins with Sturges rule + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + if isinstance(orig_bins, numbers.Number) if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From d623a3c3b99f18339fe23756a94d669af1b384a3 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:49 +0000 Subject: [PATCH 36/72] Update test_docstring_parameters.py --- sklearn/tests/test_docstring_parameters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 1756e0e4a65a6..2c4a18d2fa0e4 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -205,6 +205,9 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == 'SelectKBest': est.k = 2 + + if Estimator.__name__ == 'KBinsDiscretizer': + est.n_bins = 'auto' if Estimator.__name__ == 'DummyClassifier': est.strategy = "stratified" From 78cdde1190099a671c4178f3b90b51a4cd60b701 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:56:51 +0000 Subject: [PATCH 37/72] Update test_common.py --- sklearn/tests/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b900f94231419..5e4659e5240ca 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -294,5 +294,7 @@ def test_search_cv(estimator, check, request): @pytest.mark.parametrize("estimator", N_FEATURES_IN_AFTER_FIT_ESTIMATORS, ids=_get_check_estimator_ids) def test_check_n_features_in_after_fitting(estimator): + if estimator.__name__ == 'KBinsDiscretizer': + estimator.n_bins = 'auto' _set_checking_parameters(estimator) check_n_features_in_after_fitting(estimator.__class__.__name__, estimator) From 396d4a862e7778258c002a3fe3c8b58a945b69bf Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:07:37 +0000 Subject: [PATCH 38/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a00523ec2223b..24c7db0779575 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -135,6 +135,15 @@ Changelog polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. +- |Enhancement| Added new argument `auto` for `n_bins` parameter in + :class:`preprocessing.KBinsDiscretizer` for calculating number of bins + via Sturges rule. + :pr:`9337` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` + and :user:`Jérémie du Boisberranger`. +- |Efficiency| Changed `algorithm` argument for :class:`cluster.KMeans` in + :class:`preprocessing.KBinsDiscretizer` from `auto` to `full`. + :pr:`19256` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` + and :user:`Jérémie du Boisberranger`. :mod:`sklearn.tree` ................... From f577a03146ff49c27618c0303fa7849a12fddeba Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:08:19 +0000 Subject: [PATCH 39/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 24c7db0779575..373a9c4b6a1ba 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -143,7 +143,7 @@ Changelog - |Efficiency| Changed `algorithm` argument for :class:`cluster.KMeans` in :class:`preprocessing.KBinsDiscretizer` from `auto` to `full`. :pr:`19256` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` - and :user:`Jérémie du Boisberranger`. + and :user:`Jérémie du Boisberranger `. :mod:`sklearn.tree` ................... From 16abf9e3189058229068c40bf42ba51b79f1058a Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:09:45 +0000 Subject: [PATCH 40/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a8c4df3ada7a8..a299312e03ea8 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -257,7 +257,7 @@ def _validate_n_bins(self, n_features, n_samples): if isinstance(orig_bins, str) and (orig_bins == 'auto'): # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - if isinstance(orig_bins, numbers.Number) + if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From 94f9c87c92e3378629c650acf9cabd1a5f92be16 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:11:34 +0000 Subject: [PATCH 41/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a299312e03ea8..2cdbbc8119401 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -158,9 +158,9 @@ def fit(self, X, y=None): """ self._n_bins = self.n_bins if isinstance(self.n_bins, str) and self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 1.2", FutureWarning) - self._n_bins = 5 + warnings.warn("The default value of n_bins will change from " + "5 to 'auto' in 1.2", FutureWarning) + self._n_bins = 5 X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From f499549812533dabd25f702f026dc2e147530ff4 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:13:29 +0000 Subject: [PATCH 42/72] Update test_docstring_parameters.py --- sklearn/tests/test_docstring_parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 2c4a18d2fa0e4..5cb585fe9e950 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -205,7 +205,7 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == 'SelectKBest': est.k = 2 - + if Estimator.__name__ == 'KBinsDiscretizer': est.n_bins = 'auto' From e266fadc7b6b3544107ddf77cd4a013553ebd1cb Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:18:23 +0000 Subject: [PATCH 43/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 373a9c4b6a1ba..c7b6092a6ed76 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -135,13 +135,13 @@ Changelog polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. -- |Enhancement| Added new argument `auto` for `n_bins` parameter in +- |Enhancement| Added new argument ``auto`` for ``n_bins`` parameter in :class:`preprocessing.KBinsDiscretizer` for calculating number of bins via Sturges rule. :pr:`9337` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` and :user:`Jérémie du Boisberranger`. -- |Efficiency| Changed `algorithm` argument for :class:`cluster.KMeans` in - :class:`preprocessing.KBinsDiscretizer` from `auto` to `full`. +- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in + :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. :pr:`19256` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` and :user:`Jérémie du Boisberranger `. From cb72479e078c5fc08723a3f7aae87e4459e6e156 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:18:49 +0000 Subject: [PATCH 44/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c7b6092a6ed76..bf0a2d629d8db 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -139,7 +139,7 @@ Changelog :class:`preprocessing.KBinsDiscretizer` for calculating number of bins via Sturges rule. :pr:`9337` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` - and :user:`Jérémie du Boisberranger`. + and :user:`Jérémie du Boisberranger `. - |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. :pr:`19256` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` From 66a46828cb71eccee95e5787fa1a608ca349b6e4 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 16:21:01 +0000 Subject: [PATCH 45/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index bf0a2d629d8db..1f674f0bb8741 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -138,12 +138,10 @@ Changelog - |Enhancement| Added new argument ``auto`` for ``n_bins`` parameter in :class:`preprocessing.KBinsDiscretizer` for calculating number of bins via Sturges rule. - :pr:`9337` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` - and :user:`Jérémie du Boisberranger `. + :pr:`9337` by :user:`Joel Nothman `. - |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. - :pr:`19256` by :user:`Gleb Levitskiy `, :user:`Joel Nothman ` - and :user:`Jérémie du Boisberranger `. + :pr:`19256` by :user:`Gleb Levitskiy `. :mod:`sklearn.tree` ................... From 840c77a67a8e7c60b835b4ca6a457d9f9c55a07b Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 17:08:20 +0000 Subject: [PATCH 46/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2cdbbc8119401..7f9e055b62479 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -253,11 +253,11 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self._n_bins - if isinstance(orig_bins, str) and (orig_bins == 'auto'): - # calculate number of bins with Sturges rule - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) - if isinstance(orig_bins, numbers.Number): + orig_bins = self._n_bins + if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): + if orig_bins == 'auto': + # calculate number of bins with Sturges rule + orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " From 9cdb920486406034149e9a551c057dac5b96ba24 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 17:09:28 +0000 Subject: [PATCH 47/72] Update test_common.py --- sklearn/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 5e4659e5240ca..47705ff0bf5e5 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -294,7 +294,7 @@ def test_search_cv(estimator, check, request): @pytest.mark.parametrize("estimator", N_FEATURES_IN_AFTER_FIT_ESTIMATORS, ids=_get_check_estimator_ids) def test_check_n_features_in_after_fitting(estimator): - if estimator.__name__ == 'KBinsDiscretizer': + if estimator.__class__.__name__ == 'KBinsDiscretizer': estimator.n_bins = 'auto' _set_checking_parameters(estimator) check_n_features_in_after_fitting(estimator.__class__.__name__, estimator) From ff127f5b59fdf5ceb2fc08aa71c3e596c859e8d3 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 17:12:43 +0000 Subject: [PATCH 48/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 7f9e055b62479..7ee2d45b6968a 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -253,7 +253,7 @@ def fit(self, X, y=None): def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self._n_bins + orig_bins = self._n_bins if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): if orig_bins == 'auto': # calculate number of bins with Sturges rule From 641d58dfe1b73002a33ae42cf7d8461abaf459ae Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:14:45 +0000 Subject: [PATCH 49/72] Update test_docstring_parameters.py --- sklearn/tests/test_docstring_parameters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 5cb585fe9e950..3d0cfc5151193 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -207,6 +207,7 @@ def test_fit_docstring_attributes(name, Estimator): est.k = 2 if Estimator.__name__ == 'KBinsDiscretizer': + # FIX ME: remove in 1.2 est.n_bins = 'auto' if Estimator.__name__ == 'DummyClassifier': From 3e82bee9a3f2c39ff25b014141a8ae34a5412722 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:14:55 +0000 Subject: [PATCH 50/72] Update v1.0.rst --- doc/whats_new/v1.0.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1f674f0bb8741..2cec2c6d8d9df 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -135,13 +135,15 @@ Changelog polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. + - |Enhancement| Added new argument ``auto`` for ``n_bins`` parameter in :class:`preprocessing.KBinsDiscretizer` for calculating number of bins via Sturges rule. - :pr:`9337` by :user:`Joel Nothman `. + :pr:`19290` by :user:`Gleb Levitskiy `. + - |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. - :pr:`19256` by :user:`Gleb Levitskiy `. + :pr:`19290` by :user:`Gleb Levitskiy `. :mod:`sklearn.tree` ................... From 71520f391cdda19fbd4d4aab880e0e0fe3220eda Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:16:01 +0000 Subject: [PATCH 51/72] Update test_common.py --- sklearn/tests/test_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 47705ff0bf5e5..b900f94231419 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -294,7 +294,5 @@ def test_search_cv(estimator, check, request): @pytest.mark.parametrize("estimator", N_FEATURES_IN_AFTER_FIT_ESTIMATORS, ids=_get_check_estimator_ids) def test_check_n_features_in_after_fitting(estimator): - if estimator.__class__.__name__ == 'KBinsDiscretizer': - estimator.n_bins = 'auto' _set_checking_parameters(estimator) check_n_features_in_after_fitting(estimator.__class__.__name__, estimator) From 76c68a7114712addf00c340f06811b31f227b30c Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:20:44 +0000 Subject: [PATCH 52/72] Update estimator_checks.py --- sklearn/utils/estimator_checks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 849d8a1f3921b..5377851f75892 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -613,6 +613,10 @@ def _set_checking_parameters(estimator): if name == "TheilSenRegressor": estimator.max_subpopulation = 100 + if name == 'KBinsDiscretizer': + # FIX ME: remove in 1.2 + estimator.n_bins = 'auto' + if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably From 4e01fd5670dbe2236997c5639a7b5ef5211565d0 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:26:09 +0000 Subject: [PATCH 53/72] Update estimator_checks.py --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5377851f75892..0895f9a97cc85 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -615,7 +615,7 @@ def _set_checking_parameters(estimator): if name == 'KBinsDiscretizer': # FIX ME: remove in 1.2 - estimator.n_bins = 'auto' + estimator.n_bins = 'auto' if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number From 71bb2e5dabac6eac0b72ea3ff910e9577953bcb4 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:26:19 +0000 Subject: [PATCH 54/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 7ee2d45b6968a..1045f845ff603 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -254,15 +254,22 @@ def _validate_n_bins(self, n_features, n_samples): """Returns n_bins_, the number of bins per feature. """ orig_bins = self._n_bins - if isinstance(orig_bins, numbers.Number) or isinstance(orig_bins, str): + if isinstance(orig_bins, str): if orig_bins == 'auto': # calculate number of bins with Sturges rule orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + else: + raise ValueError( + f"{KBinsDiscretizer.__name__} received " + f"an invalid n_bins value. Received " + f"{orig_bins}, while only 'auto' is supported." + ) + if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( f"{KBinsDiscretizer.__name__} received " f"an invalid n_bins type. Received " - f"{type(orig_bins).__name__}, expected int or 'auto'." + f"{type(orig_bins).__name__}, expected int." ) if orig_bins < 2: raise ValueError( From 28ae05e985f7070a71458b3aeec0e9a647cb3768 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:26:21 +0000 Subject: [PATCH 55/72] Update test_discretization.py --- sklearn/preprocessing/tests/test_discretization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 1ab40c544743f..cfb74e95afbb6 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -63,14 +63,14 @@ def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1.1) err_msg = ("KBinsDiscretizer received an invalid " - "n_bins type. Received float, expected int or 'auto'.") + "n_bins type. Received float, expected int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Bad string value est = KBinsDiscretizer(n_bins='rice') err_msg = ("KBinsDiscretizer received an invalid " - "n_bins type. Received str, expected int or 'auto'.") + "n_bins value. Received rice, while only 'auto' is supported.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) From 835a5146081eeb368ae0c9ff69a8ca2d95a9ad5e Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 18:33:01 +0000 Subject: [PATCH 56/72] Update estimator_checks.py --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 0895f9a97cc85..55c047faecbcc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -615,7 +615,7 @@ def _set_checking_parameters(estimator): if name == 'KBinsDiscretizer': # FIX ME: remove in 1.2 - estimator.n_bins = 'auto' + estimator.n_bins = 'auto' if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number From 6ff5ee64b50f17944d7c4ff391c4664194e65b39 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 19:31:25 +0000 Subject: [PATCH 57/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 1045f845ff603..26505291f7da6 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -161,6 +161,7 @@ def fit(self, X, y=None): warnings.warn("The default value of n_bins will change from " "5 to 'auto' in 1.2", FutureWarning) self._n_bins = 5 + X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) From 01ad4debe496bbba9291cd886ddc4cdafbac5304 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Sun, 31 Jan 2021 20:02:54 +0000 Subject: [PATCH 58/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 26505291f7da6..ace82dd2fedea 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -258,7 +258,8 @@ def _validate_n_bins(self, n_features, n_samples): if isinstance(orig_bins, str): if orig_bins == 'auto': # calculate number of bins with Sturges rule - orig_bins = int(np.ceil(np.log2(n_samples) + 1.)) + orig_bins = np.maximum(int(np.ceil(np.log2(n_samples) + 1.)), + 2) else: raise ValueError( f"{KBinsDiscretizer.__name__} received " From fce49a542eff007ede622ac537b42440f3085931 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 1 Feb 2021 11:33:32 +0000 Subject: [PATCH 59/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index ace82dd2fedea..2e1cd8512c91e 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -138,7 +138,6 @@ def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', self.strategy = strategy self.dtype = dtype - def fit(self, X, y=None): """ Fit the estimator. @@ -258,8 +257,7 @@ def _validate_n_bins(self, n_features, n_samples): if isinstance(orig_bins, str): if orig_bins == 'auto': # calculate number of bins with Sturges rule - orig_bins = np.maximum(int(np.ceil(np.log2(n_samples) + 1.)), - 2) + orig_bins = max(int(np.ceil(np.log2(n_samples) + 1.)), 2) else: raise ValueError( f"{KBinsDiscretizer.__name__} received " From 230307578ef6a39d283ccbaf50c67cb2454de66c Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 2 Feb 2021 16:02:56 +0300 Subject: [PATCH 60/72] Revert "DOC Add URL to reference of Minka paper used in PCA (#19207)" This reverts commit ca7fc5dd4f31a3b2eee5352e9ce615d2aa104b4d. --- sklearn/decomposition/_pca.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index ac4a1d1d9816b..80ac7e856dfd0 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -32,8 +32,7 @@ def _assess_dimension(spectrum, rank, n_samples): """Compute the log-likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, - dimf) having spectrum ``spectrum``. This implements the method of - T. P. Minka. + dimf) having spectrum ``spectrum``. Parameters ---------- @@ -51,11 +50,10 @@ def _assess_dimension(spectrum, rank, n_samples): ll : float The log-likelihood. - References - ---------- + Notes + ----- This implements the method of `Thomas P. Minka: - Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604 - `_ + Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604` """ n_features = spectrum.shape[0] @@ -273,30 +271,26 @@ class PCA(_BasePCA): References ---------- - For n_components == 'mle', this class uses the method from: - `Minka, T. P.. "Automatic choice of dimensionality for PCA". - In NIPS, pp. 598-604 `_ + For n_components == 'mle', this class uses the method of *Minka, T. P. + "Automatic choice of dimensionality for PCA". In NIPS, pp. 598-604* Implements the probabilistic PCA model from: - `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal + Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal component analysis". Journal of the Royal Statistical Society: Series B (Statistical Methodology), 61(3), 611-622. - `_ via the score and score_samples methods. + See http://www.miketipping.com/papers/met-mppca.pdf For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`. For svd_solver == 'randomized', see: - `Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). + *Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions". - SIAM review, 53(2), 217-288. - `_ - and also - `Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). + SIAM review, 53(2), 217-288.* and also + *Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). "A randomized algorithm for the decomposition of matrices". - Applied and Computational Harmonic Analysis, 30(1), 47-68 - `_. + Applied and Computational Harmonic Analysis, 30(1), 47-68.* Examples -------- From 4f99c48a2378a140b0ceb20ccbd7739dd3717f13 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 2 Feb 2021 16:03:01 +0300 Subject: [PATCH 61/72] Revert "DOC update Keras description in related projects (#19265)" This reverts commit 315463f3d20494581b440cd9c26d48c580419edc. --- doc/related_projects.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 38d8bc555638e..8496b2b9b1df0 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -148,8 +148,8 @@ and tasks. - `nolearn `_ A number of wrappers and abstractions around existing neural network libraries -- `Keras `_ High-level API for - TensorFlow with a scikit-learn inspired API. +- `keras `_ Deep Learning library capable of + running on top of either TensorFlow or Theano. - `lasagne `_ A lightweight library to build and train neural networks in Theano. From cdee3573072c680eaa661c27232336cde163a7f5 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 2 Feb 2021 16:03:08 +0300 Subject: [PATCH 62/72] Revert "CLN Removes duplicated or unneeded code in ColumnTransformer (#19261)" This reverts commit 8965abb264aaf70d11d9f56d2947bcc0b5ddaf75. --- sklearn/compose/_column_transformer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6693c9896c87a..553e8c3afa263 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -19,6 +19,7 @@ from ..utils import Bunch from ..utils import _safe_indexing from ..utils import _get_column_indices +from ..utils import _determine_key_type from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -319,6 +320,12 @@ def _validate_remainder(self, X): "'passthrough', or estimator. '%s' was passed instead" % self.remainder) + # Make it possible to check for reordered named columns on transform + self._has_str_cols = any(_determine_key_type(cols) == 'str' + for cols in self._columns) + if hasattr(X, 'columns'): + self._df_columns = X.columns + self._n_features = X.shape[1] cols = [] for columns in self._columns: @@ -355,12 +362,12 @@ def get_feature_names(self): hasattr(column, '__len__') and not len(column)): continue if trans == 'passthrough': - if self._feature_names_in is not None: + if hasattr(self, '_df_columns'): if ((not isinstance(column, slice)) and all(isinstance(col, str) for col in column)): feature_names.extend(column) else: - feature_names.extend(self._feature_names_in[column]) + feature_names.extend(self._df_columns[column]) else: indices = np.arange(self._n_features) feature_names.extend(['x%d' % i for i in indices[column]]) @@ -434,7 +441,7 @@ def _fit_transform(self, X, y, func, fitted=False): message_clsname='ColumnTransformer', message=self._log_message(name, idx, len(transformers))) for idx, (name, trans, column, weight) in enumerate( - transformers, 1)) + self._iter(fitted=fitted, replace_strings=True), 1)) except ValueError as e: if "Expected 2D array, got 1D array instead" in str(e): raise ValueError(_ERR_MSG_1DCOLUMN) from e @@ -599,9 +606,9 @@ def _sk_visual_block_(self): transformers = self.transformers elif hasattr(self, "_remainder"): remainder_columns = self._remainder[2] - if self._feature_names_in is not None: + if hasattr(self, '_df_columns'): remainder_columns = ( - self._feature_names_in[remainder_columns].tolist() + self._df_columns[remainder_columns].tolist() ) transformers = chain(self.transformers, [('remainder', self.remainder, From 15ba4125b87a8c211eb6fc5ff7f84e86e8b39102 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:28:08 +0000 Subject: [PATCH 63/72] Revert "Kbd changes" --- doc/whats_new/v1.0.rst | 9 -- sklearn/preprocessing/_discretization.py | 93 +++++++------------ .../tests/test_discretization.py | 25 ----- sklearn/tests/test_docstring_parameters.py | 4 - sklearn/utils/estimator_checks.py | 4 - 5 files changed, 32 insertions(+), 103 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 2cec2c6d8d9df..a00523ec2223b 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -136,15 +136,6 @@ Changelog positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. -- |Enhancement| Added new argument ``auto`` for ``n_bins`` parameter in - :class:`preprocessing.KBinsDiscretizer` for calculating number of bins - via Sturges rule. - :pr:`19290` by :user:`Gleb Levitskiy `. - -- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in - :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. - :pr:`19290` by :user:`Gleb Levitskiy `. - :mod:`sklearn.tree` ................... diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2e1cd8512c91e..22fa236f3314e 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,13 +28,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral,\ - default=5 + n_bins : int or array-like of shape (n_features,), default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. - For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. - - .. versionadded:: 1.0 - Added 'auto' option encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. @@ -131,7 +126,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ @_deprecate_positional_args - def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', + def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins self.encode = encode @@ -155,12 +150,6 @@ def fit(self, X, y=None): ------- self """ - self._n_bins = self.n_bins - if isinstance(self.n_bins, str) and self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 1.2", FutureWarning) - self._n_bins = 5 - X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) @@ -174,21 +163,20 @@ def fit(self, X, y=None): f"{supported_dtype + (None,)}. Got dtype={self.dtype} " f" instead." ) + valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: - raise ValueError( - f"Valid options for 'encode' are {valid_encode}. " - f"Got encode={self.encode!r} instead." - ) + raise ValueError("Valid options for 'encode' are {}. " + "Got encode={!r} instead." + .format(valid_encode, self.encode)) valid_strategy = ('uniform', 'quantile', 'kmeans') if self.strategy not in valid_strategy: - raise ValueError( - f"Valid options for 'strategy' are {valid_strategy}. " - f"Got strategy={self.strategy!r} instead." - ) + raise ValueError("Valid options for 'strategy' are {}. " + "Got strategy={!r} instead." + .format(valid_strategy, self.strategy)) - n_samples, n_features = X.shape - n_bins = self._validate_n_bins(n_features, n_samples) + n_features = X.shape[1] + n_bins = self._validate_n_bins(n_features) bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): @@ -196,8 +184,8 @@ def fit(self, X, y=None): col_min, col_max = column.min(), column.max() if col_min == col_max: - warnings.warn(f"Feature {jj} is constant and will be " - f"replaced with 0.") + warnings.warn("Feature %d is constant and will be " + "replaced with 0." % jj) n_bins[jj] = 1 bin_edges[jj] = np.array([-np.inf, np.inf]) continue @@ -217,8 +205,7 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, - n_init=1, algorithm='full') + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() @@ -230,10 +217,9 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn(f"Bins whose width are too small " - f"(i.e., <= 1e-8) in feature {jj} " - f"are removed. Consider decreasing " - f"the number of bins.") + warnings.warn('Bins whose width are too small (i.e., <= ' + '1e-8) in feature %d are removed. Consider ' + 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges @@ -250,32 +236,20 @@ def fit(self, X, y=None): return self - def _validate_n_bins(self, n_features, n_samples): + def _validate_n_bins(self, n_features): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self._n_bins - if isinstance(orig_bins, str): - if orig_bins == 'auto': - # calculate number of bins with Sturges rule - orig_bins = max(int(np.ceil(np.log2(n_samples) + 1.)), 2) - else: - raise ValueError( - f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins value. Received " - f"{orig_bins}, while only 'auto' is supported." - ) + orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): - raise ValueError( - f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins type. Received " - f"{type(orig_bins).__name__}, expected int." - ) + raise ValueError("{} received an invalid n_bins type. " + "Received {}, expected int." + .format(KBinsDiscretizer.__name__, + type(orig_bins).__name__)) if orig_bins < 2: - raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid number " - f"of bins. Received {orig_bins}, expected at least 2." - ) + raise ValueError("{} received an invalid number " + "of bins. Received {}, expected at least 2." + .format(KBinsDiscretizer.__name__, orig_bins)) return np.full(n_features, orig_bins, dtype=int) n_bins = check_array(orig_bins, dtype=int, copy=True, @@ -290,11 +264,10 @@ def _validate_n_bins(self, n_features, n_samples): violating_indices = np.where(bad_nbins_value)[0] if violating_indices.shape[0] > 0: indices = ", ".join(str(i) for i in violating_indices) - raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid number " - f"of bins at indices {indices}. Number of bins " - f"must be at least 2, and must be an int." - ) + raise ValueError("{} received an invalid number " + "of bins at indices {}. Number of bins " + "must be at least 2, and must be an int." + .format(KBinsDiscretizer.__name__, indices)) return n_bins def transform(self, X): @@ -369,10 +342,8 @@ def inverse_transform(self, Xt): Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32)) n_features = self.n_bins_.shape[0] if Xinv.shape[1] != n_features: - raise ValueError( - f"Incorrect number of features. Expecting {n_features}, " - f"received {Xinv.shape[1]}." - ) + raise ValueError("Incorrect number of features. Expecting {}, " + "received {}.".format(n_features, Xinv.shape[1])) for jj in range(n_features): bin_edges = self.bin_edges_[jj] diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index cfb74e95afbb6..9d607c82d5831 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -36,24 +36,6 @@ def test_valid_n_bins(): assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int) -def test_n_bins_5_deprecated(): - # FIXME: remove in 1.2 - est = KBinsDiscretizer() - depr_msg = ("The default value of n_bins will change from " - "5 to 'auto' in 1.2") - with pytest.warns(FutureWarning, match=depr_msg): - est.fit(X) - - -def test_auto_bins(): - est = KBinsDiscretizer(n_bins='auto') - # for Sturges rule: ceil(log2(4) + 1) = 3 - # where 4 is n_samples in X - expected_bins = [3, 3, 3, 3] - est.fit(X) - assert np.all(est.n_bins_ == expected_bins) - - def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1) err_msg = ("KBinsDiscretizer received an invalid " @@ -67,13 +49,6 @@ def test_invalid_n_bins(): with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) - # Bad string value - est = KBinsDiscretizer(n_bins='rice') - err_msg = ("KBinsDiscretizer received an invalid " - "n_bins value. Received rice, while only 'auto' is supported.") - with pytest.raises(ValueError, match=err_msg): - est.fit_transform(X) - def test_invalid_n_bins_array(): # Bad shape diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 3d0cfc5151193..1756e0e4a65a6 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -206,10 +206,6 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == 'SelectKBest': est.k = 2 - if Estimator.__name__ == 'KBinsDiscretizer': - # FIX ME: remove in 1.2 - est.n_bins = 'auto' - if Estimator.__name__ == 'DummyClassifier': est.strategy = "stratified" diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 55c047faecbcc..849d8a1f3921b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -613,10 +613,6 @@ def _set_checking_parameters(estimator): if name == "TheilSenRegressor": estimator.max_subpopulation = 100 - if name == 'KBinsDiscretizer': - # FIX ME: remove in 1.2 - estimator.n_bins = 'auto' - if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably From f7f394a711318bd7deb6abfb451bfa07681e051c Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:34:19 +0000 Subject: [PATCH 64/72] reverse --- sklearn/preprocessing/_discretization.py | 29 ++++-------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2e1cd8512c91e..520020dce1b05 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -31,10 +31,6 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral,\ default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. - For 'auto' option Sturges formula is used: bins are log(n_samples) + 1. - - .. versionadded:: 1.0 - Added 'auto' option encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. @@ -131,7 +127,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ @_deprecate_positional_args - def __init__(self, n_bins='warn', *, encode='onehot', strategy='quantile', + def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins self.encode = encode @@ -155,11 +151,6 @@ def fit(self, X, y=None): ------- self """ - self._n_bins = self.n_bins - if isinstance(self.n_bins, str) and self.n_bins == 'warn': - warnings.warn("The default value of n_bins will change from " - "5 to 'auto' in 1.2", FutureWarning) - self._n_bins = 5 X = self._validate_data(X, dtype='numeric') @@ -187,8 +178,8 @@ def fit(self, X, y=None): f"Got strategy={self.strategy!r} instead." ) - n_samples, n_features = X.shape - n_bins = self._validate_n_bins(n_features, n_samples) + n_features = X.shape[1] + n_bins = self._validate_n_bins(n_features) bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): @@ -250,20 +241,10 @@ def fit(self, X, y=None): return self - def _validate_n_bins(self, n_features, n_samples): + def _validate_n_bins(self, n_features): """Returns n_bins_, the number of bins per feature. """ - orig_bins = self._n_bins - if isinstance(orig_bins, str): - if orig_bins == 'auto': - # calculate number of bins with Sturges rule - orig_bins = max(int(np.ceil(np.log2(n_samples) + 1.)), 2) - else: - raise ValueError( - f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins value. Received " - f"{orig_bins}, while only 'auto' is supported." - ) + orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): raise ValueError( From 173f18fe1b492fe9b7d94352c09ac853dc963662 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:35:26 +0000 Subject: [PATCH 65/72] reverse --- .../tests/test_discretization.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index cfb74e95afbb6..9d607c82d5831 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -36,24 +36,6 @@ def test_valid_n_bins(): assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(int) -def test_n_bins_5_deprecated(): - # FIXME: remove in 1.2 - est = KBinsDiscretizer() - depr_msg = ("The default value of n_bins will change from " - "5 to 'auto' in 1.2") - with pytest.warns(FutureWarning, match=depr_msg): - est.fit(X) - - -def test_auto_bins(): - est = KBinsDiscretizer(n_bins='auto') - # for Sturges rule: ceil(log2(4) + 1) = 3 - # where 4 is n_samples in X - expected_bins = [3, 3, 3, 3] - est.fit(X) - assert np.all(est.n_bins_ == expected_bins) - - def test_invalid_n_bins(): est = KBinsDiscretizer(n_bins=1) err_msg = ("KBinsDiscretizer received an invalid " @@ -67,13 +49,6 @@ def test_invalid_n_bins(): with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) - # Bad string value - est = KBinsDiscretizer(n_bins='rice') - err_msg = ("KBinsDiscretizer received an invalid " - "n_bins value. Received rice, while only 'auto' is supported.") - with pytest.raises(ValueError, match=err_msg): - est.fit_transform(X) - def test_invalid_n_bins_array(): # Bad shape From 192a37c4791d544b8b67394471f140858e99b97d Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:36:03 +0000 Subject: [PATCH 66/72] reverse --- sklearn/tests/test_docstring_parameters.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 3d0cfc5151193..1756e0e4a65a6 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -206,10 +206,6 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == 'SelectKBest': est.k = 2 - if Estimator.__name__ == 'KBinsDiscretizer': - # FIX ME: remove in 1.2 - est.n_bins = 'auto' - if Estimator.__name__ == 'DummyClassifier': est.strategy = "stratified" From f003f99ef0c04ace0c16336a0ddac2827730da35 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:37:16 +0000 Subject: [PATCH 67/72] reverse --- sklearn/utils/estimator_checks.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 55c047faecbcc..849d8a1f3921b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -613,10 +613,6 @@ def _set_checking_parameters(estimator): if name == "TheilSenRegressor": estimator.max_subpopulation = 100 - if name == 'KBinsDiscretizer': - # FIX ME: remove in 1.2 - estimator.n_bins = 'auto' - if isinstance(estimator, BaseRandomProjection): # Due to the jl lemma and often very few samples, the number # of components of the random matrix projection will be probably From 181e0c4544a9608644c9cc1af1eddfacee417cf0 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:39:53 +0000 Subject: [PATCH 68/72] reverse --- doc/whats_new/v1.0.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 2cec2c6d8d9df..1f91634735a9c 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -136,11 +136,6 @@ Changelog positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. -- |Enhancement| Added new argument ``auto`` for ``n_bins`` parameter in - :class:`preprocessing.KBinsDiscretizer` for calculating number of bins - via Sturges rule. - :pr:`19290` by :user:`Gleb Levitskiy `. - - |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. :pr:`19290` by :user:`Gleb Levitskiy `. From 4a3380af0118054785c52735eaab702bbeba86b0 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Mon, 8 Feb 2021 12:01:44 +0000 Subject: [PATCH 69/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 520020dce1b05..2810834d529ea 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,7 +28,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int, 'auto' or array-like of shape (n_features,), dtype=integral,\ + n_bins : int or array-like of shape (n_features,), dtype=integral,\ default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. From cf290758d7fd3e33a0cb72128625e874f9123d89 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 20 Apr 2021 11:10:37 +0000 Subject: [PATCH 70/72] reverse --- sklearn/preprocessing/_discretization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 22fa236f3314e..526aa74d31007 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -205,7 +205,8 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, + algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() From f4d30ab7ca6e39b7803e7ca57f24326ba2a18422 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 20 Apr 2021 11:13:22 +0000 Subject: [PATCH 71/72] reverse --- sklearn/preprocessing/_discretization.py | 65 ++++++++++-------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 2810834d529ea..9ce95a97544a5 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -28,8 +28,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Parameters ---------- - n_bins : int or array-like of shape (n_features,), dtype=integral,\ - default=5 + n_bins : int or array-like of shape (n_features,), default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' @@ -151,7 +150,6 @@ def fit(self, X, y=None): ------- self """ - X = self._validate_data(X, dtype='numeric') supported_dtype = (np.float64, np.float32) @@ -165,18 +163,17 @@ def fit(self, X, y=None): f"{supported_dtype + (None,)}. Got dtype={self.dtype} " f" instead." ) + valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: - raise ValueError( - f"Valid options for 'encode' are {valid_encode}. " - f"Got encode={self.encode!r} instead." - ) + raise ValueError("Valid options for 'encode' are {}. " + "Got encode={!r} instead." + .format(valid_encode, self.encode)) valid_strategy = ('uniform', 'quantile', 'kmeans') if self.strategy not in valid_strategy: - raise ValueError( - f"Valid options for 'strategy' are {valid_strategy}. " - f"Got strategy={self.strategy!r} instead." - ) + raise ValueError("Valid options for 'strategy' are {}. " + "Got strategy={!r} instead." + .format(valid_strategy, self.strategy)) n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) @@ -187,8 +184,8 @@ def fit(self, X, y=None): col_min, col_max = column.min(), column.max() if col_min == col_max: - warnings.warn(f"Feature {jj} is constant and will be " - f"replaced with 0.") + warnings.warn("Feature %d is constant and will be " + "replaced with 0." % jj) n_bins[jj] = 1 bin_edges[jj] = np.array([-np.inf, np.inf]) continue @@ -208,8 +205,8 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, - n_init=1, algorithm='full') + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, + algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() @@ -221,10 +218,9 @@ def fit(self, X, y=None): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: - warnings.warn(f"Bins whose width are too small " - f"(i.e., <= 1e-8) in feature {jj} " - f"are removed. Consider decreasing " - f"the number of bins.") + warnings.warn('Bins whose width are too small (i.e., <= ' + '1e-8) in feature %d are removed. Consider ' + 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges @@ -247,16 +243,14 @@ def _validate_n_bins(self, n_features): orig_bins = self.n_bins if isinstance(orig_bins, numbers.Number): if not isinstance(orig_bins, numbers.Integral): - raise ValueError( - f"{KBinsDiscretizer.__name__} received " - f"an invalid n_bins type. Received " - f"{type(orig_bins).__name__}, expected int." - ) + raise ValueError("{} received an invalid n_bins type. " + "Received {}, expected int." + .format(KBinsDiscretizer.__name__, + type(orig_bins).__name__)) if orig_bins < 2: - raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid number " - f"of bins. Received {orig_bins}, expected at least 2." - ) + raise ValueError("{} received an invalid number " + "of bins. Received {}, expected at least 2." + .format(KBinsDiscretizer.__name__, orig_bins)) return np.full(n_features, orig_bins, dtype=int) n_bins = check_array(orig_bins, dtype=int, copy=True, @@ -271,11 +265,10 @@ def _validate_n_bins(self, n_features): violating_indices = np.where(bad_nbins_value)[0] if violating_indices.shape[0] > 0: indices = ", ".join(str(i) for i in violating_indices) - raise ValueError( - f"{KBinsDiscretizer.__name__} received an invalid number " - f"of bins at indices {indices}. Number of bins " - f"must be at least 2, and must be an int." - ) + raise ValueError("{} received an invalid number " + "of bins at indices {}. Number of bins " + "must be at least 2, and must be an int." + .format(KBinsDiscretizer.__name__, indices)) return n_bins def transform(self, X): @@ -350,10 +343,8 @@ def inverse_transform(self, Xt): Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32)) n_features = self.n_bins_.shape[0] if Xinv.shape[1] != n_features: - raise ValueError( - f"Incorrect number of features. Expecting {n_features}, " - f"received {Xinv.shape[1]}." - ) + raise ValueError("Incorrect number of features. Expecting {}, " + "received {}.".format(n_features, Xinv.shape[1])) for jj in range(n_features): bin_edges = self.bin_edges_[jj] From 6e3f50f39424bdd8cc2ef955594e4da1556bff46 Mon Sep 17 00:00:00 2001 From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com> Date: Tue, 20 Apr 2021 12:11:13 +0000 Subject: [PATCH 72/72] Update _discretization.py --- sklearn/preprocessing/_discretization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 526aa74d31007..4747c4adc4945 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -205,8 +205,8 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, - algorithm='full') + km = KMeans(n_clusters=n_bins[jj], init=init, + n_init=1, algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort()