diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index dd1f798ccb3aa..04f9228750ece 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -540,6 +540,16 @@ columns for this feature will be all zeros array([[1., 0., 0., 0., 0., 0.]]) +Missing categorical features in the training data can be handled by specifying what happens to them using the ``handle_missing`` parameter. The values for this can be one of : + +`all-missing`: This will replace all missing rows with NaN. +`all-zero` : This will replace all missing rows with zeros. +`categorical` : This will replace all missing rows as a representation of a separate one hot column. + +Note that, for scikit-learn to handle your missing values using OneHotEncoder, you have to pass a placeholder of what should be recorded as a missing value. This is the `missing_values` parameter and possible values can be either a `NaN` or a custom value of your choice. + + + See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index bd6e10fb62810..43cf8a1a97513 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -108,20 +108,16 @@ def _transform(self, X, handle_unknown='error'): return X_int, X_mask -class OneHotEncoder(_BaseEncoder): - """Encode categorical integer features as a one-hot numeric array. +class OneHotEncoder(BaseEstimator, TransformerMixin): + """Encode categorical integer features using a one-hot aka one-of-K scheme. - The input to this transformer should be an array-like of integers or - strings, denoting the values taken on by categorical (discrete) features. - The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') - encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array. - - By default, the encoder derives the categories based on the unique values - in each feature. Alternatively, you can also specify the `categories` - manually. - The OneHotEncoder previously assumed that the input features take on - values in the range [0, max(values)). This behaviour is deprecated. + The input to this transformer should be a matrix of integers, denoting + the values taken on by categorical (discrete) features. The output will be + a sparse matrix where each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range + [0, n_values). For an encoder based on the unique values of the input + features of any type, see the + :class:`~sklearn.preprocessing.CategoricalEncoder`. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -133,31 +129,6 @@ class OneHotEncoder(_BaseEncoder): Parameters ---------- - categories : 'auto' or a list of lists/arrays of values. - Categories (unique values) per feature: - - - 'auto' : Determine categories automatically from the training data. - - list : ``categories[i]`` holds the categories expected in the ith - column. The passed categories should not mix strings and numeric - values within a single feature, and should be sorted in case of - numeric values. - - The used categories can be found in the ``categories_`` attribute. - - sparse : boolean, default=True - Will return sparse matrix if set True else will return an array. - - dtype : number type, default=np.float - Desired dtype of output. - - handle_unknown : 'error' (default) or 'ignore' - Whether to raise an error or ignore if an unknown categorical feature - is present during transform (default is to raise). When this parameter - is set to 'ignore' and an unknown category is encountered during - transform, the resulting one-hot encoded columns for this feature - will be all zeros. In the inverse transform, an unknown category - will be denoted as None. - n_values : 'auto', int or array of ints Number of values per feature. @@ -168,10 +139,6 @@ class OneHotEncoder(_BaseEncoder): ``X[:, i]``. Each feature value should be in ``range(n_values[i])`` - .. deprecated:: 0.20 - The `n_values` keyword was deprecated in version 0.20 and will - be removed in 0.22. Use `categories` instead. - categorical_features : "all" or array of indices or mask Specify what features are treated as categorical. @@ -181,72 +148,68 @@ class OneHotEncoder(_BaseEncoder): Non-categorical features are always stacked to the right of the matrix. - .. deprecated:: 0.20 - The `categorical_features` keyword was deprecated in version - 0.20 and will be removed in 0.22. - You can use the ``ColumnTransformer`` instead. + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + handle_unknown : str, 'error' or 'ignore' + Whether to raise an error or ignore if a unknown categorical feature is + present during transform. Attributes ---------- - categories_ : list of arrays - The categories of each feature determined during fitting - (in order of the features in X and corresponding with the output - of ``transform``). - active_features_ : array Indices for active features, meaning values that actually occur in the training set. Only available when n_values is ``'auto'``. - .. deprecated:: 0.20 - The ``active_features_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. - feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by ``active_features_`` afterwards) - - .. deprecated:: 0.20 - The ``feature_indices_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. + (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. - .. deprecated:: 0.20 - The ``n_values_`` attribute was deprecated in version - 0.20 and will be removed in 0.22. + handle_missing : int, 0, 1, 2 + What should be done to missing values. Should be one of: + + all-missing: Replace with a row of NaNs as above + + all-zero: Replace with a row of zeros + + category: Represent with a separate one-hot column + + missing_values: NaN or None + What should be considered as a missing value? Examples -------- - Given a dataset with two features, we let the encoder find the unique - values per feature and transform the data to a binary one-hot encoding. + Given a dataset with three features and four samples, we let the encoder + find the maximum value per feature and transform the data to a binary + one-hot encoding. >>> from sklearn.preprocessing import OneHotEncoder - >>> enc = OneHotEncoder(handle_unknown='ignore') - >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] - >>> enc.fit(X) - ... # doctest: +ELLIPSIS - OneHotEncoder(categorical_features=None, categories=None, - dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) - - >>> enc.categories_ - [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] - >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() - array([[1., 0., 1., 0., 0.], - [0., 1., 0., 0., 0.]]) - >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) - array([['Male', 1], - [None, 2]], dtype=object) - >>> enc.get_feature_names() - array(['x0_Female', 'x0_Male', 'x1_1', 'x1_2', 'x1_3'], dtype=object) + >>> enc = OneHotEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ +[1, 0, 2]]) # doctest: +ELLIPSIS + OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>, + handle_unknown='error', n_values='auto', sparse=True) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 2, 5, 9]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[1., 0., 0., 1., 0., 0., 1., 0., 0.]]) See also -------- - sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) - encoding of the categorical features. + sklearn.preprocessing.CategoricalEncoder : performs a one-hot or ordinal + encoding of all features (also handles string-valued features). This + encoder derives the categories based on the unique values in each + feature. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot @@ -256,125 +219,19 @@ class OneHotEncoder(_BaseEncoder): sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of iterables and a multilabel format, e.g. a (samples x classes) binary matrix indicating the presence of a class label. + sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 + and n_classes-1. """ - def __init__(self, n_values=None, categorical_features=None, - categories=None, sparse=True, dtype=np.float64, - handle_unknown='error'): - self.categories = categories - self.sparse = sparse - self.dtype = dtype - self.handle_unknown = handle_unknown + def __init__(self, n_values="auto", categorical_features="all", + dtype=np.float64, sparse=True, handle_unknown='error', missing_values=None, handle_missing=None): self.n_values = n_values self.categorical_features = categorical_features - - # Deprecated attributes - - @property - @deprecated("The ``active_features_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - def active_features_(self): - check_is_fitted(self, 'categories_') - return self._active_features_ - - @property - @deprecated("The ``feature_indices_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - def feature_indices_(self): - check_is_fitted(self, 'categories_') - return self._feature_indices_ - - @property - @deprecated("The ``n_values_`` attribute was deprecated in version " - "0.20 and will be removed 0.22.") - def n_values_(self): - check_is_fitted(self, 'categories_') - return self._n_values_ - - def _handle_deprecations(self, X): - - # internal version of the attributes to handle deprecations - self._categories = getattr(self, '_categories', None) - self._categorical_features = getattr(self, '_categorical_features', - None) - - # user manually set the categories or second fit -> never legacy mode - if self.categories is not None or self._categories is not None: - self._legacy_mode = False - if self.categories is not None: - self._categories = self.categories - - # categories not set -> infer if we need legacy mode or not - elif self.n_values is not None and self.n_values != 'auto': - msg = ( - "Passing 'n_values' is deprecated in version 0.20 and will be " - "removed in 0.22. You can use the 'categories' keyword " - "instead. 'n_values=n' corresponds to 'categories=[range(n)]'." - ) - warnings.warn(msg, DeprecationWarning) - self._legacy_mode = True - - else: # n_values = 'auto' - if self.handle_unknown == 'ignore': - # no change in behaviour, no need to raise deprecation warning - self._legacy_mode = False - self._categories = 'auto' - if self.n_values == 'auto': - # user manually specified this - msg = ( - "Passing 'n_values' is deprecated in version 0.20 and " - "will be removed in 0.22. n_values='auto' can be " - "replaced with categories='auto'." - ) - warnings.warn(msg, DeprecationWarning) - else: - - # check if we have integer or categorical input - try: - X = check_array(X, dtype=np.int) - except ValueError: - self._legacy_mode = False - self._categories = 'auto' - else: - msg = ( - "The handling of integer data will change in version " - "0.22. Currently, the categories are determined " - "based on the range [0, max(values)], while in the " - "future they will be determined based on the unique " - "values.\nIf you want the future behaviour and " - "silence this warning, you can specify " - "\"categories='auto'\".\n" - "In case you used a LabelEncoder before this " - "OneHotEncoder to convert the categories to integers, " - "then you can now use the OneHotEncoder directly." - ) - warnings.warn(msg, FutureWarning) - self._legacy_mode = True - self.n_values = 'auto' - - # if user specified categorical_features -> always use legacy mode - if self.categorical_features is not None: - if (isinstance(self.categorical_features, six.string_types) - and self.categorical_features == 'all'): - warnings.warn( - "The 'categorical_features' keyword is deprecated in " - "version 0.20 and will be removed in 0.22. The passed " - "value of 'all' is the default and can simply be removed.", - DeprecationWarning) - else: - if self.categories is not None: - raise ValueError( - "The 'categorical_features' keyword is deprecated, " - "and cannot be used together with specifying " - "'categories'.") - warnings.warn( - "The 'categorical_features' keyword is deprecated in " - "version 0.20 and will be removed in 0.22. You can " - "use the ColumnTransformer instead.", DeprecationWarning) - self._legacy_mode = True - self._categorical_features = self.categorical_features - else: - self._categorical_features = 'all' + self.dtype = dtype + self.sparse = sparse + self.handle_unknown = handle_unknown + self.handle_missing = handle_missing + self.missing_values = missing_values def fit(self, X, y=None): """Fit OneHotEncoder to X. @@ -382,31 +239,17 @@ def fit(self, X, y=None): Parameters ---------- X : array-like, shape [n_samples, n_feature] - The data to determine the categories of each feature. + Input array of type int. Returns ------- self """ - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) - raise ValueError(msg) - - self._handle_deprecations(X) - - if self._legacy_mode: - _transform_selected(X, self._legacy_fit_transform, self.dtype, - self._categorical_features, - copy=True) - return self - else: - self._fit(X, handle_unknown=self.handle_unknown) - return self + self.fit_transform(X) + return self - def _legacy_fit_transform(self, X): + def _fit_transform(self, X): """Assumes X contains only categorical features.""" - dtype = getattr(X, 'dtype', None) X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") @@ -431,12 +274,10 @@ def _legacy_fit_transform(self, X): raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") - self._n_values_ = n_values - self.categories_ = [np.arange(n_val - 1, dtype=dtype) - for n_val in n_values] + self.n_values_ = n_values n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) - self._feature_indices_ = indices + self.feature_indices_ = indices column_indices = (X + indices[:-1]).ravel() row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), @@ -451,11 +292,7 @@ def _legacy_fit_transform(self, X): mask = np.array(out.sum(axis=0)).ravel() != 0 active_features = np.where(mask)[0] out = out[:, active_features] - self._active_features_ = active_features - - self.categories_ = [ - np.unique(X[:, i]).astype(dtype) if dtype - else np.unique(X[:, i]) for i in range(n_features)] + self.active_features_ = active_features return out if self.sparse else out.toarray() @@ -470,28 +307,36 @@ def fit_transform(self, X, y=None): X : array-like, shape [n_samples, n_feature] Input array of type int. """ - if self.handle_unknown not in ('error', 'ignore'): - msg = ("handle_unknown should be either 'error' or 'ignore', " - "got {0}.".format(self.handle_unknown)) - raise ValueError(msg) - - self._handle_deprecations(X) - - if self._legacy_mode: - return _transform_selected( - X, self._legacy_fit_transform, self.dtype, - self._categorical_features, copy=True) - else: - return self.fit(X).transform(X) + if not self.missing_values: + return _transform_selected(X, self._fit_transform, + self.categorical_features, copy=True) + if self.missing_values and self.missing_values != "NaN": + raise ValueError("Wrong 'missing_missing' value specified. " + "'missing_values' should be one of either 'None' or 'NaN'") + if self.missing_values == "NaN": + if not self.handle_missing: + raise ValueError("'handle_missing' cannot be None when 'missing_values' is passed.") + if self.handle_missing not in ["all-missing", "all-zero", "category"]: + raise ValueError("Wrong 'handle_missing' value specified. " + "'handle_missing' should be one of either ['all-missing', 'all-zero', 'category']") + if self.handle_missing == "all-missing": + # Replace entire row with NaN + pass + if self.handle_missing == "all-zero": + # Replace with a row of zeros + pass + else: + # Replace with a seperate one-hot column + pass - def _legacy_transform(self, X): + def _transform(self, X): """Assumes X contains only categorical features.""" X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape - indices = self._feature_indices_ + indices = self.feature_indices_ if n_features != indices.shape[0] - 1: raise ValueError("X has different shape than during fitting." " Expected %d, got %d." @@ -502,7 +347,7 @@ def _legacy_transform(self, X): # This means, if self.handle_unknown is "ignore", the row_indices and # col_indices corresponding to the unknown categorical feature are # ignored. - mask = (X < self._n_values_).ravel() + mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: raise ValueError("handle_unknown should be either error or " @@ -520,158 +365,25 @@ def _legacy_transform(self, X): dtype=self.dtype).tocsr() if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): - out = out[:, self._active_features_] + out = out[:, self.active_features_] return out if self.sparse else out.toarray() - def _transform_new(self, X): - """New implementation assuming categorical input""" - X_temp = check_array(X, dtype=None) - if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=np.object) - else: - X = X_temp - - n_samples, n_features = X.shape - - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - - mask = X_mask.ravel() - n_values = [cats.shape[0] for cats in self.categories_] - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) - - indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] - - out = sparse.csr_matrix((data, indices, indptr), - shape=(n_samples, feature_indices[-1]), - dtype=self.dtype) - if not self.sparse: - return out.toarray() - else: - return out - def transform(self, X): """Transform X using one-hot encoding. Parameters ---------- X : array-like, shape [n_samples, n_features] - The data to encode. + Input array of type int. Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int Transformed input. """ - if self._legacy_mode: - return _transform_selected(X, self._legacy_transform, self.dtype, - self._categorical_features, - copy=True) - else: - return self._transform_new(X) - - def inverse_transform(self, X): - """Convert the back data to the original representation. - - In case unknown categories are encountered (all zero's in the - one-hot encoding), ``None`` is used to represent this category. - - Parameters - ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. - - Returns - ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. - - """ - # if self._legacy_mode: - # raise ValueError("only supported for categorical features") - - check_is_fitted(self, 'categories_') - X = check_array(X, accept_sparse='csr') - - n_samples, _ = X.shape - n_features = len(self.categories_) - n_transformed_features = sum([len(cats) for cats in self.categories_]) - - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_transformed_features: - raise ValueError(msg.format(n_transformed_features, X.shape[1])) - - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) - - j = 0 - found_unknown = {} - - for i in range(n_features): - n_categories = len(self.categories_[i]) - sub = X[:, j:j + n_categories] - - # for sparse X argmax returns 2D matrix, ensure 1D array - labels = np.asarray(_argmax(sub, axis=1)).flatten() - X_tr[:, i] = self.categories_[i][labels] - - if self.handle_unknown == 'ignore': - # ignored unknown categories: we have a row of all zero's - unknown = np.asarray(sub.sum(axis=1) == 0).flatten() - if unknown.any(): - found_unknown[i] = unknown - - j += n_categories - - # if ignored are found: potentially need to upcast result to - # insert None values - if found_unknown: - if X_tr.dtype != object: - X_tr = X_tr.astype(object) - - for idx, mask in found_unknown.items(): - X_tr[mask, idx] = None - - return X_tr - - def get_feature_names(self, input_features=None): - """Return feature names for output features. - - Parameters - ---------- - input_features : list of string, length n_features, optional - String names for input features if available. By default, - "x0", "x1", ... "xn_features" is used. - - Returns - ------- - output_feature_names : array of string, length n_output_features - - """ - check_is_fitted(self, 'categories_') - cats = self.categories_ - if input_features is None: - input_features = ['x%d' % i for i in range(len(cats))] - elif(len(input_features) != len(self.categories_)): - raise ValueError( - "input_features should have length equal to number of " - "features ({}), got {}".format(len(self.categories_), - len(input_features))) - - feature_names = [] - for i in range(len(cats)): - names = [ - input_features[i] + '_' + six.text_type(t) for t in cats[i]] - feature_names.extend(names) - - return np.array(feature_names, dtype=object) + return _transform_selected(X, self._transform, + self.categorical_features, copy=True) class OrdinalEncoder(_BaseEncoder): diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0a33f9140f902..656cdb879f94e 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2916,4 +2916,4 @@ def __init__(*args, **kwargs): raise RuntimeError( "CategoricalEncoder briefly existed in 0.20dev. Its functionality " "has been rolled into the OneHotEncoder and OrdinalEncoder. " - "This stub will be removed in version 0.21.") + "This stub will be removed in version 0.21.") \ No newline at end of file diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f4d0b5af9799f..8f603686033ce 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -7,7 +7,6 @@ import warnings import re -import itertools import numpy as np import numpy.linalg as la @@ -33,15 +32,18 @@ from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_allclose_dense_sparse from sklearn.utils.testing import skip_if_32bit +from sklearn.utils.testing import SkipTest from sklearn.utils.sparsefuncs import mean_variance_axis +from sklearn.preprocessing.data import _transform_selected from sklearn.preprocessing.data import _handle_zeros_in_scale from sklearn.preprocessing.data import Binarizer from sklearn.preprocessing.data import KernelCenterer from sklearn.preprocessing.data import Normalizer from sklearn.preprocessing.data import normalize +from sklearn.preprocessing.data import OneHotEncoder +from sklearn.preprocessing.data import CategoricalEncoder from sklearn.preprocessing.data import StandardScaler from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler @@ -58,11 +60,9 @@ from sklearn.preprocessing.data import power_transform from sklearn.exceptions import DataConversionWarning, NotFittedError -from sklearn.base import clone from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_predict from sklearn.svm import SVR -from sklearn.utils import shuffle from sklearn import datasets @@ -210,7 +210,7 @@ def test_standard_scaler_1d(): assert_array_almost_equal(X_scaled_back, X) # Constant feature - X = np.ones((5, 1)) + X = np.ones(5).reshape(5, 1) scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_almost_equal(scaler.mean_, 1.) @@ -238,7 +238,7 @@ def test_standard_scaler_numerical_stability(): # np.log(1e-5) is taken because of its floating point representation # was empirically found to cause numerical problems with np.mean & np.std. - x = np.full(8, np.log(1e-5), dtype=np.float64) + x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64) if LooseVersion(np.__version__) >= LooseVersion('1.9'): # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy @@ -250,17 +250,17 @@ def test_standard_scaler_numerical_stability(): assert_array_almost_equal(x_scaled, np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: - x = np.full(10, np.log(1e-5), dtype=np.float64) + x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64) w = "standard deviation of the data is probably very close to 0" x_scaled = assert_warns_message(UserWarning, w, scale, x) assert_array_almost_equal(x_scaled, np.zeros(10)) - x = np.full(10, 1e-100, dtype=np.float64) + x = np.ones(10, dtype=np.float64) * 1e-100 x_small_scaled = assert_no_warnings(scale, x) assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: - x_big = np.full(10, 1e100, dtype=np.float64) + x_big = np.ones(10, dtype=np.float64) * 1e100 w = "Dataset may contain too large values" x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) @@ -511,7 +511,7 @@ def test_standard_scaler_trasform_with_partial_fit(): assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) - epsilon = np.finfo(float).eps + epsilon = np.nextafter(0, 1) assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted @@ -622,7 +622,7 @@ def test_min_max_scaler_1d(): assert_array_almost_equal(X_scaled_back, X) # Constant feature - X = np.ones((5, 1)) + X = np.ones(5).reshape(5, 1) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) @@ -701,85 +701,6 @@ def test_scaler_without_centering(): assert_array_almost_equal(X_csc_scaled_back.toarray(), X) -@pytest.mark.parametrize("with_mean", [True, False]) -@pytest.mark.parametrize("with_std", [True, False]) -@pytest.mark.parametrize("array_constructor", - [np.asarray, sparse.csc_matrix, sparse.csr_matrix]) -def test_scaler_n_samples_seen_with_nan(with_mean, with_std, - array_constructor): - X = np.array([[0, 1, 3], - [np.nan, 6, 10], - [5, 4, np.nan], - [8, 0, np.nan]], - dtype=np.float64) - X = array_constructor(X) - - if sparse.issparse(X) and with_mean: - pytest.skip("'with_mean=True' cannot be used with sparse matrix.") - - transformer = StandardScaler(with_mean=with_mean, with_std=with_std) - transformer.fit(X) - - assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) - - -def _check_identity_scalers_attributes(scaler_1, scaler_2): - assert scaler_1.mean_ is scaler_2.mean_ is None - assert scaler_1.var_ is scaler_2.var_ is None - assert scaler_1.scale_ is scaler_2.scale_ is None - assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ - - -def test_scaler_return_identity(): - # test that the scaler return identity when with_mean and with_std are - # False - X_dense = np.array([[0, 1, 3], - [5, 6, 0], - [8, 0, 10]], - dtype=np.float64) - X_csr = sparse.csr_matrix(X_dense) - X_csc = X_csr.tocsc() - - transformer_dense = StandardScaler(with_mean=False, with_std=False) - X_trans_dense = transformer_dense.fit_transform(X_dense) - - transformer_csr = clone(transformer_dense) - X_trans_csr = transformer_csr.fit_transform(X_csr) - - transformer_csc = clone(transformer_dense) - X_trans_csc = transformer_csc.fit_transform(X_csc) - - assert_allclose_dense_sparse(X_trans_csr, X_csr) - assert_allclose_dense_sparse(X_trans_csc, X_csc) - assert_allclose(X_trans_dense, X_dense) - - for trans_1, trans_2 in itertools.combinations([transformer_dense, - transformer_csr, - transformer_csc], - 2): - _check_identity_scalers_attributes(trans_1, trans_2) - - transformer_dense.partial_fit(X_dense) - transformer_csr.partial_fit(X_csr) - transformer_csc.partial_fit(X_csc) - - for trans_1, trans_2 in itertools.combinations([transformer_dense, - transformer_csr, - transformer_csc], - 2): - _check_identity_scalers_attributes(trans_1, trans_2) - - transformer_dense.fit(X_dense) - transformer_csr.fit(X_csr) - transformer_csc.fit(X_csc) - - for trans_1, trans_2 in itertools.combinations([transformer_dense, - transformer_csr, - transformer_csc], - 2): - _check_identity_scalers_attributes(trans_1, trans_2) - - def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices @@ -901,56 +822,15 @@ def test_scale_sparse_with_mean_raise_exception(): def test_scale_input_finiteness_validation(): # Check if non finite inputs raise ValueError - X = [[np.inf, 5, 6, 7, 8]] + X = [[np.nan, 5, 6, 7, 8]] assert_raises_regex(ValueError, - "Input contains infinity or a value too large", + "Input contains NaN, infinity or a value too large", scale, X) - -def test_robust_scaler_error_sparse(): - X_sparse = sparse.rand(1000, 10) - scaler = RobustScaler(with_centering=True) - err_msg = "Cannot center sparse matrices" - with pytest.raises(ValueError, match=err_msg): - scaler.fit(X_sparse) - - -@pytest.mark.parametrize("with_centering", [True, False]) -@pytest.mark.parametrize("with_scaling", [True, False]) -@pytest.mark.parametrize("X", [np.random.randn(10, 3), - sparse.rand(10, 3, density=0.5)]) -def test_robust_scaler_attributes(X, with_centering, with_scaling): - # check consistent type of attributes - if with_centering and sparse.issparse(X): - pytest.skip("RobustScaler cannot center sparse matrix") - - scaler = RobustScaler(with_centering=with_centering, - with_scaling=with_scaling) - scaler.fit(X) - - if with_centering: - assert isinstance(scaler.center_, np.ndarray) - else: - assert scaler.center_ is None - if with_scaling: - assert isinstance(scaler.scale_, np.ndarray) - else: - assert scaler.scale_ is None - - -def test_robust_scaler_col_zero_sparse(): - # check that the scaler is working when there is not data materialized in a - # column of a sparse matrix - X = np.random.randn(10, 5) - X[:, 0] = 0 - X = sparse.csr_matrix(X) - - scaler = RobustScaler(with_centering=False) - scaler.fit(X) - assert scaler.scale_[0] == pytest.approx(1) - - X_trans = scaler.transform(X) - assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray()) + X = [[np.inf, 5, 6, 7, 8]] + assert_raises_regex(ValueError, + "Input contains NaN, infinity or a value too large", + scale, X) def test_robust_scaler_2d_arrays(): @@ -966,29 +846,6 @@ def test_robust_scaler_2d_arrays(): assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) -@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) -@pytest.mark.parametrize("strictly_signed", - ['positive', 'negative', 'zeros', None]) -def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): - # Check the equivalence of the fitting with dense and sparse matrices - X_sparse = sparse.rand(1000, 5, density=density).tocsc() - if strictly_signed == 'positive': - X_sparse.data = np.abs(X_sparse.data) - elif strictly_signed == 'negative': - X_sparse.data = - np.abs(X_sparse.data) - elif strictly_signed == 'zeros': - X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) - X_dense = X_sparse.toarray() - - scaler_sparse = RobustScaler(with_centering=False) - scaler_dense = RobustScaler(with_centering=False) - - scaler_sparse.fit(X_sparse) - scaler_dense.fit(X_dense) - - assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) - - def test_robust_scaler_transform_one_row_csr(): # Check RobustScaler on transforming csr matrix with one row rng = np.random.RandomState(0) @@ -1578,7 +1435,7 @@ def test_maxabs_scaler_1d(): assert_array_almost_equal(X_scaled_back, X) # Constant feature - X = np.ones((5, 1)) + X = np.ones(5).reshape(5, 1) scaler = MaxAbsScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.) @@ -1979,6 +1836,450 @@ def test_add_dummy_feature_csr(): assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) +def test_one_hot_encoder_sparse(): + # Test OneHotEncoder's fit and transform. + X = [[3, 2, 1], [0, 1, 1]] + enc = OneHotEncoder() + # discover max values automatically + X_trans = enc.fit_transform(X).toarray() + assert_equal(X_trans.shape, (2, 5)) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + + # check outcome + assert_array_equal(X_trans, + [[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]]) + + # max value given as 3 + enc = OneHotEncoder(n_values=4) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 4 * 3)) + assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) + + # max value given per feature + enc = OneHotEncoder(n_values=[3, 2, 2]) + X = [[1, 0, 1], [0, 1, 1]] + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 3 + 2 + 2)) + assert_array_equal(enc.n_values_, [3, 2, 2]) + # check that testing with larger feature works: + X = np.array([[2, 0, 1], [0, 1, 1]]) + enc.transform(X) + + # test that an error is raised when out of bounds: + X_too_large = [[0, 2, 1], [0, 1, 1]] + assert_raises(ValueError, enc.transform, X_too_large) + error_msg = r"unknown categorical feature present \[2\] during transform." + assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) + assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) + + # test that error is raised when wrong number of features + assert_raises(ValueError, enc.transform, X[:, :-1]) + # test that error is raised when wrong number of features in fit + # with prespecified n_values + assert_raises(ValueError, enc.fit, X[:, :-1]) + # test exception on wrong init param + assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) + + enc = OneHotEncoder() + # test negative input to fit + assert_raises(ValueError, enc.fit, [[0], [-1]]) + + # test negative input to transform + enc.fit([[0], [1]]) + assert_raises(ValueError, enc.transform, [[0], [-1]]) + + +def test_one_hot_encoder_dense(): + # check for sparse=False + X = [[3, 2, 1], [0, 1, 1]] + enc = OneHotEncoder(sparse=False) + # discover max values automatically + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (2, 5)) + assert_array_equal(enc.active_features_, + np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) + assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) + + # check outcome + assert_array_equal(X_trans, + np.array([[0., 1., 0., 1., 1.], + [1., 0., 1., 0., 1.]])) + + +def _check_transform_selected(X, X_expected, sel): + for M in (X, sparse.csr_matrix(X)): + Xtr = _transform_selected(M, Binarizer().transform, sel) + assert_array_equal(toarray(Xtr), X_expected) + + +def test_transform_selected(): + X = [[3, 2, 1], [0, 1, 1]] + + X_expected = [[1, 2, 1], [0, 1, 1]] + _check_transform_selected(X, X_expected, [0]) + _check_transform_selected(X, X_expected, [True, False, False]) + + X_expected = [[1, 1, 1], [0, 1, 1]] + _check_transform_selected(X, X_expected, [0, 1, 2]) + _check_transform_selected(X, X_expected, [True, True, True]) + _check_transform_selected(X, X_expected, "all") + + _check_transform_selected(X, X, []) + _check_transform_selected(X, X, [False, False, False]) + + +def test_transform_selected_copy_arg(): + # transformer that alters X + def _mutating_transformer(X): + X[0, 0] = X[0, 0] + 1 + return X + + original_X = np.asarray([[1, 2], [3, 4]]) + expected_Xtr = [[2, 2], [3, 4]] + + X = original_X.copy() + Xtr = _transform_selected(X, _mutating_transformer, copy=True, + selected='all') + + assert_array_equal(toarray(X), toarray(original_X)) + assert_array_equal(toarray(Xtr), expected_Xtr) + + +def _run_one_hot(X, X2, cat): + enc = OneHotEncoder(categorical_features=cat) + Xtr = enc.fit_transform(X) + X2tr = enc.transform(X2) + return Xtr, X2tr + + +def _check_one_hot(X, X2, cat, n_features): + ind = np.where(cat)[0] + # With mask + A, B = _run_one_hot(X, X2, cat) + # With indices + C, D = _run_one_hot(X, X2, ind) + # Check shape + assert_equal(A.shape, (2, n_features)) + assert_equal(B.shape, (1, n_features)) + assert_equal(C.shape, (2, n_features)) + assert_equal(D.shape, (1, n_features)) + # Check that mask and indices give the same results + assert_array_equal(toarray(A), toarray(C)) + assert_array_equal(toarray(B), toarray(D)) + + +def test_one_hot_encoder_categorical_features(): + X = np.array([[3, 2, 1], [0, 1, 1]]) + X2 = np.array([[1, 1, 1]]) + + cat = [True, False, False] + _check_one_hot(X, X2, cat, 4) + + # Edge case: all non-categorical + cat = [False, False, False] + _check_one_hot(X, X2, cat, 3) + + # Edge case: all categorical + cat = [True, True, True] + _check_one_hot(X, X2, cat, 5) + + +def test_one_hot_encoder_unknown_transform(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + # Test the ignore option, ignores unknown features. + oh = OneHotEncoder(handle_unknown='ignore') + oh.fit(X) + assert_array_equal( + oh.transform(y).toarray(), + np.array([[0., 0., 0., 0., 1., 0., 0.]])) + + # Raise error if handle_unknown is neither ignore or error. + oh = OneHotEncoder(handle_unknown='42') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_invalid_handle_missing(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_missing_values_none_handle_missing_passed(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown='error', missing_values=None, handle_missing='abcde') + oh.fit(X) + assert_raises(ValueError, oh.transform, y) + + +def test_one_hot_encoder_handle_missing_all_zeros(): + pass + + +def test_one_hot_encoder_handle_missing_all_missing(): + pass + + +def test_one_hot_encoder_handle_missing_category(): + pass + + +def check_categorical_onehot(X): + enc = CategoricalEncoder(encoding='onehot') + Xtr1 = enc.fit_transform(X) + + enc = CategoricalEncoder(encoding='onehot-dense') + Xtr2 = enc.fit_transform(X) + + assert_allclose(Xtr1.toarray(), Xtr2) + + assert sparse.isspmatrix_csr(Xtr1) + return Xtr1.toarray() + + +def test_categorical_encoder_onehot(): + X = [['abc', 1, 55], ['def', 2, 55]] + + Xtr = check_categorical_onehot(np.array(X)[:, [0]]) + assert_allclose(Xtr, [[1, 0], [0, 1]]) + + Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) + assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) + + Xtr = CategoricalEncoder().fit_transform(X) + assert_allclose(Xtr.toarray(), [[1, 0, 1, 0, 1], [0, 1, 0, 1, 1]]) + + +def test_categorical_encoder_onehot_inverse(): + for encoding in ['onehot', 'onehot-dense']: + X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]] + enc = CategoricalEncoder(encoding=encoding) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + X = [[2, 55], [1, 55], [3, 55]] + enc = CategoricalEncoder(encoding=encoding) + X_tr = enc.fit_transform(X) + exp = np.array(X) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # with unknown categories + X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]] + enc = CategoricalEncoder(encoding=encoding, handle_unknown='ignore', + categories=[['abc', 'def'], [1, 2], + [54, 55, 56]]) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # with an otherwise numerical output, still object if unknown + X = [[2, 55], [1, 55], [3, 55]] + enc = CategoricalEncoder(encoding=encoding, + categories=[[1, 2], [54, 56]], + handle_unknown='ignore') + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 0] = None + exp[:, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1], [1, 0, 1]]) + msg = re.escape('Shape of the passed X data is not correct') + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + + +def test_categorical_encoder_handle_unknown(): + X = np.array([[1, 2, 3], [4, 5, 6]]) + X2 = np.array([[7, 5, 3]]) + + # Test that encoder raises error for unknown features during transform. + enc = CategoricalEncoder() + enc.fit(X) + msg = re.escape('unknown categories [7] in column 0') + assert_raises_regex(ValueError, msg, enc.transform, X2) + + # With 'ignore' you get all 0's in result + enc = CategoricalEncoder(handle_unknown='ignore') + enc.fit(X) + X2_passed = X2.copy() + Xtr = enc.transform(X2_passed) + assert_allclose(Xtr.toarray(), [[0, 0, 0, 1, 1, 0]]) + # ensure transformed data was not modified in place + assert_allclose(X2, X2_passed) + + # Invalid option + enc = CategoricalEncoder(handle_unknown='invalid') + assert_raises(ValueError, enc.fit, X) + + +def test_categorical_encoder_categories(): + X = [['abc', 1, 55], ['def', 2, 55]] + + # order of categories should not depend on order of samples + for Xi in [X, X[::-1]]: + enc = CategoricalEncoder() + enc.fit(Xi) + assert enc.categories == 'auto' + assert isinstance(enc.categories_, list) + cat_exp = [['abc', 'def'], [1, 2], [55]] + for res, exp in zip(enc.categories_, cat_exp): + assert res.tolist() == exp + + +def test_categorical_encoder_specified_categories(): + X = np.array([['a', 'b']], dtype=object).T + + enc = CategoricalEncoder(categories=[['a', 'b', 'c']]) + exp = np.array([[1., 0., 0.], + [0., 1., 0.]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories[0] == ['a', 'b', 'c'] + assert enc.categories_[0].tolist() == ['a', 'b', 'c'] + assert np.issubdtype(enc.categories_[0].dtype, np.str_) + + # unsorted passed categories raises for now + enc = CategoricalEncoder(categories=[['c', 'b', 'a']]) + msg = re.escape('Unsorted categories are not yet supported') + assert_raises_regex(ValueError, msg, enc.fit_transform, X) + + # multiple columns + X = np.array([['a', 'b'], [0, 2]], dtype=object).T + enc = CategoricalEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]]) + exp = np.array([[1., 0., 0., 1., 0., 0.], + [0., 1., 0., 0., 0., 1.]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ['a', 'b', 'c'] + assert np.issubdtype(enc.categories_[0].dtype, np.str_) + assert enc.categories_[1].tolist() == [0, 1, 2] + assert np.issubdtype(enc.categories_[1].dtype, np.integer) + + # when specifying categories manually, unknown categories should already + # raise when fitting + X = np.array([['a', 'b', 'c']]).T + enc = CategoricalEncoder(categories=[['a', 'b']]) + assert_raises(ValueError, enc.fit, X) + enc = CategoricalEncoder(categories=[['a', 'b']], handle_unknown='ignore') + exp = np.array([[1., 0.], [0., 1.], [0., 0.]]) + assert_array_equal(enc.fit(X).transform(X).toarray(), exp) + + +def test_categorical_encoder_pandas(): + try: + import pandas as pd + except ImportError: + raise SkipTest("pandas is not installed") + + X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) + + Xtr = check_categorical_onehot(X_df) + assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) + + +def test_categorical_encoder_ordinal(): + X = [['abc', 2, 55], ['def', 1, 55]] + + enc = CategoricalEncoder(encoding='other') + assert_raises(ValueError, enc.fit, X) + + enc = CategoricalEncoder(encoding='ordinal', handle_unknown='ignore') + assert_raises(ValueError, enc.fit, X) + + enc = CategoricalEncoder(encoding='ordinal') + exp = np.array([[0, 1, 0], + [1, 0, 0]], dtype='int64') + assert_array_equal(enc.fit_transform(X), exp.astype('float64')) + enc = CategoricalEncoder(encoding='ordinal', dtype='int64') + assert_array_equal(enc.fit_transform(X), exp) + + +def test_categorical_encoder_ordinal_inverse(): + X = [['abc', 2, 55], ['def', 1, 55]] + enc = CategoricalEncoder(encoding='ordinal') + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) + msg = re.escape('Shape of the passed X data is not correct') + assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) + + +def test_categorical_encoder_dtypes(): + # check that dtypes are preserved when determining categories + enc = CategoricalEncoder() + exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64') + + for X in [np.array([[1, 2], [3, 4]], dtype='int64'), + np.array([[1, 2], [3, 4]], dtype='float64'), + np.array([['a', 'b'], ['c', 'd']]), # string dtype + np.array([[1, 'a'], [3, 'b']], dtype='object')]: + enc.fit(X) + assert all([enc.categories_[i].dtype == X.dtype for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, 2], [3, 4]] + enc.fit(X) + assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) + for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, 'a'], [3, 'b']] + enc.fit(X) + assert all([enc.categories_[i].dtype == 'object' for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_categorical_encoder_dtypes_pandas(): + # check dtype (similar to test_categorical_encoder_dtypes for dataframes) + try: + import pandas as pd + except ImportError: + raise SkipTest("pandas is not installed") + + enc = CategoricalEncoder() + exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64') + + X = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, dtype='int64') + enc.fit(X) + assert all([enc.categories_[i].dtype == 'int64' for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b']}) + enc.fit(X) + assert all([enc.categories_[i].dtype == 'object' for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_categorical_encoder_warning(): + enc = CategoricalEncoder() + X = [['Male', 1], ['Female', 3]] + np.testing.assert_no_warnings(enc.fit_transform, X) + + def test_fit_cold_start(): X = iris.data X_2d = X[:, :2] @@ -2004,26 +2305,13 @@ def test_quantile_transform_valid_axis(): ". Got axis=2", quantile_transform, X.T, axis=2) -@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) -def test_power_transformer_notfitted(method): - pt = PowerTransformer(method=method) +def test_power_transformer_notfitted(): + pt = PowerTransformer(method='box-cox') X = np.abs(X_1col) assert_raises(NotFittedError, pt.transform, X) assert_raises(NotFittedError, pt.inverse_transform, X) -@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) -@pytest.mark.parametrize('standardize', [True, False]) -@pytest.mark.parametrize('X', [X_1col, X_2d]) -def test_power_transformer_inverse(method, standardize, X): - # Make sure we get the original input when applying transform and then - # inverse transform - X = np.abs(X) if method == 'box-cox' else X - pt = PowerTransformer(method=method, standardize=standardize) - X_trans = pt.fit_transform(X) - assert_almost_equal(X, pt.inverse_transform(X_trans)) - - def test_power_transformer_1d(): X = np.abs(X_1col) @@ -2075,12 +2363,11 @@ def test_power_transformer_2d(): assert isinstance(pt.lambdas_, np.ndarray) -def test_power_transformer_boxcox_strictly_positive_exception(): - # Exceptions should be raised for negative arrays and zero arrays when - # method is boxcox - +def test_power_transformer_strictly_positive_exception(): pt = PowerTransformer(method='box-cox') pt.fit(np.abs(X_2d)) + + # Exceptions should be raised for negative arrays and zero arrays X_with_negatives = X_2d not_positive_message = 'strictly positive' @@ -2091,7 +2378,7 @@ def test_power_transformer_boxcox_strictly_positive_exception(): pt.fit, X_with_negatives) assert_raise_message(ValueError, not_positive_message, - power_transform, X_with_negatives, 'box-cox') + power_transform, X_with_negatives) assert_raise_message(ValueError, not_positive_message, pt.transform, np.zeros(X_2d.shape)) @@ -2100,19 +2387,11 @@ def test_power_transformer_boxcox_strictly_positive_exception(): pt.fit, np.zeros(X_2d.shape)) assert_raise_message(ValueError, not_positive_message, - power_transform, np.zeros(X_2d.shape), 'box-cox') + power_transform, np.zeros(X_2d.shape)) -@pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d), - np.zeros(X_2d.shape)]) -def test_power_transformer_yeojohnson_any_input(X): - # Yeo-Johnson method should support any kind of input - power_transform(X, method='yeo-johnson') - - -@pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) -def test_power_transformer_shape_exception(method): - pt = PowerTransformer(method=method) +def test_power_transformer_shape_exception(): + pt = PowerTransformer(method='box-cox') X = np.abs(X_2d) pt.fit(X) @@ -2145,136 +2424,3 @@ def test_power_transformer_lambda_zero(): pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) - - -def test_power_transformer_lambda_one(): - # Make sure lambda = 1 corresponds to the identity for yeo-johnson - pt = PowerTransformer(method='yeo-johnson', standardize=False) - X = np.abs(X_2d)[:, 0:1] - - pt.lambdas_ = np.array([1]) - X_trans = pt.transform(X) - assert_array_almost_equal(X_trans, X) - - -@pytest.mark.parametrize("method, lmbda", [('box-cox', .1), - ('box-cox', .5), - ('yeo-johnson', .1), - ('yeo-johnson', .5), - ('yeo-johnson', 1.), - ]) -def test_optimization_power_transformer(method, lmbda): - # Test the optimization procedure: - # - set a predefined value for lambda - # - apply inverse_transform to a normal dist (we get X_inv) - # - apply fit_transform to X_inv (we get X_inv_trans) - # - check that X_inv_trans is roughly equal to X - - rng = np.random.RandomState(0) - n_samples = 20000 - X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) - - pt = PowerTransformer(method=method, standardize=False) - pt.lambdas_ = [lmbda] - X_inv = pt.inverse_transform(X) - - pt = PowerTransformer(method=method, standardize=False) - X_inv_trans = pt.fit_transform(X_inv) - - assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, - decimal=2) - assert_almost_equal(0, X_inv_trans.mean(), decimal=1) - assert_almost_equal(1, X_inv_trans.std(), decimal=1) - - -@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) -def test_power_transformer_nans(method): - # Make sure lambda estimation is not influenced by NaN values - # and that transform() supports NaN silently - - X = np.abs(X_1col) - pt = PowerTransformer(method=method) - pt.fit(X) - lmbda_no_nans = pt.lambdas_[0] - - # concat nans at the end and check lambda stays the same - X = np.concatenate([X, np.full_like(X, np.nan)]) - X = shuffle(X, random_state=0) - - pt.fit(X) - lmbda_nans = pt.lambdas_[0] - - assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) - - X_trans = pt.transform(X) - assert_array_equal(np.isnan(X_trans), np.isnan(X)) - - -@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) -@pytest.mark.parametrize('standardize', [True, False]) -def test_power_transformer_fit_transform(method, standardize): - # check that fit_transform() and fit().transform() return the same values - X = X_1col - if method == 'box-cox': - X = np.abs(X) - - pt = PowerTransformer(method, standardize) - assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) - - -@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) -@pytest.mark.parametrize('standardize', [True, False]) -def test_power_transformer_copy_True(method, standardize): - # Check that neither fit, transform, fit_transform nor inverse_transform - # modify X inplace when copy=True - X = X_1col - if method == 'box-cox': - X = np.abs(X) - - X_original = X.copy() - assert X is not X_original # sanity checks - assert_array_almost_equal(X, X_original) - - pt = PowerTransformer(method, standardize, copy=True) - - pt.fit(X) - assert_array_almost_equal(X, X_original) - X_trans = pt.transform(X) - assert X_trans is not X - - X_trans = pt.fit_transform(X) - assert_array_almost_equal(X, X_original) - assert X_trans is not X - - X_inv_trans = pt.inverse_transform(X_trans) - assert X_trans is not X_inv_trans - - -@pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) -@pytest.mark.parametrize('standardize', [True, False]) -def test_power_transformer_copy_False(method, standardize): - # check that when copy=False fit doesn't change X inplace but transform, - # fit_transform and inverse_transform do. - X = X_1col - if method == 'box-cox': - X = np.abs(X) - - X_original = X.copy() - assert X is not X_original # sanity checks - assert_array_almost_equal(X, X_original) - - pt = PowerTransformer(method, standardize, copy=False) - - pt.fit(X) - assert_array_almost_equal(X, X_original) # fit didn't change X - - X_trans = pt.transform(X) - assert X_trans is X - - if method == 'box-cox': - X = np.abs(X) - X_trans = pt.fit_transform(X) - assert X_trans is X - - X_inv_trans = pt.inverse_transform(X_trans) - assert X_trans is X_inv_trans