diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py index a26ce1f2f0716..3ecd1c1ee3e35 100644 --- a/sklearn/preprocessing.py +++ b/sklearn/preprocessing.py @@ -4,6 +4,7 @@ # Andreas Mueller # License: BSD 3 clause +import functools import warnings import numbers @@ -829,6 +830,15 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] + It can also be used to transform multi-label sequences of sequences: + + >>> le = preprocessing.LabelEncoder() + >>> targets = [["paris", "tokyo"], ["amsterdam", "paris"]] + >>> list(map(list, le.fit_transform(targets))) + [[1, 2], [0, 1]] + >>> list(map(list, le.inverse_transform([[1, 2], [0, 1]]))) + [['paris', 'tokyo'], ['amsterdam', 'paris']] + """ def _check_fitted(self): @@ -840,14 +850,14 @@ def fit(self, y): Parameters ---------- - y : array-like of shape [n_samples] + y : array-like of shape [n_samples] or sequence of sequences Target values. Returns ------- self : returns an instance of self. """ - self.classes_ = np.unique(y) + self.classes_ = unique_labels(y) return self def fit_transform(self, y): @@ -855,13 +865,16 @@ def fit_transform(self, y): Parameters ---------- - y : array-like of shape [n_samples] + y : array-like of shape [n_samples] or sequence of sequences Target values. Returns ------- - y : array-like of shape [n_samples] + y : array-like of shape [n_samples] or sequence of sequences """ + if is_multilabel(y): + self.fit(y) + return self.transform(y) self.classes_, y = unique(y, return_inverse=True) return y @@ -870,20 +883,27 @@ def transform(self, y): Parameters ---------- - y : array-like of shape [n_samples] + y : array-like of shape [n_samples] or sequence of sequences Target values. Returns ------- - y : array-like of shape [n_samples] + y : array-like of shape [n_samples] or sequence of sequences """ self._check_fitted() + if is_multilabel(y): + if is_label_indicator_matrix(y): + raise ValueError( + '{} does not support label indicator matrices'.format( + self.__class__.__name__)) + return list(map(self._transform, y)) - classes = np.unique(y) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) + return self._transform(y) + def _transform(self, y): + diff = np.setdiff1d(y, self.classes_) + if len(diff): + raise ValueError("y contains new labels: %s" % str(diff)) return np.searchsorted(self.classes_, y) def inverse_transform(self, y): @@ -891,15 +911,18 @@ def inverse_transform(self, y): Parameters ---------- - y : numpy array of shape [n_samples] + y : numpy array of shape [n_samples] or sequence of sequences Target values. Returns ------- - y : numpy array of shape [n_samples] + y : numpy array of shape [n_samples] or sequence of sequences """ self._check_fitted() + if is_multilabel(y): + # np.vectorize does not work with np.ndarray.take! + return list(map(self.classes_.take, y)) y = np.asarray(y) return self.classes_[y] diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index bc7e760ce84c5..a5714b573ce76 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -7,6 +7,7 @@ from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_sequences_equal from sklearn.utils.testing import assert_greater from sklearn.multiclass import OneVsRestClassifier @@ -64,7 +65,7 @@ def test_ovr_always_present(): ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(X, y) y_pred = ovr.predict(X) - assert_array_equal(np.array(y_pred), np.array(y)) + assert_sequences_equal(y_pred, y) def test_ovr_multilabel(): @@ -146,13 +147,13 @@ def test_ovr_multilabel_predict_proba(): decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) - Y_pred = clf.predict(X_test) + Y_pred = list(clf.predict(X_test)) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)] - assert_equal(pred, Y_pred) + assert_sequences_equal(pred, Y_pred) def test_ovr_single_label_predict_proba(): diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py index 97e24b0a13ece..f776e30cc6ba2 100644 --- a/sklearn/tests/test_preprocessing.py +++ b/sklearn/tests/test_preprocessing.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false +from sklearn.utils.testing import assert_sequences_equal from sklearn.utils.sparsefuncs import mean_variance_axis0 from sklearn.preprocessing import Binarizer @@ -510,7 +511,7 @@ def test_label_binarizer_multilabel(): [1, 1, 0]]) got = lb.fit_transform(inp) assert_array_equal(indicator_mat, got) - assert_equal(lb.inverse_transform(got), inp) + assert_sequences_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix lb.fit(indicator_mat) @@ -527,8 +528,7 @@ def test_label_binarizer_multilabel(): [1, 1]]) got = lb.fit_transform(inp) assert_array_equal(expected, got) - assert_equal([set(x) for x in lb.inverse_transform(got)], - [set(x) for x in inp]) + assert_sequences_equal(lb.inverse_transform(got), inp) def test_label_binarizer_errors(): @@ -612,17 +612,47 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_multilabel(): + """Test LabelEncoder's transform and inverse_transform methods with + multilabel data""" + le = LabelEncoder() + le.fit([[1], [1, 4], [5, -1, 0]]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_sequences_equal(le.transform([[0, 1, 4], [4, 5, -1], [-1]]), + [[1, 2, 3], [3, 4, 0], [0]]) + assert_sequences_equal(le.inverse_transform([[1, 2, 3], [3, 4, 0], [0]]), + [[0, 1, 4], [4, 5, -1], [-1]]) + assert_raises(ValueError, le.transform, [[0, 6]]) + # not handling label encoder matrices presently + assert_raises(ValueError, le.transform, np.array([[0, 1], [1, 0]])) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() ret = le.fit_transform([1, 1, 4, 5, -1, 0]) assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) le = LabelEncoder() ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(ret, [1, 1, 2, 0]) +def test_label_encoder_fit_transform_multilabel(): + """Test fit_transform for multilabel input""" + le = LabelEncoder() + ret = le.fit_transform([[1], [1, 4, 5], [-1, 0]]) + assert_sequences_equal(ret, [[2], [2, 3, 4], [0, 1]]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + + le = LabelEncoder() + ret = le.fit_transform([["paris"], ["paris", "tokyo", "amsterdam"]]) + assert_sequences_equal(ret, [[1], [1, 2, 0]]) + # not handling label encoder matrices presently + assert_raises(ValueError, le.transform, np.array([[0, 1], [1, 0]])) + + def test_label_encoder_string_labels(): """Test LabelEncoder's transform and inverse_transform methods with non-numeric labels""" @@ -636,6 +666,19 @@ def test_label_encoder_string_labels(): assert_raises(ValueError, le.transform, ["london"]) +def test_label_encoder_strings_multilabel(): + """Test LabelEncoder's transform and inverse_transform methods with + non-numeric multilabel data""" + le = LabelEncoder() + le.fit([["paris"], ["paris", "tokyo", "amsterdam"]]) + assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) + assert_sequences_equal(le.transform([["tokyo"], ["tokyo", "paris"]]), + [[2], [2, 1]]) + assert_sequences_equal(le.inverse_transform([[2], [2, 1]]), + [["tokyo"], ["tokyo", "paris"]]) + assert_raises(ValueError, le.transform, ["london"]) + + def test_label_encoder_errors(): """Check that invalid arguments yield ValueError""" le = LabelEncoder() diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 4c79791a0883e..055be4d311969 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -130,5 +130,7 @@ def is_multilabel(y): """ # the explicit check for ndarray is for forward compatibility; future # versions of Numpy might want to register ndarray as a Sequence - return (not isinstance(y[0], np.ndarray) and isinstance(y[0], Sequence) and - not isinstance(y[0], string_types) or is_label_indicator_matrix(y)) + if getattr(y, 'ndim', 1) != 1: + return is_label_indicator_matrix(y) + return ((isinstance(y[0], Sequence) and not isinstance(y[0], string_types)) + or isinstance(y[0], np.ndarray)) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 3e545a7adad9a..6a7d55e950415 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -10,6 +10,7 @@ import inspect import pkgutil +import numpy as np import scipy as sp from functools import wraps try: @@ -97,6 +98,25 @@ def assert_raise_message(exception, message, function, *args, **kwargs): assert_in(message, error_message) +def assert_sequences_equal(first, second, err_msg=''): + """Asserts equality of two sequences of sequences + + This compares multilabel targets irrespective of the sequence types. + It is necessary because sequence types vary, `assert_array_equal` may + misinterpret some formats as 2-dimensional. + """ + # TODO: first assert args are valid sequences of sequences + if err_msg: + err_msg = '\n' + err_msg + assert_equal(len(first), len(second), + 'Sequence of sequence lengths do not match.' + '{}'.format(err_msg)) + for i, (first_el, second_el) in enumerate(zip(first, second)): + assert_array_equal(np.unique(first_el), np.unique(second_el), + 'In sequence of sequence element {}' + '{}'.format(i, err_msg)) + + def fake_mldata(columns_dict, dataname, matfile, ordering=None): """Create a fake mldata data set.