8000 ENH support multilabel targets in LabelEncoder by jnothman · Pull Request #1987 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH support multilabel targets in LabelEncoder #1987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 35 additions & 12 deletions sklearn/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Andreas Mueller <amueller@ais.uni-bonn.de>
# License: BSD 3 clause

import functools
import warnings
import numbers

Expand Down Expand Up @@ -829,6 +830,15 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']

It can also be used to transform multi-label sequences of sequences:

>>> le = preprocessing.LabelEncoder()
>>> targets = [["paris", "tokyo"], ["amsterdam", "paris"]]
>>> list(map(list, le.fit_transform(targets)))
[[1, 2], [0, 1]]
>>> list(map(list, le.inverse_transform([[1, 2], [0, 1]])))
[['paris', 'tokyo'], ['amsterdam', 'paris']]

"""

def _check_fitted(self):
Expand All @@ -840,28 +850,31 @@ def fit(self, y):

Parameters
----------
y : array-like of shape [n_samples]
y : array-like of shape [n_samples] or sequence of sequences
Target values.

Returns
-------
self : returns an instance of self.
"""
self.classes_ = np.unique(y)
self.classes_ = unique_labels(y)
return self

def fit_transform(self, y):
"""Fit label encoder and return encoded labels

Parameters
----------
y : array-like of shape [n_samples]
y : array-like of shape [n_samples] or sequence of sequences
Target values.

Returns
-------
y : array-like of shape [n_samples]
y : array-like of shape [n_samples] or sequence of sequences
"""
if is_multilabel(y):
self.fit(y)
return self.transform(y)
self.classes_, y = unique(y, return_inverse=True)
return y

Expand All @@ -870,36 +883,46 @@ def transform(self, y):

Parameters
----------
y : array-like of shape [n_samples]
y : array-like of shape [n_samples] or sequence of sequences
Target values.

Returns
-------
y : array-like of shape [n_samples]
y : array-like of shape [n_samples] or sequence of sequences
"""
self._check_fitted()
if is_multilabel(y):
if is_label_indicator_matrix(y):
raise ValueError(
'{} does not support label indicator matrices'.format(
self.__class__.__name__))
return list(map(self._transform, y))

classes = np.unique(y)
if len(np.intersect1d(classes, self.classes_)) < len(classes):
diff = np.setdiff1d(classes, self.classes_)
raise ValueError("y contains new labels: %s" % str(diff))
return self._transform(y)

def _transform(self, y):
diff = np.setdiff1d(y, self.classes_)
if len(diff):
raise ValueError("y contains new labels: %s" % str(diff))
return np.searchsorted(self.classes_, y)

def inverse_transform(self, y):
"""Transform labels back to original encoding.

Parameters
8000 ----------
y : numpy array of shape [n_samples]
y : numpy array of shape [n_samples] or sequence of sequences
Target values.

Returns
-------
y : numpy array of shape [n_samples]
y : numpy array of shape [n_samples] or sequence of sequences
"""
self._check_fitted()

if is_multilabel(y):
# np.vectorize does not work with np.ndarray.take!
return list(map(self.classes_.take, y))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer (syntacticaly) a list comprehension:

[self.classes_.take(this_y) for this_y in y]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I think that it would be better if we returned an array, with dtype=object, instead of a list. A lot of code expects to find arrays, for instance testing the shape of the object, or applying fancy indexing. Besides, having the return type of a method vary can be a good recipe for later bugs.

y = np.asarray(y)
return self.classes_[y]

Expand Down
7 changes: 4 additions & 3 deletions sklearn/tests/test_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_sequences_equal

from sklearn.utils.testing import assert_greater
from sklearn.multiclass import OneVsRestClassifier
Expand Down Expand Up @@ -64,7 +65,7 @@ def test_ovr_always_present():
ovr = OneVsRestClassifier(DecisionTreeClassifier())
ovr.fit(X, y)
y_pred = ovr.predict(X)
assert_array_equal(np.array(y_pred), np.array(y))
assert_sequences_equal(y_pred, y)


def test_ovr_multilabel():
Expand Down Expand Up @@ -146,13 +147,13 @@ def test_ovr_multilabel_predict_proba():
decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
assert_raises(AttributeError, decision_only.predict_proba, X_test)

Y_pred = clf.predict(X_test)
Y_pred = list(clf.predict(X_test))
Y_proba = clf.predict_proba(X_test)

# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)]
assert_equal(pred, Y_pred)
assert_sequences_equal(pred, Y_pred)


def test_ovr_single_label_predict_proba():
Expand Down
49 changes: 46 additions & 3 deletions sklearn/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_sequences_equal

from sklearn.utils.sparsefuncs import mean_variance_axis0
from sklearn.preprocessing import Binarizer
Expand Down Expand Up @@ -510,7 +511,7 @@ def test_label_binarizer_multilabel():
[1, 1, 0]])
got = lb.fit_transform(inp)
assert_array_equal(indicator_mat, got)
assert_equal(lb.inverse_transform(got), inp)
assert_sequences_equal(lb.inverse_transform(got), inp)

# test input as label indicator matrix
lb.fit(indicator_mat)
Expand All @@ -527,8 +528,7 @@ def test_label_binarizer_multilabel():
[1, 1]])
got = lb.fit_transform(inp)
assert_array_equal(expected, got)
assert_equal([set(x) for x in lb.inverse_transform(got)],
[set(x) for x in inp])
assert_sequences_equal(lb.inverse_transform(got), inp)


def test_label_binarizer_errors():
Expand Down Expand Up @@ -612,17 +612,47 @@ def test_label_encoder():
assert_raises(ValueError, le.transform, [0, 6])


def test_label_encoder_multilabel():
"""Test LabelEncoder's transform and inverse_transform methods with
multilabel data"""
le = LabelEncoder()
le.fit([[1], [1, 4], [5, -1, 0]])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
assert_sequences_equal(le.transform([[0, 1, 4], [4, 5, -1], [-1]]),
[[1, 2, 3], [3, 4, 0], [0]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8

assert_sequences_equal(le.inverse_transform([[1, 2, 3], [3, 4, 0], [0]]),
[[0, 1, 4], [4, 5, -1], [-1]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8

assert_raises(ValueError, le.transform, [[0, 6]])
# not handling label encoder matrices presently
assert_raises(ValueError, le.transform, np.array([[0, 1], [1, 0]]))


def test_label_encoder_fit_transform():
"""Test fit_transform"""
le = LabelEncoder()
ret = le.fit_transform([1, 1, 4, 5, -1, 0])
assert_array_equal(ret, [2, 2, 3, 4, 0, 1])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])

le = LabelEncoder()
ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
assert_array_equal(ret, [1, 1, 2, 0])


def test_label_encoder_fit_transform_multilabel():
"""Test fit_transform for multilabel input"""
le = LabelEncoder()
ret = le.fit_transform([[1], [1, 4, 5], [-1, 0]])
assert_sequences_equal(ret, [[2], [2, 3, 4], [0, 1]])
assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])

le = LabelEncoder()
ret = le.fit_transform([["paris"], ["paris", "tokyo", "amsterdam"]])
assert_sequences_equal(ret, [[1], [1, 2, 0]])
# not handling label encoder matrices presently
assert_raises(ValueError, le.transform, np.array([[0, 1], [1, 0]]))


def test_label_encoder_string_labels():
"""Test LabelEncoder's transform and inverse_transform methods with
non-numeric labels"""
Expand All @@ -636,6 +666,19 @@ def test_label_encoder_string_labels():
assert_raises(ValueError, le.transform, ["london"])


def test_label_encoder_strings_multilabel():
"""Test LabelEncoder's transform and inverse_transform methods with
non-numeric multilabel data"""
le = LabelEncoder()
le.fit([["paris"], ["paris", "tokyo", "amsterdam"]])
assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
assert_sequences_equal(le.transform([["tokyo"], ["tokyo", "paris"]]),
[[2], [2, 1]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8

assert_sequences_equal(le.inverse_transform([[2], [2, 1]]),
[["tokyo"], ["tokyo", "paris"]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8

assert_raises(ValueError, le.transform, ["london"])


def test_label_encoder_errors():
"""Check that invalid arguments yield ValueError"""
le = LabelEncoder()
Expand Down
6 changes: 4 additions & 2 deletions sklearn/utils/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,5 +130,7 @@ def is_multilabel(y):
"""
# the explicit check for ndarray is for forward compatibility; future
# versions of Numpy might want to register ndarray as a Sequence
return (not isinstance(y[0], np.ndarray) and isinstance(y[0], Sequence) and
not isinstance(y[0], string_types) or is_label_indicator_matrix(y))
if getattr(y, 'ndim', 1) != 1:
return is_label_indicator_matrix(y)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a test case in mind for this modification?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point is to allow the outer sequence to be an array. But it can't be an array of size 1. Did I not test it? Ah well.

return ((isinstance(y[0], Sequence) and not isinstance(y[0], string_types))
or isinstance(y[0], np.ndarray))
20 changes: 20 additions & 0 deletions sklearn/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import inspect
import pkgutil

import numpy as np
import scipy as sp
from functools import wraps
try:
Expand Down Expand Up @@ -97,6 +98,25 @@ def assert_raise_message(exception, message, function, *args, **kwargs):
assert_in(message, error_message)


def assert_sequences_equal(first, second, err_msg=''):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the advantage of this new function compare to assert_equal?

Copy link
Member

F987 Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we do without and use only arrays with dtype=object? That way we make sure in the tests that we are manipulating arrays, and not lists.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have discussed arrays of dtype object below. assert_equal won't work if the outer or inner sequence is an array.

"""Asserts equality of two sequences of sequences

This compares multilabel targets irrespective of the sequence types.
It is necessary because sequence types vary, `assert_array_equal` may
misinterpret some formats as 2-dimensional.
"""
# TODO: first assert args are valid sequences of sequences
if err_msg:
err_msg = '\n' + err_msg
assert_equal(len(first), len(second),
'Sequence of sequence lengths do not match.'
'{}'.format(err_msg))
for i, (first_el, second_el) in enumerate(zip(first, second)):
assert_array_equal(np.unique(first_el), np.unique(second_el),
'In sequence of sequence element {}'
'{}'.format(i, err_msg))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that you mean

        assert_array_equal(first_el, second_el,
                           'In sequence of sequence element {}'
                           '{}'.format(i, err_msg))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're quite right. Fixed.



def fake_mldata(columns_dict, dataname, matfile, ordering=None):
"""Create a fake mldata data set.

Expand Down
0