jnothman
diff --git a/‎sklearn/preprocessing.py
Lines changed: 30 additions & 5 deletions b/‎sklearn/preprocessing.py
Lines changed: 30 additions & 5 deletions
diff --git a/‎sklearn/utils/multiclass.py
Lines changed: 49 additions & 3 deletions b/‎sklearn/utils/multiclass.py
Lines changed: 49 additions & 3 deletions
@@ -4,6 +4,7 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 # License: BSD 3 clause
 
+import functools
 import warnings
 import numbers
 
@@ -19,6 +20,7 @@
 from .utils.multiclass import unique_labels
 from .utils.multiclass import is_multilabel
 from .utils.multiclass import is_label_indicator_matrix
+from .utils.multiclass import multilabel_vectorize
 
 from .utils.sparsefuncs import inplace_csr_row_normalize_l1
 from .utils.sparsefuncs import inplace_csr_row_normalize_l2
@@ -829,6 +831,15 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+    It can also be used to transform multi-label sequences of sequences:
+
+    >>> le = preprocessing.LabelEncoder()
+    >>> targets = [["paris", "tokyo"], ["amsterdam", "paris"]]
+    >>> le.fit_transform(targets)
+    array([[1 2], [0 1]], dtype=object)
+    >>> list(map(list, le.inverse_transform([[1, 2], [0, 1]])))
+    [['paris', 'tokyo'], ['amsterdam', 'paris']]
+
     """
 
     def _check_fitted(self):
@@ -847,7 +858,7 @@ def fit(self, y):
         -------
         self : returns an instance of self.
         """
-        self.classes_ = np.unique(y)
+        self.classes_ = unique_labels(y)
         return self
 
     def fit_transform(self, y):
@@ -862,6 +873,9 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        if is_multilabel(y):
+            self.fit(y)
+            return self.transform(y)
         self.classes_, y = unique(y, return_inverse=True)
         return y
 
@@ -878,12 +892,19 @@ def transform(self, y):
         y : array-like of shape [n_samples]
         """
         self._check_fitted()
+        if is_multilabel(y):
+            if is_label_indicator_matrix(y):
+                raise ValueError(
+                    '{} does not support label indicator matrices'.format(
+                        self.__class__.__name__))
+            return multilabel_vectorize(self._transform)(y)
 
-        classes = np.unique(y)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
+        return self._transform(y)
 
+    def _transform(self, y):
+        diff = np.setdiff1d(y, self.classes_)
+        if len(diff):
+            raise ValueError("y contains new labels: %s" % str(diff))
         return np.searchsorted(self.classes_, y)
 
     def inverse_transform(self, y):
@@ -900,6 +921,10 @@ def inverse_transform(self, y):
         """
         self._check_fitted()
 
+        if is_multilabel(y):
+            # np.vectorize does not work with np.ndarray.take!
+            take = functools.partial(np.take, self.classes_)
+            return multilabel_vectorize(take)(y)
         y = np.asarray(y)
         return self.classes_[y]
 
 
@@ -120,7 +120,9 @@ def is_multilabel(y):
     False
     >>> is_multilabel([[1], [0, 2], []])
     True
-    >>> is_multilabel(np.array([[1, 0], [0, 0]]))
+    >>> is_multilabel(np.array([np.array([1]), np.array([0, 2])]))
+    True
+    >>> is_multilabel(np.array([[1, 0], [0, 0]]))  # label indicator matrix
     True
     >>> is_multilabel(np.array([[1], [0], [0]]))
     False
@@ -130,5 +132,49 @@ def is_multilabel(y):
     """
     # the explicit check for ndarray is for forward compatibility; future
     # versions of Numpy might want to register ndarray as a Sequence
-    return (not isinstance(y[0], np.ndarray) and isinstance(y[0], Sequence) and
-            not isinstance(y[0], string_types) or is_label_indicator_matrix(y))
+    if getattr(y, 'ndim', 1) != 1:
+        return is_label_indicator_matrix(y)
+    return ((isinstance(y[0], Sequence) and not isinstance(y[0], string_types))
+            or isinstance(y[0], np.ndarray))
+
+
+def multilabel_as_array(y):
+    """Transform a sequence of sequences into an array of sequences
+
+    Parameters
+    ----------
+    y : sequence or array of sequences
+        Target values. In the multilabel case the nested sequences can
+        have variable lengths. Label indicator matrices are not supported.
+
+    Returns
+    -------
+    out : numpy array of shape [len(y)]
+        The elements of the returned array correspond to the elements of y.
+        If y is an array, it is returned without copying.
+    """
+    if hasattr(y, '__array__'):
+        return np.asarray(y)
+    out = np.empty(len(y), dtype=object)
+    out[:] = y
+    return out
+
+
+def multilabel_vectorize(func, otypes='O'):
+    """Vectorize a function suitably for sequence-of-sequence input and output
+
+    Parameters
+    ----------
+    func : a function to vectorize
+    otypes : the dtypes of the output arrays, default objects
+
+    Returns
+    -------
+    out : callable
+        The returned function will vectorize `func` over its arguments, first
+        ensuring they are arrays of sequences.
+    """
+    vfunc = np.vectorize(func, otypes=otypes)
+    def wrapper(*args):
+        return vfunc(*[multilabel_as_array(arg) for arg in args])
+    return wrapper