scikit-learn · mjbommar · Jun 4, 2014 · Jun 4, 2014 · Jun 4, 2014 · Jun 4, 2014
diff --git a/appveyor.yml b/appveyor.yml
@@ -13,18 +13,23 @@ environment:
       PYTHON_VERSION: "2.7.8"
       PYTHON_ARCH: "32"
 
-    - PYTHON: "C:\\Python27"
+    - PYTHON: "C:\\Python27_64"
       PYTHON_VERSION: "2.7.8"
       PYTHON_ARCH: "64"
 
     - PYTHON: "C:\\Python34_32"
       PYTHON_VERSION: "3.4.1"
       PYTHON_ARCH: "32"
 
-    - PYTHON: "C:\\Python34"
+    - PYTHON: "C:\\Python34_64"
       PYTHON_VERSION: "3.4.1"
       PYTHON_ARCH: "64"
 
+branches:
+  only:
+    - master
+    - 0.15.X
+
 install:
   # Install Python (from the official .msi of http://python.org) and pip when
   # not already installed.
@@ -53,7 +58,7 @@ test_script:
 
   # Skip joblib tests that require multiprocessing as they are prone to random
   # slow down
-  - "python -c \"import nose; nose.main()\" -v -s sklearn"
+  - "python -c \"import nose; nose.main()\" -s sklearn"
 
 artifacts:
   # Archive the generated wheel package in the ci.appveyor.com build report.

diff --git a/continuous_integration/appveyor/install.ps1 b/continuous_integration/appveyor/install.ps1
@@ -52,12 +52,17 @@ function InstallPython ($python_version, $architecture, $python_home) {
     } else {
         $platform_suffix = ".amd64"
     }
-    $filepath = DownloadPython $python_version $platform_suffix
-    Write-Host "Installing" $filepath "to" $python_home
+    $msipath = DownloadPython $python_version $platform_suffix
+    Write-Host "Installing" $msipath "to" $python_home
     $install_log = $python_home + ".log"
-    $args = "/qn  /log $install_log /i $filepath TARGETDIR=$python_home"
-    Write-Host "msiexec.exe" $args
-    Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru
+    $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home"
+    $uninstall_args = "/qn /x $msipath"
+    RunCommand "msiexec.exe" $install_args
+    if (-not(Test-Path $python_home)) {
+        Write-Host "Python seems to be installed else-where, reinstalling."
+        RunCommand "msiexec.exe" $uninstall_args
+        RunCommand "msiexec.exe" $install_args
+    }
     if (Test-Path $python_home) {
         Write-Host "Python $python_version ($architecture) installation complete"
     } else {
@@ -67,6 +72,11 @@ function InstallPython ($python_version, $architecture, $python_home) {
     }
 }
 
+function RunCommand ($command, $command_args) {
+    Write-Host $command $command_args
+    Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru
+}
+
 
 function InstallPip ($python_home) {
     $pip_path = $python_home + "\Scripts\pip.exe"
@@ -82,6 +92,7 @@ function InstallPip ($python_home) {
     }
 }
 
+
 function main () {
     InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
     InstallPip $env:PYTHON

diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
@@ -53,8 +53,8 @@ coming from the same population than the initial
 observations. Otherwise, if they lay outside the frontier, we can say
 that they are abnormal with a given confidence in our assessment.
 
-The One-Class SVM has been introduced in [1] for that purpose and
-implemented in the :ref:`svm` module in the
+The One-Class SVM has been introduced by Schölkopf et al. for that purpose 
+and implemented in the :ref:`svm` module in the
 :class:`svm.OneClassSVM` object. It requires the choice of a
 kernel and a scalar parameter to define a frontier.  The RBF kernel is
 usually chosen although there exists no exact formula or algorithm to
@@ -63,6 +63,12 @@ implementation. The :math:`\nu` parameter, also known as the margin of
 the One-Class SVM, corresponds to the probability of finding a new,
 but regular, observation outside the frontier.
 
+.. topic:: References:
+
+    * `Estimating the support of a high-dimensional distribution
+      <http://dl.acm.org/citation.cfm?id=1119749>`_ Schölkopf, 
+      Bernhard, et al. Neural computation 13.7 (2001): 1443-1471.
+
 .. topic:: Examples:
 
    * See :ref:`example_svm_plot_oneclass.py` for visualizing the
@@ -73,7 +79,7 @@ but regular, observation outside the frontier.
    :target: ../auto_examples/svm/plot_oneclasse.html
    :align: center
    :scale: 75%
-
+   
 
 Outlier Detection
 =================

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -397,7 +397,7 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6])
@@ -410,14 +410,28 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])
     array([2, 2, 1])
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
+labels are passed in ``transform`` that were not seen in ``fit``.  This
+behavior can be handled with the ``new_labels`` parameter, which supports
+``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for
+handling new labels.  For example, the ``"label"`` strategy will assign
+the unseen values a label of ``-1``.
+
+    >>> le = preprocessing.LabelEncoder(new_labels="label")
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder(new_label_class=-1, new_labels='label')
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris", "rome"])
+    array([ 2,  2,  1, -1])
 
 Imputation of missing values
 ============================

diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py
@@ -468,7 +468,11 @@ def generate_example_rst(app):
         examples.
     """
     root_dir = os.path.join(app.builder.srcdir, 'auto_examples')
-    example_dir = os.path.abspath(app.builder.srcdir + '/../' + 'examples')
+    example_dir = os.path.abspath(os.path.join(app.builder.srcdir, '..',
+                                               'examples'))
+    generated_dir = os.path.abspath(os.path.join(app.builder.srcdir,
+                                                 'modules', 'generated'))
+
     try:
         plot_gallery = eval(app.builder.config.plot_gallery)
     except TypeError:
@@ -477,10 +481,12 @@ def generate_example_rst(app):
         os.makedirs(example_dir)
     if not os.path.exists(root_dir):
         os.makedirs(root_dir)
+    if not os.path.exists(generated_dir):
+        os.makedirs(generated_dir)
 
     # we create an index.rst with all examples
     fhindex = open(os.path.join(root_dir, 'index.rst'), 'w')
-    #Note: The sidebar button has been removed from the examples page for now
+    # Note: The sidebar button has been removed from the examples page for now
     #      due to how it messes up the layout. Will be fixed at a later point
     fhindex.write("""\
 

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -48,8 +48,8 @@ def test_f_oneway_ints():
 
     # test that is gives the same result as with float
     f, p = f_oneway(X.astype(np.float), y)
-    assert_array_almost_equal(f, fint, decimal=5)
-    assert_array_almost_equal(p, pint, decimal=5)
+    assert_array_almost_equal(f, fint, decimal=4)
+    assert_array_almost_equal(p, pint, decimal=4)
 
 
 def test_f_classif():

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -1,6 +1,6 @@
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#          Mathieu Blondel <mathieu@mblondel.org>
-#          Olivier Grisel <olivier.grisel@ensta.org>
+# Mathieu Blondel <mathieu@mblondel.org>
+# Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Hamzeh Alsalhi <ha258@cornell.edu>
@@ -10,7 +10,9 @@
 import itertools
 import array
 import warnings
+import operator
 
+import operator
 import numpy as np
 import scipy.sparse as sp
 
@@ -53,19 +55,37 @@ def _check_numpy_unicode_bug(labels):
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
+    Parameters
+    ----------
+
+    new_labels : string, optional (default: "raise")
+        Determines how to handle new labels, i.e., data
+        not seen in the training domain.
+
+        - If ``"raise"``, then raise ValueError.
+        - If ``"update"``, then re-map the new labels to
+          classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
+        - If an integer value is passed, then use re-label with this value.
+          N.B. that default values are in [0, 1, ...], so caution should be
+          taken if a non-negative value is passed to not accidentally
+          intersect.
+
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
         Holds the label for each class.
 
+    `new_label_mapping_` : dictionary
+        Stores the mapping for classes not seen during original ``fit``.
+
     Examples
     --------
     `LabelEncoder` can be used to normalize labels.
 
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -78,7 +98,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_label_class=-1, new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
@@ -88,10 +108,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     """
 
+    def __init__(self, new_labels="raise"):
+        """Constructor"""
+        self.new_labels = new_labels
+        self.new_label_mapping_ = {}
+
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
             raise ValueError("LabelEncoder was not fitted yet.")
 
+    def get_classes(self):
+        """Get classes that have been observed by the encoder.  Note that this
+        method returns classes seen both at original ``fit`` time (i.e.,
+        ``self.classes_``) and classes seen after ``fit`` (i.e.,
+        ``self.new_label_mapping_.keys()``) for applicable values of
+        ``new_labels``.
+
+        Returns
+        -------
+        classes : array-like of shape [n_classes]
+        """
+        # If we've seen updates, include them in the order they were added.
+        if len(self.new_label_mapping_) > 0:
+            sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(),
+                                        key=operator.itemgetter(1)))
+            return np.append(self.classes_, sorted_new)
+        else:
+            return self.classes_
+
     def fit(self, y):
         """Fit label encoder
 
@@ -104,6 +148,14 @@ def fit(self, y):
         -------
         self : returns an instance of self.
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
+            # Raise on invalid argument.
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown and not integer."
+                             .format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_ = np.unique(y)
@@ -121,6 +173,14 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
+            # Raise on invalid argument.
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown and not integer."
+                             .format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
         self.classes_, y = np.unique(y, return_inverse=True)
@@ -142,9 +202,47 @@ def transform(self, y):
 
         classes = np.unique(y)
         _check_numpy_unicode_bug(classes)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
+        if len(np.intersect1d(classes, self.get_classes())) < len(classes):
+            # Get the new classes
+            diff_fit = np.setdiff1d(classes, self.classes_)
+            diff_new = np.setdiff1d(classes, self.get_classes())
+
+            # Create copy of array and return
+            y = np.array(y)
+
+            # If we are mapping new labels, get "new" ID and change in copy.
+            if self.new_labels == "update":
+                # Update the new label mapping
+                next_label = len(self.get_classes())
+                self.new_label_mapping_.update(dict(zip(diff_new,
+                                                        range(next_label,
+                                                              next_label +
+                                                              len(diff_new)))))
+
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
+
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = [self.new_label_mapping_[value]
+                                     for value in y[missing_mask]]
+                return out
+            elif type(self.new_labels) in [int]:
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
+
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.classes_, y)
+                out[missing_mask] = self.new_labels
+                return out
+            elif self.new_labels == "raise":
+                # Return ValueError, original behavior.
+                raise ValueError("y contains new labels: %s" % str(diff_fit))
+            else:
+                # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
         return np.searchsorted(self.classes_, y)
 
     def inverse_transform(self, y):