diff --git a/appveyor.yml b/appveyor.yml index 54bd67e5f0c26..7d91bf06eecf1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,7 +13,7 @@ environment: PYTHON_VERSION: "2.7.8" PYTHON_ARCH: "32" - - PYTHON: "C:\\Python27" + - PYTHON: "C:\\Python27_64" PYTHON_VERSION: "2.7.8" PYTHON_ARCH: "64" @@ -21,10 +21,15 @@ environment: PYTHON_VERSION: "3.4.1" PYTHON_ARCH: "32" - - PYTHON: "C:\\Python34" + - PYTHON: "C:\\Python34_64" PYTHON_VERSION: "3.4.1" PYTHON_ARCH: "64" +branches: + only: + - master + - 0.15.X + install: # Install Python (from the official .msi of http://python.org) and pip when # not already installed. @@ -53,7 +58,7 @@ test_script: # Skip joblib tests that require multiprocessing as they are prone to random # slow down - - "python -c \"import nose; nose.main()\" -v -s sklearn" + - "python -c \"import nose; nose.main()\" -s sklearn" artifacts: # Archive the generated wheel package in the ci.appveyor.com build report. diff --git a/continuous_integration/appveyor/install.ps1 b/continuous_integration/appveyor/install.ps1 index fc06c58078965..2a96d3372ecab 100644 --- a/continuous_integration/appveyor/install.ps1 +++ b/continuous_integration/appveyor/install.ps1 @@ -52,12 +52,17 @@ function InstallPython ($python_version, $architecture, $python_home) { } else { $platform_suffix = ".amd64" } - $filepath = DownloadPython $python_version $platform_suffix - Write-Host "Installing" $filepath "to" $python_home + $msipath = DownloadPython $python_version $platform_suffix + Write-Host "Installing" $msipath "to" $python_home $install_log = $python_home + ".log" - $args = "/qn /log $install_log /i $filepath TARGETDIR=$python_home" - Write-Host "msiexec.exe" $args - Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru + $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home" + $uninstall_args = "/qn /x $msipath" + RunCommand "msiexec.exe" $install_args + if (-not(Test-Path $python_home)) { + Write-Host "Python seems to be installed else-where, reinstalling." + RunCommand "msiexec.exe" $uninstall_args + RunCommand "msiexec.exe" $install_args + } if (Test-Path $python_home) { Write-Host "Python $python_version ($architecture) installation complete" } else { @@ -67,6 +72,11 @@ function InstallPython ($python_version, $architecture, $python_home) { } } +function RunCommand ($command, $command_args) { + Write-Host $command $command_args + Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru +} + function InstallPip ($python_home) { $pip_path = $python_home + "\Scripts\pip.exe" @@ -82,6 +92,7 @@ function InstallPip ($python_home) { } } + function main () { InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON InstallPip $env:PYTHON diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index ee7c483c73a7e..a99758989e195 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -53,8 +53,8 @@ coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment. -The One-Class SVM has been introduced in [1] for that purpose and -implemented in the :ref:`svm` module in the +The One-Class SVM has been introduced by Schölkopf et al. for that purpose +and implemented in the :ref:`svm` module in the :class:`svm.OneClassSVM` object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to @@ -63,6 +63,12 @@ implementation. The :math:`\nu` parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier. +.. topic:: References: + + * `Estimating the support of a high-dimensional distribution + `_ Schölkopf, + Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. + .. topic:: Examples: * See :ref:`example_svm_plot_oneclass.py` for visualizing the @@ -73,7 +79,7 @@ but regular, observation outside the frontier. :target: ../auto_examples/svm/plot_oneclasse.html :align: center :scale: 75% - + Outlier Detection ================= diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4d3b04ade3c7b..2915eb9c45dd1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -418,6 +418,20 @@ hashable and comparable) to numerical labels:: >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] +By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that +labels are passed in ``transform`` that were not seen in ``fit``. This +behavior can be handled with the ``new_labels`` parameter, which supports +``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for +handling new labels. For example, the ``"label"`` strategy will assign +the unseen values a label of ``-1``. + + >>> le = preprocessing.LabelEncoder(new_labels="label") + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder(new_label_class=-1, new_labels='label') + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) + array([ 2, 2, 1, -1]) Imputation of missing values ============================ diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py index 213484a515481..9a8d2535a2b26 100644 --- a/doc/sphinxext/gen_rst.py +++ b/doc/sphinxext/gen_rst.py @@ -468,7 +468,11 @@ def generate_example_rst(app): examples. """ root_dir = os.path.join(app.builder.srcdir, 'auto_examples') - example_dir = os.path.abspath(app.builder.srcdir + '/../' + 'examples') + example_dir = os.path.abspath(os.path.join(app.builder.srcdir, '..', + 'examples')) + generated_dir = os.path.abspath(os.path.join(app.builder.srcdir, + 'modules', 'generated')) + try: plot_gallery = eval(app.builder.config.plot_gallery) except TypeError: @@ -477,10 +481,12 @@ def generate_example_rst(app): os.makedirs(example_dir) if not os.path.exists(root_dir): os.makedirs(root_dir) + if not os.path.exists(generated_dir): + os.makedirs(generated_dir) # we create an index.rst with all examples fhindex = open(os.path.join(root_dir, 'index.rst'), 'w') - #Note: The sidebar button has been removed from the examples page for now + # Note: The sidebar button has been removed from the examples page for now # due to how it messes up the layout. Will be fixed at a later point fhindex.write("""\ diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 2734eb5ab1729..5ef88c41a7b61 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -48,8 +48,8 @@ def test_f_oneway_ints(): # test that is gives the same result as with float f, p = f_oneway(X.astype(np.float), y) - assert_array_almost_equal(f, fint, decimal=5) - assert_array_almost_equal(p, pint, decimal=5) + assert_array_almost_equal(f, fint, decimal=4) + assert_array_almost_equal(p, pint, decimal=4) def test_f_classif(): diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f304bf6104cae..a319ee0e55ffd 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -1,6 +1,6 @@ # Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel +# Mathieu Blondel +# Olivier Grisel # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi @@ -10,7 +10,9 @@ import itertools import array import warnings +import operator +import operator import numpy as np import scipy.sparse as sp @@ -53,11 +55,29 @@ def _check_numpy_unicode_bug(labels): class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. + Parameters + ---------- + + new_labels : string, optional (default: "raise") + Determines how to handle new labels, i.e., data + not seen in the training domain. + + - If ``"raise"``, then raise ValueError. + - If ``"update"``, then re-map the new labels to + classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. + - If an integer value is passed, then use re-label with this value. + N.B. that default values are in [0, 1, ...], so caution should be + taken if a non-negative value is passed to not accidentally + intersect. + Attributes ---------- `classes_` : array of shape (n_class,) Holds the label for each class. + `new_label_mapping_` : dictionary + Stores the mapping for classes not seen during original ``fit``. + Examples -------- `LabelEncoder` can be used to normalize labels. @@ -65,7 +85,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -78,7 +98,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -88,10 +108,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ + def __init__(self, new_labels="raise"): + """Constructor""" + self.new_labels = new_labels + self.new_label_mapping_ = {} + def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") + def get_classes(self): + """Get classes that have been observed by the encoder. Note that this + method returns classes seen both at original ``fit`` time (i.e., + ``self.classes_``) and classes seen after ``fit`` (i.e., + ``self.new_label_mapping_.keys()``) for applicable values of + ``new_labels``. + + Returns + ------- + classes : array-like of shape [n_classes] + """ + # If we've seen updates, include them in the order they were added. + if len(self.new_label_mapping_) > 0: + sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), + key=operator.itemgetter(1))) + return np.append(self.classes_, sorted_new) + else: + return self.classes_ + def fit(self, y): """Fit label encoder @@ -104,6 +148,14 @@ def fit(self, y): ------- self : returns an instance of self. """ + # Check new_labels parameter + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown and not integer." + .format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) @@ -121,6 +173,14 @@ def fit_transform(self, y): ------- y : array-like of shape [n_samples] """ + # Check new_labels parameter + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown and not integer." + .format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) @@ -142,9 +202,47 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) + if len(np.intersect1d(classes, self.get_classes())) < len(classes): + # Get the new classes + diff_fit = np.setdiff1d(classes, self.classes_) + diff_new = np.setdiff1d(classes, self.get_classes()) + + # Create copy of array and return + y = np.array(y) + + # If we are mapping new labels, get "new" ID and change in copy. + if self.new_labels == "update": + # Update the new label mapping + next_label = len(self.get_classes()) + self.new_label_mapping_.update(dict(zip(diff_new, + range(next_label, + next_label + + len(diff_new))))) + + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) + + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = [self.new_label_mapping_[value] + for value in y[missing_mask]] + return out + elif type(self.new_labels) in [int]: + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) + + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = self.new_labels + return out + elif self.new_labels == "raise": + # Return ValueError, original behavior. + raise ValueError("y contains new labels: %s" % str(diff_fit)) + else: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + return np.searchsorted(self.classes_, y) def inverse_transform(self, y): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index dfdb2d23a2134..70a47fcffd498 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,6 +210,52 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_get_classes(): + """Test LabelEncoder's get_classes method.""" + le = LabelEncoder(new_labels="update") + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.classes_, le.get_classes()) + le.transform([10]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10]) + + +def test_label_encoder_new_label_update(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="update") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "_"]), + [1, 2, 3]) + assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) + assert_array_equal(le.transform(["_", "z", "a"]), + [3, 4, 0]) + + +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels=-99) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, -99]) + + +def test_label_encoder_new_label_arg(): + """Test LabelEncoder's new_labels argument handling""" + le = LabelEncoder(new_labels="xyz") + assert_raises(ValueError, le.fit, ["a", "b", "b", "c"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 874dae7c338a8..c114d45887794 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -136,15 +136,6 @@ def check_regressors_classifiers_sparse_data(name, Estimator): def check_transformer(name, Transformer): - if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): - # Those transformers yield non-deterministic output when executed on - # a 32bit Python. The same transformers are stable on 64bit Python. - # FIXME: try to isolate a minimalistic reproduction case only depending - # on numpy & scipy and/or maybe generate a test dataset that does not - # cause such unstable behaviors. - msg = name + ' is non deterministic on 32bit Python' - raise SkipTest(msg) - X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) @@ -166,6 +157,14 @@ def check_transformer_data_not_an_array(name, Transformer): def _check_transformer(name, Transformer, X, y): + if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): + # Those transformers yield non-deterministic output when executed on + # a 32bit Python. The same transformers are stable on 64bit Python. + # FIXME: try to isolate a minimalistic reproduction case only depending + # on numpy & scipy and/or maybe generate a test dataset that does not + # cause such unstable behaviors. + msg = name + ' is non deterministic on 32bit Python' + raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True):