From 7ef1deb3053dc394337a4c8914171b4105519eea Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 19 Dec 2016 17:10:51 -0500 Subject: [PATCH 01/20] some bug fixes. --- doc/modules/model_evaluation.rst | 2 +- doc/whats_new.rst | 36 ++++++++++++++- sklearn/covariance/outlier_detection.py | 35 +++++++++++++-- sklearn/decomposition/dict_learning.py | 9 ++-- sklearn/decomposition/truncated_svd.py | 8 +--- sklearn/dummy.py | 12 +++-- sklearn/ensemble/base.py | 6 ++- sklearn/ensemble/gradient_boosting.py | 10 ++--- sklearn/feature_extraction/tests/test_text.py | 12 ++--- sklearn/feature_extraction/text.py | 32 +++++++------- sklearn/feature_selection/from_model.py | 11 +++-- sklearn/feature_selection/rfe.py | 5 ++- .../tests/test_from_model.py | 8 +++- sklearn/multiclass.py | 20 ++++++--- sklearn/multioutput.py | 13 ++++-- sklearn/naive_bayes.py | 6 +-- sklearn/neighbors/approximate.py | 2 +- sklearn/tests/test_multiclass.py | 44 +++++++++++++++++-- sklearn/tests/test_multioutput.py | 3 ++ sklearn/utils/multiclass.py | 5 ++- 20 files changed, 205 insertions(+), 74 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index db7b59d6c1d3a..beaeabafee752 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -173,7 +173,7 @@ Here is an example of building custom scorers, and of using the >>> # and predictions defined below. >>> loss = make_scorer(my_custom_loss_func, greater_is_better=False) >>> score = make_scorer(my_custom_loss_func, greater_is_better=True) - >>> ground_truth = [[1, 1]] + >>> ground_truth = [[1], [1]] >>> predictions = [0, 1] >>> from sklearn.dummy import DummyClassifier >>> clf = DummyClassifier(strategy='most_frequent', random_state=0) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 7d2fa8a562887..0f0fa26918445 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -177,11 +177,12 @@ Bug fixes - Fixed a bug where :class:`sklearn.linear_model.LassoLars` does not give the same result as the LassoLars implementation available in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez ` + in R (lars library). :issue:`7849` by `Jair Montoya Martinez`_ + - Some ``fetch_`` functions in `sklearn.datasets` were ignoring the ``download_if_missing`` keyword. This was fixed in :issue:`7944` by :user:`Ralf Gommers `. - - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse array X and initial centroids, where X's means were unnecessarily being subtracted from the centroids. :issue:`7872` by `Josh Karnofsky `_. @@ -207,6 +208,13 @@ Bug fixes :class:`sklearn.ensemble.GradientBoostingRegressor` ignored the ``min_impurity_split`` parameter. :issue:`8006` by :user:`Sebastian Pölsterl `. + - Fixes to the input validation in :class:`sklearn.covariance.EllipticEnvelope` by + `Andreas Müller`_. + + - Fix shape output shape of :class:`sklearn.decomposition.DictionaryLearning` transform + for one-dimensional data by `Andreas Müller`_. + + - Several fixes to input validation in :class:`multiclass.OutputCodeClassifier` by `Andreas Müller`_ - Fix a bug where :class:`sklearn.ensemble.gradient_boosting.QuantileLossFunction` computed @@ -267,6 +275,32 @@ API changes summary :func:`sklearn.model_selection.cross_val_predict`. :issue:`2879` by :user:`Stephen Hoover `. + + - Gradient boosting base models are not longer estimators. By `Andreas Müller`_. + + - `feature_extraction.text.TfidfTransformer` now supports numpy arrays as inputs, and produces numpy + arrays for list inputs and numpy array inputs. By `Andreas `Müller_. + + - `feature_selection.SelectFromModel` now validates the ``threshold`` + parameter and sets the ``threshold_`` attribute during the call to + ``fit``, and no longer during the call to ``transform```, by `Andreas Müller`_. + + - `features_selection.SelectFromModel` now has a ``partial_fit`` method only if the underlying + estimator does. By `Andreas Müller`_. + + - All checks in ``utils.estimator_checks``, in particular :func:`utils.estimator_checks.check_estimator` now + accept estimator instances. All checks apart from ``check_estimator`` do not accept estimator classes any more. + By `Andreas Müller`_. + + - The ``include_others`` and ``dont_test`` parameters of :func:`utils.testing.all_estimators` are deprecated + and are assumed ``True``, by `Andreas Müller`_. + + + - :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` now do input validation on ``X`` and check + whether ``X`` and ``y`` are of the same length, by `Andreas Müller`_. + + - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method only if the underlying estimator does. + By `Andreas Müller`_. .. _changes_0_18_1: Version 0.18.1 diff --git a/sklearn/covariance/outlier_detection.py b/sklearn/covariance/outlier_detection.py index 1cafe885fdd47..3349f71af42d2 100644 --- a/sklearn/covariance/outlier_detection.py +++ b/sklearn/covariance/outlier_detection.py @@ -15,8 +15,8 @@ import numpy as np import scipy as sp from . import MinCovDet -from ..base import ClassifierMixin -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_array +from ..metrics import accuracy_score class OutlierDetectionMixin(object): @@ -63,11 +63,11 @@ def decision_function(self, X, raw_values=False): """ check_is_fitted(self, 'threshold_') + X = check_array(X) mahal_dist = self.mahalanobis(X) if raw_values: decision = mahal_dist else: - check_is_fitted(self, 'threshold_') transformed_mahal_dist = mahal_dist ** 0.33 decision = self.threshold_ ** 0.33 - transformed_mahal_dist @@ -91,6 +91,7 @@ def predict(self, X): """ check_is_fitted(self, 'threshold_') + X = check_array(X) is_inlier = -np.ones(X.shape[0], dtype=int) if self.contamination is not None: values = self.decision_function(X, raw_values=True) @@ -101,7 +102,7 @@ def predict(self, X): return is_inlier -class EllipticEnvelope(ClassifierMixin, OutlierDetectionMixin, MinCovDet): +class EllipticEnvelope(OutlierDetectionMixin, MinCovDet): """An object for detecting outliers in a Gaussian distributed dataset. Read more in the :ref:`User Guide `. @@ -176,3 +177,29 @@ def fit(self, X, y=None): self.threshold_ = sp.stats.scoreatpercentile( self.dist_, 100. * (1. - self.contamination)) return self + + def score(self, X, y, sample_weight=None): + """Returns the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + + """ + return accuracy_score(y, self.predict(X), sample_weight=sample_weight) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index baf79544dd172..a81a16be3f718 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -282,9 +282,9 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', check_input=False, verbose=verbose) # This ensure that dimensionality of code is always 2, - # consistant with the case n_jobs > 1 + # consistent with the case n_jobs > 1 if code.ndim == 1: - code = code[np.newaxis, :] + code = code[:, np.newaxis] return code # Enter parallel code block @@ -722,8 +722,8 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, sys.stdout.flush() elif verbose: if verbose > 10 or ii % ceil(100. / verbose) == 0: - print ("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" - % (ii, dt, dt / 60)) + print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" + % (ii, dt, dt / 60)) this_code = sparse_encode(this_X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs).T @@ -811,7 +811,6 @@ def transform(self, X, y=None): """ check_is_fitted(self, 'components_') - # XXX : kwargs is not documented X = check_array(X) n_samples, n_features = X.shape diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index 5d029d1205bd0..4be64c3ac64d5 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -15,7 +15,7 @@ from ..utils.arpack import svds from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, as_float_array, check_random_state +from ..utils import check_array, check_random_state from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis @@ -155,13 +155,9 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = as_float_array(X, copy=False) + X = check_array(X, accept_sparse=['csr', 'csc']) random_state = check_random_state(self.random_state) - # If sparse and not csr or csc, convert to csr - if sp.issparse(X) and X.getformat() not in ["csr", "csc"]: - X = X.tocsr() - if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 84d42e7177a0a..0f01d18cb2b9c 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -10,7 +10,7 @@ from .base import BaseEstimator, ClassifierMixin, RegressorMixin from .utils import check_random_state -from .utils.validation import check_array +from .utils.validation import check_array, check_X_y from .utils.validation import check_consistent_length from .utils.validation import check_is_fitted from .utils.random import random_choice_csc @@ -117,6 +117,9 @@ def fit(self, X, y, sample_weight=None): self.sparse_output_ = sp.issparse(y) + X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + check_consistent_length(X, y) + if not self.sparse_output_: y = np.atleast_1d(y) @@ -181,7 +184,7 @@ def predict(self, X): classes_ = self.classes_ class_prior_ = self.class_prior_ constant = self.constant - if self.n_outputs_ == 1: + if self.n_outputs_ == 1 and not self.output_2d_: # Get same type even for self.n_outputs_ == 1 n_classes_ = [n_classes_] classes_ = [classes_] @@ -190,7 +193,7 @@ def predict(self, X): # Compute probability only once if self.strategy == "stratified": proba = self.predict_proba(X) - if self.n_outputs_ == 1: + if self.n_outputs_ == 1 and not self.output_2d_: proba = [proba] if self.sparse_output_: @@ -395,7 +398,8 @@ def fit(self, X, y, sample_weight=None): "'mean', 'median', 'quantile' or 'constant'" % self.strategy) - y = check_array(y, ensure_2d=False) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True) if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py index 165124d62428a..5bf3d72dcdd38 100644 --- a/sklearn/ensemble/base.py +++ b/sklearn/ensemble/base.py @@ -12,6 +12,8 @@ from ..base import BaseEstimator from ..base import MetaEstimatorMixin from ..utils import _get_n_jobs, check_random_state +from ..externals import six +from abc import ABCMeta, abstractmethod MAX_RAND_SEED = np.iinfo(np.int32).max @@ -52,7 +54,8 @@ def _set_random_states(estimator, random_state=None): estimator.set_params(**to_set) -class BaseEnsemble(BaseEstimator, MetaEstimatorMixin): +class BaseEnsemble(six.with_metaclass(ABCMeta, BaseEstimator, + MetaEstimatorMixin)): """Base class for all ensemble classes. Warning: This class should not be used directly. Use derived classes @@ -79,6 +82,7 @@ class BaseEnsemble(BaseEstimator, MetaEstimatorMixin): The collection of fitted base estimators. """ + @abstractmethod def __init__(self, base_estimator, n_estimators=10, estimator_params=tuple()): # Set parameters diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 26797ca25cb1f..ec6d5d74b9452 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -64,7 +64,7 @@ from ..exceptions import NotFittedError -class QuantileEstimator(BaseEstimator): +class QuantileEstimator(object): """An estimator predicting the alpha-quantile of the training targets.""" def __init__(self, alpha=0.9): if not 0 < alpha < 1.0: @@ -86,7 +86,7 @@ def predict(self, X): return y -class MeanEstimator(BaseEstimator): +class MeanEstimator(object): """An estimator predicting the mean of the training targets.""" def fit(self, X, y, sample_weight=None): if sample_weight is None: @@ -102,7 +102,7 @@ def predict(self, X): return y -class LogOddsEstimator(BaseEstimator): +class LogOddsEstimator(object): """An estimator predicting the log odds ratio.""" scale = 1.0 @@ -132,7 +132,7 @@ class ScaledLogOddsEstimator(LogOddsEstimator): scale = 0.5 -class PriorProbabilityEstimator(BaseEstimator): +class PriorProbabilityEstimator(object): """An estimator predicting the probability of each class in the training data. """ @@ -150,7 +150,7 @@ def predict(self, X): return y -class ZeroEstimator(BaseEstimator): +class ZeroEstimator(object): """An estimator that simply predicts zero. """ def fit(self, X, y, sample_weight=None): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 88382f7d13c0b..ab8d9d39aadc2 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -35,6 +35,7 @@ from functools import partial import pickle from io import StringIO +from scipy import sparse JUNK_FOOD_DOCS = ( @@ -309,7 +310,7 @@ def test_tf_idf_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') - tfidf = tr.fit_transform(X).toarray() + tfidf = tr.fit_transform(X) assert_true((tfidf >= 0).all()) # check normalization @@ -320,7 +321,7 @@ def test_tf_idf_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') - tfidf = tr.fit_transform(X).toarray() + tfidf = tr.fit_transform(X) assert_true((tfidf >= 0).all()) @@ -329,7 +330,7 @@ def test_tfidf_no_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') - tfidf = tr.fit_transform(X).toarray() + tfidf = tr.fit_transform(X) assert_true((tfidf >= 0).all()) # check normalization @@ -340,6 +341,7 @@ def test_tfidf_no_smoothing(): X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] + X = sparse.csr_matrix(X) tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() @@ -357,7 +359,7 @@ def test_tfidf_no_smoothing(): def test_sublinear_tf(): X = [[1], [2], [3]] tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None) - tfidf = tr.fit_transform(X).toarray() + tfidf = tr.fit_transform(X) assert_equal(tfidf[0], 1) assert_greater(tfidf[1], tfidf[0]) assert_greater(tfidf[2], tfidf[1]) @@ -420,7 +422,7 @@ def test_vectorizer(): # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() - assert_equal(t2.idf_, None) + assert_false(hasattr(t2, "idf_")) # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index f5b548a5278cd..72676b69a36fc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,9 +29,8 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS -from ..utils import deprecated from ..utils.fixes import frombuffer_empty, bincount -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_array __all__ = ['CountVectorizer', 'ENGLISH_STOP_WORDS', @@ -1023,7 +1022,8 @@ def fit(self, X, y=None): a matrix of term/token counts """ if not sp.issparse(X): - X = sp.csc_matrix(X) + X = sp.csc_matrix(X, dtype=np.float64) + X = check_array(X, accept_sparse=["csc", "csr"]) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) @@ -1035,7 +1035,7 @@ def fit(self, X, y=None): # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples) / df) + 1.0 - self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, + self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self @@ -1056,18 +1056,19 @@ def transform(self, X, copy=True): ------- vectors : sparse matrix, [n_samples, n_features] """ - if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float): - # preserve float family dtype - X = sp.csr_matrix(X, copy=copy) - else: - # convert counts or binary occurrences to floats - X = sp.csr_matrix(X, dtype=np.float64, copy=copy) + X = check_array(X, accept_sparse=["csr"], copy=copy, + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.sublinear_tf: - np.log(X.data, X.data) - X.data += 1 + if sp.issparse(X): + np.log(X.data, X.data) + X.data += 1 + else: + mask = X != 0 + X[mask] = np.log(X[mask]) + X[mask] += 1 if self.use_idf: check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') @@ -1087,10 +1088,9 @@ def transform(self, X, copy=True): @property def idf_(self): - if hasattr(self, "_idf_diag"): - return np.ravel(self._idf_diag.sum(axis=0)) - else: - return None + # if _idf_diag is not set, this will raise an attribute error, + # which means hasatt(self, "idf_") is False + return np.ravel(self._idf_diag.sum(axis=0)) class TfidfVectorizer(CountVectorizer): diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index 7fe4456ccd390..e27c0bd267bf9 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -4,11 +4,12 @@ import numpy as np from .base import SelectorMixin -from ..base import BaseEstimator, clone +from ..base import BaseEstimator, clone, MetaEstimatorMixin from ..externals import six from ..exceptions import NotFittedError from ..utils.fixes import norm +from ..utils.metaestimators import if_delegate_has_method def _get_feature_importances(estimator, norm_order=1): @@ -76,7 +77,7 @@ def _calculate_threshold(estimator, importances, threshold): return threshold -class SelectFromModel(BaseEstimator, SelectorMixin): +class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): """Meta-transformer for selecting features based on importance weights. .. versionadded:: 0.17 @@ -121,7 +122,6 @@ class SelectFromModel(BaseEstimator, SelectorMixin): threshold_ : float The threshold value used for feature selection. """ - def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): self.estimator = estimator self.threshold = threshold @@ -138,6 +138,7 @@ def _get_support_mask(self): raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') + # XXX duplicate computation if we called fit before scores = _get_feature_importances(estimator, self.norm_order) self.threshold_ = _calculate_threshold(estimator, scores, self.threshold) @@ -167,8 +168,12 @@ def fit(self, X, y=None, **fit_params): "Since 'prefit=True', call transform directly") self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **fit_params) + scores = _get_feature_importances(self.estimator_, self.norm_order) + self.threshold_ = _calculate_threshold(self.estimator, scores, + self.threshold) return self + @if_delegate_has_method('estimator') def partial_fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer only once. diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index d92e341676371..31ff0057d8d8e 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -30,6 +30,7 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): X_train, y_train, lambda estimator, features: _score(estimator, X_test[:, features], y_test, scorer)).scores_ + class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): """Feature ranking with recursive feature elimination. @@ -293,8 +294,8 @@ class RFECV(RFE, MetaEstimatorMixin): - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. If the - estimator is a classifier or if ``y`` is neither binary nor multiclass, + :class:`sklearn.model_selection.StratifiedKFold` is used. If the + estimator is a classifier or if ``y`` is neither binary nor multiclass, :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 6efb6f405bb1c..9beeef78a17be 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater @@ -27,8 +28,7 @@ def test_invalid_input(): clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) - model.fit(data, y) - assert_raises(ValueError, model.transform, data) + assert_raises(ValueError, model.fit, data, y) def test_input_estimator_unchanged(): @@ -120,6 +120,10 @@ def test_partial_fit(): transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data)) + # check that if est doesn't have partial_fit, neither does SelectFromModel + transformer = SelectFromModel(estimator=RandomForestClassifier()) + assert_false(hasattr(transformer, "partial_fit")) + def test_calling_fit_reinitializes(): est = LinearSVC(random_state=0) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 3de5ee319c718..63645a96cf37e 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -46,7 +46,7 @@ from .utils import check_random_state from .utils.validation import _num_samples from .utils.validation import check_is_fitted -from .utils.validation import check_X_y +from .utils.validation import check_X_y, check_array from .utils.multiclass import (_check_partial_fit_first_call, check_classification_targets, _ovr_decision_function) @@ -176,7 +176,6 @@ class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): multilabel_ : boolean Whether a OneVsRestClassifier is a multilabel classifier. """ - def __init__(self, estimator, n_jobs=1): self.estimator = estimator self.n_jobs = n_jobs @@ -217,6 +216,7 @@ def fit(self, X, y): return self + @if_delegate_has_method('estimator') def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators @@ -488,8 +488,12 @@ def fit(self, X, y): self """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + check_classification_targets(y) self.classes_ = np.unique(y) + if len(self.classes_) == 1: + raise ValueError("OneVsOneClassifier can not be fit when only one" + " class is present.") n_classes = self.classes_.shape[0] estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)( delayed(_fit_ovo_binary) @@ -498,13 +502,14 @@ def fit(self, X, y): self.estimators_ = estimators_indices[0] try: - self.pairwise_indices_ = estimators_indices[1] \ - if self._pairwise else None + self.pairwise_indices_ = ( + estimators_indices[1] if self._pairwise else None) except AttributeError: self.pairwise_indices_ = None return self + @if_delegate_has_method(delegate='estimator') def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators @@ -544,8 +549,8 @@ def partial_fit(self, X, y, classes=None): n_jobs=self.n_jobs)( delayed(_partial_fit_ovo_binary)( estimator, X, y, self.classes_[i], self.classes_[j]) - for estimator, (i, j) in izip( - self.estimators_, (combinations))) + for estimator, (i, j) in izip(self.estimators_, + (combinations))) self.pairwise_indices_ = None @@ -701,12 +706,14 @@ def fit(self, X, y): ------- self """ + X, y = check_X_y(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {1}" "".format(self.code_size)) _check_estimator(self.estimator) random_state = check_random_state(self.random_state) + check_classification_targets(y) self.classes_ = np.unique(y) n_classes = self.classes_.shape[0] @@ -747,6 +754,7 @@ def predict(self, X): Predicted multi-class targets. """ check_is_fitted(self, 'estimators_') + X = check_array(X) Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T pred = euclidean_distances(Y, self.code_book_).argmin(axis=1) return self.classes_[pred] diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 826ece6d50d98..f608936e952ab 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -16,13 +16,14 @@ import numpy as np -from abc import ABCMeta -from .base import BaseEstimator, clone +from abc import ABCMeta, abstractmethod +from .base import BaseEstimator, clone, MetaEstimatorMixin from .base import RegressorMixin, ClassifierMixin from .utils import check_array, check_X_y from .utils.fixes import parallel_helper from .utils.validation import check_is_fitted, has_fit_parameter from .utils.metaestimators import if_delegate_has_method +from .utils.multiclass import check_classification_targets from .externals.joblib import Parallel, delayed from .externals import six @@ -57,8 +58,9 @@ def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None, return estimator -class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator)): - +class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator, + MetaEstimatorMixin)): + @abstractmethod def __init__(self, estimator, n_jobs=1): self.estimator = estimator self.n_jobs = n_jobs @@ -149,6 +151,9 @@ def fit(self, X, y, sample_weight=None): multi_output=True, accept_sparse=True) + if isinstance(self, ClassifierMixin): + check_classification_targets(y) + if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output regression but has only one.") diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 843bf9ce126cc..d370eda994047 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -483,13 +483,13 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): y : array-like, shape = [n_samples] Target values. - classes : array-like, shape = [n_classes], optional (default=None) + classes : array-like, shape = [n_classes], (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - sample_weight : array-like, shape = [n_samples], optional (default=None) + sample_weight : array-like, shape = [n_samples], (default=None) Weights applied to individual samples (1. for unweighted). Returns @@ -554,7 +554,7 @@ def fit(self, X, y, sample_weight=None): y : array-like, shape = [n_samples] Target values. - sample_weight : array-like, shape = [n_samples], optional (default=None) + sample_weight : array-like, shape = [n_samples], (default=None) Weights applied to individual samples (1. for unweighted). Returns diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py index c6f602979ea1b..3a4e1c3868c95 100644 --- a/sklearn/neighbors/approximate.py +++ b/sklearn/neighbors/approximate.py @@ -93,7 +93,7 @@ class GaussianRandomProjectionHash(ProjectionToHashMixin, GaussianRandomProjection): """Use GaussianRandomProjection to produce a cosine LSH fingerprint""" def __init__(self, - n_components=8, + n_components=32, random_state=None): super(GaussianRandomProjectionHash, self).__init__( n_components=n_components, diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index b62e78e87c223..20ec4b132fc7f 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -13,7 +13,8 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier -from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.multiclass import (check_classification_targets, + type_of_target) from sklearn.utils import shuffle from sklearn.metrics import precision_score @@ -104,6 +105,10 @@ def test_ovr_partial_fit(): pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) + # test partial_fit only exists if estimator has it: + ovr = OneVsRestClassifier(SVC()) + assert_false(hasattr(ovr, "partial_fit")) + def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) @@ -428,7 +433,8 @@ def test_ovr_pipeline(): def test_ovr_coef_(): - for base_classifier in [SVC(kernel='linear', random_state=0), LinearSVC(random_state=0)]: + for base_classifier in [SVC(kernel='linear', random_state=0), + LinearSVC(random_state=0)]: # SVC has sparse coef with sparse input data ovr = OneVsRestClassifier(base_classifier) @@ -439,7 +445,8 @@ def test_ovr_coef_(): assert_equal(shape[0], n_classes) assert_equal(shape[1], iris.data.shape[1]) # don't densify sparse coefficients - assert_equal(sp.issparse(ovr.estimators_[0].coef_), sp.issparse(ovr.coef_)) + assert_equal(sp.issparse(ovr.estimators_[0].coef_), + sp.issparse(ovr.coef_)) def test_ovr_coef_exceptions(): @@ -508,6 +515,10 @@ def test_ovo_partial_fit_predict(): assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65) + # test partial_fit only exists if estimator has it: + ovr = OneVsOneClassifier(SVC()) + assert_false(hasattr(ovr, "partial_fit")) + def test_ovo_decision_function(): n_samples = iris.data.shape[0] @@ -606,6 +617,24 @@ def test_ovo_string_y(): assert_array_equal(y, ovo.predict(X)) +def test_ovo_one_class(): + # Test error for OvO with one class + X = np.eye(4) + y = np.array(['a'] * 4) + + ovo = OneVsOneClassifier(LinearSVC()) + assert_raise_message(ValueError, "when only one class", ovo.fit, X, y) + + +def test_ovo_float_y(): + # Test that the OvO errors on float targets + X = iris.data + y = iris.data[:, 0] + + ovo = OneVsOneClassifier(LinearSVC()) + assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + + def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ecoc.predict, []) @@ -634,6 +663,15 @@ def test_ecoc_gridsearch(): assert_true(best_C in Cs) +def test_ecoc_float_y(): + # Test that the OCC errors on float targets + X = iris.data + y = iris.data[:, 0] + + ovo = OutputCodeClassifier(LinearSVC()) + assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + + def test_pairwise_indices(): clf_precomputed = svm.SVC(kernel='precomputed') X, y = iris.data, iris.target diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index 163363155ca3d..a4217bea63a7c 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -6,6 +6,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises_regex +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_not_equal @@ -336,3 +337,5 @@ def test_multi_output_exceptions(): y_new = np.column_stack((y1, y2)) moc.fit(X, y) assert_raises(ValueError, moc.score, X, y_new) + # ValueError when y is continuous + assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1]) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 2a2cfe1c30fbf..2d3c80510db0d 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -23,6 +23,7 @@ from ..utils.fixes import bincount from ..utils.fixes import array_equal + def _unique_multiclass(y): if hasattr(y, '__array__'): return np.unique(np.asarray(y)) @@ -155,6 +156,7 @@ def is_multilabel(y): return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(labels)) + def check_classification_targets(y): """Ensure that target y is of a non-regression type. @@ -168,11 +170,10 @@ def check_classification_targets(y): """ y_type = type_of_target(y) if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', - 'multilabel-indicator', 'multilabel-sequences']: + 'multilabel-indicator', 'multilabel-sequences']: raise ValueError("Unknown label type: %r" % y_type) - def type_of_target(y): """Determine the type of data indicated by target `y` From c99b9ecf988198c1a4a1928ade926a35a5000e1d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 19 Dec 2016 17:16:39 -0500 Subject: [PATCH 02/20] minor fixes to whatsnew --- doc/whats_new.rst | 49 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0f0fa26918445..9b0630711289a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -183,9 +183,10 @@ Bug fixes ``download_if_missing`` keyword. This was fixed in :issue:`7944` by :user:`Ralf Gommers `. - - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a - sparse array X and initial centroids, where X's means were unnecessarily - being subtracted from the centroids. :issue:`7872` by `Josh Karnofsky `_. + - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse + array X and initial centroids, where X's means were unnecessarily being + subtracted from the centroids. :issue:`7872` by `Josh Karnofsky + `_. - Fix estimators to accept a ``sample_weight`` parameter of type ``pandas.Series`` in their ``fit`` function. :issue:`7825` by @@ -208,13 +209,16 @@ Bug fixes :class:`sklearn.ensemble.GradientBoostingRegressor` ignored the ``min_impurity_split`` parameter. :issue:`8006` by :user:`Sebastian Pölsterl `. - - Fixes to the input validation in :class:`sklearn.covariance.EllipticEnvelope` by - `Andreas Müller`_. - - Fix shape output shape of :class:`sklearn.decomposition.DictionaryLearning` transform - for one-dimensional data by `Andreas Müller`_. + - Fixes to the input validation in + :class:`sklearn.covariance.EllipticEnvelope` by `Andreas Müller`_. + + - Fix shape output shape of + :class:`sklearn.decomposition.DictionaryLearning` transform for + one-dimensional data by `Andreas Müller`_. - - Several fixes to input validation in :class:`multiclass.OutputCodeClassifier` by `Andreas Müller`_ + - Several fixes to input validation in + :class:`multiclass.OutputCodeClassifier` by `Andreas Müller`_ - Fix a bug where :class:`sklearn.ensemble.gradient_boosting.QuantileLossFunction` computed @@ -278,29 +282,26 @@ API changes summary - Gradient boosting base models are not longer estimators. By `Andreas Müller`_. - - `feature_extraction.text.TfidfTransformer` now supports numpy arrays as inputs, and produces numpy - arrays for list inputs and numpy array inputs. By `Andreas `Müller_. + - :class:`feature_extraction.text.TfidfTransformer` now supports numpy + arrays as inputs, and produces numpy arrays for list inputs and numpy + array inputs. By `Andreas `Müller_. - - `feature_selection.SelectFromModel` now validates the ``threshold`` + - :class:`feature_selection.SelectFromModel` now validates the ``threshold`` parameter and sets the ``threshold_`` attribute during the call to - ``fit``, and no longer during the call to ``transform```, by `Andreas Müller`_. - - - `features_selection.SelectFromModel` now has a ``partial_fit`` method only if the underlying - estimator does. By `Andreas Müller`_. + ``fit``, and no longer during the call to ``transform```, by `Andreas + Müller`_. - - All checks in ``utils.estimator_checks``, in particular :func:`utils.estimator_checks.check_estimator` now - accept estimator instances. All checks apart from ``check_estimator`` do not accept estimator classes any more. - By `Andreas Müller`_. + - :class:`features_selection.SelectFromModel` now has a ``partial_fit`` + method only if the underlying estimator does. By `Andreas Müller`_. - - The ``include_others`` and ``dont_test`` parameters of :func:`utils.testing.all_estimators` are deprecated - and are assumed ``True``, by `Andreas Müller`_. + - :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` now do + input validation on ``X`` and check whether ``X`` and ``y`` are of the + same length, by `Andreas Müller`_. + - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method + only if the underlying estimator does. By `Andreas Müller`_. - - :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` now do input validation on ``X`` and check - whether ``X`` and ``y`` are of the same length, by `Andreas Müller`_. - - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method only if the underlying estimator does. - By `Andreas Müller`_. .. _changes_0_18_1: Version 0.18.1 From 534b0c5a16ca30f2c9338eaa8144724b4231d1c7 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 19 Dec 2016 17:17:51 -0500 Subject: [PATCH 03/20] typo in whatsnew --- doc/whats_new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9b0630711289a..c31569d8f7a60 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -284,7 +284,7 @@ API changes summary - :class:`feature_extraction.text.TfidfTransformer` now supports numpy arrays as inputs, and produces numpy arrays for list inputs and numpy - array inputs. By `Andreas `Müller_. + array inputs. By `Andreas Müller_`. - :class:`feature_selection.SelectFromModel` now validates the ``threshold`` parameter and sets the ``threshold_`` attribute during the call to From c7cd00dd54e385c72aab467a493949ea0437633b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 19 Dec 2016 17:41:54 -0500 Subject: [PATCH 04/20] add test for n_components = 1 transform in dict learning --- sklearn/decomposition/tests/test_dict_learning.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index b7ed5c4703492..9df3528d33443 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -28,7 +28,12 @@ def test_dict_learning_shapes(): n_components = 5 dico = DictionaryLearning(n_components, random_state=0).fit(X) - assert_true(dico.components_.shape == (n_components, n_features)) + assert_equal(dico.components_.shape, (n_components, n_features)) + + n_components = 1 + dico = DictionaryLearning(n_components, random_state=0).fit(X) + assert_equal(dico.components_.shape, (n_components, n_features)) + assert_equal(dico.transform(X).shape, (X.shape[0], n_components)) def test_dict_learning_overcomplete(): From 91559ce20738cb26419ee9727fd3fc6dbdfc1e0b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 20 Dec 2016 11:45:39 -0500 Subject: [PATCH 05/20] feature extraction doc fix --- doc/modules/feature_extraction.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 4995177705c1d..009b58dcfdfa9 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -450,7 +450,7 @@ Let's take an example with the following counts. The first term is present 100% of the time hence not very interesting. The two other features only in less than 50% of the time hence probably more representative of the content of the documents:: - + >>> counts = [[3, 0, 1], ... [2, 0, 0], ... [3, 0, 0], @@ -460,10 +460,6 @@ content of the documents:: ... >>> tfidf = transformer.fit_transform(counts) >>> tfidf # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - <6x3 sparse matrix of type '<... 'numpy.float64'>' - with 9 stored elements in Compressed Sparse ... format> - - >>> tfidf.toarray() # doctest: +ELLIPSIS array([[ 0.81940995, 0. , 0.57320793], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], From 6ee218db1a3571711f1835b1258c874665cb58e4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Sat, 25 Feb 2017 13:39:40 -0500 Subject: [PATCH 06/20] fix broken test --- doc/modules/feature_extraction.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 009b58dcfdfa9..b7a7755ebcfb6 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -519,7 +519,7 @@ And the L2-normalized tf-idf changes to = [0.8515, 0, 0.5243]`:: >>> transformer = TfidfTransformer() - >>> transformer.fit_transform(counts).toarray() + >>> transformer.fit_transform(counts) array([[ 0.85151335, 0. , 0.52433293], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], From 27775e9aa363dc21872b687c17ffae1ad301750c Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 17:42:28 -0400 Subject: [PATCH 07/20] revert aggressive input validation changes --- doc/whats_new.rst | 4 ---- sklearn/dummy.py | 7 +++---- sklearn/feature_extraction/text.py | 30 +++++++++++++++--------------- sklearn/naive_bayes.py | 4 ++-- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c31569d8f7a60..e181640e25724 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -282,10 +282,6 @@ API changes summary - Gradient boosting base models are not longer estimators. By `Andreas Müller`_. - - :class:`feature_extraction.text.TfidfTransformer` now supports numpy - arrays as inputs, and produces numpy arrays for list inputs and numpy - array inputs. By `Andreas Müller_`. - - :class:`feature_selection.SelectFromModel` now validates the ``threshold`` parameter and sets the ``threshold_`` attribute during the call to ``fit``, and no longer during the call to ``transform```, by `Andreas diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 0f01d18cb2b9c..ddee7167272ab 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -10,7 +10,7 @@ from .base import BaseEstimator, ClassifierMixin, RegressorMixin from .utils import check_random_state -from .utils.validation import check_array, check_X_y +from .utils.validation import check_array from .utils.validation import check_consistent_length from .utils.validation import check_is_fitted from .utils.random import random_choice_csc @@ -117,7 +117,6 @@ def fit(self, X, y, sample_weight=None): self.sparse_output_ = sp.issparse(y) - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) check_consistent_length(X, y) if not self.sparse_output_: @@ -398,8 +397,8 @@ def fit(self, X, y, sample_weight=None): "'mean', 'median', 'quantile' or 'constant'" % self.strategy) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True) + y = check_array(y, ensure_2d=False) + if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 72676b69a36fc..d3d4206d44c9c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,8 +29,9 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS +from ..utils import deprecated from ..utils.fixes import frombuffer_empty, bincount -from ..utils.validation import check_is_fitted, check_array +from ..utils.validation import check_is_fitted __all__ = ['CountVectorizer', 'ENGLISH_STOP_WORDS', @@ -158,7 +159,8 @@ def _char_wb_ngrams(self, text_document): """Whitespace sensitive char-n-gram tokenization. Tokenize text_document into a sequence of character n-grams - excluding any whitespace (operating only inside word boundaries)""" + operating only inside word boundaries. n-grams at the edges + of words are padded with space.""" # normalize white spaces text_document = self._white_spaces.sub(" ", text_document) @@ -353,7 +355,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): analyzer : string, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside - word boundaries. + word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. @@ -552,7 +554,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): analyzer : string, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside - word boundaries. + word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. @@ -1022,8 +1024,7 @@ def fit(self, X, y=None): a matrix of term/token counts """ if not sp.issparse(X): - X = sp.csc_matrix(X, dtype=np.float64) - X = check_array(X, accept_sparse=["csc", "csr"]) + X = sp.csc_matrix(X) if self.use_idf: n_samples, n_features = X.shape df = _document_frequency(X) @@ -1056,19 +1057,18 @@ def transform(self, X, copy=True): ------- vectors : sparse matrix, [n_samples, n_features] """ - X = check_array(X, accept_sparse=["csr"], copy=copy, - dtype=[np.float64, np.float32]) + if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float): + # preserve float family dtype + X = sp.csr_matrix(X, copy=copy) + else: + # convert counts or binary occurrences to floats + X = sp.csr_matrix(X, dtype=np.float64, copy=copy) n_samples, n_features = X.shape if self.sublinear_tf: - if sp.issparse(X): - np.log(X.data, X.data) - X.data += 1 - else: - mask = X != 0 - X[mask] = np.log(X[mask]) - X[mask] += 1 + np.log(X.data, X.data) + X.data += 1 if self.use_idf: check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index d370eda994047..4f8a94115dc5a 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -483,13 +483,13 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): y : array-like, shape = [n_samples] Target values. - classes : array-like, shape = [n_classes], (default=None) + classes : array-like, shape = [n_classes] (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - sample_weight : array-like, shape = [n_samples], (default=None) + sample_weight : array-like, shape = [n_samples] (default=None) Weights applied to individual samples (1. for unweighted). Returns From f4c9d60a930389a75a8a5076aba591d4046b67b9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 17:42:55 -0400 Subject: [PATCH 08/20] in SelectFromModel, don't store threshold_ in transform. If we called "fit", use estimates from last "fit". --- sklearn/feature_selection/from_model.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index e27c0bd267bf9..262484c1ad1c0 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -129,20 +129,18 @@ def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): self.norm_order = norm_order def _get_support_mask(self): + if hasattr(self, "_mask"): + return self._mask # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator - elif hasattr(self, 'estimator_'): - estimator = self.estimator_ else: raise ValueError( - 'Either fit the model before transform or set "prefit=True"' - ' while passing the fitted estimator to the constructor.') - # XXX duplicate computation if we called fit before + 'Either fit SelectFromModel before transform or set "prefit=' + 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator, self.norm_order) - self.threshold_ = _calculate_threshold(estimator, scores, - self.threshold) - return scores >= self.threshold_ + threshold_ = _calculate_threshold(estimator, scores, self.threshold) + return scores >= threshold_ def fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer. @@ -171,6 +169,7 @@ def fit(self, X, y=None, **fit_params): scores = _get_feature_importances(self.estimator_, self.norm_order) self.threshold_ = _calculate_threshold(self.estimator, scores, self.threshold) + self._mask = scores >= self.threshold_ return self @if_delegate_has_method('estimator') From 30bdd041a261823ed6ac08028129a601a31c13ac Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 17:48:40 -0400 Subject: [PATCH 09/20] move score from EllipticEnvelope to OutlierDetectionMixin --- sklearn/covariance/outlier_detection.py | 52 ++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/sklearn/covariance/outlier_detection.py b/sklearn/covariance/outlier_detection.py index 3349f71af42d2..7cfa84d880422 100644 --- a/sklearn/covariance/outlier_detection.py +++ b/sklearn/covariance/outlier_detection.py @@ -101,6 +101,32 @@ def predict(self, X): return is_inlier + def score(self, X, y, sample_weight=None): + """Returns the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples. + + y : array-like, shape = (n_samples) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + + """ + return accuracy_score(y, self.predict(X), sample_weight=sample_weight) + class EllipticEnvelope(OutlierDetectionMixin, MinCovDet): """An object for detecting outliers in a Gaussian distributed dataset. @@ -177,29 +203,3 @@ def fit(self, X, y=None): self.threshold_ = sp.stats.scoreatpercentile( self.dist_, 100. * (1. - self.contamination)) return self - - def score(self, X, y, sample_weight=None): - """Returns the mean accuracy on the given test data and labels. - - In multi-label classification, this is the subset accuracy - which is a harsh metric since you require for each sample that - each label set be correctly predicted. - - Parameters - ---------- - X : array-like, shape = (n_samples, n_features) - Test samples. - - y : array-like, shape = (n_samples) or (n_samples, n_outputs) - True labels for X. - - sample_weight : array-like, shape = [n_samples], optional - Sample weights. - - Returns - ------- - score : float - Mean accuracy of self.predict(X) wrt. y. - - """ - return accuracy_score(y, self.predict(X), sample_weight=sample_weight) From 5ed1174f666afcf6a3a30f2605529204ffce8833 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 17:50:07 -0400 Subject: [PATCH 10/20] revert changes to Tfidf documentation --- doc/modules/feature_extraction.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index f4e59fbc63bd9..32e53f0817e6e 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -450,7 +450,7 @@ Let's take an example with the following counts. The first term is present 100% of the time hence not very interesting. The two other features only in less than 50% of the time hence probably more representative of the content of the documents:: - + >>> counts = [[3, 0, 1], ... [2, 0, 0], ... [3, 0, 0], @@ -460,6 +460,10 @@ content of the documents:: ... >>> tfidf = transformer.fit_transform(counts) >>> tfidf # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + <6x3 sparse matrix of type '<... 'numpy.float64'>' + with 9 stored elements in Compressed Sparse ... format> + + >>> tfidf.toarray() # doctest: +ELLIPSIS array([[ 0.81940995, 0. , 0.57320793], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], @@ -519,7 +523,7 @@ And the L2-normalized tf-idf changes to = [0.8515, 0, 0.5243]`:: >>> transformer = TfidfTransformer() - >>> transformer.fit_transform(counts) + >>> transformer.fit_transform(counts).toarray() array([[ 0.85151335, 0. , 0.52433293], [ 1. , 0. , 0. ], [ 1. , 0. , 0. ], From adee7a342ff2c86ed4dabeb4320be870a4cc681d Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 17:52:58 -0400 Subject: [PATCH 11/20] remove dummy input validation from whatsnew --- doc/whats_new.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 63a8c960826bd..441ae13bf5238 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -351,10 +351,6 @@ API changes summary - :class:`features_selection.SelectFromModel` now has a ``partial_fit`` method only if the underlying estimator does. By `Andreas Müller`_. - - :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` now do - input validation on ``X`` and check whether ``X`` and ``y`` are of the - same length, by `Andreas Müller`_. - - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method only if the underlying estimator does. By `Andreas Müller`_. From a83697f5f7a9e8392d2cde551db81b636433447f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 18:23:26 -0400 Subject: [PATCH 12/20] fix text feature tests --- .travis.yml | 2 +- doc/whats_new.rst | 13 ++++++------- sklearn/feature_extraction/tests/test_text.py | 10 ++++------ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5ba455625c313..6a513d4a0f3d9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ env: - DISTRIB="conda" PYTHON_VERSION="3.6" INSTALL_MKL="true" NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" PANDAS_VERSION="0.19.1" CYTHON_VERSION="0.25.2" COVERAGE=true - # This environment use pytest to run the tests. It uses the newest + # This environment uses pytest to run the tests. It uses the newest # supported anaconda env. It also runs tests requiring Pandas. - USE_PYTEST="true" DISTRIB="conda" PYTHON_VERSION="3.6" INSTALL_MKL="true" NUMPY_VERSION="1.11.2" SCIPY_VERSION="0.18.1" PANDAS_VERSION="0.19.1" diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 441ae13bf5238..56665be7caed3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -206,23 +206,22 @@ Bug fixes - Fixed a bug where :class:`sklearn.linear_model.LassoLars` does not give the same result as the LassoLars implementation available - in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez ` in R (lars library). :issue:`7849` by `Jair Montoya Martinez`_ - Some ``fetch_`` functions in `sklearn.datasets` were ignoring the ``download_if_missing`` keyword. This was fixed in :issue:`7944` by :user:`Ralf Gommers `. - - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse - array X and initial centroids, where X's means were unnecessarily being - subtracted from the centroids. :issue:`7872` by `Josh Karnofsky - `_. - - Fixed a bug in :class:`sklearn.ensemble.GradientBoostingClassifier` and :class:`sklearn.ensemble.GradientBoostingRegressor` where a float being compared to ``0.0`` using ``==`` caused a divide by zero error. This was fixed in :issue:`7970` by :user:`He Chen `. + - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse + array X and initial centroids, where X's means were unnecessarily being + subtracted from the centroids. :issue:`7872` by `Josh Karnofsky + `_. + - Fix estimators to accept a ``sample_weight`` parameter of type ``pandas.Series`` in their ``fit`` function. :issue:`7825` by `Kathleen Chen`_. @@ -5107,4 +5106,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Anish Shah: https://github.com/AnishShah .. _Neeraj Gangwar: http://neerajgangwar.in -.. _Arthur Mensch: https://amensch.fr \ No newline at end of file +.. _Arthur Mensch: https://amensch.fr diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index ab8d9d39aadc2..341486abd3b1c 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -35,7 +35,6 @@ from functools import partial import pickle from io import StringIO -from scipy import sparse JUNK_FOOD_DOCS = ( @@ -310,7 +309,7 @@ def test_tf_idf_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') - tfidf = tr.fit_transform(X) + tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization @@ -321,7 +320,7 @@ def test_tf_idf_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') - tfidf = tr.fit_transform(X) + tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) @@ -330,7 +329,7 @@ def test_tfidf_no_smoothing(): [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') - tfidf = tr.fit_transform(X) + tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization @@ -341,7 +340,6 @@ def test_tfidf_no_smoothing(): X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] - X = sparse.csr_matrix(X) tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() @@ -359,7 +357,7 @@ def test_tfidf_no_smoothing(): def test_sublinear_tf(): X = [[1], [2], [3]] tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None) - tfidf = tr.fit_transform(X) + tfidf = tr.fit_transform(X).toarray() assert_equal(tfidf[0], 1) assert_greater(tfidf[1], tfidf[0]) assert_greater(tfidf[2], tfidf[1]) From 9ce47472c61ddf390f11d0ad19c2b2744c493002 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 15 May 2017 18:34:34 -0400 Subject: [PATCH 13/20] rewrite from_model threshold again... --- sklearn/feature_selection/from_model.py | 16 ++++++++++------ .../feature_selection/tests/test_from_model.py | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index 262484c1ad1c0..b1993f4eb4088 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -134,13 +134,16 @@ def _get_support_mask(self): # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator + elif hasattr(self, 'estimator_'): + estimator = self.estimator_ else: raise ValueError( 'Either fit SelectFromModel before transform or set "prefit=' 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator, self.norm_order) - threshold_ = _calculate_threshold(estimator, scores, self.threshold) - return scores >= threshold_ + threshold = _calculate_threshold(self.estimator, scores, + self.threshold) + return scores >= threshold def fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer. @@ -166,12 +169,13 @@ def fit(self, X, y=None, **fit_params): "Since 'prefit=True', call transform directly") self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **fit_params) - scores = _get_feature_importances(self.estimator_, self.norm_order) - self.threshold_ = _calculate_threshold(self.estimator, scores, - self.threshold) - self._mask = scores >= self.threshold_ return self + @property + def threshold_(self): + scores = _get_feature_importances(self.estimator_, self.norm_order) + return _calculate_threshold(self.estimator, scores, self.threshold) + @if_delegate_has_method('estimator') def partial_fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer only once. diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 9beeef78a17be..fc64f13723f8d 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -28,7 +28,8 @@ def test_invalid_input(): clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) - assert_raises(ValueError, model.fit, data, y) + model.fit(data, y) + assert_raises(ValueError, model.transform, data) def test_input_estimator_unchanged(): From f727e899353e6c4c9529b5c0fc685268f821e14e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 16 May 2017 17:42:17 -0400 Subject: [PATCH 14/20] remove stray condition --- sklearn/feature_selection/from_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index b1993f4eb4088..f221d6d749457 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -129,8 +129,6 @@ def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): self.norm_order = norm_order def _get_support_mask(self): - if hasattr(self, "_mask"): - return self._mask # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator From 8bbb7424a61639bba62e4125a7de5c390d2f4b3e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 17 May 2017 16:23:56 -0400 Subject: [PATCH 15/20] fix self.estimator -> estimator, slightly more interesting test --- sklearn/feature_selection/from_model.py | 3 +-- sklearn/feature_selection/tests/test_from_model.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index f221d6d749457..dada33e9a75cc 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -139,8 +139,7 @@ def _get_support_mask(self): 'Either fit SelectFromModel before transform or set "prefit=' 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator, self.norm_order) - threshold = _calculate_threshold(self.estimator, scores, - self.threshold) + threshold = _calculate_threshold(estimator, scores, self.threshold) return scores >= threshold def fit(self, X, y=None, **fit_params): diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index fc64f13723f8d..6ef0d824b587c 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -176,10 +176,10 @@ def test_threshold_string(): def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) - model = SelectFromModel(clf, threshold=0.1) + model = SelectFromModel(clf, threshold="0.1 * mean") model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. - model.threshold = 1.0 + model.threshold = "1.0 * mean" assert_greater(X_transform.shape[1], model.transform(data).shape[1]) From 746ccdbd89cab7803449d2e6b7b655b3d1d8b6c4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 17 May 2017 16:26:27 -0400 Subject: [PATCH 16/20] typo in comment --- sklearn/feature_extraction/text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index d3d4206d44c9c..539e88973bcc0 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,7 +29,6 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS -from ..utils import deprecated from ..utils.fixes import frombuffer_empty, bincount from ..utils.validation import check_is_fitted @@ -1089,7 +1088,7 @@ def transform(self, X, copy=True): @property def idf_(self): # if _idf_diag is not set, this will raise an attribute error, - # which means hasatt(self, "idf_") is False + # which means hasattr(self, "idf_") is False return np.ravel(self._idf_diag.sum(axis=0)) From 9564e0f04cf31901d96e5edf34480f7522c0b740 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 5 Jun 2017 13:37:56 +0200 Subject: [PATCH 17/20] Fix issues in SparseEncoder, add tests. more explicit explanation of SparseEncoder change, add issue numbers to whatsnew --- doc/whats_new.rst | 14 +++++++++----- sklearn/decomposition/dict_learning.py | 12 ++++++++---- sklearn/decomposition/tests/test_dict_learning.py | 13 +++++++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 56665be7caed3..67d581bd1c6fe 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -245,14 +245,18 @@ Bug fixes :issue:`8006` by :user:`Sebastian Pölsterl `. - Fixes to the input validation in - :class:`sklearn.covariance.EllipticEnvelope` by `Andreas Müller`_. + :class:`sklearn.covariance.EllipticEnvelope`. + :issue:`8086` by `Andreas Müller`_. - - Fix shape output shape of - :class:`sklearn.decomposition.DictionaryLearning` transform for - one-dimensional data by `Andreas Müller`_. + - Fix output shape and bugs with n_jobs > 1 in + :class:`sklearn.decomposition.SparseEncoder` transform and :func:`sklarn.decomposition.sparse_encode` + for one-dimensional data and one component. + This also impacts the output shape of :class:`sklearn.decomposition.DictionaryLearning`. + :issue:`8086` by `Andreas Müller`_. - Several fixes to input validation in - :class:`multiclass.OutputCodeClassifier` by `Andreas Müller`_ + :class:`multiclass.OutputCodeClassifier` + :issue:`8086` by `Andreas Müller`_. - Fix a bug where :class:`sklearn.ensemble.gradient_boosting.QuantileLossFunction` computed diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 9a302196526d8..b9bb0fcea864c 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -94,6 +94,11 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', if X.ndim == 1: X = X[:, np.newaxis] n_samples, n_features = X.shape + n_components = dictionary.shape[0] + if dictionary.shape[1] != X.shape[1]: + raise ValueError("Dictionary and X have different numbers of features:" + "dictionary.shape: {} X.shape{}".format( + dictionary.shape, X.shape)) if cov is None and algorithm != 'lasso_cd': # overwriting cov is safe copy_cov = False @@ -157,6 +162,8 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', raise ValueError('Sparse coding method must be "lasso_lars" ' '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) + if new_code.ndim != 2: + return new_code.reshape(n_samples, n_components) return new_code @@ -281,10 +288,6 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', max_iter=max_iter, check_input=False, verbose=verbose) - # This ensure that dimensionality of code is always 2, - # consistent with the case n_jobs > 1 - if code.ndim == 1: - code = code[:, np.newaxis] return code # Enter parallel code block @@ -905,6 +908,7 @@ class SparseCoder(BaseEstimator, SparseCodingMixin): MiniBatchSparsePCA sparse_encode """ + _required_parameters = ["dictionary"] def __init__(self, dictionary, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 9df3528d33443..5bf9836aa6a9e 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -1,4 +1,5 @@ import numpy as np +import itertools from sklearn.exceptions import ConvergenceWarning @@ -25,6 +26,18 @@ X = rng_global.randn(n_samples, n_features) +def test_sparse_encode_shapes_omp(): + rng = np.random.RandomState(0) + algorithms = ['omp', 'lasso_lars', 'lasso_cd', 'lars', 'threshold'] + for n_components, n_samples in itertools.product([1, 5], [1, 9]): + X_ = rng.randn(n_samples, n_features) + dictionary = rng.randn(n_components, n_features) + for algorithm, n_jobs in itertools.product(algorithms, [1, 3]): + code = sparse_encode(X_, dictionary, algorithm=algorithm, + n_jobs=n_jobs) + assert_equal(code.shape, (n_samples, n_components)) + + def test_dict_learning_shapes(): n_components = 5 dico = DictionaryLearning(n_components, random_state=0).fit(X) From bb7f085eeed0550bbd073fe44c2f6f62a4b553b6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Jun 2017 11:09:13 +0200 Subject: [PATCH 18/20] minor fixes in whats_new.rst --- doc/whats_new.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index b3b27f9beac03..c0e874231ca2a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -213,7 +213,7 @@ Bug fixes - Fixed a bug where :class:`sklearn.linear_model.LassoLars` does not give the same result as the LassoLars implementation available - in R (lars library). :issue:`7849` by `Jair Montoya Martinez`_ + in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez `. - Some ``fetch_`` functions in `sklearn.datasets` were ignoring the ``download_if_missing`` keyword. This was fixed in :issue:`7944` by @@ -226,8 +226,7 @@ Bug fixes - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse array X and initial centroids, where X's means were unnecessarily being - subtracted from the centroids. :issue:`7872` by `Josh Karnofsky - `_. + subtracted from the centroids. :issue:`7872` by :user:`Josh Karnofsky `. - Fix estimators to accept a ``sample_weight`` parameter of type ``pandas.Series`` in their ``fit`` function. :issue:`7825` by @@ -256,7 +255,7 @@ Bug fixes :issue:`8086` by `Andreas Müller`_. - Fix output shape and bugs with n_jobs > 1 in - :class:`sklearn.decomposition.SparseEncoder` transform and :func:`sklarn.decomposition.sparse_encode` + :class:`sklearn.decomposition.SparseCoder` transform and :func:`sklarn.decomposition.sparse_encode` for one-dimensional data and one component. This also impacts the output shape of :class:`sklearn.decomposition.DictionaryLearning`. :issue:`8086` by `Andreas Müller`_. @@ -360,7 +359,7 @@ API changes summary ``fit``, and no longer during the call to ``transform```, by `Andreas Müller`_. - - :class:`features_selection.SelectFromModel` now has a ``partial_fit`` + - :class:`feature_selection.SelectFromModel` now has a ``partial_fit`` method only if the underlying estimator does. By `Andreas Müller`_. - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method From c18d646b4bf8b27e3fbf47b7d308476ba9ce55e6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Jun 2017 11:09:26 +0200 Subject: [PATCH 19/20] slightly more consistency with tuples for shapes --- sklearn/covariance/outlier_detection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/covariance/outlier_detection.py b/sklearn/covariance/outlier_detection.py index 7cfa84d880422..9fe219ba5d0b6 100644 --- a/sklearn/covariance/outlier_detection.py +++ b/sklearn/covariance/outlier_detection.py @@ -113,10 +113,10 @@ def score(self, X, y, sample_weight=None): X : array-like, shape = (n_samples, n_features) Test samples. - y : array-like, shape = (n_samples) or (n_samples, n_outputs) + y : array-like, shape = (n_samples,) or (n_samples, n_outputs) True labels for X. - sample_weight : array-like, shape = [n_samples], optional + sample_weight : array-like, shape = (n_samples,), optional Sample weights. Returns From 8a3ea13a47eef41af1e306ed80668c5e3c11159e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Tue, 6 Jun 2017 13:20:14 +0200 Subject: [PATCH 20/20] not longer typo --- doc/whats_new.rst | 2 +- sklearn/cluster/tests/test_k_means.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c0e874231ca2a..bb52411e2fba4 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -352,7 +352,7 @@ API changes summary :issue:`2879` by :user:`Stephen Hoover `. - - Gradient boosting base models are not longer estimators. By `Andreas Müller`_. + - Gradient boosting base models are no longer estimators. By `Andreas Müller`_. - :class:`feature_selection.SelectFromModel` now validates the ``threshold`` parameter and sets the ``threshold_`` attribute during the call to diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 31307e55801a5..38fcff94d7505 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -404,7 +404,7 @@ def test_minibatch_sensible_reassign_partial_fit(): def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model - # should not longer be good + # should no longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, random_state=42)