From 75e04c2a98a16817163deed6cf32a3a584547e7f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 17 Apr 2020 11:16:51 +0200 Subject: [PATCH 01/13] FIX remove validation from __init__ and set_param --- sklearn/decomposition/_factor_analysis.py | 7 ++- sklearn/decomposition/_fastica.py | 6 +- sklearn/decomposition/_kernel_pca.py | 7 ++- sklearn/feature_extraction/_hash.py | 2 - sklearn/feature_extraction/text.py | 64 ++++++++++++++------ sklearn/linear_model/_stochastic_gradient.py | 2 - sklearn/model_selection/_search.py | 8 ++- sklearn/neighbors/_base.py | 1 - sklearn/neighbors/_classification.py | 10 +-- sklearn/neighbors/_kde.py | 16 ++--- sklearn/pipeline.py | 6 +- sklearn/preprocessing/_label.py | 21 ++++--- sklearn/svm/_base.py | 18 +++--- 13 files changed, 97 insertions(+), 71 deletions(-) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index a09b89bda6d6e..c59ce5eb3a0a5 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -147,9 +147,6 @@ def __init__(self, n_components=None, *, tol=1e-2, copy=True, self.copy = copy self.tol = tol self.max_iter = max_iter - if svd_method not in ['lapack', 'randomized']: - raise ValueError('SVD method %s is not supported. Please consider' - ' the documentation' % svd_method) self.svd_method = svd_method self.noise_variance_init = noise_variance_init @@ -170,6 +167,10 @@ def fit(self, X, y=None): ------- self """ + if self.svd_method not in ['lapack', 'randomized']: + raise ValueError('SVD method %s is not supported. Please consider' + ' the documentation' % self.svd_method) + X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 7329fbbe4be1f..8b1a03231289a 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -396,9 +396,6 @@ def __init__(self, n_components=None, *, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=1e-4, w_init=None, random_state=None): super().__init__() - if max_iter < 1: - raise ValueError("max_iter should be greater than 1, got " - "(max_iter={})".format(max_iter)) self.n_components = n_components self.algorithm = algorithm self.whiten = whiten @@ -426,6 +423,9 @@ def _fit(self, X, compute_sources=False): ------- X_new : array-like, shape (n_samples, n_components) """ + if self.max_iter < 1: + raise ValueError("max_iter should be greater than 1, got " + "(max_iter={})".format(self.max_iter)) X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2).T diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 527f78d34bbb5..428c8c64e9f35 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -145,9 +145,6 @@ def __init__(self, n_components=None, *, kernel="linear", alpha=1.0, fit_inverse_transform=False, eigen_solver='auto', tol=0, max_iter=None, remove_zero_eig=False, random_state=None, copy_X=True, n_jobs=None): - if fit_inverse_transform and kernel == 'precomputed': - raise ValueError( - "Cannot fit_inverse_transform with a precomputed kernel.") self.n_components = n_components self.kernel = kernel self.kernel_params = kernel_params @@ -275,6 +272,10 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ + if self.fit_inverse_transform and self.kernel == 'precomputed': + raise ValueError( + "Cannot fit_inverse_transform with a precomputed kernel.") + X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X) self._centerer = KernelCenterer() K = self._get_kernel(X) diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index b9c2abaa25a72..04238eeb29fac 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -92,8 +92,6 @@ class FeatureHasher(TransformerMixin, BaseEstimator): @_deprecate_positional_args def __init__(self, n_features=(2 ** 20), *, input_type="dict", dtype=np.float64, alternate_sign=True): - self._validate_params(n_features, input_type) - self.dtype = dtype self.input_type = input_type self.n_features = n_features diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 27c5eb437805b..45a59337d5bb5 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1020,15 +1020,7 @@ def __init__(self, *, input='content', encoding='utf-8', self.stop_words = stop_words self.max_df = max_df self.min_df = min_df - if max_df < 0 or min_df < 0: - raise ValueError("negative value for max_df or min_df") self.max_features = max_features - if max_features is not None: - if (not isinstance(max_features, numbers.Integral) or - max_features <= 0): - raise ValueError( - "max_features=%r, neither a positive integer nor None" - % max_features) self.ngram_range = ngram_range self.vocabulary = vocabulary self.binary = binary @@ -1184,6 +1176,15 @@ def fit_transform(self, raw_documents, y=None): # We intentionally don't call the transform method to make # fit_transform overridable without unwanted side effects in # TfidfVectorizer. + if self.max_df < 0 or self.min_df < 0: + raise ValueError("negative value for max_df or min_df") + if self.max_features is not None: + if (not isinstance(self.max_features, numbers.Integral) or + self.max_features <= 0): + raise ValueError( + "max_features=%r, neither a positive integer nor None" + % self.max_features) + if isinstance(raw_documents, str): raise ValueError( "Iterable over raw text documents expected, " @@ -1735,51 +1736,72 @@ def __init__(self, *, input='content', encoding='utf-8', max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype) - self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, - smooth_idf=smooth_idf, - sublinear_tf=sublinear_tf) - # Broadcast the TF-IDF parameters to the underlying transformer instance # for easy grid search and repr @property def norm(self): - return self._tfidf.norm + try: + check_is_fitted(self) + return self._tfidf.norm + except NotFittedError: + return None @norm.setter def norm(self, value): + check_is_fitted(self) self._tfidf.norm = value @property def use_idf(self): - return self._tfidf.use_idf + try: + check_is_fitted(self) + return self._tfidf.use_idf + except NotFittedError: + return None @use_idf.setter def use_idf(self, value): + check_is_fitted(self) self._tfidf.use_idf = value @property def smooth_idf(self): - return self._tfidf.smooth_idf + try: + check_is_fitted(self) + return self._tfidf.smooth_idf + except NotFittedError: + return None @smooth_idf.setter def smooth_idf(self, value): + check_is_fitted(self) self._tfidf.smooth_idf = value @property def sublinear_tf(self): - return self._tfidf.sublinear_tf + try: + check_is_fitted(self) + return self._tfidf.sublinear_tf + except NotFittedError: + return None @sublinear_tf.setter def sublinear_tf(self, value): + check_is_fitted(self) self._tfidf.sublinear_tf = value @property def idf_(self): - return self._tfidf.idf_ + try: + check_is_fitted(self) + return self._tfidf.idf_ + except NotFittedError: + return None @idf_.setter def idf_(self, value): + check_is_fitted(self) self._validate_vocabulary() if hasattr(self, 'vocabulary_'): if len(self.vocabulary_) != len(value): @@ -1813,7 +1835,9 @@ def fit(self, raw_documents, y=None): self._check_params() self._warn_for_unused_params() X = super().fit_transform(raw_documents) - self._tfidf.fit(X) + self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf).fit(X) return self def fit_transform(self, raw_documents, y=None): @@ -1836,7 +1860,9 @@ def fit_transform(self, raw_documents, y=None): """ self._check_params() X = super().fit_transform(raw_documents) - self._tfidf.fit(X) + self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf).fit(X) # X is already a transformed view of raw_documents so # we set copy to False return self._tfidf.transform(X, copy=False) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index bf1e77e3e355b..3416a35cf58c7 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -97,7 +97,6 @@ def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0, self.tol = tol # current tests expect init to do parameter validation # but we are not allowed to set attributes - self._validate_params() def set_params(self, **kwargs): """Set and validate the parameters of estimator. @@ -113,7 +112,6 @@ def set_params(self, **kwargs): Estimator instance. """ super().set_params(**kwargs) - self._validate_params() return self @abstractmethod diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d283dc2f0b483..49e645e13a010 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -643,6 +643,7 @@ def fit(self, X, y=None, groups=None, **fit_params): **fit_params : dict of str -> object Parameters passed to the ``fit`` method of the estimator """ + self._validate_params() estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) @@ -1170,12 +1171,14 @@ def __init__(self, estimator, param_grid, scoring=None, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) self.param_grid = param_grid - _check_param_grid(param_grid) def _run_search(self, evaluate_candidates): """Search all candidates in param_grid""" evaluate_candidates(ParameterGrid(self.param_grid)) + def _validate_params(self): + _check_param_grid(self.param_grid) + class RandomizedSearchCV(BaseSearchCV): """Randomized search on hyper parameters. @@ -1511,3 +1514,6 @@ def _run_search(self, evaluate_candidates): evaluate_candidates(ParameterSampler( self.param_distributions, self.n_iter, random_state=self.random_state)) + + def _validate_params(self): + _check_param_grid(self.param_distributions) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 945959ef10d9c..2519367cd6d87 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -302,7 +302,6 @@ def __init__(self, n_neighbors=None, radius=None, self.metric_params = metric_params self.p = p self.n_jobs = n_jobs - self._check_algorithm_metric() def _check_algorithm_metric(self): if self.algorithm not in ['auto', 'brute', diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 0580b710afd44..3ae82d1bcf44e 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -152,7 +152,7 @@ def __init__(self, n_neighbors=5, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs, **kwargs) - self.weights = _check_weights(weights) + self.weights = weights def predict(self, X): """Predict the class labels for the provided data. @@ -179,7 +179,7 @@ def predict(self, X): n_outputs = len(classes_) n_queries = _num_samples(X) - weights = _get_weights(neigh_dist, self.weights) + weights = _get_weights(neigh_dist, _check_weights(self.weights)) y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) for k, classes_k in enumerate(classes_): @@ -224,7 +224,7 @@ def predict_proba(self, X): n_queries = _num_samples(X) - weights = _get_weights(neigh_dist, self.weights) + weights = _get_weights(neigh_dist, _check_weights(self.weights)) if weights is None: weights = np.ones_like(neigh_ind) @@ -384,7 +384,7 @@ def __init__(self, radius=1.0, weights='uniform', leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs, **kwargs) - self.weights = _check_weights(weights) + self.weights = weights self.outlier_label = outlier_label def fit(self, X, y): @@ -531,7 +531,7 @@ def predict_proba(self, X): 'or considering removing them from your dataset.' % outliers) - weights = _get_weights(neigh_dist, self.weights) + weights = _get_weights(neigh_dist, _check_weights(self.weights)) if weights is not None: weights = weights[inliers] diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 91a97e2810baa..582f59e2ff418 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -102,16 +102,6 @@ def __init__(self, bandwidth=1.0, algorithm='auto', self.leaf_size = leaf_size self.metric_params = metric_params - # run the choose algorithm code so that exceptions will happen here - # we're using clone() in the GenerativeBayes classifier, - # so we can't do this kind of logic in __init__ - self._choose_algorithm(self.algorithm, self.metric) - - if bandwidth <= 0: - raise ValueError("bandwidth must be positive") - if kernel not in VALID_KERNELS: - raise ValueError("invalid kernel: '{0}'".format(kernel)) - def _choose_algorithm(self, algorithm, metric): # given the algorithm string + metric string, choose the optimal # algorithm to compute the result. @@ -152,6 +142,12 @@ def fit(self, X, y=None, sample_weight=None): Returns instance of object. """ algorithm = self._choose_algorithm(self.algorithm, self.metric) + + if self.bandwidth <= 0: + raise ValueError("bandwidth must be positive") + if self.kernel not in VALID_KERNELS: + raise ValueError("invalid kernel: '{0}'".format(self.kernel)) + X = self._validate_data(X, order='C', dtype=DTYPE) if sample_weight is not None: diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c1bbdbd629ff8..4103bd162cfdb 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -108,7 +108,6 @@ def __init__(self, steps, memory=None, verbose=False): self.steps = steps self.memory = memory self.verbose = verbose - self._validate_steps() def get_params(self, deep=True): """Get parameters for this estimator. @@ -323,6 +322,7 @@ def fit(self, X, y=None, **fit_params): self : Pipeline This estimator """ + self._validate_steps() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time('Pipeline', @@ -360,6 +360,7 @@ def fit_transform(self, X, y=None, **fit_params): Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples """ + self._validate_steps() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -803,7 +804,6 @@ def __init__(self, transformer_list, n_jobs=None, self.n_jobs = n_jobs self.transformer_weights = transformer_weights self.verbose = verbose - self._validate_transformers() def get_params(self, deep=True): """Get parameters for this estimator. @@ -900,6 +900,7 @@ def fit(self, X, y=None, **fit_params): self : FeatureUnion This estimator """ + self._validate_transformers() transformers = self._parallel_func(X, y, fit_params, _fit_one) if not transformers: # All transformers are None @@ -926,6 +927,7 @@ def fit_transform(self, X, y=None, **fit_params): hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ + self._validate_transformers() results = self._parallel_func(X, y, fit_params, _fit_transform_one) if not results: # All transformers are None diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 43b6ac642284c..06396b4c72c99 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -397,16 +397,6 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): """ def __init__(self, neg_label=0, pos_label=1, sparse_output=False): - if neg_label >= pos_label: - raise ValueError("neg_label={0} must be strictly less than " - "pos_label={1}.".format(neg_label, pos_label)) - - if sparse_output and (pos_label == 0 or neg_label != 0): - raise ValueError("Sparse binarization is only supported with non " - "zero pos_label and zero neg_label, got " - "pos_label={0} and neg_label={1}" - "".format(pos_label, neg_label)) - self.neg_label = neg_label self.pos_label = pos_label self.sparse_output = sparse_output @@ -424,6 +414,17 @@ def fit(self, y): ------- self : returns an instance of self. """ + if self.neg_label >= self.pos_label: + raise ValueError("neg_label={0} must be strictly less than " + "pos_label={1}.".format(self.neg_label, + self.pos_label)) + + if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0): + raise ValueError("Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(self.pos_label, self.neg_label)) + self.y_type_ = type_of_target(y) if 'multioutput' in self.y_type_: raise ValueError("Multioutput target data is not supported with " diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 6cecefb693ec8..6e4df94340d12 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -76,16 +76,6 @@ class BaseLibSVM(BaseEstimator, metaclass=ABCMeta): def __init__(self, kernel, degree, gamma, coef0, tol, C, nu, epsilon, shrinking, probability, cache_size, class_weight, verbose, max_iter, random_state): - - if self._impl not in LIBSVM_IMPL: - raise ValueError("impl should be one of %s, %s was given" % ( - LIBSVM_IMPL, self._impl)) - - if gamma == 0: - msg = ("The gamma value of 0.0 is invalid. Use 'auto' to set" - " gamma to a value of 1 / n_features.") - raise ValueError(msg) - self.kernel = kernel self.degree = degree self.gamma = gamma @@ -139,6 +129,14 @@ def fit(self, X, y, sample_weight=None): If X is a dense array, then the other methods will not support sparse matrices as input. """ + if self._impl not in LIBSVM_IMPL: + raise ValueError("impl should be one of %s, %s was given" % ( + LIBSVM_IMPL, self._impl)) + + if self.gamma == 0: + msg = ("The gamma value of 0.0 is invalid. Use 'auto' to set" + " gamma to a value of 1 / n_features.") + raise ValueError(msg) rnd = check_random_state(self.random_state) From 81ad16abf6431f02aac70fd9c94d8b25823a139f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 17 Apr 2020 17:53:23 +0200 Subject: [PATCH 02/13] more fixes --- sklearn/feature_extraction/_hash.py | 1 + .../tests/test_feature_hasher.py | 10 ++-- sklearn/feature_extraction/text.py | 59 ++----------------- sklearn/tests/test_pipeline.py | 46 +++++++-------- 4 files changed, 34 insertions(+), 82 deletions(-) diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 04238eeb29fac..5411b47c72da1 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -148,6 +148,7 @@ def transform(self, raw_X): Feature matrix, for use with estimators or further transformers. """ + self._validate_params(self.n_features, self.input_type) raw_X = iter(raw_X) if self.input_type == "dict": raw_X = (_iteritems(d) for d in raw_X) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index c0cd50cef6e09..e72f7eb0c9ef8 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -112,14 +112,16 @@ def test_hash_empty_input(): def test_hasher_invalid_input(): + raw_X = [[], (), iter(range(0))] + with pytest.raises(ValueError): - FeatureHasher(input_type="gobbledygook") + FeatureHasher(input_type="gobbledygook").transform(raw_X) with pytest.raises(ValueError): - FeatureHasher(n_features=-1) + FeatureHasher(n_features=-1).transform(raw_X) with pytest.raises(ValueError): - FeatureHasher(n_features=0) + FeatureHasher(n_features=0).transform(raw_X) with pytest.raises(TypeError): - FeatureHasher(n_features='ham') + FeatureHasher(n_features='ham').transform(raw_X) h = FeatureHasher(n_features=np.uint16(2 ** 6)) with pytest.raises(ValueError): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 45a59337d5bb5..30b37c946846b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1735,61 +1735,10 @@ def __init__(self, *, input='content', encoding='utf-8', ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype) - - # Broadcast the TF-IDF parameters to the underlying transformer instance - # for easy grid search and repr - - @property - def norm(self): - try: - check_is_fitted(self) - return self._tfidf.norm - except NotFittedError: - return None - - @norm.setter - def norm(self, value): - check_is_fitted(self) - self._tfidf.norm = value - - @property - def use_idf(self): - try: - check_is_fitted(self) - return self._tfidf.use_idf - except NotFittedError: - return None - - @use_idf.setter - def use_idf(self, value): - check_is_fitted(self) - self._tfidf.use_idf = value - - @property - def smooth_idf(self): - try: - check_is_fitted(self) - return self._tfidf.smooth_idf - except NotFittedError: - return None - - @smooth_idf.setter - def smooth_idf(self, value): - check_is_fitted(self) - self._tfidf.smooth_idf = value - - @property - def sublinear_tf(self): - try: - check_is_fitted(self) - return self._tfidf.sublinear_tf - except NotFittedError: - return None - - @sublinear_tf.setter - def sublinear_tf(self, value): - check_is_fitted(self) - self._tfidf.sublinear_tf = value + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + self.sublinear_tf = sublinear_tf @property def idf_(self): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index b9c2e26abac61..3c41275a93f95 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -160,14 +160,15 @@ def predict(self, X, got_attribute=False): def test_pipeline_init(): # Test the various init parameters of the pipeline. - assert_raises(TypeError, Pipeline) + X, y = [[1]], [1] + with pytest.raises(TypeError): + Pipeline().fit(X, y) # Check that we can't instantiate pipelines with objects without fit # method - assert_raises_regex(TypeError, - 'Last step of Pipeline should implement fit ' - 'or be the string \'passthrough\'' - '.*NoFit.*', - Pipeline, [('clf', NoFit())]) + with pytest.raises(TypeError, + match='Last step of Pipeline should implement fit ' + 'or be the string \'passthrough\'.*NoFit.*'): + Pipeline([('clf', NoFit())]).fit(X, y) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) @@ -193,10 +194,10 @@ def test_pipeline_init(): # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform - assert_raises_regex(TypeError, - 'All intermediate steps should be transformers' - '.*\\bNoTrans\\b.*', - Pipeline, [('t', NoTrans()), ('svc', clf)]) + with pytest.raises(TypeError, + match='All intermediate steps should be transformers' + '.*\\bNoTrans\\b.*'): + Pipeline([('t', NoTrans()), ('svc', clf)]).fit(X, y) # Check that params are set pipe.set_params(svc__C=0.1) @@ -490,11 +491,10 @@ def test_feature_union(): assert X_transformed.shape == (X.shape[0], 8) # test error if some elements do not support transform - assert_raises_regex(TypeError, - 'All estimators should implement fit and ' - 'transform.*\\bNoTrans\\b', - FeatureUnion, - [("transform", Transf()), ("no_transform", NoTrans())]) + with pytest.raises(TypeError, match='All estimators should implement fit ' + 'and transform.*\\bNoTrans\\b'): + FeatureUnion([("transform", Transf()), + ("no_transform", NoTrans())]).fit(X, y) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) @@ -951,15 +951,15 @@ def test_step_name_validation(): # we validate in construction (despite scikit-learn convention) bad_steps3 = [('a', Mult(2)), (param, Mult(3))] for bad_steps, message in [ - (bad_steps1, "Estimator names must not contain __: got ['a__q']"), - (bad_steps2, "Names provided are not unique: ['a', 'a']"), + (bad_steps1, "Estimator names must not contain __:"), + (bad_steps2, "Names provided are not unique:"), (bad_steps3, "Estimator names conflict with constructor " - "arguments: ['%s']" % param), + "arguments:"), ]: # three ways to make invalid: # - construction - assert_raise_message(ValueError, message, cls, - **{param: bad_steps}) + with pytest.raises(ValueError, match=message): + cls(**{param: bad_steps}).fit([[1]], [1]) # - setattr est = cls(**{param: [('a', Mult(1))]}) @@ -1233,10 +1233,10 @@ def transform(self, X, y=None): def test_feature_union_warns_with_none(): msg = (r"Using None as a transformer is deprecated in version 0\.22 and " r"will be removed in version 0\.24\. Please use 'drop' instead\.") - with pytest.warns(FutureWarning, match=msg): - union = FeatureUnion([('multi1', None), ('multi2', Mult())]) - X = [[1, 2, 3], [4, 5, 6]] + with pytest.warns(FutureWarning, match=msg): + union = FeatureUnion([('multi1', None), ('multi2', Mult())]).fit(X) + with pytest.warns(FutureWarning, match=msg): union.fit_transform(X) From cbf391c86f79f244128792c72da299db5195f48b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 17 Apr 2020 20:14:12 +0200 Subject: [PATCH 03/13] fix neighbors --- sklearn/neighbors/_classification.py | 9 ++++- sklearn/neighbors/tests/test_kde.py | 48 ++++++++++++++--------- sklearn/neighbors/tests/test_neighbors.py | 33 ++++++++-------- sklearn/preprocessing/tests/test_label.py | 7 ++-- 4 files changed, 58 insertions(+), 39 deletions(-) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 3ae82d1bcf44e..8ddc33a7d31a5 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -154,6 +154,13 @@ def __init__(self, n_neighbors=5, n_jobs=n_jobs, **kwargs) self.weights = weights + def fit(self, X, y): + # this `fit` is here only to do the validation which happens in this + # class and not the parent. Otherwise it's the same `fit`. + # `weights` is only used in `predict*` + _check_weights(self.weights) + return super().fit(X, y) + def predict(self, X): """Predict the class labels for the provided data. @@ -402,7 +409,7 @@ def fit(self, X, y): Target values. """ - + _check_weights(self.weights) SupervisedIntegerMixin.fit(self, X, y) classes_ = self.classes_ diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index e17e8e575f728..aaaa07f5d451f 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -111,11 +111,10 @@ def test_kde_algorithm_metric_choice(algorithm, metric): Y = rng.randn(10, 2) if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: - assert_raises(ValueError, KernelDensity, - algorithm=algorithm, metric=metric) + with pytest.raises(ValueError): + KernelDensity(algorithm=algorithm, metric=metric).fit(X) else: - kde = KernelDensity(algorithm=algorithm, metric=metric) - kde.fit(X) + kde = KernelDensity(algorithm=algorithm, metric=metric).fit(X) y_dens = kde.score_samples(Y) assert y_dens.shape == Y.shape[:1] @@ -129,21 +128,32 @@ def test_kde_score(n_samples=100, n_features=3): def test_kde_badargs(): - assert_raises(ValueError, KernelDensity, - algorithm='blah') - assert_raises(ValueError, KernelDensity, - bandwidth=0) - assert_raises(ValueError, KernelDensity, - kernel='blah') - assert_raises(ValueError, KernelDensity, - metric='blah') - assert_raises(ValueError, KernelDensity, - algorithm='kd_tree', metric='blah') - kde = KernelDensity() - assert_raises(ValueError, kde.fit, np.random.random((200, 10)), - sample_weight=np.random.random((200, 10))) - assert_raises(ValueError, kde.fit, np.random.random((200, 10)), - sample_weight=-np.random.random(200)) + rng = np.random.RandomState(0) + X = rng.randn(10, 2) + with pytest.raises(ValueError): + KernelDensity(algorithm='blah').fit(X) + + with pytest.raises(ValueError): + KernelDensity(bandwidth=0).fit(X) + + with pytest.raises(ValueError): + KernelDensity(kernel='blah').fit(X) + + with pytest.raises(ValueError): + KernelDensity(metric='blah').fit(X) + + with pytest.raises(ValueError): + KernelDensity(algorithm='kd_tree', metric='blah').fit(X) + + with pytest.raises(ValueError): + KernelDensity(algorithm='blah').fit( + rng.random((200, 10)), + sample_weight=rng.random((200, 10))) + + with pytest.raises(ValueError): + KernelDensity(algorithm='blah').fit( + rng.random((200, 10)), + sample_weight=-rng.random(200)) def test_kde_pipeline_gridsearch(): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 88e32669777a1..9e0134c2a6727 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1124,26 +1124,27 @@ def test_radius_neighbors_graph_sparse(seed=36): def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors - assert_raises(ValueError, - neighbors.NearestNeighbors, - algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) X3 = rng.random_sample((10, 3)) y = np.ones(10) + with pytest.raises(ValueError): + neighbors.NearestNeighbors(algorithm='blah').fit(X, y) + for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): - assert_raises(ValueError, - cls, - weights='blah') - assert_raises(ValueError, - cls, p=-1) - assert_raises(ValueError, - cls, algorithm='blah') + with pytest.raises(ValueError): + cls(weights='blah').fit(X, y) + + with pytest.raises(ValueError): + cls(p=-1).fit(X, y) + + with pytest.raises(ValueError): + cls(algorithm='blah').fit(X, y) nbrs = cls(algorithm='ball_tree', metric='haversine') assert_raises(ValueError, @@ -1212,10 +1213,10 @@ def test_neighbors_metrics(n_samples=20, n_features=3, # KD tree doesn't support all metrics if (algorithm == 'kd_tree' and metric not in neighbors.KDTree.valid_metrics): - assert_raises(ValueError, - neighbors.NearestNeighbors, - algorithm=algorithm, - metric=metric, metric_params=metric_params) + with pytest.raises(ValueError): + neighbors.NearestNeighbors( + algorithm=algorithm, + metric=metric, metric_params=metric_params).fit(X) continue neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, @@ -1310,8 +1311,8 @@ def test_valid_brute_metric_for_auto_algorithm(): def test_metric_params_interface(): - assert_warns(SyntaxWarning, neighbors.KNeighborsClassifier, - metric_params={'p': 3}) + with pytest.warns(SyntaxWarning): + neighbors.KNeighborsClassifier(metric_params={'p': 3}).fit([[1]], [1]) def test_predict_sparse_ball_kd_tree(): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 887fa90c98d61..55d54bf75adc2 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -142,12 +142,13 @@ def test_label_binarizer_errors(): lb.inverse_transform([]) with pytest.raises(ValueError): - LabelBinarizer(neg_label=2, pos_label=1) + LabelBinarizer(neg_label=2, pos_label=1).fit([1, 2]) with pytest.raises(ValueError): - LabelBinarizer(neg_label=2, pos_label=2) + LabelBinarizer(neg_label=2, pos_label=2).fit([1, 2]) with pytest.raises(ValueError): - LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) + LabelBinarizer(neg_label=1, pos_label=2, + sparse_output=True).fit([1, 2]) # Fail on y_type with pytest.raises(ValueError): From 0132609d08635620b59d835d1e07df68278d145b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 17 Apr 2020 20:35:36 +0200 Subject: [PATCH 04/13] fix linear_models --- sklearn/linear_model/tests/test_sgd.py | 96 +++++++------------------- 1 file changed, 24 insertions(+), 72 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 22744a427b901..7a0560a6e2864 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -167,24 +167,15 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0): @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_alpha(klass): +@pytest.mark.parametrize('params', [ + {'alpha': -.1}, + {'penalty': 'foobar', 'l1_ratio': 0.85}, + {'loss': "foobar"} +]) +def test_sgd_bad_alpha(klass, params): # Check whether expected ValueError on bad alpha - assert_raises(ValueError, klass, alpha=-.1) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_penalty(klass): - # Check whether expected ValueError on bad penalty - assert_raises(ValueError, klass, penalty='foobar', - l1_ratio=0.85) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) -def test_sgd_bad_loss(klass): - # Check whether expected ValueError on bad loss - assert_raises(ValueError, klass, loss="foobar") + with pytest.raises(ValueError): + klass(**params).fit(X, Y) def _test_warm_start(klass, X, Y, lr): @@ -334,8 +325,8 @@ def test_late_onset_averaging_reached(klass): def test_sgd_bad_alpha_for_optimal_learning_rate(klass): # Check whether expected ValueError on bad alpha, i.e. 0 # since alpha is used to compute the optimal learning rate - assert_raises(ValueError, klass, - alpha=0, learning_rate="optimal") + with pytest.raises(ValueError): + klass(alpha=0, learning_rate="optimal").fit(X, Y) @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, @@ -436,59 +427,20 @@ def test_sgd_clf(klass): @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_bad_l1_ratio(klass): - # Check whether expected ValueError on bad l1_ratio - assert_raises(ValueError, klass, l1_ratio=1.1) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_bad_learning_rate_schedule(klass): - # Check whether expected ValueError on bad learning_rate - assert_raises(ValueError, klass, learning_rate="") - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_bad_eta0(klass): - # Check whether expected ValueError on bad eta0 - assert_raises(ValueError, klass, eta0=0, - learning_rate="constant") - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_max_iter_param(klass): - # Test parameter validity check - assert_raises(ValueError, klass, max_iter=-10000) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_shuffle_param(klass): - # Test parameter validity check - assert_raises(ValueError, klass, shuffle="false") - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_early_stopping_param(klass): - # Test parameter validity check - assert_raises(ValueError, klass, early_stopping="false") - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_validation_fraction(klass): - # Test parameter validity check - assert_raises(ValueError, klass, validation_fraction=-.1) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_sgd_n_iter_no_change(klass): - # Test parameter validity check - assert_raises(ValueError, klass, n_iter_no_change=0) - - -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) -def test_argument_coef(klass): - # Checks coef_init not allowed as model argument (only fit) - # Provided coef_ does not match dataset - assert_raises(TypeError, klass, coef_init=np.zeros((3,))) +@pytest.mark.parametrize('params, error', [ + ({'l1_ratio': 1.1}, ValueError), + ({'learning_rate': ""}, ValueError), + ({'eta0': 0, 'learning_rate': "constant"}, ValueError), + ({'max_iter': -10000}, ValueError), + ({'shuffle': "false"}, ValueError), + ({'early_stopping': "false"}, ValueError), + ({'validation_fraction': -.1}, ValueError), + ({'n_iter_no_change': 0}, ValueError), + ({'coef_init': np.zeros((3,))}, TypeError)]) +def test_sgd_bad_l1_ratio(klass, params, error): + # Check expected ValueError on bad parameter set + with pytest.raises(error): + klass(**params).fit(X, Y) @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) From 96e4a113b028a7b09d37fa9960512cc033fb89c5 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 20 Apr 2020 20:43:30 +0200 Subject: [PATCH 05/13] fix model_selection --- sklearn/model_selection/_search.py | 3 ++- sklearn/model_selection/tests/test_search.py | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 49e645e13a010..a9a1e82999925 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1516,4 +1516,5 @@ def _run_search(self, evaluate_candidates): random_state=self.random_state)) def _validate_params(self): - _check_param_grid(self.param_distributions) + # TODO: check param distributions validity + pass diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 1673040f96bc6..c3657a59149f0 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -436,14 +436,14 @@ def test_grid_search_bad_param_grid(): " be a list or numpy array, but got ()." " Single values need to be wrapped in a list" " with one element.", - GridSearchCV, clf, param_dict) + GridSearchCV(clf, param_grid=param_dict).fit, X, y) param_dict = {"C": []} clf = SVC() assert_raise_message( ValueError, "Parameter values for parameter (C) need to be a non-empty sequence.", - GridSearchCV, clf, param_dict) + GridSearchCV(clf, param_grid=param_dict).fit, X, y) param_dict = {"C": "1,2,3"} clf = SVC(gamma='auto') @@ -453,11 +453,12 @@ def test_grid_search_bad_param_grid(): " be a list or numpy array, but got ()." " Single values need to be wrapped in a list" " with one element.", - GridSearchCV, clf, param_dict) + GridSearchCV(clf, param_grid=param_dict).fit, X, y) param_dict = {"C": np.ones((3, 2))} clf = SVC() - assert_raises(ValueError, GridSearchCV, clf, param_dict) + assert_raises(ValueError, GridSearchCV(clf, param_grid=param_dict).fit, + X, y) def test_grid_search_sparse(): @@ -1717,6 +1718,9 @@ def _run_search(self, evaluate): check_results(results, fit_grid([{'max_depth': [1, 2]}, {'min_samples_split': [5, 10]}])) + def _validate_params(self): + pass + # Using regressor to make sure each score differs clf = DecisionTreeRegressor(random_state=0) X, y = make_classification(n_samples=100, n_informative=4, @@ -1747,6 +1751,9 @@ def __init__(self, estimator, **kwargs): def fit(self, X, y=None, groups=None, **fit_params): return self + def _validate_params(self): + pass + # this should not raise any exceptions NoRunSearchSearchCV(SVC()).fit(X, y) @@ -1754,6 +1761,9 @@ class BadSearchCV(BaseSearchCV): def __init__(self, estimator, **kwargs): super().__init__(estimator, **kwargs) + def _validate_params(self): + pass + with pytest.raises(NotImplementedError, match="_run_search not implemented."): # this should raise a NotImplementedError From 0fb81dbca8cfd3e39978430f446ed4df8febc8c1 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 11:51:05 +0200 Subject: [PATCH 06/13] remove deprecated test --- sklearn/tests/test_pipeline.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c9b6d5e415dff..7d56da486d723 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1225,16 +1225,3 @@ def transform(self, X, y=None): t.fit(X, y, a=0) t.fit_transform(X, y, a=0) - - -# TODO: Remove in 0.24 when None is removed -def test_feature_union_warns_with_none(): - msg = (r"Using None as a transformer is deprecated in version 0\.22 and " - r"will be removed in version 0\.24\. Please use 'drop' instead\.") - X = [[1, 2, 3], [4, 5, 6]] - - with pytest.warns(FutureWarning, match=msg): - union = FeatureUnion([('multi1', None), ('multi2', Mult())]).fit(X) - - with pytest.warns(FutureWarning, match=msg): - union.fit_transform(X) From 9fda41f008c97298d5cf1477e274d1e9c05b2116 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 13:37:43 +0200 Subject: [PATCH 07/13] fix test_factor_analysys --- sklearn/decomposition/tests/test_factor_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_factor_analysis.py b/sklearn/decomposition/tests/test_factor_analysis.py index 128c1d04fb405..62a053c682ceb 100644 --- a/sklearn/decomposition/tests/test_factor_analysis.py +++ b/sklearn/decomposition/tests/test_factor_analysis.py @@ -33,7 +33,7 @@ def test_factor_analysis(): X = np.dot(h, W) + noise with pytest.raises(ValueError): - FactorAnalysis(svd_method='foo') + FactorAnalysis(svd_method='foo').fit(X) fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' with pytest.raises(ValueError): From 2764de52398050422fe09e4bcafc41b8c0654d7f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 13:39:41 +0200 Subject: [PATCH 08/13] fix test_fastica --- sklearn/decomposition/tests/test_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 9f37ac25c2f76..60ddf00423bf2 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -271,7 +271,7 @@ def test_fastica_errors(): X = rng.random_sample((n_samples, n_features)) w_init = rng.randn(n_features + 1, n_features + 1) with pytest.raises(ValueError, match='max_iter should be greater than 1'): - FastICA(max_iter=0) + FastICA(max_iter=0).fit(X) with pytest.raises(ValueError, match=r'alpha must be in \[1,2\]'): fastica(X, fun_args={'alpha': 0}) with pytest.raises(ValueError, match='w_init has invalid shape.+' From d6a5c83b34950c496fd9ba09fb5427e58fbcde50 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 13:41:54 +0200 Subject: [PATCH 09/13] fix test_kernel_pca --- sklearn/decomposition/tests/test_kernel_pca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index a7a9547bfa33a..ec6088219b6bf 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -55,8 +55,10 @@ def histogram(x, y, **kwargs): def test_kernel_pca_invalid_parameters(): + state = np.random.RandomState(0) + X = state.rand(10, 10) with pytest.raises(ValueError): - KernelPCA(10, fit_inverse_transform=True, kernel='precomputed') + KernelPCA(10, fit_inverse_transform=True, kernel='precomputed').fit(X) def test_kernel_pca_consistent_transform(): From 358a45dc001a59dac53c092a373becb6907c3198 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 13:54:53 +0200 Subject: [PATCH 10/13] fix test_text --- sklearn/feature_extraction/tests/test_text.py | 4 ++++ sklearn/feature_extraction/text.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index c65f25c2e7329..620c2addd3af6 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -512,12 +512,16 @@ def test_tfidf_vectorizer_setters(): tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=False) tv.norm = 'l1' + tv.fit(ALL_FOOD_DOCS) assert tv._tfidf.norm == 'l1' tv.use_idf = True + tv.fit(ALL_FOOD_DOCS) assert tv._tfidf.use_idf tv.smooth_idf = True + tv.fit(ALL_FOOD_DOCS) assert tv._tfidf.smooth_idf tv.sublinear_tf = True + tv.fit(ALL_FOOD_DOCS) assert tv._tfidf.sublinear_tf diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 63fb425d5c684..5953189eeab97 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1746,13 +1746,17 @@ def idf_(self): @idf_.setter def idf_(self, value): - check_is_fitted(self) self._validate_vocabulary() if hasattr(self, 'vocabulary_'): if len(self.vocabulary_) != len(value): raise ValueError("idf length = %d must be equal " "to vocabulary size = %d" % (len(value), len(self.vocabulary))) + if not hasattr(self, '_tfidf'): + self._tfidf = TfidfTransformer(norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf) self._tfidf.idf_ = value def _check_params(self): From e2b5ebc005e23a13d9a1ca478800944a32bf96ad Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 29 May 2020 15:04:39 +0200 Subject: [PATCH 11/13] raise AttributeError before fit on idf_ property --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 5953189eeab97..6f852abfe3239 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1742,7 +1742,7 @@ def idf_(self): check_is_fitted(self) return self._tfidf.idf_ except NotFittedError: - return None + raise AttributeError @idf_.setter def idf_(self, value): From c1a5db95c8fe18624f9da31ee34bdee075fe2772 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 29 May 2020 15:17:37 +0200 Subject: [PATCH 12/13] use random_sample in test_kde --- sklearn/neighbors/tests/test_kde.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 82235efc18446..803ba692de5ee 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -148,12 +148,12 @@ def test_kde_badargs(): with pytest.raises(ValueError): KernelDensity(algorithm='blah').fit( rng.random((200, 10)), - sample_weight=rng.random((200, 10))) + sample_weight=rng.random_sample((200, 10))) with pytest.raises(ValueError): KernelDensity(algorithm='blah').fit( rng.random((200, 10)), - sample_weight=-rng.random(200)) + sample_weight=-rng.random_sample(200)) def test_kde_pipeline_gridsearch(): From 2d674c92884d9335638230e8aca4379671713ffa Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 29 May 2020 16:34:22 +0200 Subject: [PATCH 13/13] fix the other random call in test_kde --- sklearn/neighbors/tests/test_kde.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 803ba692de5ee..99dc30b120dd7 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -147,12 +147,12 @@ def test_kde_badargs(): with pytest.raises(ValueError): KernelDensity(algorithm='blah').fit( - rng.random((200, 10)), + rng.randn(200, 10), sample_weight=rng.random_sample((200, 10))) with pytest.raises(ValueError): KernelDensity(algorithm='blah').fit( - rng.random((200, 10)), + rng.randn(200, 10), sample_weight=-rng.random_sample(200))