scikit-learn · rth · Feb 16, 2020 · Jan 13, 2020 · Jan 13, 2020 · Jan 13, 2020
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
@@ -201,16 +201,13 @@ def test_project_and_cluster():
                         [0, 1],
                         [0, 0]])
     for mat in (data, csr_matrix(data)):
-        labels = model._project_and_cluster(data, vectors,
+        labels = model._project_and_cluster(mat, vectors,
                                             n_clusters=2)
         assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
 
 
 def test_perfect_checkerboard():
-    # XXX test always skipped
-    raise SkipTest("This test is failing on the buildbot, but cannot"
-                   " reproduce. Temporarily disabling it until it can be"
-                   " reproduced and  fixed.")
+    # XXX Previously failed on build bot (not reproducible)
     model = SpectralBiclustering(3, svd_method="arpack", random_state=0)
 
     S, rows, cols = make_checkerboard((30, 30), 3, noise=0,

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -523,7 +523,8 @@ def predict(self, X):
 
     X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
     ct = ColumnTransformer([('trans', NoTrans(), [0])])
-    assert_raise_message(TypeError, "All estimators should implement fit",
+    assert_raise_message(TypeError,
+                         "All estimators should implement fit and transform",
                          ct.fit, X_array)
 
 

diff --git a/sklearn/dataset 67F4 s/tests/test_openml.py b/sklearn/dataset 67F4 s/tests/test_openml.py
@@ -102,7 +102,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
         assert data_by_id.target.shape == (expected_observations,
                                            len(target_column))
         assert data_by_id.target_names == target_column
-    assert data_by_id.data.dtype == np.float64
+    assert data_by_id.data.dtype == expected_data_dtype
     assert data_by_id.target.dtype == expected_target_dtype
     assert len(data_by_id.feature_names) == expected_features
     for feature in data_by_id.feature_names:
@@ -118,11 +118,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     if compare_default_target:
         # check whether the data by id and data by id target are equal
         data_by_id_default = fetch_openml(data_id=data_id, cache=False)
-        if data_by_id.data.dtype == np.float64:
-            np.testing.assert_allclose(data_by_id.data,
-                                       data_by_id_default.data)
-        else:
-            assert np.array_equal(data_by_id.data, data_by_id_default.data)
+        np.testing.assert_allclose(data_by_id.data, data_by_id_default.data)
         if data_by_id.target.dtype == np.float64:
             np.testing.assert_allclose(data_by_id.target,
                                        data_by_id_default.target)
@@ -740,7 +736,7 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, np.float64, expect_sparse=False,
+                               np.float64, np.float64, expect_sparse=False,
                                compare_default_target=False)
 
 
@@ -759,7 +755,7 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, object, expect_sparse=False,
+                               np.float64, object, expect_sparse=False,
                                compare_default_target=True)
 
 
@@ -784,7 +780,7 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, object, expect_sparse=False,
+                               np.float64, object, expect_sparse=False,
                                compare_default_target=False)
 
 
@@ -802,7 +798,7 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, np.float64, expect_sparse=False,
+                               np.float64, np.float64, expect_sparse=False,
                                compare_default_target=True)
 
 

diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py
@@ -158,7 +158,6 @@ def test_fastica_convergence_fail():
     s2 = np.ceil(np.sin(np.pi * t))
     s = np.c_[s1, s2].T
     center_and_norm(s)
-    s1, s2 = s
 
     # Mixing matrix
     mixing = rng.randn(6, 2)
@@ -170,7 +169,8 @@ def test_fastica_convergence_fail():
     assert_warns(ConvergenceWarning, ica.fit, m.T)
 
 
-def test_non_square_fastica(add_noise=False):
+@pytest.mark.parametrize('add_noise', [True, False])
+def test_non_square_fastica(add_noise):
     # Test the FastICA algorithm on very simple data.
     rng = np.random.RandomState(0)
 

diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
@@ -1,6 +1,5 @@
 import numpy as np
 import scipy.sparse as sp
-import numbers
 
 from scipy import linalg
 from sklearn.decomposition import NMF, non_negative_factorization
@@ -10,7 +9,6 @@
 import pytest
 
 from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -246,11 +244,6 @@ def _beta_divergence_dense(X, W, H, beta):
 
     Used as a reference for testing nmf._beta_divergence.
     """
-    if isinstance(X, numbers.Number):
-        W = np.array([[W]])
-        H = np.array([[H]])
-        X = np.array([[X]])
-
     WH = np.dot(W, H)
 
     if beta == 2:

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -13,9 +13,6 @@
     get_equivalent_estimator)
 
 
-pytest.importorskip("lightgbm")
-
-
 @pytest.mark.parametrize('seed', range(5))
 @pytest.mark.parametrize('min_samples_leaf', (1, 20))
 @pytest.mark.parametrize('n_samples, max_leaf_nodes', [
@@ -46,6 +43,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
     #   discrepancy between the initial values leads to biggish differences in
     #   the predictions. These differences are much smaller with more
     #   iterations.
+    pytest.importorskip("lightgbm")
 
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
@@ -98,6 +96,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
 def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                          max_leaf_nodes):
     # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
 
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
@@ -158,6 +157,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
 def test_same_predictions_multiclass_classification(
         seed, min_samples_leaf, n_samples, max_leaf_nodes):
     # Same as test_same_predictions_regression but for classification
+    pytest.importorskip("lightgbm")
 
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 from collections.abc import Mapping
 import re
-import warnings
 
 import pytest
 from scipy import sparse
@@ -31,10 +30,11 @@
 from numpy.testing import assert_array_equal
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import (assert_almost_equal,
-                                   assert_warns_message, assert_raise_message,
-                                   SkipTest, assert_no_warnings,
-                                   fails_if_pypy, assert_allclose_dense_sparse,
-                                   skip_if_32bit)
+                                    assert_warns_message, assert_raise_message,
+                                    assert_no_warnings,
+                                    fails_if_pypy,
+                                    assert_allclose_dense_sparse,
+                                    skip_if_32bit)
 from collections import defaultdict
 from functools import partial
 import pickle
@@ -296,18 +296,17 @@ def test_countvectorizer_custom_vocabulary_pipeline():
 
 def test_countvectorizer_custom_vocabulary_repeated_indices():
     vocab = {"pizza": 0, "beer": 0}
-    try:
-        CountVectorizer(vocabulary=vocab)
-    except ValueError as e:
-        assert "vocabulary contains repeated indices" in str(e).lower()
+    msg = "Vocabulary contains repeated indices"
+    with pytest.raises(ValueError, match=msg):
+        vect = CountVectorizer(vocabulary=vocab)
+        vect.fit(["pasta_siziliana"])
 
 
 def test_countvectorizer_custom_vocabulary_gap_index():
     vocab = {"pizza": 1, "beer": 2}
-    try:
-        CountVectorizer(vocabulary=vocab)
-    except ValueError as e:
-        assert "doesn't contain index" in str(e).lower()
+    with pytest.raises(ValueError, match="doesn't contain index"):
+        vect = CountVectorizer(vocabulary=vocab)
+        vect.fit(['pasta_verdura'])
 
 
 def test_countvectorizer_stop_words():
@@ -326,20 +325,14 @@ def test_countvectorizer_stop_words():
 
 
 def test_countvectorizer_empty_vocabulary():
-    try:
+    with pytest.raises(ValueError, match="empty vocabulary"):
         vect = CountVectorizer(vocabulary=[])
         vect.fit(["foo"])
-        assert False, "we shouldn't get here"
-    except ValueError as e:
-        assert "empty vocabulary" in str(e).lower()
 
-    try:
+    with pytest.raises(ValueError, match="empty vocabulary"):
         v = CountVectorizer(max_df=1.0, stop_words="english")
         # fit on stopwords only
         v.fit(["to be or not to be", "and me too", "and so do you"])
-        assert False, "we shouldn't get here"
-    except ValueError as e:
-        assert "empty vocabulary" in str(e).lower()
 
 
 def test_fit_countvectorizer_twice():
@@ -387,15 +380,9 @@ def test_tfidf_no_smoothing():
          [1, 0, 0]]
     tr = TfidfTransformer(smooth_idf=False, norm='l2')
 
-    with warnings.catch_warnings(record=True) as w:
-        1. / np.array([0.])
-        numpy_provides_div0_warning = len(w) == 1
-
     in_warning_message = 'divide by zero'
-    tfidf = assert_warns_message(RuntimeWarning, in_warning_message,
-                                 tr.fit_transform, X).toarray()
-    if not numpy_provides_div0_warning:
-        raise SkipTest("Numpy does not provide div 0 warnings.")
+    assert_warns_message(RuntimeWarning, in_warning_message,
+                         tr.fit_transform, X).toarray()
 
 
 def test_sublinear_tf():
@@ -1155,7 +1142,7 @@ def test_vectorizers_invalid_ngram_range(vec):
     message = ("Invalid value for ngram_range=%s "
                "lower boundary larger than the upper boundary."
                % str(invalid_range))
-    if isinstance(vec, HashingVectorizer):
+    if isinstance(vec, HashingVectorizer) and IS_PYPY:
         pytest.xfail(reason='HashingVectorizer is not supported on PyPy')
 
     assert_raise_message(

diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py
@@ -30,8 +30,8 @@ class MockClassifier:
     def __init__(self, foo_param=0):
         self.foo_param = foo_param
 
-    def fit(self, X, Y):
-        assert len(X) == len(Y)
+    def fit(self, X, y):
+        assert len(X) == len(y)
         self.coef_ = np.ones(X.shape[1], dtype=np.float64)
         return self
 
@@ -42,12 +42,8 @@ def predict(self, T):
     decision_function = predict
     transform = predict
 
-    def score(self, X=None, Y=None):
-        if self.foo_param > 1:
-            score = 1.
-        else:
-            score = 0.
-        return score
+    def score(self, X=None, y=None):
+        return 0.
 
     def get_params(self, deep=True):
         return {'foo_param': self.foo_param}

diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py
@@ -11,6 +11,7 @@
         [0, 2, 2, 3, 5],
         [1, 1, 2, 4, 0]]
 
+data2 = [[-0.13725701]] * 10
 
 def test_zero_variance():
     # Test VarianceThreshold with default setting, zero variance.
@@ -32,17 +33,16 @@ def test_variance_threshold():
         assert (len(data), 1) == X.shape
 
 
+@pytest.mark.skipif(np.var(data2) == 0,
+                    reason=('This test is not valid for this platform, '
+                            'as it relies on numerical instabilities.'))
 def test_zero_variance_floating_point_error():
     # Test that VarianceThreshold(0.0).fit eliminates features that have
     # the same value in every sample, even when floating point errors
     # cause np.var not to be 0 for the feature.
     # See #13691
 
-    data = [[-0.13725701]] * 10
-    if np.var(data) == 0:
-        pytest.skip('This test is not valid for this platform, as it relies '
-                    'on numerical instabilities.')
-    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
+    for X in [data2, csr_matrix(data2), csc_matrix(data2), bsr_matrix(data2)]:
         msg = "No feature in X meets the variance threshold 0.00000"
         with pytest.raises(ValueError, match=msg):
             VarianceThreshold().fit(X)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
@@ -250,6 +250,7 @@ def test_kernel_clone_after_set_params(kernel):
                                                    isotropic_kernels):
         length_scale = params['length_scale']
         if np.iterable(length_scale):
+            # XXX unreached code as of v0.22
             params['length_scale'] = length_scale[0]
             params['length_scale_bounds'] = bounds
         else:

diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
@@ -177,6 +177,7 @@ def test_imputation_mean_median():
             X[:, j] = np.hstack((v, z, p))
 
             if 0 == test_missing_values:
+                # XXX unreached code as of v0.22
                 X_true[:, j] = np.hstack((v,
                                           np.repeat(
                                               true_statistics[j],
@@ -706,7 +707,6 @@ def test_iterative_imputer_truncated_normal_posterior():
     #  note that starting from the wrong random seed will make this test fail
     #  because random sampling doesn't occur at all when the imputation
     #  is outside of the (min_value, max_value) range
-    pytest.importorskip("scipy", minversion="0.17.0")
     rng = np.random.RandomState(42)
 
     X = rng.normal(size=(5, 5))
@@ -763,7 +763,6 @@ def test_iterative_imputer_missing_at_transform(strategy):
 
 
 def test_iterative_imputer_transform_stochasticity():
-    pytest.importorskip("scipy", minversion="0.17.0")
     rng1 = np.random.RandomState(0)
     rng2 = np.random.RandomState(1)
     n = 100

diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py
@@ -5,18 +5,9 @@
 from sklearn.metrics.pairwise import nan_euclidean_distances
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.neighbors import KNeighborsRegressor
-from sklearn.utils._mask import _get_mask
 from sklearn.utils._testing import assert_allclose
 
 
-def _missing_mean(X, missing_value):
-    masked_X = np.ma.array(X, mask=_get_mask(X, missing_value))
-    masked_X_mean = masked_X.mean(axis=0)
-    output = masked_X_mean.data
-    output[masked_X_mean.mask] = np.nan
-    return output
-
-
 @pytest.mark.parametrize("weights", ["uniform", "distance"])
 @pytest.mark.parametrize("n_neighbors", range(1, 6))
 def test_knn_imputer_shape(weights, n_neighbors):

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -1723,7 +1723,7 @@ def fit(X, y, **kw):
         if sys.platform == 'darwin' and solver == 'lbfgs':
             pytest.xfail('Issue #11924: LogisticRegressionCV(solver="lbfgs", '
                          'multi_class="multinomial") is nondterministic on '
-                         'MacOS.')  # pragma: no cover
+                         'MacOS.')
         assert_allclose(est_auto_multi.coef_, est_multi_multi.coef_)
         assert_allclose(est_auto_multi.predict_proba(X2),
                         est_multi_multi.predict_proba(X2))