From 070ca3564db88c4c18c832ec1415a4ff856a5473 Mon Sep 17 00:00:00 2001 From: Holger Peters Date: Wed, 8 Feb 2017 22:48:41 +0100 Subject: [PATCH 01/25] Migrate to pytest --- .gitignore | 2 + sklearn/datasets/tests/test_base.py | 88 +++++++----- sklearn/datasets/tests/test_mldata.py | 35 +++-- .../feature_extraction/tests/test_image.py | 2 +- sklearn/feature_extraction/tests/test_text.py | 2 +- .../gaussian_process/tests/test_kernels.py | 80 +++++------ sklearn/linear_model/tests/test_ransac.py | 3 +- sklearn/mixture/tests/test_gmm.py | 17 +-- sklearn/neighbors/tests/test_ball_tree.py | 42 +++--- sklearn/neighbors/tests/test_kd_tree.py | 4 +- sklearn/neighbors/tests/test_kde.py | 2 +- sklearn/svm/tests/test_bounds.py | 2 +- sklearn/tests/test_naive_bayes.py | 7 +- sklearn/tests/test_random_projection.py | 30 ++-- sklearn/tree/tests/test_tree.py | 129 +++++++++--------- sklearn/utils/testing.py | 37 +++-- sklearn/utils/tests/test_stats.py | 13 +- 17 files changed, 260 insertions(+), 235 deletions(-) diff --git a/.gitignore b/.gitignore index eb29ee31057d7..0f85606ab8862 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,5 @@ benchmarks/bench_covertype_data/ !*/src/*.cpp *.sln *.pyproj +.cache +.eggs diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 92fe96fa10656..7ec08e2e100d4 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -6,6 +6,8 @@ from pickle import loads from pickle import dumps +import pytest + from sklearn.datasets import get_data_home from sklearn.datasets import clear_data_home from sklearn.datasets import load_files @@ -26,13 +28,24 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import with_setup -DATA_HOME = tempfile.mkdtemp(prefix="scikit_learn_data_home_test_") -LOAD_FILES_ROOT = tempfile.mkdtemp(prefix="scikit_learn_load_files_test_") -TEST_CATEGORY_DIR1 = "" -TEST_CATEGORY_DIR2 = "" +@pytest.fixture(scope="module") +def data_home(): + try: + path = tempfile.mkdtemp(prefix="scikit_learn_data_home_test_") + yield path + finally: + _remove_dir(path) + + +@pytest.fixture(scope="module") +def load_files_root(): + try: + path = tempfile.mkdtemp(prefix="scikit_learn_load_files_test_") + yield path + finally: + _remove_dir(path) def _remove_dir(path): @@ -42,30 +55,34 @@ def _remove_dir(path): def teardown_module(): """Test fixture (clean up) run once after all tests of this module""" - for path in [DATA_HOME, LOAD_FILES_ROOT]: - _remove_dir(path) - -def setup_load_files(): - global TEST_CATEGORY_DIR1 - global TEST_CATEGORY_DIR2 - TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) - TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) - sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, - delete=False) - sample_file.write(b("Hello World!\n")) - sample_file.close() - -def teardown_load_files(): - _remove_dir(TEST_CATEGORY_DIR1) - _remove_dir(TEST_CATEGORY_DIR2) +@pytest.fixture +def test_category_dir_1(load_files_root): + try: + TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=load_files_root) + sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, + delete=False) + sample_file.write(b("Hello World!\n")) + sample_file.close() + yield TEST_CATEGORY_DIR1 + finally: + _remove_dir(TEST_CATEGORY_DIR1) + + +@pytest.fixture +def test_category_dir_2(load_files_root): + try: + TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=load_files_root) + yield TEST_CATEGORY_DIR2 + finally: + _remove_dir(TEST_CATEGORY_DIR2) -def test_data_home(): +def test_data_home(data_home): # get_data_home will point to a pre-existing folder - data_home = get_data_home(data_home=DATA_HOME) - assert_equal(data_home, DATA_HOME) + data_home = get_data_home(data_home=data_home) + assert_equal(data_home, data_home) assert_true(os.path.exists(data_home)) # clear_data_home will delete both the content and the folder it-self @@ -73,30 +90,28 @@ def test_data_home(): assert_false(os.path.exists(data_home)) # if the folder is missing it will be created again - data_home = get_data_home(data_home=DATA_HOME) + data_home = get_data_home(data_home=data_home) assert_true(os.path.exists(data_home)) -def test_default_empty_load_files(): - res = load_files(LOAD_FILES_ROOT) +def test_default_empty_load_files(load_files_root): + res = load_files(load_files_root) assert_equal(len(res.filenames), 0) assert_equal(len(res.target_names), 0) assert_equal(res.DESCR, None) -@with_setup(setup_load_files, teardown_load_files) -def test_default_load_files(): - res = load_files(LOAD_FILES_ROOT) +def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files_root): + res = load_files(load_files_root) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")]) -@with_setup(setup_load_files, teardown_load_files) -def test_load_files_w_categories_desc_and_encoding(): - category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop() - res = load_files(LOAD_FILES_ROOT, description="test", +def test_load_files_w_categories_desc_and_encoding(test_category_dir_1, test_category_dir_2, load_files_root): + category = os.path.abspath(test_category_dir_1).split('/').pop() + res = load_files(load_files_root, description="test", categories=category, encoding="utf-8") assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 1) @@ -104,9 +119,8 @@ def test_load_files_w_categories_desc_and_encoding(): assert_equal(res.data, [u("Hello World!\n")]) -@with_setup(setup_load_files, teardown_load_files) -def test_load_files_wo_load_content(): - res = load_files(LOAD_FILES_ROOT, load_content=False) +def test_load_files_wo_load_content(test_category_dir_1, test_category_dir_2, load_files_root): + res = load_files(load_files_root, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) diff --git a/sklearn/datasets/tests/test_mldata.py b/sklearn/datasets/tests/test_mldata.py index 1ce22079bdd11..c3ebd15af5276 100644 --- a/sklearn/datasets/tests/test_mldata.py +++ b/sklearn/datasets/tests/test_mldata.py @@ -13,24 +13,24 @@ from sklearn.utils.testing import mock_mldata_urlopen from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import with_setup from sklearn.utils.testing import assert_array_equal - -tmpdir = None +import pytest -def setup_tmpdata(): - # create temporary dir - global tmpdir - tmpdir = tempfile.mkdtemp() - os.makedirs(os.path.join(tmpdir, 'mldata')) +tmpdir = None -def teardown_tmpdata(): - # remove temporary dir - if tmpdir is not None: - shutil.rmtree(tmpdir) +@pytest.fixture +def tmpdata(): + try: + global tmpdir + tmpdir = tempfile.mkdtemp() + os.makedirs(os.path.join(tmpdir, 'mldata')) + yield tmpdir + finally: + if tmpdir is not None: + shutil.rmtree(tmpdir) def test_mldata_filename(): @@ -43,8 +43,7 @@ def test_mldata_filename(): assert_equal(mldata_filename(name), desired) -@with_setup(setup_tmpdata, teardown_tmpdata) -def test_download(): +def test_download(tmpdata): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen @@ -55,7 +54,7 @@ def test_download(): }, }) try: - mock = fetch_mldata('mock', data_home=tmpdir) + mock = fetch_mldata('mock', data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) @@ -68,8 +67,7 @@ def test_download(): datasets.mldata.urlopen = _urlopen_ref -@with_setup(setup_tmpdata, teardown_tmpdata) -def test_fetch_one_column(): +def test_fetch_one_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' @@ -92,8 +90,7 @@ def test_fetch_one_column(): datasets.mldata.urlopen = _urlopen_ref -@with_setup(setup_tmpdata, teardown_tmpdata) -def test_fetch_multiple_column(): +def test_fetch_multiple_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: # create fake data set in cache diff --git a/sklearn/feature_extraction/tests/test_image.py b/sklearn/feature_extraction/tests/test_image.py index 6c57788efe904..bcde10126fb5c 100644 --- a/sklearn/feature_extraction/tests/test_image.py +++ b/sklearn/feature_extraction/tests/test_image.py @@ -6,7 +6,7 @@ import scipy as sp from scipy import ndimage -from numpy.testing import assert_raises +from sklearn.utils.testing import assert_raises from sklearn.feature_extraction.image import ( img_to_graph, grid_to_graph, extract_patches_2d, diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 88382f7d13c0b..1c431beabfa3e 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -23,7 +23,7 @@ import numpy as np from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal -from numpy.testing import assert_raises +from sklearn.utils.testing import assert_raises from sklearn.utils.random import choice from sklearn.utils.testing import (assert_equal, assert_false, assert_true, assert_not_equal, assert_almost_equal, diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 003c5727da097..e94e115e8b8af 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -5,6 +5,7 @@ from sklearn.externals.funcsigs import signature +import pytest import numpy as np from sklearn.gaussian_process.kernels import _approx_fprime @@ -192,48 +193,47 @@ def check_hyperparameters_equal(kernel1, kernel2): assert_equal(attr_value1, attr_value2) -def test_kernel_clone(): +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_clone(kernel): # Test that sklearn's clone works correctly on kernels. bounds = (1e-5, 1e5) - for kernel in kernels: - kernel_cloned = clone(kernel) - - # XXX: Should this be fixed? - # This differs from the sklearn's estimators equality check. - assert_equal(kernel, kernel_cloned) - assert_not_equal(id(kernel), id(kernel_cloned)) - - # Check that all constructor parameters are equal. - assert_equal(kernel.get_params(), kernel_cloned.get_params()) - - # Check that all hyperparameters are equal. - yield check_hyperparameters_equal, kernel, kernel_cloned - - # This test is to verify that using set_params does not - # break clone on kernels. - # This used to break because in kernels such as the RBF, non-trivial - # logic that modified the length scale used to be in the constructor - # See https://github.com/scikit-learn/scikit-learn/issues/6961 - # for more details. - params = kernel.get_params() - # RationalQuadratic kernel is isotropic. - isotropic_kernels = (ExpSineSquared, RationalQuadratic) - if 'length_scale' in params and not isinstance(kernel, - isotropic_kernels): - length_scale = params['length_scale'] - if np.iterable(length_scale): - params['length_scale'] = length_scale[0] - params['length_scale_bounds'] = bounds - else: - params['length_scale'] = [length_scale] * 2 - params['length_scale_bounds'] = bounds * 2 - kernel_cloned.set_params(**params) - kernel_cloned_clone = clone(kernel_cloned) - assert_equal(kernel_cloned_clone.get_params(), - kernel_cloned.get_params()) - assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned)) - yield (check_hyperparameters_equal, kernel_cloned, - kernel_cloned_clone) + kernel_cloned = clone(kernel) + + # XXX: Should this be fixed? + # This differs from the sklearn's estimators equality check. + assert_equal(kernel, kernel_cloned) + assert_not_equal(id(kernel), id(kernel_cloned)) + + # Check that all constructor parameters are equal. + assert_equal(kernel.get_params(), kernel_cloned.get_params()) + + # Check that all hyperparameters are equal. + check_hyperparameters_equal(kernel, kernel_cloned) + + # This test is to verify that using set_params does not + # break clone on kernels. + # This used to break because in kernels such as the RBF, non-trivial + # logic that modified the length scale used to be in the constructor + # See https://github.com/scikit-learn/scikit-learn/issues/6961 + # for more details. + params = kernel.get_params() + # RationalQuadratic kernel is isotropic. + isotropic_kernels = (ExpSineSquared, RationalQuadratic) + if 'length_scale' in params and not isinstance(kernel, + isotropic_kernels): + length_scale = params['length_scale'] + if np.iterable(length_scale): + params['length_scale'] = length_scale[0] + params['length_scale_bounds'] = bounds + else: + params['length_scale'] = [length_scale] * 2 + params['length_scale_bounds'] = bounds * 2 + kernel_cloned.set_params(**params) + kernel_cloned_clone = clone(kernel_cloned) + assert_equal(kernel_cloned_clone.get_params(), + kernel_cloned.get_params()) + assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned)) + check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone) def test_matern_kernel(): diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index b19ee0aa25895..c742f4d2beeb3 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -3,7 +3,8 @@ import numpy as np from scipy import sparse -from numpy.testing import assert_equal, assert_raises +from numpy.testing import assert_equal +from sklearn.utils.testing import assert_raises from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index 2a2dce1fc18d1..90e973a34884f 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -8,9 +8,10 @@ import copy import sys +import pytest + import numpy as np -from numpy.testing import (assert_array_equal, assert_array_almost_equal, - assert_raises) +from numpy.testing import assert_array_equal, assert_array_almost_equal from scipy import stats from sklearn import mixture from sklearn.datasets.samples_generator import make_spd_matrix @@ -157,12 +158,12 @@ def test_GMM_attributes(): covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2 g.covars_ = covars assert_array_almost_equal(g.covars_, covars) - assert_raises(ValueError, g._set_covars, []) - assert_raises(ValueError, g._set_covars, - np.zeros((n_components - 2, n_features))) - - assert_raises(ValueError, mixture.GMM, n_components=20, - covariance_type='badcovariance_type') + with pytest.raises(ValueError): + g._set_covars([]) + with pytest.raises(ValueError): + g._set_covars( np.zeros((n_components - 2, n_features))) + with pytest.raises(ValueError): + mixture.GMM( n_components=20, covariance_type='badcovariance_type') class GMMTester(): diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index 0f5968cd63897..d7b521ce3fb4c 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -1,5 +1,6 @@ import pickle import numpy as np +import pytest from numpy.testing import assert_array_almost_equal from sklearn.neighbors.ball_tree import (BallTree, NeighborsHeap, simultaneous_sort, kernel_norm, @@ -60,9 +61,7 @@ def check_neighbors(dualtree, breadth_first, k, metric, kwargs): for k in (1, 3, 5): for dualtree in (True, False): for breadth_first in (True, False): - yield (check_neighbors, - dualtree, breadth_first, - k, metric, kwargs) + check_neighbors(dualtree, breadth_first, k, metric, kwargs) def test_ball_tree_query_boolean_metrics(): @@ -78,7 +77,7 @@ def check_neighbors(metric): assert_array_almost_equal(dist1, dist2) for metric in BOOLEAN_METRICS: - yield check_neighbors, metric + check_neighbors(metric) def test_ball_tree_query_discrete_metrics(): @@ -94,7 +93,7 @@ def check_neighbors(metric): assert_array_almost_equal(dist1, dist2) for metric in DISCRETE_METRICS: - yield check_neighbors, metric + check_neighbors(metric) def test_ball_tree_query_radius(n_samples=100, n_features=10): @@ -156,29 +155,26 @@ def compute_kernel_slow(Y, X, kernel, h): raise ValueError('kernel not recognized') -def test_ball_tree_kde(n_samples=100, n_features=3): +@pytest.mark.parametrize("kernel", ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize("h", [0.01, 0.1, 1]) +@pytest.mark.parametrize("rtol", [0, 1E-5]) +@pytest.mark.parametrize("atol", [1E-6, 1E-2]) +@pytest.mark.parametrize("breadth_first", [True, False]) +def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) bt = BallTree(X, leaf_size=10) - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for h in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, h) + dens_true = compute_kernel_slow(Y, X, kernel, h) - def check_results(kernel, h, atol, rtol, breadth_first): - dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, - kernel=kernel, - breadth_first=breadth_first) - assert_allclose(dens, dens_true, - atol=atol, rtol=max(rtol, 1e-7)) + dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, + kernel=kernel, + breadth_first=breadth_first) + assert_allclose(dens, dens_true, + atol=atol, rtol=max(rtol, 1e-7)) - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, h, atol, rtol, - breadth_first) def test_gaussian_kde(n_samples=1000): @@ -217,7 +213,7 @@ def check_two_point(r, dualtree): assert_array_almost_equal(counts, counts_true) for dualtree in (True, False): - yield check_two_point, r, dualtree + check_two_point(r, dualtree) def test_ball_tree_pickle(): @@ -248,7 +244,7 @@ def check_pickle_protocol(protocol): assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + check_pickle_protocol(protocol) def test_neighbors_heap(n_pts=5, n_nbrs=10): diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index 50ece8f97a271..392a4a27c087c 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -128,8 +128,8 @@ def check_results(kernel, h, atol, rtol, breadth_first): for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): - yield (check_results, kernel, h, atol, rtol, - breadth_first) + check_results(kernel, h, atol, rtol, + breadth_first) def test_gaussian_kde(n_samples=1000): diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 3078a3c05df39..7e8cd9a1475b4 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -52,7 +52,7 @@ def check_results(kernel, bandwidth, atol, rtol): for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): - yield (check_results, kernel, bandwidth, atol, rtol) + check_results(kernel, bandwidth, atol, rtol) def test_kernel_density_sampling(n_samples=100, n_features=3): diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 7a280d22a6a81..af78e80c8ea01 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -35,7 +35,7 @@ def test_l1_min_c(): check.description = ('Test l1_min_c loss=%r %s %s %s' % (loss, X_label, Y_label, intercept_label)) - yield check + check() # loss='l2' should raise ValueError assert_raise_message(ValueError, "loss type not in", diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index be278ed884fda..c68cd5f37cfe2 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -2,6 +2,7 @@ from io import BytesIO import numpy as np import scipy.sparse +import pytest from sklearn.datasets import load_digits, load_iris @@ -223,9 +224,9 @@ def check_partial_fit(cls): assert_array_equal(clf1.feature_count_, clf3.feature_count_) -def test_discretenb_partial_fit(): - for cls in [MultinomialNB, BernoulliNB]: - yield check_partial_fit, cls +@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB]) +def test_discretenb_partial_fit(cls): + check_partial_fit(cls) def test_gnb_partial_fit(): diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py index dcbe97c7d6d7f..08209a7046b1b 100644 --- a/sklearn/tests/test_random_projection.py +++ b/sklearn/tests/test_random_projection.py @@ -2,6 +2,7 @@ import numpy as np import scipy.sparse as sp +import pytest from sklearn.metrics import euclidean_distances @@ -113,21 +114,22 @@ def check_input_with_sparse_random_matrix(random_matrix): random_matrix, n_components, n_features, density=density) -def test_basic_property_of_random_matrix(): +@pytest.mark.parametrize("random_matrix", all_random_matrix) +def test_basic_property_of_random_matrix(random_matrix): # Check basic properties of random matrix generation - for random_matrix in all_random_matrix: - yield check_input_size_random_matrix, random_matrix - yield check_size_generated, random_matrix - yield check_zero_mean_and_unit_norm, random_matrix - - for random_matrix in all_sparse_random_matrix: - yield check_input_with_sparse_random_matrix, random_matrix - - random_matrix_dense = \ - lambda n_components, n_features, random_state: random_matrix( - n_components, n_features, random_state=random_state, - density=1.0) - yield check_zero_mean_and_unit_norm, random_matrix_dense + check_input_size_random_matrix(random_matrix) + check_size_generated(random_matrix) + check_zero_mean_and_unit_norm(random_matrix) + + +@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) +def test_basic_property_of_random_matrix(random_matrix): + check_input_with_sparse_random_matrix(random_matrix) + + def random_matrix_dense(n_components, n_features, random_state): + return random_matrix(n_components, n_features, random_state=random_state, density=1.0) + + check_zero_mean_and_unit_norm(random_matrix_dense) def test_gaussian_random_matrix(): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index ff662e9af414a..57238dad60a82 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -7,6 +7,7 @@ from itertools import product import struct +import pytest import numpy as np from scipy.sparse import csc_matrix from scipy.sparse import csr_matrix @@ -695,14 +696,14 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): name, est.min_weight_fraction_leaf)) -def test_min_weight_fraction_leaf(): - # Check on dense input - for name in ALL_TREES: - yield check_min_weight_fraction_leaf, name, "iris" +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_fraction_leaf_on_dense_input(name): + check_min_weight_fraction_leaf(name, "iris") - # Check on sparse input - for name in SPARSE_TREES: - yield check_min_weight_fraction_leaf, name, "multilabel", True + +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_min_weight_fraction_leaf_on_sparse_input(name): + check_min_weight_fraction_leaf(name, "multilabel", True) def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, @@ -769,16 +770,14 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, est.min_samples_leaf)) -def test_min_weight_fraction_leaf_with_min_samples_leaf(): - # Check on dense input - for name in ALL_TREES: - yield (check_min_weight_fraction_leaf_with_min_samples_leaf, - name, "iris") +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name): + check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris") + - # Check on sparse input - for name in SPARSE_TREES: - yield (check_min_weight_fraction_leaf_with_min_samples_leaf, - name, "multilabel", True) +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name): + check_min_weight_fraction_leaf_with_min_samples_leaf(name, "multilabel", True) def test_min_impurity_split(): @@ -1107,9 +1106,9 @@ def check_class_weights(name): assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) -def test_class_weights(): - for name in CLF_TREES: - yield check_class_weights, name +@pytest.mark.parametrize("name", CLF_TREES) +def test_class_weights(name): + check_class_weights(name) def check_class_weight_errors(name): @@ -1131,9 +1130,9 @@ def check_class_weight_errors(name): assert_raises(ValueError, clf.fit, X, _y) -def test_class_weight_errors(): - for name in CLF_TREES: - yield check_class_weight_errors, name +@pytest.mark.parametrize("name", CLF_TREES) +def test_class_weight_errors(name): + check_class_weight_errors(name) def test_max_leaf_nodes(): @@ -1278,21 +1277,23 @@ def check_sparse_input(tree, dataset, max_depth=None): assert_array_almost_equal(s.predict_log_proba(X_sparse_test), y_log_proba) - -def test_sparse_input(): - for tree_type, dataset in product(SPARSE_TREES, ("clf_small", "toy", +tree_type_dataset_combination = product(SPARSE_TREES, ("clf_small", "toy", "digits", "multilabel", "sparse-pos", "sparse-neg", - "sparse-mix", "zeros")): - max_depth = 3 if dataset == "digits" else None - yield (check_sparse_input, tree_type, dataset, max_depth) + "sparse-mix", "zeros")) +@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combination) +def test_sparse_input(tree_type, dataset): + max_depth = 3 if dataset == "digits" else None + check_sparse_input(tree_type, dataset, max_depth) + +@pytest.mark.parametrize("tree_type, dataset", product(SPARSE_TREES, ["boston", "reg_small"])) +def test_sparse_input(tree_type, dataset): # Due to numerical instability of MSE and too strict test, we limit the # maximal depth - for tree_type, dataset in product(SPARSE_TREES, ["boston", "reg_small"]): - if tree_type in REG_TREES: - yield (check_sparse_input, tree_type, dataset, 2) + if tree_type in REG_TREES: + check_sparse_input(tree_type, dataset, 2) def check_sparse_parameters(tree, dataset): @@ -1339,11 +1340,12 @@ def check_sparse_parameters(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -def test_sparse_parameters(): - for tree_type, dataset in product(SPARSE_TREES, ["sparse-pos", - "sparse-neg", - "sparse-mix", "zeros"]): - yield (check_sparse_parameters, tree_type, dataset) +tree_type_dataset_combinations = product(SPARSE_TREES, ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) + + +@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combinations) +def test_sparse_parameters(tree_type, dataset): + check_sparse_parameters(tree_type, dataset) def check_sparse_criterion(tree, dataset): @@ -1366,11 +1368,9 @@ def check_sparse_criterion(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -def test_sparse_criterion(): - for tree_type, dataset in product(SPARSE_TREES, ["sparse-pos", - "sparse-neg", - "sparse-mix", "zeros"]): - yield (check_sparse_criterion, tree_type, dataset) +@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combinations) +def test_sparse_criterion(tree_type, dataset): + check_sparse_criterion(tree_type, dataset) def check_explicit_sparse_zeros(tree, max_depth=3, @@ -1442,9 +1442,9 @@ def check_explicit_sparse_zeros(tree, max_depth=3, d.predict_proba(X2)) -def test_explicit_sparse_zeros(): - for tree_type in SPARSE_TREES: - yield (check_explicit_sparse_zeros, tree_type) +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +def test_explicit_sparse_zeros(tree_type): + check_explicit_sparse_zeros(tree_type) @ignore_warnings @@ -1462,10 +1462,11 @@ def check_raise_error_on_1d_input(name): assert_raises(ValueError, est.predict, [X]) -@ignore_warnings -def test_1d_input(): - for name in ALL_TREES: - yield check_raise_error_on_1d_input, name +# XXX +# @ignore_warnings +@pytest.mark.parametrize("name", ALL_TREES) +def test_1d_input(name): + check_raise_error_on_1d_input(name) def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight): @@ -1492,9 +1493,9 @@ def check_min_weight_leaf_split_level(name): sample_weight) -def test_min_weight_leaf_split_level(): - for name in ALL_TREES: - yield check_min_weight_leaf_split_level, name +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_leaf_split_level(name): + check_min_weight_leaf_split_level(name) def check_public_apply(name): @@ -1515,12 +1516,14 @@ def check_public_apply_sparse(name): est.tree_.apply(X_small32)) -def test_public_apply(): - for name in ALL_TREES: - yield (check_public_apply, name) +@pytest.mark.parametrize("name", ALL_TREES) +def test_public_apply_all_trees(name): + check_public_apply(name) + - for name in SPARSE_TREES: - yield (check_public_apply_sparse, name) +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_public_apply_all_trees(name): + check_public_apply_sparse(name) def check_presort_sparse(est, X, y): @@ -1539,7 +1542,7 @@ def test_presort_sparse(): y = y[:, 0] for est, sparse_matrix in product(ests, sparse_matrices): - yield check_presort_sparse, est, sparse_matrix(X), y + check_presort_sparse(est, sparse_matrix(X), y) def test_decision_path_hardcoded(): @@ -1578,9 +1581,9 @@ def check_decision_path(name): assert_less_equal(est.tree_.max_depth, max_depth) -def test_decision_path(): - for name in ALL_TREES: - yield (check_decision_path, name) +@pytest.mark.parametrize("name", ALL_TREES) +def test_decision_path(name): + check_decision_path(name) def check_no_sparse_y_support(name): @@ -1589,10 +1592,10 @@ def check_no_sparse_y_support(name): assert_raises(TypeError, TreeEstimator(random_state=0).fit, X, y) -def test_no_sparse_y_support(): +@pytest.mark.parametrize("name", ALL_TREES) +def test_no_sparse_y_support(name): # Currently we don't support sparse y - for name in ALL_TREES: - yield (check_no_sparse_y_support, name) + check_no_sparse_y_support(name) def test_mae(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 9a62b3c6a96fc..35dab0ca4bafa 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -46,8 +46,13 @@ from sklearn.base import BaseEstimator from sklearn.externals import joblib -from nose.tools import raises -from nose import with_setup +import pytest + + +def raises(exception): + return pytest.mark.xfail(raises=exception) + + from numpy.testing import assert_almost_equal from numpy.testing import assert_array_equal @@ -61,7 +66,7 @@ from sklearn.cluster import DBSCAN __all__ = ["assert_equal", "assert_not_equal", "assert_raises", - "assert_raises_regexp", "raises", "with_setup", "assert_true", + "assert_raises_regexp", "raises", "assert_true", "assert_false", "assert_almost_equal", "assert_array_equal", "assert_array_almost_equal", "assert_array_less", "assert_less", "assert_less_equal", @@ -74,7 +79,13 @@ assert_not_equal = _dummy.assertNotEqual assert_true = _dummy.assertTrue assert_false = _dummy.assertFalse -assert_raises = _dummy.assertRaises + + +def assert_raises(exception, fct, *args, **kwargs): + with pytest.raises(exception): + return fct(*args, **kwargs) + + SkipTest = unittest.case.SkipTest assert_dict_equal = _dummy.assertDictEqual assert_in = _dummy.assertIn @@ -673,15 +684,15 @@ def clean_warning_registry(): getattr(mod, reg).clear() -def check_skip_network(): - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)): - raise SkipTest("Text tutorial requires large dataset download") +# def check_skip_network(): + # if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)): + # raise SkipTest("Text tutorial requires large dataset download") -def check_skip_travis(): - """Skip test if being run on Travis.""" - if os.environ.get('TRAVIS') == "true": - raise SkipTest("This test needs to be skipped on Travis") +# def check_skip_travis(): + # """Skip test if being run on Travis.""" + # if os.environ.get('TRAVIS') == "true": + # raise SkipTest("This test needs to be skipped on Travis") def _delete_folder(folder_path, warn=False): @@ -716,8 +727,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -with_network = with_setup(check_skip_network) -with_travis = with_setup(check_skip_travis) +with_network = pytest.mark.skipif(int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)), reason="skip_network tests") +with_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', reason='skip on travis') class _named_check(object): diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py index a941ce38a85ea..163bacd62feb7 100644 --- a/sklearn/utils/tests/test_stats.py +++ b/sklearn/utils/tests/test_stats.py @@ -1,3 +1,4 @@ +import pytest from sklearn.utils.testing import assert_array_equal from sklearn.utils.stats import rankdata @@ -13,11 +14,7 @@ ) -def test_cases(): - - def check_case(values, method, expected): - r = rankdata(values, method=method) - assert_array_equal(r, expected) - - for values, method, expected in _cases: - yield check_case, values, method, expected +@pytest.mark.parametrize("values, method, expected", _cases) +def test_cases_rankdata(values, method, expected): + r = rankdata(values, method=method) + assert_array_equal(r, expected) From 3d952628e87e1304f2ffe0445297fd62c5e95e56 Mon Sep 17 00:00:00 2001 From: Holger Peters Date: Wed, 8 Feb 2017 23:00:42 +0100 Subject: [PATCH 02/25] Replace nosetests with pytest on travis --- build_tools/travis/install.sh | 16 ++++++++-------- build_tools/travis/test_script.sh | 4 ++-- setup.cfg | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 85e0d174a1812..63cdac4aca475 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -51,21 +51,21 @@ if [[ "$DISTRIB" == "conda" ]]; then # Configure the conda environment and put it in the path using the # provided versions if [[ "$INSTALL_MKL" == "true" ]]; then - conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ + conda create -n testenv --yes python=$PYTHON_VERSION pip pytest \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ mkl cython=$CYTHON_VERSION \ ${PANDAS_VERSION+pandas=$PANDAS_VERSION} - + else - conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ + conda create -n testenv --yes python=$PYTHON_VERSION pip pytest \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ nomkl cython=$CYTHON_VERSION \ ${PANDAS_VERSION+pandas=$PANDAS_VERSION} fi source activate testenv - # Install nose-timer via pip - pip install nose-timer + # Install pytest-cov via pip + pip install pytest-cov elif [[ "$DISTRIB" == "ubuntu" ]]; then # At the time of writing numpy 1.9.1 is included in the travis @@ -76,7 +76,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # and scipy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install nose nose-timer cython + pip install pytest pytest-cov cython elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then # Set up our own virtualenv environment to avoid travis' numpy. @@ -91,11 +91,11 @@ elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then pip install --pre --upgrade --no-index --timeout=60 \ --trusted-host travis-dev-wheels.scipy.org \ -f https://travis-dev-wheels.scipy.org/ numpy scipy - pip install nose nose-timer cython + pip install pytest cython fi if [[ "$COVERAGE" == "true" ]]; then - pip install coverage codecov + pip install pytest-cov coverage codecov fi if [[ "$SKIP_TESTS" == "true" ]]; then diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index 6ab342b932cf1..b3c33e82401b6 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -34,9 +34,9 @@ run_tests() { export SKLEARN_SKIP_NETWORK_TESTS=1 if [[ "$COVERAGE" == "true" ]]; then - nosetests -s --with-coverage --with-timer --timer-top-n 20 sklearn + py.test -s --cov=sklearn --durations=20 $OLDPWD/sklearn else - nosetests -s --with-timer --timer-top-n 20 sklearn + py.test -s --durations=20 $OLDPWD/sklearn fi # Test doc diff --git a/setup.cfg b/setup.cfg index 0df34d84602e3..ae61b7a9522bf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [aliases] # python2.7 has upgraded unittest and it is no longer compatible with some # of our tests, so we run all through nose -test = nosetests +test = pytest [nosetests] # nosetests skips test files with the executable bit by default From 607f1b9a663f9dcebbc11f5141c6fb43c6df400f Mon Sep 17 00:00:00 2001 From: Holger Peters Date: Wed, 8 Feb 2017 23:58:51 +0100 Subject: [PATCH 03/25] Makefile to pytest --- Makefile | 16 +++++++--------- build_tools/appveyor/requirements.txt | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index aa6203f3cdbe7..567e7ecd18363 100644 --- a/Makefile +++ b/Makefile @@ -4,14 +4,14 @@ PYTHON ?= python CYTHON ?= cython -NOSETESTS ?= nosetests +PYTEST ?= py.test CTAGS ?= ctags # skip doctests on 32bit python BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))') ifeq ($(BITS),32) - NOSETESTS:=$(NOSETESTS) -c setup32.cfg + PYTEST:=$(PYTEST) -c setup32.cfg endif @@ -29,21 +29,19 @@ inplace: $(PYTHON) setup.py build_ext -i test-code: in - $(NOSETESTS) -s -v sklearn + $(PYTEST) -s -v sklearn test-sphinxext: - $(NOSETESTS) -s -v doc/sphinxext/ + $(PYTEST) -s -v doc/sphinxext/ test-doc: ifeq ($(BITS),64) - $(NOSETESTS) -s -v doc/*.rst doc/modules/ doc/datasets/ \ - doc/developers doc/tutorial/basic doc/tutorial/statistical_inference \ - doc/tutorial/text_analytics + $(PYTEST) -s -v --doctest-glob='*.rst' endif test-coverage: rm -rf coverage .coverage - $(NOSETESTS) -s -v --with-coverage sklearn + $(PYTEST) -s -v --cov=sklearn -test: test-code test-sphinxext test-doc +test: test-code test-doc trailing-spaces: find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; diff --git a/build_tools/appveyor/requirements.txt b/build_tools/appveyor/requirements.txt index 0b9c63f72789f..fe185986e9e19 100644 --- a/build_tools/appveyor/requirements.txt +++ b/build_tools/appveyor/requirements.txt @@ -10,7 +10,7 @@ numpy==1.9.3 scipy==0.16.0 cython -nose -nose-timer +pytest +pytest-cov wheel wheelhouse_uploader From 86d49081cad7755522ae1859a8785f5ecd4daa70 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 4 May 2018 17:58:31 +0300 Subject: [PATCH 04/25] Fix conflicts and add a few more parametrizations --- .gitignore | 5 -- Makefile | 27 +------- build_tools/travis/install.sh | 2 +- .../gaussian_process/tests/test_kernels.py | 19 +++--- sklearn/mixture/tests/test_gmm.py | 2 +- sklearn/neighbors/tests/test_ball_tree.py | 68 ++++++++----------- sklearn/neighbors/tests/test_kd_tree.py | 2 +- sklearn/svm/tests/test_bounds.py | 25 +++---- sklearn/tests/test_random_projection.py | 7 +- sklearn/tree/tests/test_tree.py | 57 ++++++++-------- sklearn/utils/testing.py | 45 +++--------- 11 files changed, 94 insertions(+), 165 deletions(-) diff --git a/.gitignore b/.gitignore index 6bf844061d27e..9fa8c09bdf0b0 100644 --- a/.gitignore +++ b/.gitignore @@ -62,13 +62,8 @@ benchmarks/bench_covertype_data/ !*/src/*.cpp *.sln *.pyproj -<<<<<<< HEAD -.cache -.eggs -======= # Used by py.test .cache .pytest_cache/ _configtest.o.d ->>>>>>> master diff --git a/Makefile b/Makefile index 627d9cac299e4..ac242b12e59c2 100644 --- a/Makefile +++ b/Makefile @@ -4,24 +4,12 @@ PYTHON ?= python CYTHON ?= cython -<<<<<<< HEAD -PYTEST ?= py.test -======= PYTEST ?= pytest ->>>>>>> master CTAGS ?= ctags # skip doctests on 32bit python BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))') -<<<<<<< HEAD -ifeq ($(BITS),32) - PYTEST:=$(PYTEST) -c setup32.cfg -endif - - -======= ->>>>>>> master all: clean inplace test clean-ctags: @@ -36,32 +24,19 @@ inplace: $(PYTHON) setup.py build_ext -i test-code: in -<<<<<<< HEAD - $(PYTEST) -s -v sklearn -test-sphinxext: - $(PYTEST) -s -v doc/sphinxext/ -test-doc: -ifeq ($(BITS),64) - $(PYTEST) -s -v --doctest-glob='*.rst' -======= $(PYTEST) --showlocals -v sklearn test-sphinxext: $(PYTEST) --showlocals -v doc/sphinxext/ test-doc: ifeq ($(BITS),64) $(PYTEST) $(shell find doc -name '*.rst' | sort) ->>>>>>> master endif test-coverage: rm -rf coverage .coverage -<<<<<<< HEAD - $(PYTEST) -s -v --cov=sklearn -======= $(PYTEST) sklearn --showlocals -v --cov=sklearn --cov-report=html:coverage ->>>>>>> master -test: test-code test-doc +test: test-code test-sphinxext test-doc trailing-spaces: find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 3898232b1031d..443bfb9d812a9 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -93,7 +93,7 @@ elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then fi if [[ "$COVERAGE" == "true" ]]; then - pip install pytest-cov coverage codecov + pip install coverage codecov fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 354cf82a76791..09065eb4871a0 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -195,19 +195,18 @@ def check_hyperparameters_equal(kernel1, kernel2): @pytest.mark.parametrize("kernel", kernels) def test_kernel_clone(kernel): # Test that sklearn's clone works correctly on kernels. - for kernel in kernels: - kernel_cloned = clone(kernel) + kernel_cloned = clone(kernel) - # XXX: Should this be fixed? - # This differs from the sklearn's estimators equality check. - assert_equal(kernel, kernel_cloned) - assert_not_equal(id(kernel), id(kernel_cloned)) + # XXX: Should this be fixed? + # This differs from the sklearn's estimators equality check. + assert_equal(kernel, kernel_cloned) + assert_not_equal(id(kernel), id(kernel_cloned)) - # Check that all constructor parameters are equal. - assert_equal(kernel.get_params(), kernel_cloned.get_params()) + # Check that all constructor parameters are equal. + assert_equal(kernel.get_params(), kernel_cloned.get_params()) - # Check that all hyperparameters are equal. - yield check_hyperparameters_equal, kernel, kernel_cloned + # Check that all hyperparameters are equal. + check_hyperparameters_equal(kernel, kernel_cloned) def test_kernel_clone_after_set_params(): diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index f5bef0152b641..8eaf5534544c8 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -18,7 +18,7 @@ from sklearn.datasets.samples_generator import make_spd_matrix from sklearn.utils.testing import (assert_true, assert_greater, assert_raise_message, assert_warns_message, - ignore_warnings, assert_raises) + ignore_warnings) from sklearn.metrics.cluster import adjusted_rand_score from sklearn.externals.six.moves import cStringIO as StringIO diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index bbea7a1ca8b66..de0d166fb8891 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -1,4 +1,6 @@ import pickle +import itertools + import numpy as np import pytest from numpy.testing import assert_array_almost_equal @@ -43,58 +45,44 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind -def test_ball_tree_query(): +@pytest.mark.parametrize('metric', METRICS) +@pytest.mark.parametrize('k', (1, 3, 5)) +@pytest.mark.parametrize('dualtree', (True, False)) +@pytest.mark.parametrize('breadth_first', (True, False)) +def test_ball_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) - def check_neighbors(dualtree, breadth_first, k, metric, kwargs): - bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) - dist1, ind1 = bt.query(Y, k, dualtree=dualtree, - breadth_first=breadth_first) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) + kwargs = METRICS[metric] - # don't check indices here: if there are any duplicate distances, - # the indices may not match. Distances should not have this problem. - assert_array_almost_equal(dist1, dist2) + bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) + dist1, ind1 = bt.query(Y, k, dualtree=dualtree, + breadth_first=breadth_first) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) - for (metric, kwargs) in METRICS.items(): - for k in (1, 3, 5): - for dualtree in (True, False): - for breadth_first in (True, False): - check_neighbors(dualtree, breadth_first, k, metric, kwargs) + # don't check indices here: if there are any duplicate distances, + # the indices may not match. Distances should not have this problem. + assert_array_almost_equal(dist1, dist2) -def test_ball_tree_query_boolean_metrics(): +@pytest.mark.parametrize('metric', + itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) +def test_ball_tree_query_metrics(metric): rng = check_random_state(0) - X = rng.random_sample((40, 10)).round(0) - Y = rng.random_sample((10, 10)).round(0) - k = 5 - - def check_neighbors(metric): - bt = BallTree(X, leaf_size=1, metric=metric) - dist1, ind1 = bt.query(Y, k) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric) - assert_array_almost_equal(dist1, dist2) + if metric in BOOLEAN_METRICS: + X = rng.random_sample((40, 10)).round(0) + Y = rng.random_sample((10, 10)).round(0) + elif metric in DISCRETE_METRICS: + X = (4 * rng.random_sample((40, 10))).round(0) + Y = (4 * rng.random_sample((10, 10))).round(0) - for metric in BOOLEAN_METRICS: - check_neighbors(metric) - - -def test_ball_tree_query_discrete_metrics(): - rng = check_random_state(0) - X = (4 * rng.random_sample((40, 10))).round(0) - Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 - def check_neighbors(metric): - bt = BallTree(X, leaf_size=1, metric=metric) - dist1, ind1 = bt.query(Y, k) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric) - assert_array_almost_equal(dist1, dist2) - - for metric in DISCRETE_METRICS: - check_neighbors(metric) + bt = BallTree(X, leaf_size=1, metric=metric) + dist1, ind1 = bt.query(Y, k) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric) + assert_array_almost_equal(dist1, dist2) def test_ball_tree_query_radius(n_samples=100, n_features=10): diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index deb6f47d58009..e1b7cb1965987 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -5,7 +5,7 @@ nodeheap_sort, DTYPE, ITYPE) from sklearn.neighbors.dist_metrics import DistanceMetric from sklearn.utils import check_random_state -from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import SkipTest, assert_allclose rng = np.random.RandomState(42) V = rng.random_sample((3, 3)) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 4400b50bf3121..d02c53b05d8b7 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -1,6 +1,8 @@ import numpy as np from scipy import sparse as sp +import pytest + from sklearn.svm.bounds import l1_min_c from sklearn.svm import LinearSVC from sklearn.linear_model.logistic import LogisticRegression @@ -16,25 +18,24 @@ Y2 = [2, 1, 0, 0] -def test_l1_min_c(): - losses = ['squared_hinge', 'log'] +@pytest.mark.parametrize('loss', ['squared_hinge', 'log']) +@pytest.mark.parametrize('X_label', ['sparse', 'dense']) +@pytest.mark.parametrize('Y_label', ['two-classes', 'multi-class']) +@pytest.mark.parametrize('intercept_label', ['no-intercept', 'fit-intercept']) +def test_l1_min_c(loss, X_label, Y_label, intercept_label): Xs = {'sparse': sparse_X, 'dense': dense_X} Ys = {'two-classes': Y1, 'multi-class': Y2} intercepts = {'no-intercept': {'fit_intercept': False}, 'fit-intercept': {'fit_intercept': True, 'intercept_scaling': 10}} - for loss in losses: - for X_label, X in Xs.items(): - for Y_label, Y in Ys.items(): - for intercept_label, intercept_params in intercepts.items(): - check = lambda: check_l1_min_c(X, Y, loss, - **intercept_params) - check.description = ('Test l1_min_c loss=%r %s %s %s' % - (loss, X_label, Y_label, - intercept_label)) - check() + X = Xs[X_label] + Y = Ys[Y_label] + intercept_params = intercepts[intercept_label] + check_l1_min_c(X, Y, loss, **intercept_params) + +def test_l1_min_c_l2_loss(): # loss='l2' should raise ValueError assert_raise_message(ValueError, "loss type not in", l1_min_c, dense_X, Y1, "l2") diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py index 08209a7046b1b..975922a341163 100644 --- a/sklearn/tests/test_random_projection.py +++ b/sklearn/tests/test_random_projection.py @@ -1,5 +1,7 @@ from __future__ import division +import functools + import numpy as np import scipy.sparse as sp import pytest @@ -123,11 +125,10 @@ def test_basic_property_of_random_matrix(random_matrix): @pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) -def test_basic_property_of_random_matrix(random_matrix): +def test_basic_property_of_sparse_random_matrix(random_matrix): check_input_with_sparse_random_matrix(random_matrix) - def random_matrix_dense(n_components, n_features, random_state): - return random_matrix(n_components, n_features, random_state=random_state, density=1.0) + random_matrix_dense = functools.partial(random_matrix, density=1.0) check_zero_mean_and_unit_norm(random_matrix_dense) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 8678f72b6b43c..f57064f4d76cb 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -783,7 +783,8 @@ def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name): @pytest.mark.parametrize("name", SPARSE_TREES) def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name): - check_min_weight_fraction_leaf_with_min_samples_leaf(name, "multilabel", True) + check_min_weight_fraction_leaf_with_min_samples_leaf( + name, "multilabel", True) def test_min_impurity_split(): @@ -1362,23 +1363,26 @@ def check_sparse_input(tree, dataset, max_depth=None): assert_array_almost_equal(s.predict_log_proba(X_sparse_test), y_log_proba) -tree_type_dataset_combination = product(SPARSE_TREES, ("clf_small", "toy", - "digits", "multilabel", - "sparse-pos", - "sparse-neg", - "sparse-mix", "zeros")) -@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combination) + +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +@pytest.mark.parametrize( + "dataset", + ("clf_small", "toy", "digits", "multilabel", + "sparse-pos", "sparse-neg", "sparse-mix", + "zeros") +) def test_sparse_input(tree_type, dataset): max_depth = 3 if dataset == "digits" else None check_sparse_input(tree_type, dataset, max_depth) -@pytest.mark.parametrize("tree_type, dataset", product(SPARSE_TREES, ["boston", "reg_small"])) -def test_sparse_input(tree_type, dataset): +@pytest.mark.parametrize("tree_type", + set(SPARSE_TREES).intersection(REG_TREES)) +@pytest.mark.parametrize("dataset", ["boston", "reg_small"]) +def test_sparse_input_reg_trees(tree_type, dataset): # Due to numerical instability of MSE and too strict test, we limit the # maximal depth - if tree_type in REG_TREES: - check_sparse_input(tree_type, dataset, 2) + check_sparse_input(tree_type, dataset, 2) def check_sparse_parameters(tree, dataset): @@ -1425,14 +1429,6 @@ def check_sparse_parameters(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -tree_type_dataset_combinations = product(SPARSE_TREES, ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) - - -@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combinations) -def test_sparse_parameters(tree_type, dataset): - check_sparse_parameters(tree_type, dataset) - - def check_sparse_criterion(tree, dataset): TreeEstimator = ALL_TREES[tree] X = DATASETS[dataset]["X"] @@ -1453,9 +1449,13 @@ def check_sparse_criterion(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -@pytest.mark.parametrize("tree_type, dataset", tree_type_dataset_combinations) -def test_sparse_criterion(tree_type, dataset): - check_sparse_criterion(tree_type, dataset) +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +@pytest.mark.parametrize("dataset", + ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) +@pytest.mark.parametrize("check", + [check_sparse_parameters, check_sparse_criterion]) +def test_sparse(tree_type, dataset, check): + check(tree_type, dataset) def check_explicit_sparse_zeros(tree, max_depth=3, @@ -1606,7 +1606,7 @@ def test_public_apply_all_trees(name): @pytest.mark.parametrize("name", SPARSE_TREES) -def test_public_apply_all_trees(name): +def test_public_apply_sparse_trees(name): check_public_apply_sparse(name) @@ -1629,16 +1629,15 @@ def test_presort_sparse(): check_presort_sparse(est, sparse_matrix(X), y) -def test_invalid_presort(): - classes = (DecisionTreeRegressor, DecisionTreeClassifier) +@pytest.mark.parametrize('cls', + (DecisionTreeRegressor, DecisionTreeClassifier)) +def test_invalid_presort(cls): allowed_presort = ('auto', True, False) invalid_presort = 'invalid' msg = ("'presort' should be in {}. " "Got {!r} instead.".format(allowed_presort, invalid_presort)) - for cls in classes: - est = cls(presort=invalid_presort) - assert_raise_message(ValueError, msg, - est.fit, X, y) + est = cls(presort=invalid_presort) + assert_raise_message(ValueError, msg, est.fit, X, y) def test_decision_path_hardcoded(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index c0789e62568ad..98a5e3e98b00d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -52,10 +52,6 @@ import pytest -def raises(exception): - return pytest.mark.xfail(raises=exception) - - additional_names_in_all = [] try: from nose.tools import raises as _nose_raises @@ -105,13 +101,7 @@ def raises(exception): assert_not_equal = _dummy.assertNotEqual assert_true = _dummy.assertTrue assert_false = _dummy.assertFalse - - -def assert_raises(exception, fct, *args, **kwargs): - with pytest.raises(exception): - return fct(*args, **kwargs) - - +assert_raises = _dummy.assertRaises SkipTest = unittest.case.SkipTest assert_dict_equal = _dummy.assertDictEqual assert_in = _dummy.assertIn @@ -701,16 +691,10 @@ def run_test(*args, **kwargs): return run_test -def skip_if_32bit(func): - """Test decorator that skips tests on 32bit platforms.""" - @wraps(func) - def run_test(*args, **kwargs): - bits = 8 * struct.calcsize("P") - if bits == 32: - raise SkipTest('Test skipped on 32bit platforms.') - else: - return func(*args, **kwargs) - return run_test +skip_if_32bit = pytest.mark.skipif(8 * struct.calcsize("P") == 32, + reason='skipped on 32bit platforms') +skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', + reason='skip on travis') def if_safe_multiprocessing_with_blas(func): @@ -752,15 +736,9 @@ def clean_warning_registry(): getattr(mod, reg).clear() -# def check_skip_network(): - # if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)): - # raise SkipTest("Text tutorial requires large dataset download") - - -# def check_skip_travis(): - # """Skip test if being run on Travis.""" - # if os.environ.get('TRAVIS') == "true": - # raise SkipTest("This test needs to be skipped on Travis") +def check_skip_network(): + if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)): + raise SkipTest("Text tutorial requires large dataset download") def _delete_folder(folder_path, warn=False): @@ -792,13 +770,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -with_network = pytest.mark.skipif( - int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)), - reason="skip_network tests") -with_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', - reason='skip on travis') - - def create_memmap_backed_data(data, mmap_mode='r', return_folder=False): temp_folder = tempfile.mkdtemp(prefix='sklearn_testing_') atexit.register(functools.partial(_delete_folder, temp_folder, warn=True)) From 85fe86a46d7dabb4e5b09d152fff90b9d0344417 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 4 May 2018 18:40:44 +0300 Subject: [PATCH 05/25] More pytest parametrizations --- sklearn/datasets/tests/test_base.py | 28 +++----- sklearn/datasets/tests/test_mldata.py | 29 +++----- .../datasets/tests/test_svmlight_format.py | 72 +++++++++---------- .../decomposition/tests/test_online_lda.py | 23 +++--- sklearn/decomposition/tests/test_pca.py | 10 +-- sklearn/ensemble/tests/test_forest.py | 58 ++++++++------- sklearn/utils/tests/test_extmath.py | 62 ++++++++-------- 7 files changed, 136 insertions(+), 146 deletions(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 35d2ac4925588..a58e515a8f760 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -33,31 +33,19 @@ from sklearn.utils.testing import assert_raises -@pytest.fixture(scope="module") -def data_home(): - try: - path = tempfile.mkdtemp(prefix="scikit_learn_data_home_test_") - yield path - finally: - _remove_dir(path) - - -@pytest.fixture(scope="module") -def load_files_root(): - try: - path = tempfile.mkdtemp(prefix="scikit_learn_load_files_test_") - yield path - finally: - _remove_dir(path) - - def _remove_dir(path): if os.path.isdir(path): shutil.rmtree(path) -def teardown_module(): - """Test fixture (clean up) run once after all tests of this module""" +@pytest.fixture(scope="module") +def data_home(tmpdir_factory): + return tmpdir_factory.mktemp("scikit_learn_data_home_test") + + +@pytest.fixture(scope="module") +def load_files_root(tmpdir_factory): + return tmpdir_factory.mktemp("scikit_learn_load_files_test") @pytest.fixture diff --git a/sklearn/datasets/tests/test_mldata.py b/sklearn/datasets/tests/test_mldata.py index 4c720b96da0b2..2863290216522 100644 --- a/sklearn/datasets/tests/test_mldata.py +++ b/sklearn/datasets/tests/test_mldata.py @@ -1,8 +1,6 @@ """Test functionality of mldata fetching utilities.""" import os -import shutil -import tempfile import scipy as sp from sklearn import datasets @@ -18,19 +16,10 @@ import pytest -tmpdir = None - - @pytest.fixture -def tmpdata(): - try: - global tmpdir - tmpdir = tempfile.mkdtemp() - os.makedirs(os.path.join(tmpdir, 'mldata')) - yield tmpdir - finally: - if tmpdir is not None: - shutil.rmtree(tmpdir) +def tmpdata(tmpdir): + os.makedirs(tmpdir.join('mldata')) + yield tmpdir def test_mldata_filename(): @@ -74,7 +63,7 @@ def test_fetch_one_column(tmpdata): x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) - dset = fetch_mldata(dataname, data_home=tmpdir) + dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) @@ -83,7 +72,7 @@ def test_fetch_one_column(tmpdata): assert_array_equal(dset.data, x) # transposing the data array - dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) + dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref @@ -110,7 +99,7 @@ def test_fetch_multiple_column(tmpdata): ), }) - dset = fetch_mldata(dataname, data_home=tmpdir) + dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) @@ -126,7 +115,7 @@ def test_fetch_multiple_column(tmpdata): dataname: ({'y': y, 'x': x, 'z': z}, ['y', 'x', 'z']), }) - dset = fetch_mldata(dataname, data_home=tmpdir) + dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) @@ -144,7 +133,7 @@ def test_fetch_multiple_column(tmpdata): }) dset = fetch_mldata(dataname, target_name=2, data_name=0, - data_home=tmpdir) + data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) @@ -155,7 +144,7 @@ def test_fetch_multiple_column(tmpdata): # by name dset = fetch_mldata(dataname, target_name='y', data_name='z', - data_home=tmpdir) + data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 35808fc5b3c89..3eab1d7c37eba 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -8,6 +8,8 @@ import shutil from tempfile import NamedTemporaryFile +import pytest + from sklearn.externals.six import b from sklearn.utils.testing import assert_equal @@ -414,46 +416,42 @@ def test_load_zeros(): assert_array_almost_equal(X.toarray(), true_X.toarray()) -def test_load_with_offsets(): - def check_load_with_offsets(sparsity, n_samples, n_features): - rng = np.random.RandomState(0) - X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) - if sparsity: - X[X < sparsity] = 0.0 - X = sp.csr_matrix(X) - y = rng.randint(low=0, high=2, size=n_samples) +@pytest.mark.parametrize('sparsity', [0, 0.1, .5, 0.99, 1]) +@pytest.mark.parametrize('n_samples', [13, 101]) +@pytest.mark.parametrize('n_features', [2, 7, 41]) +def test_load_with_offsets(sparsity, n_samples, n_features): + rng = np.random.RandomState(0) + X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) + if sparsity: + X[X < sparsity] = 0.0 + X = sp.csr_matrix(X) + y = rng.randint(low=0, high=2, size=n_samples) - f = BytesIO() - dump_svmlight_file(X, y, f) - f.seek(0) + f = BytesIO() + dump_svmlight_file(X, y, f) + f.seek(0) - size = len(f.getvalue()) - - # put some marks that are likely to happen anywhere in a row - mark_0 = 0 - mark_1 = size // 3 - length_0 = mark_1 - mark_0 - mark_2 = 4 * size // 5 - length_1 = mark_2 - mark_1 - - # load the original sparse matrix into 3 independent CSR matrices - X_0, y_0 = load_svmlight_file(f, n_features=n_features, - offset=mark_0, length=length_0) - X_1, y_1 = load_svmlight_file(f, n_features=n_features, - offset=mark_1, length=length_1) - X_2, y_2 = load_svmlight_file(f, n_features=n_features, - offset=mark_2) - - y_concat = np.concatenate([y_0, y_1, y_2]) - X_concat = sp.vstack([X_0, X_1, X_2]) - assert_array_almost_equal(y, y_concat) - assert_array_almost_equal(X.toarray(), X_concat.toarray()) + size = len(f.getvalue()) - # Generate a uniformly random sparse matrix - for sparsity in [0, 0.1, .5, 0.99, 1]: - for n_samples in [13, 101]: - for n_features in [2, 7, 41]: - yield check_load_with_offsets, sparsity, n_samples, n_features + # put some marks that are likely to happen anywhere in a row + mark_0 = 0 + mark_1 = size // 3 + length_0 = mark_1 - mark_0 + mark_2 = 4 * size // 5 + length_1 = mark_2 - mark_1 + + # load the original sparse matrix into 3 independent CSR matrices + X_0, y_0 = load_svmlight_file(f, n_features=n_features, + offset=mark_0, length=length_0) + X_1, y_1 = load_svmlight_file(f, n_features=n_features, + offset=mark_1, length=length_1) + X_2, y_2 = load_svmlight_file(f, n_features=n_features, + offset=mark_2) + + y_concat = np.concatenate([y_0, y_1, y_2]) + X_concat = sp.vstack([X_0, X_1, X_2]) + assert_array_almost_equal(y, y_concat) + assert_array_almost_equal(X.toarray(), X_concat.toarray()) def test_load_offset_exhaustive_splits(): diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index b69788ed566f1..2dd7766ed19aa 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -5,6 +5,8 @@ from scipy.sparse import csr_matrix from scipy.special import psi +import pytest + from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition._online_lda import (_dirichlet_expectation_1d, _dirichlet_expectation_2d) @@ -402,16 +404,17 @@ def check_verbosity(verbose, evaluate_every, expected_lines, assert_equal(expected_perplexities, n_perplexity) -def test_verbosity(): - for verbose, evaluate_every, expected_lines, expected_perplexities in [ - (False, 1, 0, 0), - (False, 0, 0, 0), - (True, 0, 3, 0), - (True, 1, 3, 3), - (True, 2, 3, 1), - ]: - yield (check_verbosity, verbose, evaluate_every, expected_lines, - expected_perplexities) +@pytest.mark.parametrize( + 'verbose,evaluate_every,expected_lines,expected_perplexities', + [(False, 1, 0, 0), + (False, 0, 0, 0), + (True, 0, 3, 0), + (True, 1, 3, 3), + (True, 2, 3, 1)]) +def test_verbosity(verbose, evaluate_every, expected_lines, + expected_perplexities): + check_verbosity(verbose, evaluate_every, expected_lines, + expected_perplexities) def test_lda_n_topics_deprecation(): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 4efb0254fa55f..c2fe0d1fb7da1 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -2,6 +2,8 @@ import scipy as sp from itertools import product +import pytest + from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_true @@ -721,10 +723,10 @@ def test_pca_bad_solver(): assert_raises(ValueError, pca.fit, X) -def test_pca_dtype_preservation(): - for svd_solver in solver_list: - yield check_pca_float_dtype_preservation, svd_solver - yield check_pca_int_dtype_upcast_to_double, svd_solver +@pytest.mark.parametrize('svd_solver', solver_list) +def test_pca_dtype_preservation(svd_solver): + check_pca_float_dtype_preservation(svd_solver) + check_pca_int_dtype_upcast_to_double(svd_solver) def check_pca_float_dtype_preservation(svd_solver): diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 420d10bc10bee..3b39298a392b5 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -10,6 +10,7 @@ import pickle from collections import defaultdict +import itertools from itertools import combinations from itertools import product @@ -18,6 +19,8 @@ from scipy.sparse import csc_matrix from scipy.sparse import coo_matrix +import pytest + from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal @@ -114,9 +117,9 @@ def check_classification_toy(name): assert_equal(leaf_indices.shape, (len(X), clf.n_estimators)) -def test_classification_toy(): - for name in FOREST_CLASSIFIERS: - yield check_classification_toy, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_classification_toy(name): + check_classification_toy(name) def check_iris_criterion(name, criterion): @@ -138,9 +141,10 @@ def check_iris_criterion(name, criterion): % (criterion, score)) -def test_iris(): - for name, criterion in product(FOREST_CLASSIFIERS, ("gini", "entropy")): - yield check_iris_criterion, name, criterion +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize('criterion', ("gini", "entropy")) +def test_iris(name, criterion): + check_iris_criterion(name, criterion) def check_boston_criterion(name, criterion): @@ -162,9 +166,10 @@ def check_boston_criterion(name, criterion): "and score = %f" % (criterion, score)) -def test_boston(): - for name, criterion in product(FOREST_REGRESSORS, ("mse", "mae", "friedman_mse")): - yield check_boston_criterion, name, criterion +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +@pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse")) +def test_boston(name, criterion): + check_boston_criterion(name, criterion) def check_regressor_attributes(name): @@ -178,9 +183,9 @@ def check_regressor_attributes(name): assert_false(hasattr(r, "n_classes_")) -def test_regressor_attributes(): - for name in FOREST_REGRESSORS: - yield check_regressor_attributes, name +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +def test_regressor_attributes(name): + check_regressor_attributes(name) def check_probability(name): @@ -196,9 +201,9 @@ def check_probability(name): np.exp(clf.predict_log_proba(iris.data))) -def test_probability(): - for name in FOREST_CLASSIFIERS: - yield check_probability, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_probability(name): + check_probability(name) def check_importances(name, criterion, dtype, tolerance): @@ -241,17 +246,18 @@ def check_importances(name, criterion, dtype, tolerance): assert_less(np.abs(importances - importances_bis).mean(), tolerance) -def test_importances(): - for dtype in (np.float64, np.float32): - tolerance = 0.01 - for name, criterion in product(FOREST_CLASSIFIERS, - ["gini", "entropy"]): - yield check_importances, name, criterion, dtype, tolerance - - for name, criterion in product(FOREST_REGRESSORS, - ["mse", "friedman_mse", "mae"]): - tolerance = 0.05 if criterion == "mae" else 0.01 - yield check_importances, name, criterion, dtype, tolerance +@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +@pytest.mark.parametrize( + 'name, criterion', + itertools.chain(product(FOREST_CLASSIFIERS, + ["gini", "entropy"]), + product(FOREST_REGRESSORS, + ["mse", "friedman_mse", "mae"]))) +def test_importances(dtype, name, criterion): + tolerance = 0.01 + if name in FOREST_REGRESSORS and criterion == "mae": + tolerance = 0.05 + check_importances(name, criterion, dtype, tolerance) def test_importances_asymptotic(): diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index f53b814c70084..d89e2a1aa1223 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -9,6 +9,8 @@ from scipy import linalg from scipy import stats +import pytest + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal @@ -170,9 +172,10 @@ def check_randomized_svd_low_rank(dtype): assert_almost_equal(s[:rank], sa[:rank], decimal=decimal) -def test_randomized_svd_low_rank_all_dtypes(): - for dtype in (np.int32, np.int64, np.float32, np.float64): - yield check_randomized_svd_low_rank, dtype +@pytest.mark.parametrize('dtype', + (np.int32, np.int64, np.float32, np.float64)) +def test_randomized_svd_low_rank_all_dtypes(dtype): + check_randomized_svd_low_rank(dtype) @ignore_warnings # extmath.norm is deprecated to be removed in 0.21 @@ -191,34 +194,35 @@ def test_norm_squared_norm(): squared_norm, X.astype(int)) -def test_row_norms(): +@pytest.mark.parametrize('dtype', + (np.float32, np.float64)) +def test_row_norms(dtype): X = np.random.RandomState(42).randn(100, 100) - for dtype in (np.float32, np.float64): - if dtype is np.float32: - precision = 4 - else: - precision = 5 - - X = X.astype(dtype) - sq_norm = (X ** 2).sum(axis=1) - - assert_array_almost_equal(sq_norm, row_norms(X, squared=True), + if dtype is np.float32: + precision = 4 + else: + precision = 5 + + X = X.astype(dtype) + sq_norm = (X ** 2).sum(axis=1) + + assert_array_almost_equal(sq_norm, row_norms(X, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) + + for csr_index_dtype in [np.int32, np.int64]: + Xcsr = sparse.csr_matrix(X, dtype=dtype) + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if csr_index_dtype is np.int64: + Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) + Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) + assert Xcsr.indices.dtype == csr_index_dtype + assert Xcsr.indptr.dtype == csr_index_dtype + assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) - - for csr_index_dtype in [np.int32, np.int64]: - Xcsr = sparse.csr_matrix(X, dtype=dtype) - # csr_matrix will use int32 indices by default, - # up-casting those to int64 when necessary - if csr_index_dtype is np.int64: - Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) - Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) - assert Xcsr.indices.dtype == csr_index_dtype - assert Xcsr.indptr.dtype == csr_index_dtype - assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), - precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), - precision) def test_randomized_svd_low_rank_with_noise(): From b51cc6a98f42a1647ff5b57d047ce56774cc1aad Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 5 May 2018 10:07:05 +0300 Subject: [PATCH 06/25] Parametrize test_forest.py --- sklearn/ensemble/tests/test_forest.py | 192 +++++++++++++------------- 1 file changed, 97 insertions(+), 95 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 3b39298a392b5..efcf1ffc5f04c 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -97,6 +97,10 @@ FOREST_ESTIMATORS.update(FOREST_REGRESSORS) FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) +FOREST_CLASSIFIERS_REGRESSORS = dict() +FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_CLASSIFIERS) +FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) + def check_classification_toy(name): """Check classification on a toy dataset.""" @@ -358,9 +362,9 @@ def check_unfitted_feature_importances(name): "feature_importances_") -def test_unfitted_feature_importances(): - for name in FOREST_ESTIMATORS: - yield check_unfitted_feature_importances, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_unfitted_feature_importances(name): + check_unfitted_feature_importances(name) def check_oob_score(name, X, y, n_estimators=20): @@ -387,21 +391,23 @@ def check_oob_score(name, X, y, n_estimators=20): assert_warns(UserWarning, est.fit, X, y) -def test_oob_score(): - for name in FOREST_CLASSIFIERS: - yield check_oob_score, name, iris.data, iris.target +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_oob_score_classifiers(name): + check_oob_score(name, iris.data, iris.target) + + # csc matrix + check_oob_score(name, csc_matrix(iris.data), iris.target) - # csc matrix - yield check_oob_score, name, csc_matrix(iris.data), iris.target + # non-contiguous targets in classification + check_oob_score(name, iris.data, iris.target * 2 + 1) - # non-contiguous targets in classification - yield check_oob_score, name, iris.data, iris.target * 2 + 1 - for name in FOREST_REGRESSORS: - yield check_oob_score, name, boston.data, boston.target, 50 +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +def test_oob_score_regressors(name): + check_oob_score(name, boston.data, boston.target, 50) - # csc matrix - yield check_oob_score, name, csc_matrix(boston.data), boston.target, 50 + # csc matrix + check_oob_score(name, csc_matrix(boston.data), boston.target, 50) def check_oob_score_raise_error(name): @@ -427,9 +433,9 @@ def check_oob_score_raise_error(name): bootstrap=False).fit, X, y) -def test_oob_score_raise_error(): - for name in FOREST_ESTIMATORS: - yield check_oob_score_raise_error, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_oob_score_raise_error(name): + check_oob_score_raise_error(name) def check_gridsearch(name): @@ -438,10 +444,10 @@ def check_gridsearch(name): clf.fit(iris.data, iris.target) -def test_gridsearch(): +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_gridsearch(name): # Check that base trees can be grid-searched. - for name in FOREST_CLASSIFIERS: - yield check_gridsearch, name + check_gridsearch(name) def check_parallel(name, X, y): @@ -459,12 +465,14 @@ def check_parallel(name, X, y): assert_array_almost_equal(y1, y2, 3) -def test_parallel(): - for name in FOREST_CLASSIFIERS: - yield check_parallel, name, iris.data, iris.target +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_parallel(name): + if name in FOREST_CLASSIFIERS: + ds = iris + elif name in FOREST_REGRESSORS: + ds = boston - for name in FOREST_REGRESSORS: - yield check_parallel, name, boston.data, boston.target + check_parallel(name, ds.data, ds.target) def check_pickle(name, X, y): @@ -482,12 +490,14 @@ def check_pickle(name, X, y): assert_equal(score, score2) -def test_pickle(): - for name in FOREST_CLASSIFIERS: - yield check_pickle, name, iris.data[::2], iris.target[::2] +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_pickle(name): + if name in FOREST_CLASSIFIERS: + ds = iris + elif name in FOREST_REGRESSORS: + ds = boston - for name in FOREST_REGRESSORS: - yield check_pickle, name, boston.data[::2], boston.target[::2] + check_pickle(name, ds.data[::2], ds.target[::2]) def check_multioutput(name): @@ -517,12 +527,9 @@ def check_multioutput(name): assert_equal(log_proba[1].shape, (4, 4)) -def test_multioutput(): - for name in FOREST_CLASSIFIERS: - yield check_multioutput, name - - for name in FOREST_REGRESSORS: - yield check_multioutput, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput(name): + check_multioutput(name) def check_classes_shape(name): @@ -543,9 +550,9 @@ def check_classes_shape(name): assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]]) -def test_classes_shape(): - for name in FOREST_CLASSIFIERS: - yield check_classes_shape, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_classes_shape(name): + check_classes_shape(name) def test_random_trees_dense_type(): @@ -698,9 +705,9 @@ def check_max_leaf_nodes_max_depth(name): assert_equal(est.estimators_[0].tree_.max_depth, 1) -def test_max_leaf_nodes_max_depth(): - for name in FOREST_ESTIMATORS: - yield check_max_leaf_nodes_max_depth, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_max_leaf_nodes_max_depth(name): + check_max_leaf_nodes_max_depth(name) def check_min_samples_split(name): @@ -732,9 +739,9 @@ def check_min_samples_split(name): "Failed with {0}".format(name)) -def test_min_samples_split(): - for name in FOREST_ESTIMATORS: - yield check_min_samples_split, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_samples_split(name): + check_min_samples_split(name) def check_min_samples_leaf(name): @@ -769,9 +776,9 @@ def check_min_samples_leaf(name): "Failed with {0}".format(name)) -def test_min_samples_leaf(): - for name in FOREST_ESTIMATORS: - yield check_min_samples_leaf, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_samples_leaf(name): + check_min_samples_leaf(name) def check_min_weight_fraction_leaf(name): @@ -805,9 +812,9 @@ def check_min_weight_fraction_leaf(name): name, est.min_weight_fraction_leaf)) -def test_min_weight_fraction_leaf(): - for name in FOREST_ESTIMATORS: - yield check_min_weight_fraction_leaf, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_weight_fraction_leaf(name): + check_min_weight_fraction_leaf(name) def check_sparse_input(name, X, X_sparse, y): @@ -836,13 +843,14 @@ def check_sparse_input(name, X, X_sparse, y): dense.fit_transform(X).toarray()) -def test_sparse_input(): +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize('sparse_matrix', + (csr_matrix, csc_matrix, coo_matrix)) +def test_sparse_input(name, sparse_matrix): X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) - for name, sparse_matrix in product(FOREST_ESTIMATORS, - (csr_matrix, csc_matrix, coo_matrix)): - yield check_sparse_input, name, X, sparse_matrix(X), y + check_sparse_input(name, X, sparse_matrix(X), y) def check_memory_layout(name, dtype): @@ -892,12 +900,10 @@ def check_memory_layout(name, dtype): assert_array_almost_equal(est.fit(X, y).predict(X), y) -def test_memory_layout(): - for name, dtype in product(FOREST_CLASSIFIERS, [np.float64, np.float32]): - yield check_memory_layout, name, dtype - - for name, dtype in product(FOREST_REGRESSORS, [np.float64, np.float32]): - yield check_memory_layout, name, dtype +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +def test_memory_layout(name, dtype): + check_memory_layout(name, dtype) @ignore_warnings @@ -913,14 +919,14 @@ def check_1d_input(name, X, X_2d, y): assert_raises(ValueError, est.predict, X) +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) @ignore_warnings -def test_1d_input(): +def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target - for name in FOREST_ESTIMATORS: - yield check_1d_input, name, X, X_2d, y + check_1d_input(name, X, X_2d, y) def check_class_weights(name): @@ -972,9 +978,9 @@ def check_class_weights(name): clf.fit(iris.data, iris.target, sample_weight=sample_weight) -def test_class_weights(): - for name in FOREST_CLASSIFIERS: - yield check_class_weights, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weights(name): + check_class_weights(name) def check_class_weight_balanced_and_bootstrap_multi_output(name): @@ -991,9 +997,9 @@ def check_class_weight_balanced_and_bootstrap_multi_output(name): clf.fit(X, _y) -def test_class_weight_balanced_and_bootstrap_multi_output(): - for name in FOREST_CLASSIFIERS: - yield check_class_weight_balanced_and_bootstrap_multi_output, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weight_balanced_and_bootstrap_multi_output(name): + check_class_weight_balanced_and_bootstrap_multi_output(name) def check_class_weight_errors(name): @@ -1021,9 +1027,9 @@ def check_class_weight_errors(name): assert_raises(ValueError, clf.fit, X, _y) -def test_class_weight_errors(): - for name in FOREST_CLASSIFIERS: - yield check_class_weight_errors, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weight_errors(name): + check_class_weight_errors(name) def check_warm_start(name, random_state=42): @@ -1053,9 +1059,9 @@ def check_warm_start(name, random_state=42): err_msg="Failed with {0}".format(name)) -def test_warm_start(): - for name in FOREST_ESTIMATORS: - yield check_warm_start, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start(name): + check_warm_start(name) def check_warm_start_clear(name): @@ -1075,9 +1081,9 @@ def check_warm_start_clear(name): assert_array_almost_equal(clf_2.apply(X), clf.apply(X)) -def test_warm_start_clear(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_clear, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_clear(name): + check_warm_start_clear(name) def check_warm_start_smaller_n_estimators(name): @@ -1090,9 +1096,9 @@ def check_warm_start_smaller_n_estimators(name): assert_raises(ValueError, clf.fit, X, y) -def test_warm_start_smaller_n_estimators(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_smaller_n_estimators, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_smaller_n_estimators(name): + check_warm_start_smaller_n_estimators(name) def check_warm_start_equal_n_estimators(name): @@ -1116,9 +1122,9 @@ def check_warm_start_equal_n_estimators(name): assert_array_equal(clf.apply(X), clf_2.apply(X)) -def test_warm_start_equal_n_estimators(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_equal_n_estimators, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_equal_n_estimators(name): + check_warm_start_equal_n_estimators(name) def check_warm_start_oob(name): @@ -1153,11 +1159,9 @@ def check_warm_start_oob(name): assert_equal(clf.oob_score_, clf_3.oob_score_) -def test_warm_start_oob(): - for name in FOREST_CLASSIFIERS: - yield check_warm_start_oob, name - for name in FOREST_REGRESSORS: - yield check_warm_start_oob, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_warm_start_oob(name): + check_warm_start_oob(name) def test_dtype_convert(n_classes=15): @@ -1193,11 +1197,9 @@ def check_decision_path(name): assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) -def test_decision_path(): - for name in FOREST_CLASSIFIERS: - yield check_decision_path, name - for name in FOREST_REGRESSORS: - yield check_decision_path, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_decision_path(name): + check_decision_path(name) def test_min_impurity_split(): From e720097c9846e3fd9ecaf095e9b9e9a402312517 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 5 May 2018 10:14:43 +0300 Subject: [PATCH 07/25] Migrate test_gradient_boosting.py --- .../ensemble/tests/test_gradient_boosting.py | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 6711007e2701c..b04b857870195 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -10,6 +10,8 @@ from scipy.sparse import csc_matrix from scipy.sparse import coo_matrix +import pytest + from sklearn import datasets from sklearn.base import clone from sklearn.datasets import make_classification @@ -75,10 +77,10 @@ def check_classification_toy(presort, loss): assert_equal(leaves.shape, (6, 10, 1)) -def test_classification_toy(): - for presort, loss in product(('auto', True, False), - ('deviance', 'exponential')): - yield check_classification_toy, presort, loss +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('deviance', 'exponential')) +def test_classification_toy(presort, loss): + check_classification_toy(presort, loss) def test_classifier_parameter_checks(): @@ -213,9 +215,10 @@ def check_classification_synthetic(presort, loss): assert_less(error_rate, 0.08) -def test_classification_synthetic(): - for presort, loss in product(('auto', True, False), ('deviance', 'exponential')): - yield check_classification_synthetic, presort, loss +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('deviance', 'exponential')) +def test_classification_synthetic(presort, loss): + check_classification_synthetic(presort, loss) def check_boston(presort, loss, subsample): @@ -248,11 +251,11 @@ def check_boston(presort, loss, subsample): last_y_pred = y_pred -def test_boston(): - for presort, loss, subsample in product(('auto', True, False), - ('ls', 'lad', 'huber'), - (1.0, 0.5)): - yield check_boston, presort, loss, subsample +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('ls', 'lad', 'huber')) +@pytest.mark.parametrize('subsample', (1.0, 0.5)) +def test_boston(presort, loss, subsample): + check_boston(presort, loss, subsample) def check_iris(presort, subsample, sample_weight): @@ -270,12 +273,13 @@ def check_iris(presort, subsample, sample_weight): assert_equal(leaves.shape, (150, 100, 3)) -def test_iris(): - ones = np.ones(len(iris.target)) - for presort, subsample, sample_weight in product(('auto', True, False), - (1.0, 0.5), - (None, ones)): - yield check_iris, presort, subsample, sample_weight +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('subsample', (1.0, 0.5)) +@pytest.mark.parametrize('sample_weight', (None, 1)) +def test_iris(presort, subsample, sample_weight): + if sample_weight == 1: + sample_weight = np.ones(len(iris.target)) + check_iris(presort, subsample, sample_weight) def test_regression_synthetic(): @@ -1211,18 +1215,20 @@ def check_sparse_input(EstimatorClass, X, X_sparse, y): @skip_if_32bit -def test_sparse_input(): - ests = (GradientBoostingClassifier, GradientBoostingRegressor) - sparse_matrices = (csr_matrix, csc_matrix, coo_matrix) - +@pytest.mark.parametrize( + 'EstimatorClass', + (GradientBoostingClassifier, GradientBoostingRegressor)) +@pytest.mark.parametrize( + 'sparse_matrix', + (csr_matrix, csc_matrix, coo_matrix)) +def test_sparse_input(EstimatorClass, sparse_matrix): y, X = datasets.make_multilabel_classification(random_state=0, n_samples=50, n_features=1, n_classes=20) y = y[:, 0] - for EstimatorClass, sparse_matrix in product(ests, sparse_matrices): - yield check_sparse_input, EstimatorClass, X, sparse_matrix(X), y + check_sparse_input(EstimatorClass, X, sparse_matrix(X), y) def test_gradient_boosting_early_stopping(): From f7d092a0e9bbccd6e9692ac6535be4ace6a0f03c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 5 May 2018 11:03:49 +0300 Subject: [PATCH 08/25] More parametrization --- .../gaussian_process/tests/test_kernels.py | 43 ++-- sklearn/linear_model/tests/test_ridge.py | 87 ++++--- sklearn/manifold/tests/test_t_sne.py | 8 +- sklearn/metrics/cluster/tests/test_common.py | 9 +- sklearn/metrics/tests/test_pairwise.py | 36 +-- sklearn/metrics/tests/test_ranking.py | 34 ++- sklearn/metrics/tests/test_score_objects.py | 8 +- sklearn/mixture/tests/test_gmm.py | 7 +- .../model_selection/tests/test_validation.py | 4 +- sklearn/neighbors/tests/test_dist_metrics.py | 214 +++++++++--------- sklearn/neighbors/tests/test_neighbors.py | 88 ++++--- sklearn/neighbors/tests/test_quad_tree.py | 62 +++-- 12 files changed, 308 insertions(+), 292 deletions(-) diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 09065eb4871a0..3ede9032fd6fc 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -209,7 +209,8 @@ def test_kernel_clone(kernel): check_hyperparameters_equal(kernel, kernel_cloned) -def test_kernel_clone_after_set_params(): +@pytest.mark.parametrize('kernel', kernels) +def test_kernel_clone_after_set_params(kernel): # This test is to verify that using set_params does not # break clone on kernels. # This used to break because in kernels such as the RBF, non-trivial @@ -217,27 +218,25 @@ def test_kernel_clone_after_set_params(): # See https://github.com/scikit-learn/scikit-learn/issues/6961 # for more details. bounds = (1e-5, 1e5) - for kernel in kernels: - kernel_cloned = clone(kernel) - params = kernel.get_params() - # RationalQuadratic kernel is isotropic. - isotropic_kernels = (ExpSineSquared, RationalQuadratic) - if 'length_scale' in params and not isinstance(kernel, - isotropic_kernels): - length_scale = params['length_scale'] - if np.iterable(length_scale): - params['length_scale'] = length_scale[0] - params['length_scale_bounds'] = bounds - else: - params['length_scale'] = [length_scale] * 2 - params['length_scale_bounds'] = bounds * 2 - kernel_cloned.set_params(**params) - kernel_cloned_clone = clone(kernel_cloned) - assert_equal(kernel_cloned_clone.get_params(), - kernel_cloned.get_params()) - assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned)) - yield (check_hyperparameters_equal, kernel_cloned, - kernel_cloned_clone) + kernel_cloned = clone(kernel) + params = kernel.get_params() + # RationalQuadratic kernel is isotropic. + isotropic_kernels = (ExpSineSquared, RationalQuadratic) + if 'length_scale' in params and not isinstance(kernel, + isotropic_kernels): + length_scale = params['length_scale'] + if np.iterable(length_scale): + params['length_scale'] = length_scale[0] + params['length_scale_bounds'] = bounds + else: + params['length_scale'] = [length_scale] * 2 + params['length_scale_bounds'] = bounds * 2 + kernel_cloned.set_params(**params) + kernel_cloned_clone = clone(kernel_cloned) + assert_equal(kernel_cloned_clone.get_params(), + kernel_cloned.get_params()) + assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned)) + check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone) def test_matern_kernel(): diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index a2f2a135b3ae4..6b35029a01a9b 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1,7 +1,8 @@ import numpy as np import scipy.sparse as sp from scipy import linalg -from itertools import product + +import pytest from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_almost_equal @@ -151,47 +152,45 @@ def test_ridge_regression_convergence_fail(): tol=0., max_iter=None, verbose=1) -def test_ridge_sample_weights(): +@pytest.mark.parametrize('n_samples, n_features', ((6, 5), (5, 10))) +@pytest.mark.parametrize('alpha', (1.0, 1e-2)) +@pytest.mark.parametrize('intercept', (True, False)) +@pytest.mark.parametrize('solver', ('svd', 'cholesky', 'lsqr', 'sparse_cg')) +def test_ridge_sample_weights(n_samples, n_features, alpha, intercept, solver): # TODO: loop over sparse data as well rng = np.random.RandomState(0) - param_grid = product((1.0, 1e-2), (True, False), - ('svd', 'cholesky', 'lsqr', 'sparse_cg')) - - for n_samples, n_features in ((6, 5), (5, 10)): - - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - sample_weight = 1.0 + rng.rand(n_samples) - for (alpha, intercept, solver) in param_grid: - - # Ridge with explicit sample_weight - est = Ridge(alpha=alpha, fit_intercept=intercept, solver=solver) - est.fit(X, y, sample_weight=sample_weight) - coefs = est.coef_ - inter = est.intercept_ - - # Closed form of the weighted regularized least square - # theta = (X^T W X + alpha I)^(-1) * X^T W y - W = np.diag(sample_weight) - if intercept is False: - X_aug = X - I = np.eye(n_features) - else: - dummy_column = np.ones(shape=(n_samples, 1)) - X_aug = np.concatenate((dummy_column, X), axis=1) - I = np.eye(n_features + 1) - I[0, 0] = 0 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + sample_weight = 1.0 + rng.rand(n_samples) + + # Ridge with explicit sample_weight + est = Ridge(alpha=alpha, fit_intercept=intercept, solver=solver) + est.fit(X, y, sample_weight=sample_weight) + coefs = est.coef_ + inter = est.intercept_ + + # Closed form of the weighted regularized least square + # theta = (X^T W X + alpha I)^(-1) * X^T W y + W = np.diag(sample_weight) + if intercept is False: + X_aug = X + Id = np.eye(n_features) + else: + dummy_column = np.ones(shape=(n_samples, 1)) + X_aug = np.concatenate((dummy_column, X), axis=1) + Id = np.eye(n_features + 1) + Id[0, 0] = 0 - cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * I, - X_aug.T.dot(W).dot(y)) + cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * Id, + X_aug.T.dot(W).dot(y)) - if intercept is False: - assert_array_almost_equal(coefs, cf_coefs) - else: - assert_array_almost_equal(coefs, cf_coefs[1:]) - assert_almost_equal(inter, cf_coefs[0]) + if intercept is False: + assert_array_almost_equal(coefs, cf_coefs) + else: + assert_array_almost_equal(coefs, cf_coefs[1:]) + assert_almost_equal(inter, cf_coefs[0]) def test_ridge_shapes(): @@ -483,15 +482,13 @@ def check_dense_sparse(test_func): assert_array_almost_equal(ret_dense, ret_sparse, decimal=3) -def test_dense_sparse(): - for test_func in (_test_ridge_loo, - _test_ridge_cv, - _test_ridge_cv_normalize, - _test_ridge_diabetes, - _test_multi_ridge_diabetes, - _test_ridge_classifiers, - _test_tolerance): - yield check_dense_sparse, test_func +@pytest.mark.parametrize( + 'test_func', + (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize, + _test_ridge_diabetes, _test_multi_ridge_diabetes, + _test_ridge_classifiers, _test_tolerance)) +def test_dense_sparse(test_func): + check_dense_sparse(test_func) def test_ridge_cv_sparse_svd(): diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 6b1d87bb18bf5..4118dd997f00c 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -3,6 +3,8 @@ import numpy as np import scipy.sparse as sp +import pytest + from sklearn.neighbors import BallTree from sklearn.neighbors import NearestNeighbors from sklearn.utils.testing import assert_less_equal @@ -807,9 +809,9 @@ def assert_uniform_grid(Y, try_name=None): assert_less(largest_to_mean, 2, msg=try_name) -def test_uniform_grid(): - for method in ['barnes_hut', 'exact']: - yield check_uniform_grid, method +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +def test_uniform_grid(method): + check_uniform_grid(method) def test_bh_match_exact(): diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py index 3536fdcaa69fa..59c76c6bca38a 100644 --- a/sklearn/metrics/cluster/tests/test_common.py +++ b/sklearn/metrics/cluster/tests/test_common.py @@ -99,10 +99,7 @@ def test_non_symmetry(metric_name, y1, y2): assert metric(y1, y2) != pytest.approx(metric(y2, y1)) -@pytest.mark.parametrize( - "metric_name", - [name for name in NORMALIZED_METRICS] -) +@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS) def test_normalized_output(metric_name): upper_bound_1 = [0, 0, 0, 1, 1, 1] upper_bound_2 = [0, 0, 0, 1, 1, 1] @@ -124,7 +121,7 @@ def test_normalized_output(metric_name): # that is when 0 and 1 exchanged. @pytest.mark.parametrize( "metric_name", - [name for name in dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)] + dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) ) def test_permute_labels(metric_name): y_label = np.array([0, 0, 0, 1, 1, 0, 1]) @@ -145,7 +142,7 @@ def test_permute_labels(metric_name): # For all clustering metrics Input parameters can be both @pytest.mark.parametrize( "metric_name", - [name for name in dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)] + dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) ) # in the form of arrays lists, positive, negetive or string def test_format_invariance(metric_name): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 799b3e4fe9bf7..f14e08cc8d237 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -4,6 +4,8 @@ from scipy.sparse import dok_matrix, csr_matrix, issparse from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski +import pytest + from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_almost_equal @@ -196,16 +198,24 @@ def check_pairwise_parallel(func, metric, kwds): assert_array_almost_equal(S, S2) -def test_pairwise_parallel(): - wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1} - metrics = [(pairwise_distances, 'euclidean', {}), - (pairwise_distances, wminkowski, wminkowski_kwds), - (pairwise_distances, 'wminkowski', wminkowski_kwds), - (pairwise_kernels, 'polynomial', {'degree': 1}), - (pairwise_kernels, callable_rbf_kernel, {'gamma': .1}), - ] - for func, metric, kwds in metrics: - yield check_pairwise_parallel, func, metric, kwds +_wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1} + + +def callable_rbf_kernel(x, y, **kwds): + # Callable version of pairwise.rbf_kernel. + K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) + return K + + +@pytest.mark.parametrize( + 'func, metric, kwds', + [(pairwise_distances, 'euclidean', {}), + (pairwise_distances, wminkowski, _wminkowski_kwds), + (pairwise_distances, 'wminkowski', _wminkowski_kwds), + (pairwise_kernels, 'polynomial', {'degree': 1}), + (pairwise_kernels, callable_rbf_kernel, {'gamma': .1})]) +def test_pairwise_parallel(func, metric, kwds): + check_pairwise_parallel(func, metric, kwds) def test_pairwise_callable_nonstrict_metric(): @@ -215,12 +225,6 @@ def test_pairwise_callable_nonstrict_metric(): assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5) -def callable_rbf_kernel(x, y, **kwds): - # Callable version of pairwise.rbf_kernel. - K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) - return K - - def test_pairwise_kernels(): # Test the pairwise_kernels helper function. rng = np.random.RandomState(0) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 07c35c609358d..f93c11820943b 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -2,7 +2,6 @@ import pytest import numpy as np -from itertools import product import warnings from scipy.sparse import csr_matrix @@ -923,18 +922,29 @@ def check_alternative_lrap_implementation(lrap_score, n_classes=5, assert_almost_equal(score_lrap, score_my_lrap) -def test_label_ranking_avp(): - for fn in [label_ranking_average_precision_score, _my_lrap]: - yield check_lrap_toy, fn - yield check_lrap_without_tie_and_increasing_score, fn - yield check_lrap_only_ties, fn - yield check_zero_or_all_relevant_labels, fn - yield check_lrap_error_raised, label_ranking_average_precision_score +@pytest.mark.parametrize( + 'check', + (check_lrap_toy, + check_lrap_without_tie_and_increasing_score, + check_lrap_only_ties, + check_zero_or_all_relevant_labels)) +@pytest.mark.parametrize( + 'func', + (label_ranking_average_precision_score, _my_lrap)) +def test_label_ranking_avp(check, func): + check(func) - for n_samples, n_classes, random_state in product((1, 2, 8, 20), - (2, 5, 10), - range(1)): - yield (check_alternative_lrap_implementation, + +def test_lrap_error_raised(): + check_lrap_error_raised(label_ranking_average_precision_score) + + +@pytest.mark.parametrize('n_samples', (1, 2, 8, 20)) +@pytest.mark.parametrize('n_classes', (2, 5, 10)) +@pytest.mark.parametrize('random_state', range(1)) +def test_alternative_lrap_implementation(n_samples, n_classes, random_state): + + check_alternative_lrap_implementation( label_ranking_average_precision_score, n_classes, n_samples, random_state) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 6af6418635d59..8bb3c3c137dcc 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -6,6 +6,8 @@ import numpy as np +import pytest + from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal @@ -491,12 +493,12 @@ def check_scorer_memmap(scorer_name): assert isinstance(score, numbers.Number), scorer_name -def test_scorer_memmap_input(): +@pytest.mark.parametrize('name', SCORERS) +def test_scorer_memmap_input(name): # Non-regression test for #6147: some score functions would # return singleton memmap when computed on memmap data instead of scalar # float values. - for name in SCORERS.keys(): - yield check_scorer_memmap, name + check_scorer_memmap(name) def test_deprecated_names(): diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index 8eaf5534544c8..9e2288413f4ea 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -498,10 +498,11 @@ def check_positive_definite_covars(covariance_type): assert_greater(np.linalg.det(c), 0) -def test_positive_definite_covars(): +@pytest.mark.parametrize('covariance_type', + ["full", "tied", "diag", "spherical"]) +def test_positive_definite_covars(covariance_type): # Check positive definiteness for all covariance types - for covariance_type in ["full", "tied", "diag", "spherical"]: - yield check_positive_definite_covars, covariance_type + check_positive_definite_covars(covariance_type) # This function tests the deprecated old GMM class diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2929916619769..a537b9f53518a 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -387,8 +387,8 @@ def test_cross_validate(): scores = (train_mse_scores, test_mse_scores, train_r2_scores, test_r2_scores, fitted_estimators) - yield check_cross_validate_single_metric, est, X, y, scores - yield check_cross_validate_multi_metric, est, X, y, scores + check_cross_validate_single_metric(est, X, y, scores) + check_cross_validate_multi_metric(est, X, y, scores) def test_cross_validate_return_train_score_warn(): diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py index 23b7656cb313b..f4d6dc3e74c5e 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/neighbors/tests/test_dist_metrics.py @@ -4,6 +4,8 @@ import numpy as np from numpy.testing import assert_array_almost_equal +import pytest + from scipy.spatial.distance import cdist from sklearn.neighbors.dist_metrics import DistanceMetric from sklearn.neighbors import BallTree @@ -15,107 +17,117 @@ def dist_func(x1, x2, p): return np.sum((x1 - x2) ** p) ** (1. / p) -class TestMetrics(object): - n1 = 20 - n2 = 25 - d = 4 - zero_frac = 0.5 - rseed = 0 - dtype = np.float64 - rng = check_random_state(rseed) - X1 = rng.random_sample((n1, d)).astype(dtype) - X2 = rng.random_sample((n2, d)).astype(dtype) - - # make boolean arrays: ones and zeros - X1_bool = X1.round(0) - X2_bool = X2.round(0) - - V = rng.random_sample((d, d)) - VI = np.dot(V, V.T) - - metrics = {'euclidean': {}, - 'cityblock': {}, - 'minkowski': dict(p=(1, 1.5, 2, 3)), - 'chebyshev': {}, - 'seuclidean': dict(V=(rng.random_sample(d),)), - 'wminkowski': dict(p=(1, 1.5, 3), - w=(rng.random_sample(d),)), - 'mahalanobis': dict(VI=(VI,)), - 'hamming': {}, - 'canberra': {}, - 'braycurtis': {}} - - bool_metrics = ['matching', 'jaccard', 'dice', - 'kulsinski', 'rogerstanimoto', 'russellrao', - 'sokalmichener', 'sokalsneath'] - - def test_cdist(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - D_true = cdist(self.X1, self.X2, metric, **kwargs) - yield self.check_cdist, metric, kwargs, D_true - - for metric in self.bool_metrics: - D_true = cdist(self.X1_bool, self.X2_bool, metric) - yield self.check_cdist_bool, metric, D_true - - def check_cdist(self, metric, kwargs, D_true): - dm = DistanceMetric.get_metric(metric, **kwargs) - D12 = dm.pairwise(self.X1, self.X2) - assert_array_almost_equal(D12, D_true) - - def check_cdist_bool(self, metric, D_true): - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(self.X1_bool, self.X2_bool) - assert_array_almost_equal(D12, D_true) - - def test_pdist(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - D_true = cdist(self.X1, self.X1, metric, **kwargs) - yield self.check_pdist, metric, kwargs, D_true - - for metric in self.bool_metrics: - D_true = cdist(self.X1_bool, self.X1_bool, metric) - yield self.check_pdist_bool, metric, D_true - - def check_pdist(self, metric, kwargs, D_true): - dm = DistanceMetric.get_metric(metric, **kwargs) - D12 = dm.pairwise(self.X1) - assert_array_almost_equal(D12, D_true) - - def check_pdist_bool(self, metric, D_true): - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(self.X1_bool) - assert_array_almost_equal(D12, D_true) - - def test_pickle(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - yield self.check_pickle, metric, kwargs - - for metric in self.bool_metrics: - yield self.check_pickle_bool, metric - - def check_pickle_bool(self, metric): - dm = DistanceMetric.get_metric(metric) - D1 = dm.pairwise(self.X1_bool) - dm2 = pickle.loads(pickle.dumps(dm)) - D2 = dm2.pairwise(self.X1_bool) - assert_array_almost_equal(D1, D2) - - def check_pickle(self, metric, kwargs): - dm = DistanceMetric.get_metric(metric, **kwargs) - D1 = dm.pairwise(self.X1) - dm2 = pickle.loads(pickle.dumps(dm)) - D2 = dm2.pairwise(self.X1) - assert_array_almost_equal(D1, D2) +rng = check_random_state(0) +d = 4 +n1 = 20 +n2 = 25 +X1 = rng.random_sample((n1, d)).astype('float64') +X2 = rng.random_sample((n2, d)).astype('float64') + +# make boolean arrays: ones and zeros +X1_bool = X1.round(0) +X2_bool = X2.round(0) + +V = rng.random_sample((d, d)) +VI = np.dot(V, V.T) + +BOOL_METRICS = ['matching', 'jaccard', 'dice', + 'kulsinski', 'rogerstanimoto', 'russellrao', + 'sokalmichener', 'sokalsneath'] + +METRICS_DEFAULT_PARAMS = {'euclidean': {}, + 'cityblock': {}, + 'minkowski': dict(p=(1, 1.5, 2, 3)), + 'chebyshev': {}, + 'seuclidean': dict(V=(rng.random_sample(d),)), + 'wminkowski': dict(p=(1, 1.5, 3), + w=(rng.random_sample(d),)), + 'mahalanobis': dict(VI=(VI,)), + 'hamming': {}, + 'canberra': {}, + 'braycurtis': {}} + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_cdist(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + D_true = cdist(X1, X2, metric, **kwargs) + check_cdist(metric, kwargs, D_true) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_cdist_bool_metric(metric): + D_true = cdist(X1_bool, X2_bool, metric) + check_cdist_bool(metric, D_true) + + +def check_cdist(metric, kwargs, D_true): + dm = DistanceMetric.get_metric(metric, **kwargs) + D12 = dm.pairwise(X1, X2) + assert_array_almost_equal(D12, D_true) + + +def check_cdist_bool(metric, D_true): + dm = DistanceMetric.get_metric(metric) + D12 = dm.pairwise(X1_bool, X2_bool) + assert_array_almost_equal(D12, D_true) + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_pdist(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + D_true = cdist(X1, X1, metric, **kwargs) + check_pdist(metric, kwargs, D_true) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_pdist_bool_metrics(metric): + D_true = cdist(X1_bool, X1_bool, metric) + check_pdist_bool(metric, D_true) + + +def check_pdist(metric, kwargs, D_true): + dm = DistanceMetric.get_metric(metric, **kwargs) + D12 = dm.pairwise(X1) + assert_array_almost_equal(D12, D_true) + + +def check_pdist_bool(metric, D_true): + dm = DistanceMetric.get_metric(metric) + D12 = dm.pairwise(X1_bool) + assert_array_almost_equal(D12, D_true) + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_pickle(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + check_pickle(metric, kwargs) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_pickle_bool_metrics(metric): + dm = DistanceMetric.get_metric(metric) + D1 = dm.pairwise(X1_bool) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X1_bool) + assert_array_almost_equal(D1, D2) + + +def check_pickle(metric, kwargs): + dm = DistanceMetric.get_metric(metric, **kwargs) + D1 = dm.pairwise(X1) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X1) + assert_array_almost_equal(D1, D2) def test_haversine_metric(): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 0ecb3b67805fb..acff2f4e8846b 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -4,6 +4,8 @@ from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) +import pytest + from sklearn import metrics from sklearn import neighbors, datasets from sklearn.exceptions import DataConversionWarning @@ -1244,63 +1246,57 @@ def test_include_self_neighbors_graph(): assert_array_equal(rng_not_self, [[0., 1.], [1., 0.]]) -def test_same_knn_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_knn_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_knn_parallel(algorithm): - clf = neighbors.KNeighborsClassifier(n_neighbors=3, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.kneighbors(X_test) - graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - - clf.set_params(n_jobs=3) - clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.kneighbors(X_test) - graph_parallel = \ - clf.kneighbors_graph(X_test, mode='distance').toarray() - - assert_array_equal(y, y_parallel) - assert_array_almost_equal(dist, dist_parallel) - assert_array_equal(ind, ind_parallel) - assert_array_almost_equal(graph, graph_parallel) + clf = neighbors.KNeighborsClassifier(n_neighbors=3, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.kneighbors(X_test) + graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - for algorithm in ALGORITHMS: - yield check_same_knn_parallel, algorithm + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.kneighbors(X_test) + graph_parallel = \ + clf.kneighbors_graph(X_test, mode='distance').toarray() + + assert_array_equal(y, y_parallel) + assert_array_almost_equal(dist, dist_parallel) + assert_array_equal(ind, ind_parallel) + assert_array_almost_equal(graph, graph_parallel) -def test_same_radius_neighbors_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_radius_neighbors_parallel(algorithm): - clf = neighbors.RadiusNeighborsClassifier(radius=10, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.radius_neighbors(X_test) - graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() - - clf.set_params(n_jobs=3) - clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.radius_neighbors(X_test) - graph_parallel = \ - clf.radius_neighbors_graph(X_test, mode='distance').toarray() - - assert_array_equal(y, y_parallel) - for i in range(len(dist)): - assert_array_almost_equal(dist[i], dist_parallel[i]) - assert_array_equal(ind[i], ind_parallel[i]) - assert_array_almost_equal(graph, graph_parallel) - - for algorithm in ALGORITHMS: - yield check_same_radius_neighbors_parallel, algorithm + clf = neighbors.RadiusNeighborsClassifier(radius=10, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.radius_neighbors(X_test) + graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.radius_neighbors(X_test) + graph_parallel = \ + clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + assert_array_equal(y, y_parallel) + for i in range(len(dist)): + assert_array_almost_equal(dist[i], dist_parallel[i]) + assert_array_equal(ind[i], ind_parallel[i]) + assert_array_almost_equal(graph, graph_parallel) def test_dtype_convert(): diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index 6cfa4bcc562e2..156bfc232a55d 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -1,5 +1,8 @@ import pickle import numpy as np + +import pytest + from sklearn.neighbors.quad_tree import _QuadTree from sklearn.utils import check_random_state @@ -58,50 +61,43 @@ def test_quadtree_similar_point(): tree._check_coherence() -def test_quad_tree_pickle(): +@pytest.mark.parametrize('n_dimensions', (2, 3)) +@pytest.mark.parametrize('protocol', (0, 1, 2)) +def test_quad_tree_pickle(n_dimensions, protocol): rng = check_random_state(0) - for n_dimensions in (2, 3): - X = rng.random_sample((10, n_dimensions)) - - tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) - tree.build_tree(X) + X = rng.random_sample((10, n_dimensions)) - def check_pickle_protocol(protocol): - s = pickle.dumps(tree, protocol=protocol) - bt2 = pickle.loads(s) + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(X) - for x in X: - cell_x_tree = tree.get_cell(x) - cell_x_bt2 = bt2.get_cell(x) - assert cell_x_tree == cell_x_bt2 + s = pickle.dumps(tree, protocol=protocol) + bt2 = pickle.loads(s) - for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + for x in X: + cell_x_tree = tree.get_cell(x) + cell_x_bt2 = bt2.get_cell(x) + assert cell_x_tree == cell_x_bt2 -def test_qt_insert_duplicate(): +@pytest.mark.parametrize('n_dimensions', (2, 3)) +def test_qt_insert_duplicate(n_dimensions): rng = check_random_state(0) - def check_insert_duplicate(n_dimensions=2): - - X = rng.random_sample((10, n_dimensions)) - Xd = np.r_[X, X[:5]] - tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) - tree.build_tree(Xd) - - cumulative_size = tree.cumulative_size - leafs = tree.leafs + X = rng.random_sample((10, n_dimensions)) + Xd = np.r_[X, X[:5]] + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(Xd) - # Assert that the first 5 are indeed duplicated and that the next - # ones are single point leaf - for i, x in enumerate(X): - cell_id = tree.get_cell(x) - assert leafs[cell_id] - assert cumulative_size[cell_id] == 1 + (i < 5) + cumulative_size = tree.cumulative_size + leafs = tree.leafs - for n_dimensions in (2, 3): - yield check_insert_duplicate, n_dimensions + # Assert that the first 5 are indeed duplicated and that the next + # ones are single point leaf + for i, x in enumerate(X): + cell_id = tree.get_cell(x) + assert leafs[cell_id] + assert cumulative_size[cell_id] == 1 + (i < 5) def test_summarize(): From 4140c9999d19b409df47fc780953b15de3c4eb26 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 5 May 2018 22:43:18 +0300 Subject: [PATCH 09/25] Replace last yields in tests --- sklearn/neighbors/tests/test_kd_tree.py | 58 +++--- sklearn/neighbors/tests/test_kde.py | 52 ++--- sklearn/preprocessing/tests/test_label.py | 10 +- sklearn/tests/test_naive_bayes.py | 222 +++++++++++----------- 4 files changed, 179 insertions(+), 163 deletions(-) diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index e1b7cb1965987..46cddc711e769 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,5 +1,8 @@ import numpy as np from numpy.testing import assert_array_almost_equal + +import pytest + from sklearn.neighbors.kd_tree import (KDTree, NeighborsHeap, simultaneous_sort, kernel_norm, nodeheap_sort, DTYPE, ITYPE) @@ -37,18 +40,17 @@ def check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs): assert_array_almost_equal(dist1, dist2) -def test_kd_tree_query(): +@pytest.mark.parametrize('metric', METRICS) +@pytest.mark.parametrize('k', (1, 3, 5)) +@pytest.mark.parametrize('dualtree', (True, False)) +@pytest.mark.parametrize('breadth_first', (True, False)) +def test_kd_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) - for (metric, kwargs) in METRICS.items(): - for k in (1, 3, 5): - for dualtree in (True, False): - for breadth_first in (True, False): - yield (check_neighbors, - dualtree, breadth_first, - k, metric, X, Y, kwargs) + kwargs = METRICS[metric] + check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs) def test_kd_tree_query_radius(n_samples=100, n_features=10): @@ -118,22 +120,24 @@ def check_results(kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true): rtol=max(rtol, 1e-7)) -def test_kd_tree_kde(n_samples=100, n_features=3): +@pytest.mark.parametrize('kernel', + ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize('h', [0.01, 0.1, 1]) +def test_kd_tree_kde(kernel, h): + n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for h in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, h) + dens_true = compute_kernel_slow(Y, X, kernel, h) - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, h, atol, rtol, - breadth_first, Y, kdt, dens_true) + for rtol in [0, 1E-5]: + for atol in [1E-6, 1E-2]: + for breadth_first in (True, False): + check_results(kernel, h, atol, rtol, + breadth_first, Y, kdt, dens_true) def test_gaussian_kde(n_samples=1000): @@ -153,7 +157,9 @@ def test_gaussian_kde(n_samples=1000): assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3) -def test_kd_tree_two_point(n_samples=100, n_features=3): +@pytest.mark.parametrize('dualtree', (True, False)) +def test_kd_tree_two_point(dualtree): + n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) @@ -163,15 +169,12 @@ def test_kd_tree_two_point(n_samples=100, n_features=3): D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] - def check_two_point(r, dualtree): - counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) - assert_array_almost_equal(counts, counts_true) - - for dualtree in (True, False): - yield check_two_point, r, dualtree + counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) + assert_array_almost_equal(counts, counts_true) -def test_kd_tree_pickle(): +@pytest.mark.parametrize('protocol', (0, 1, 2)) +def test_kd_tree_pickle(protocol): import pickle rng = check_random_state(0) X = rng.random_sample((10, 3)) @@ -185,8 +188,7 @@ def check_pickle_protocol(protocol): assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) - for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + check_pickle_protocol(protocol) def test_neighbors_heap(n_pts=5, n_nbrs=10): diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 60f294a3df0a9..caffb662608e0 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -1,4 +1,7 @@ import numpy as np + +import pytest + from sklearn.utils.testing import (assert_allclose, assert_raises, assert_equal) from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors @@ -40,21 +43,25 @@ def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): atol=atol, rtol=max(1E-7, rtol)) -def test_kernel_density(n_samples=100, n_features=3): +@pytest.mark.parametrize( + 'kernel', + ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize('bandwidth', [0.01, 0.1, 1]) +def test_kernel_density(kernel, bandwidth): + n_samples, n_features = (100, 3) + rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) Y = rng.randn(n_samples, n_features) - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for bandwidth in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) + dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, bandwidth, atol, rtol, - X, Y, dens_true) + for rtol in [0, 1E-5]: + for atol in [1E-6, 1E-2]: + for breadth_first in (True, False): + check_results(kernel, bandwidth, atol, rtol, + X, Y, dens_true) def test_kernel_density_sampling(n_samples=100, n_features=3): @@ -91,23 +98,24 @@ def test_kernel_density_sampling(n_samples=100, n_features=3): assert_equal(kde.sample().shape, (1, 1)) -def test_kde_algorithm_metric_choice(): +@pytest.mark.parametrize('algorithm', ['auto', 'ball_tree', 'kd_tree']) +@pytest.mark.parametrize('metric', + ['euclidean', 'minkowski', 'manhattan', + 'chebyshev', 'haversine']) +def test_kde_algorithm_metric_choice(algorithm, metric): # Smoke test for various metrics and algorithms rng = np.random.RandomState(0) X = rng.randn(10, 2) # 2 features required for haversine dist. Y = rng.randn(10, 2) - for algorithm in ['auto', 'ball_tree', 'kd_tree']: - for metric in ['euclidean', 'minkowski', 'manhattan', - 'chebyshev', 'haversine']: - if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: - assert_raises(ValueError, KernelDensity, - algorithm=algorithm, metric=metric) - else: - kde = KernelDensity(algorithm=algorithm, metric=metric) - kde.fit(X) - y_dens = kde.score_samples(Y) - assert_equal(y_dens.shape, Y.shape[:1]) + if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: + assert_raises(ValueError, KernelDensity, + algorithm=algorithm, metric=metric) + else: + kde = KernelDensity(algorithm=algorithm, metric=metric) + kde.fit(X) + y_dens = kde.score_samples(Y) + assert_equal(y_dens.shape, Y.shape[:1]) def test_kde_score(n_samples=100, n_features=3): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 2bc3ae17ea2d4..a263846e6d52e 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -474,7 +474,7 @@ def test_label_binarize_binary(): neg_label = -1 expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) # Binary case where sparse_output = True will not result in a ValueError y = [0, 1, 0] @@ -483,7 +483,7 @@ def test_label_binarize_binary(): neg_label = 0 expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) def test_label_binarize_multiclass(): @@ -493,7 +493,7 @@ def test_label_binarize_multiclass(): neg_label = 0 expected = 2 * np.eye(3) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) assert_raises(ValueError, label_binarize, y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True) @@ -510,8 +510,8 @@ def test_label_binarize_multilabel(): dok_matrix, lil_matrix]] for y in [y_ind] + y_sparse: - yield (check_binarized_results, y, classes, pos_label, neg_label, - expected) + check_binarized_results(y, classes, pos_label, neg_label, + expected) assert_raises(ValueError, label_binarize, y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 7941f085d7e24..6b090ce4684f9 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -178,51 +178,56 @@ def test_discrete_prior(): clf.class_log_prior_, 8) -def test_mnnb(): +@pytest.mark.parametrize('kind', ('dense', 'sparse')) +def test_mnnb(kind): # Test Multinomial Naive Bayes classification. # This checks that MultinomialNB implements fit and predict and returns # correct values for a simple toy dataset. - for X in [X2, scipy.sparse.csr_matrix(X2)]: - # Check the ability to predict the learning set. - clf = MultinomialNB() - assert_raises(ValueError, clf.fit, -X, y2) - y_pred = clf.fit(X, y2).predict(X) + if kind == 'dense': + X = X2 + elif kind == 'sparse': + X = scipy.sparse.csr_matrix(X2) - assert_array_equal(y_pred, y2) + # Check the ability to predict the learning set. + clf = MultinomialNB() + assert_raises(ValueError, clf.fit, -X, y2) + y_pred = clf.fit(X, y2).predict(X) + + assert_array_equal(y_pred, y2) - # Verify that np.log(clf.predict_proba(X)) gives the same results as - # clf.predict_log_proba(X) - y_pred_proba = clf.predict_proba(X) - y_pred_log_proba = clf.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) + # Verify that np.log(clf.predict_proba(X)) gives the same results as + # clf.predict_log_proba(X) + y_pred_proba = clf.predict_proba(X) + y_pred_log_proba = clf.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) - # Check that incremental fitting yields the same results - clf2 = MultinomialNB() - clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) - clf2.partial_fit(X[2:5], y2[2:5]) - clf2.partial_fit(X[5:], y2[5:]) + # Check that incremental fitting yields the same results + clf2 = MultinomialNB() + clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) + clf2.partial_fit(X[2:5], y2[2:5]) + clf2.partial_fit(X[5:], y2[5:]) - y_pred2 = clf2.predict(X) - assert_array_equal(y_pred2, y2) + y_pred2 = clf2.predict(X) + assert_array_equal(y_pred2, y2) - y_pred_proba2 = clf2.predict_proba(X) - y_pred_log_proba2 = clf2.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) - assert_array_almost_equal(y_pred_proba2, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) + y_pred_proba2 = clf2.predict_proba(X) + y_pred_log_proba2 = clf2.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) + assert_array_almost_equal(y_pred_proba2, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) - # Partial fit on the whole data at once should be the same as fit too - clf3 = MultinomialNB() - clf3.partial_fit(X, y2, classes=np.unique(y2)) + # Partial fit on the whole data at once should be the same as fit too + clf3 = MultinomialNB() + clf3.partial_fit(X, y2, classes=np.unique(y2)) - y_pred3 = clf3.predict(X) - assert_array_equal(y_pred3, y2) - y_pred_proba3 = clf3.predict_proba(X) - y_pred_log_proba3 = clf3.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) - assert_array_almost_equal(y_pred_proba3, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) + y_pred3 = clf3.predict(X) + assert_array_equal(y_pred3, y2) + y_pred_proba3 = clf3.predict_proba(X) + y_pred_log_proba3 = clf3.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) + assert_array_almost_equal(y_pred_proba3, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) def check_partial_fit(cls): @@ -260,62 +265,63 @@ def test_gnb_partial_fit(): assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_) -def test_discretenb_pickle(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) +def test_discretenb_pickle(cls): # Test picklability of discrete naive Bayes classifiers - for cls in [BernoulliNB, MultinomialNB, GaussianNB]: - clf = cls().fit(X2, y2) - y_pred = clf.predict(X2) + clf = cls().fit(X2, y2) + y_pred = clf.predict(X2) - store = BytesIO() - pickle.dump(clf, store) - clf = pickle.load(BytesIO(store.getvalue())) + store = BytesIO() + pickle.dump(clf, store) + clf = pickle.load(BytesIO(store.getvalue())) - assert_array_equal(y_pred, clf.predict(X2)) + assert_array_equal(y_pred, clf.predict(X2)) - if cls is not GaussianNB: - # TODO re-enable me when partial_fit is implemented for GaussianNB + if cls is not GaussianNB: + # TODO re-enable me when partial_fit is implemented for GaussianNB - # Test pickling of estimator trained with partial_fit - clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2)) - clf2.partial_fit(X2[3:], y2[3:]) - store = BytesIO() - pickle.dump(clf2, store) - clf2 = pickle.load(BytesIO(store.getvalue())) - assert_array_equal(y_pred, clf2.predict(X2)) + # Test pickling of estimator trained with partial_fit + clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2)) + clf2.partial_fit(X2[3:], y2[3:]) + store = BytesIO() + pickle.dump(clf2, store) + clf2 = pickle.load(BytesIO(store.getvalue())) + assert_array_equal(y_pred, clf2.predict(X2)) -def test_input_check_fit(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) +def test_input_check_fit(cls): # Test input checks for the fit method - for cls in [BernoulliNB, MultinomialNB, GaussianNB]: - # check shape consistency for number of samples at fit time - assert_raises(ValueError, cls().fit, X2, y2[:-1]) - # check shape consistency for number of input features at predict time - clf = cls().fit(X2, y2) - assert_raises(ValueError, clf.predict, X2[:, :-1]) + # check shape consistency for number of samples at fit time + assert_raises(ValueError, cls().fit, X2, y2[:-1]) + # check shape consistency for number of input features at predict time + clf = cls().fit(X2, y2) + assert_raises(ValueError, clf.predict, X2[:, :-1]) -def test_input_check_partial_fit(): - for cls in [BernoulliNB, MultinomialNB]: - # check shape consistency - assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], - classes=np.unique(y2)) - # classes is required for first call to partial fit - assert_raises(ValueError, cls().partial_fit, X2, y2) +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_input_check_partial_fit(cls): + # check shape consistency + assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], + classes=np.unique(y2)) + + # classes is required for first call to partial fit + assert_raises(ValueError, cls().partial_fit, X2, y2) - # check consistency of consecutive classes values - clf = cls() - clf.partial_fit(X2, y2, classes=np.unique(y2)) - assert_raises(ValueError, clf.partial_fit, X2, y2, - classes=np.arange(42)) + # check consistency of consecutive classes values + clf = cls() + clf.partial_fit(X2, y2, classes=np.unique(y2)) + assert_raises(ValueError, clf.partial_fit, X2, y2, + classes=np.arange(42)) - # check consistency of input shape for partial_fit - assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) + # check consistency of input shape for partial_fit + assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) - # check consistency of input shape for predict - assert_raises(ValueError, clf.predict, X2[:, :-1]) + # check consistency of input shape for predict + assert_raises(ValueError, clf.predict, X2[:, :-1]) def test_discretenb_predict_proba(): @@ -349,34 +355,35 @@ def test_discretenb_predict_proba(): assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1) -def test_discretenb_uniform_prior(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_uniform_prior(cls): # Test whether discrete NB classes fit a uniform prior # when fit_prior=False and class_prior=None - for cls in [BernoulliNB, MultinomialNB]: - clf = cls() - clf.set_params(fit_prior=False) - clf.fit([[0], [0], [1]], [0, 0, 1]) - prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + clf = cls() + clf.set_params(fit_prior=False) + clf.fit([[0], [0], [1]], [0, 0, 1]) + prior = np.exp(clf.class_log_prior_) + assert_array_almost_equal(prior, np.array([.5, .5])) -def test_discretenb_provide_prior(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_provide_prior(cls): # Test whether discrete NB classes use provided prior - for cls in [BernoulliNB, MultinomialNB]: - clf = cls(class_prior=[0.5, 0.5]) - clf.fit([[0], [0], [1]], [0, 0, 1]) - prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + clf = cls(class_prior=[0.5, 0.5]) + clf.fit([[0], [0], [1]], [0, 0, 1]) + prior = np.exp(clf.class_log_prior_) + assert_array_almost_equal(prior, np.array([.5, .5])) - # Inconsistent number of classes with prior - assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) - assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], - classes=[0, 1, 1]) + # Inconsistent number of classes with prior + assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) + assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], + classes=[0, 1, 1]) -def test_discretenb_provide_prior_with_partial_fit(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_provide_prior_with_partial_fit(cls): # Test whether discrete NB classes use provided prior # when using partial_fit @@ -384,22 +391,21 @@ def test_discretenb_provide_prior_with_partial_fit(): iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split( iris.data, iris.target, test_size=0.4, random_state=415) - for cls in [BernoulliNB, MultinomialNB]: - for prior in [None, [0.3, 0.3, 0.4]]: - clf_full = cls(class_prior=prior) - clf_full.fit(iris.data, iris.target) - clf_partial = cls(class_prior=prior) - clf_partial.partial_fit(iris_data1, iris_target1, - classes=[0, 1, 2]) - clf_partial.partial_fit(iris_data2, iris_target2) - assert_array_almost_equal(clf_full.class_log_prior_, - clf_partial.class_log_prior_) - - -def test_sample_weight_multiclass(): - for cls in [BernoulliNB, MultinomialNB]: - # check shape consistency for number of samples at fit time - yield check_sample_weight_multiclass, cls + for prior in [None, [0.3, 0.3, 0.4]]: + clf_full = cls(class_prior=prior) + clf_full.fit(iris.data, iris.target) + clf_partial = cls(class_prior=prior) + clf_partial.partial_fit(iris_data1, iris_target1, + classes=[0, 1, 2]) + clf_partial.partial_fit(iris_data2, iris_target2) + assert_array_almost_equal(clf_full.class_log_prior_, + clf_partial.class_log_prior_) + + +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_sample_weight_multiclass(cls): + # check shape consistency for number of samples at fit time + check_sample_weight_multiclass(cls) def check_sample_weight_multiclass(cls): From c4663f6d30517701df0fed1c58650e1872b4bb58 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 6 May 2018 22:14:35 +0300 Subject: [PATCH 10/25] More test parametrization --- sklearn/cluster/tests/test_dbscan.py | 60 +++++----- sklearn/cluster/tests/test_k_means.py | 113 +++++++++--------- sklearn/cluster/tests/test_spectral.py | 70 +++++------ sklearn/decomposition/tests/test_nmf.py | 42 +++---- .../decomposition/tests/test_online_lda.py | 108 ++++++++--------- sklearn/decomposition/tests/test_pca.py | 110 ++++++++--------- .../decomposition/tests/test_truncated_svd.py | 44 +++---- sklearn/linear_model/tests/test_ridge.py | 71 ++++++----- 8 files changed, 317 insertions(+), 301 deletions(-) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index 1dee674e49af7..f2d6c5836db8a 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -9,6 +9,8 @@ from scipy.spatial import distance from scipy import sparse +import pytest + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises @@ -306,38 +308,38 @@ def test_weighted_dbscan(): assert_array_equal(label1, est.labels_) -def test_dbscan_core_samples_toy(): +@pytest.mark.parametrize('algorithm', ['brute', 'kd_tree', 'ball_tree']) +def test_dbscan_core_samples_toy(algorithm): X = [[0], [2], [3], [4], [6], [8], [10]] n_samples = len(X) - for algorithm in ['brute', 'kd_tree', 'ball_tree']: - # Degenerate case: every sample is a core sample, either with its own - # cluster or including other close core samples. - core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, - min_samples=1) - assert_array_equal(core_samples, np.arange(n_samples)) - assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) - - # With eps=1 and min_samples=2 only the 3 samples from the denser area - # are core samples. All other points are isolated and considered noise. - core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, - min_samples=2) - assert_array_equal(core_samples, [1, 2, 3]) - assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) - - # Only the sample in the middle of the dense area is core. Its two - # neighbors are edge samples. Remaining samples are noise. - core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, - min_samples=3) - assert_array_equal(core_samples, [2]) - assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) - - # It's no longer possible to extract core samples with eps=1: - # everything is noise. - core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, - min_samples=4) - assert_array_equal(core_samples, []) - assert_array_equal(labels, -np.ones(n_samples)) + # Degenerate case: every sample is a core sample, either with its own + # cluster or including other close core samples. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, + min_samples=1) + assert_array_equal(core_samples, np.arange(n_samples)) + assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) + + # With eps=1 and min_samples=2 only the 3 samples from the denser area + # are core samples. All other points are isolated and considered noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, + min_samples=2) + assert_array_equal(core_samples, [1, 2, 3]) + assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) + + # Only the sample in the middle of the dense area is core. Its two + # neighbors are edge samples. Remaining samples are noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, + min_samples=3) + assert_array_equal(core_samples, [2]) + assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) + + # It's no longer possible to extract core samples with eps=1: + # everything is noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, + min_samples=4) + assert_array_equal(core_samples, []) + assert_array_equal(labels, -np.ones(n_samples)) def test_dbscan_precomputed_metric_with_degenerate_input_arrays(): diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index de8772d761e22..d5ae78f6a60fd 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -4,6 +4,8 @@ import numpy as np from scipy import sparse as sp +import pytest + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal @@ -268,30 +270,31 @@ def test_k_means_n_init(): assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1).fit, X) -def test_k_means_explicit_init_shape(): +@pytest.mark.parametrize('Class', [KMeans, MiniBatchKMeans]) +def test_k_means_explicit_init_shape(Class): # test for sensible errors when giving explicit init # with wrong number of features or clusters rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 3)) - for Class in [KMeans, MiniBatchKMeans]: - # mismatch of number of features - km = Class(n_init=1, init=X[:, :2], n_clusters=len(X)) - msg = "does not match the number of features of the data" - assert_raises_regex(ValueError, msg, km.fit, X) - # for callable init - km = Class(n_init=1, - init=lambda X_, k, random_state: X_[:, :2], - n_clusters=len(X)) - assert_raises_regex(ValueError, msg, km.fit, X) - # mismatch of number of clusters - msg = "does not match the number of clusters" - km = Class(n_init=1, init=X[:2, :], n_clusters=3) - assert_raises_regex(ValueError, msg, km.fit, X) - # for callable init - km = Class(n_init=1, - init=lambda X_, k, random_state: X_[:2, :], - n_clusters=3) - assert_raises_regex(ValueError, msg, km.fit, X) + + # mismatch of number of features + km = Class(n_init=1, init=X[:, :2], n_clusters=len(X)) + msg = "does not match the number of features of the data" + assert_raises_regex(ValueError, msg, km.fit, X) + # for callable init + km = Class(n_init=1, + init=lambda X_, k, random_state: X_[:, :2], + n_clusters=len(X)) + assert_raises_regex(ValueError, msg, km.fit, X) + # mismatch of number of clusters + msg = "does not match the number of clusters" + km = Class(n_init=1, init=X[:2, :], n_clusters=3) + assert_raises_regex(ValueError, msg, km.fit, X) + # for callable init + km = Class(n_init=1, + init=lambda X_, k, random_state: X_[:2, :], + n_clusters=3) + assert_raises_regex(ValueError, msg, km.fit, X) def test_k_means_fortran_aligned_data(): @@ -776,46 +779,46 @@ def test_max_iter_error(): km.fit, X) -def test_float_precision(): - km = KMeans(n_init=1, random_state=30) - mb_km = MiniBatchKMeans(n_init=1, random_state=30) +@pytest.mark.parametrize('Estimator', [KMeans, MiniBatchKMeans]) +@pytest.mark.parametrize('is_sparse', [False, True]) +def test_float_precision(Estimator, is_sparse): + + estimator = Estimator(n_init=1, random_state=30) inertia = {} X_new = {} centers = {} - for estimator in [km, mb_km]: - for is_sparse in [False, True]: - for dtype in [np.float64, np.float32]: - if is_sparse: - X_test = sp.csr_matrix(X_csr, dtype=dtype) - else: - X_test = X.astype(dtype) - estimator.fit(X_test) - # dtype of cluster centers has to be the dtype of the input - # data - assert_equal(estimator.cluster_centers_.dtype, dtype) - inertia[dtype] = estimator.inertia_ - X_new[dtype] = estimator.transform(X_test) - centers[dtype] = estimator.cluster_centers_ - # ensure the extracted row is a 2d array - assert_equal(estimator.predict(X_test[:1]), - estimator.labels_[0]) - if hasattr(estimator, 'partial_fit'): - estimator.partial_fit(X_test[0:3]) - # dtype of cluster centers has to stay the same after - # partial_fit - assert_equal(estimator.cluster_centers_.dtype, dtype) - - # compare arrays with low precision since the difference between - # 32 and 64 bit sometimes makes a difference up to the 4th decimal - # place - assert_array_almost_equal(inertia[np.float32], inertia[np.float64], - decimal=4) - assert_array_almost_equal(X_new[np.float32], X_new[np.float64], - decimal=4) - assert_array_almost_equal(centers[np.float32], centers[np.float64], - decimal=4) + for dtype in [np.float64, np.float32]: + if is_sparse: + X_test = sp.csr_matrix(X_csr, dtype=dtype) + else: + X_test = X.astype(dtype) + estimator.fit(X_test) + # dtype of cluster centers has to be the dtype of the input + # data + assert_equal(estimator.cluster_centers_.dtype, dtype) + inertia[dtype] = estimator.inertia_ + X_new[dtype] = estimator.transform(X_test) + centers[dtype] = estimator.cluster_centers_ + # ensure the extracted row is a 2d array + assert_equal(estimator.predict(X_test[:1]), + estimator.labels_[0]) + if hasattr(estimator, 'partial_fit'): + estimator.partial_fit(X_test[0:3]) + # dtype of cluster centers has to stay the same after + # partial_fit + assert_equal(estimator.cluster_centers_.dtype, dtype) + + # compare arrays with low precision since the difference between + # 32 and 64 bit sometimes makes a difference up to the 4th decimal + # place + assert_array_almost_equal(inertia[np.float32], inertia[np.float64], + decimal=4) + assert_array_almost_equal(X_new[np.float32], X_new[np.float64], + decimal=4) + assert_array_almost_equal(centers[np.float32], centers[np.float64], + decimal=4) def test_k_means_init_centers(): diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 62d9adcc2e34f..0c220e7615e67 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -4,6 +4,8 @@ import numpy as np from scipy import sparse +import pytest + from sklearn.externals.six.moves import cPickle from sklearn.utils import check_random_state @@ -27,7 +29,9 @@ amg_loaded = False -def test_spectral_clustering(): +@pytest.mark.parametrize('eigen_solver', ('arpack', 'lobpcg')) +@pytest.mark.parametrize('assign_labels', ('kmeans', 'discretize')) +def test_spectral_clustering(eigen_solver, assign_labels): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], @@ -36,24 +40,22 @@ def test_spectral_clustering(): [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]]) - for eigen_solver in ('arpack', 'lobpcg'): - for assign_labels in ('kmeans', 'discretize'): - for mat in (S, sparse.csr_matrix(S)): - model = SpectralClustering(random_state=0, n_clusters=2, - affinity='precomputed', - eigen_solver=eigen_solver, - assign_labels=assign_labels - ).fit(mat) - labels = model.labels_ - if labels[0] == 0: - labels = 1 - labels + for mat in (S, sparse.csr_matrix(S)): + model = SpectralClustering(random_state=0, n_clusters=2, + affinity='precomputed', + eigen_solver=eigen_solver, + assign_labels=assign_labels + ).fit(mat) + labels = model.labels_ + if labels[0] == 0: + labels = 1 - labels - assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1 + assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1 - model_copy = cPickle.loads(cPickle.dumps(model)) - assert model_copy.n_clusters == model.n_clusters - assert model_copy.eigen_solver == model.eigen_solver - assert_array_equal(model_copy.labels_, model.labels_) + model_copy = cPickle.loads(cPickle.dumps(model)) + assert model_copy.n_clusters == model.n_clusters + assert model_copy.eigen_solver == model.eigen_solver + assert_array_equal(model_copy.labels_, model.labels_) def test_spectral_unknown_mode(): @@ -149,25 +151,25 @@ def histogram(x, y, **kwargs): assert_raises(ValueError, sp.fit, X) -def test_discretize(seed=8): +@pytest.mark.parametrize('n_samples', [50, 100, 150, 500]) +def test_discretize(n_samples): # Test the discretize using a noise assignment matrix - random_state = np.random.RandomState(seed) - for n_samples in [50, 100, 150, 500]: - for n_class in range(2, 10): - # random class labels - y_true = random_state.randint(0, n_class + 1, n_samples) - y_true = np.array(y_true, np.float) - # noise class assignment matrix - y_indicator = sparse.coo_matrix((np.ones(n_samples), - (np.arange(n_samples), - y_true)), - shape=(n_samples, + random_state = np.random.RandomState(seed=8) + for n_class in range(2, 10): + # random class labels + y_true = random_state.randint(0, n_class + 1, n_samples) + y_true = np.array(y_true, np.float) + # noise class assignment matrix + y_indicator = sparse.coo_matrix((np.ones(n_samples), + (np.arange(n_samples), + y_true)), + shape=(n_samples, + n_class + 1)) + y_true_noisy = (y_indicator.toarray() + + 0.1 * random_state.randn(n_samples, n_class + 1)) - y_true_noisy = (y_indicator.toarray() - + 0.1 * random_state.randn(n_samples, - n_class + 1)) - y_pred = discretize(y_true_noisy, random_state) - assert adjusted_rand_score(y_true, y_pred) > 0.8 + y_pred = discretize(y_true_noisy, random_state) + assert adjusted_rand_score(y_true, y_pred) > 0.8 def test_spectral_clustering_with_arpack_amg_solvers(): diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 6692aa23d0035..8ae157176647e 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -7,6 +7,8 @@ from sklearn.decomposition import nmf # For testing internals from scipy.sparse import csc_matrix +import pytest + from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raise_message, assert_no_warnings @@ -95,26 +97,26 @@ def test_nmf_fit_nn_output(): (transf < 0).any()) -def test_nmf_fit_close(): +@pytest.mark.parametrize('solver', ('cd', 'mu')) +def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away - for solver in ('cd', 'mu'): - pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, - max_iter=600) - X = np.abs(rng.randn(6, 5)) - assert_less(pnmf.fit(X).reconstruction_err_, 0.1) + pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, + max_iter=600) + X = np.abs(rng.randn(6, 5)) + assert_less(pnmf.fit(X).reconstruction_err_, 0.1) -def test_nmf_transform(): +@pytest.mark.parametrize('solver', ('cd', 'mu')) +def test_nmf_transform(solver): # Test that NMF.transform returns close values rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(6, 5)) - for solver in ['cd', 'mu']: - m = NMF(solver=solver, n_components=3, init='random', - random_state=0, tol=1e-5) - ft = m.fit_transform(A) - t = m.transform(A) - assert_array_almost_equal(ft, t, decimal=2) + m = NMF(solver=solver, n_components=3, init='random', + random_state=0, tol=1e-5) + ft = m.fit_transform(A) + t = m.transform(A) + assert_array_almost_equal(ft, t, decimal=2) def test_nmf_transform_custom_init(): @@ -132,16 +134,16 @@ def test_nmf_transform_custom_init(): m.transform(A) -def test_nmf_inverse_transform(): +@pytest.mark.parametrize('solver', ('cd', 'mu')) +def test_nmf_inverse_transform(solver): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) - for solver in ('cd', 'mu'): - m = NMF(solver=solver, n_components=4, init='random', random_state=0, - max_iter=1000) - ft = m.fit_transform(A) - A_new = m.inverse_transform(ft) - assert_array_almost_equal(A, A_new, decimal=2) + m = NMF(solver=solver, n_components=4, init='random', random_state=0, + max_iter=1000) + ft = m.fit_transform(A) + A_new = m.inverse_transform(ft) + assert_array_almost_equal(A, A_new, decimal=2) def test_n_components_greater_n_features(): diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 2dd7766ed19aa..b8b636d5a6fd2 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -130,17 +130,17 @@ def test_lda_transform(): np.ones(X_trans.shape[0])) -def test_lda_fit_transform(): +@pytest.mark.parametrize('method', ('online', 'batch')) +def test_lda_fit_transform(method): # Test LDA fit_transform & transform # fit_transform and transform result should be the same - for method in ('online', 'batch'): - rng = np.random.RandomState(0) - X = rng.randint(10, size=(50, 20)) - lda = LatentDirichletAllocation(n_components=5, learning_method=method, - random_state=rng) - X_fit = lda.fit_transform(X) - X_trans = lda.transform(X) - assert_array_almost_equal(X_fit, X_trans, 4) + rng = np.random.RandomState(0) + X = rng.randint(10, size=(50, 20)) + lda = LatentDirichletAllocation(n_components=5, learning_method=method, + random_state=rng) + X_fit = lda.fit_transform(X) + X_trans = lda.transform(X) + assert_array_almost_equal(X_fit, X_trans, 4) def test_lda_partial_fit_dim_mismatch(): @@ -207,20 +207,20 @@ def test_lda_transform_mismatch(): @if_safe_multiprocessing_with_blas -def test_lda_multi_jobs(): +@pytest.mark.parametrize('method', ('online', 'batch')) +def test_lda_multi_jobs(method): n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU - for method in ('online', 'batch'): - rng = np.random.RandomState(0) - lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, - learning_method=method, - evaluate_every=1, random_state=rng) - lda.fit(X) + rng = np.random.RandomState(0) + lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, + learning_method=method, + evaluate_every=1, random_state=rng) + lda.fit(X) - correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] - for c in lda.components_: - top_idx = set(c.argsort()[-3:][::-1]) - assert_true(tuple(sorted(top_idx)) in correct_idx_grps) + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for c in lda.components_: + top_idx = set(c.argsort()[-3:][::-1]) + assert_true(tuple(sorted(top_idx)) in correct_idx_grps) @if_safe_multiprocessing_with_blas @@ -261,46 +261,46 @@ def test_lda_preplexity_mismatch(): invalid_n_components) -def test_lda_perplexity(): +@pytest.mark.parametrize('method', ('online', 'batch')) +def test_lda_perplexity(method): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() - for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_components=n_components, - max_iter=1, learning_method=method, - total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_components=n_components, - max_iter=10, learning_method=method, - total_samples=100, random_state=0) - lda_1.fit(X) - perp_1 = lda_1.perplexity(X, sub_sampling=False) - - lda_2.fit(X) - perp_2 = lda_2.perplexity(X, sub_sampling=False) - assert_greater_equal(perp_1, perp_2) - - perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) - perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) - assert_greater_equal(perp_1_subsampling, perp_2_subsampling) - - -def test_lda_score(): + lda_1 = LatentDirichletAllocation(n_components=n_components, + max_iter=1, learning_method=method, + total_samples=100, random_state=0) + lda_2 = LatentDirichletAllocation(n_components=n_components, + max_iter=10, learning_method=method, + total_samples=100, random_state=0) + lda_1.fit(X) + perp_1 = lda_1.perplexity(X, sub_sampling=False) + + lda_2.fit(X) + perp_2 = lda_2.perplexity(X, sub_sampling=False) + assert_greater_equal(perp_1, perp_2) + + perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) + perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) + assert_greater_equal(perp_1_subsampling, perp_2_subsampling) + + +@pytest.mark.parametrize('method', ('online', 'batch')) +def test_lda_score(method): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() - for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_components=n_components, - max_iter=1, learning_method=method, - total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_components=n_components, - max_iter=10, learning_method=method, - total_samples=100, random_state=0) - lda_1.fit_transform(X) - score_1 = lda_1.score(X) - - lda_2.fit_transform(X) - score_2 = lda_2.score(X) - assert_greater_equal(score_2, score_1) + lda_1 = LatentDirichletAllocation(n_components=n_components, + max_iter=1, learning_method=method, + total_samples=100, random_state=0) + lda_2 = LatentDirichletAllocation(n_components=n_components, + max_iter=10, learning_method=method, + total_samples=100, random_state=0) + lda_1.fit_transform(X) + score_1 = lda_1.score(X) + + lda_2.fit_transform(X) + score_2 = lda_2.score(X) + assert_greater_equal(score_2, score_1) def test_perplexity_input_format(): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index c2fe0d1fb7da1..ddf6b4fc49c54 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -352,67 +352,67 @@ def test_pca_inverse(): assert_almost_equal(X, Y_inverse, decimal=3) -def test_pca_validation(): +@pytest.mark.parametrize('solver', solver_list) +def test_pca_validation(solver): # Ensures that solver-specific extreme inputs for the n_components # parameter raise errors X = np.array([[0, 1, 0], [1, 0, 0]]) smallest_d = 2 # The smallest dimension lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} - for solver in solver_list: - # We conduct the same test on X.T so that it is invariant to axis. - for data in [X, X.T]: - for n_components in [-1, 3]: - - if solver == 'auto': - solver_reported = 'full' - else: - solver_reported = solver - - assert_raises_regex(ValueError, - "n_components={}L? must be between " - r"{}L? and min\(n_samples, n_features\)=" - "{}L? with svd_solver=\'{}\'" - .format(n_components, - lower_limit[solver], - smallest_d, - solver_reported), - PCA(n_components, - svd_solver=solver).fit, data) - if solver == 'arpack': - - n_components = smallest_d - - assert_raises_regex(ValueError, - "n_components={}L? must be " - "strictly less than " - r"min\(n_samples, n_features\)={}L?" - " with svd_solver=\'arpack\'" - .format(n_components, smallest_d), - PCA(n_components, svd_solver=solver) - .fit, data) - - n_components = 1.0 - type_ncom = type(n_components) - assert_raise_message(ValueError, - "n_components={} must be of type int " - "when greater than or equal to 1, was of type={}" - .format(n_components, type_ncom), - PCA(n_components, svd_solver=solver).fit, data) - - -def test_n_components_none(): + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for n_components in [-1, 3]: + + if solver == 'auto': + solver_reported = 'full' + else: + solver_reported = solver + + assert_raises_regex(ValueError, + "n_components={}L? must be between " + r"{}L? and min\(n_samples, n_features\)=" + "{}L? with svd_solver=\'{}\'" + .format(n_components, + lower_limit[solver], + smallest_d, + solver_reported), + PCA(n_components, + svd_solver=solver).fit, data) + if solver == 'arpack': + + n_components = smallest_d + + assert_raises_regex(ValueError, + "n_components={}L? must be " + "strictly less than " + r"min\(n_samples, n_features\)={}L?" + " with svd_solver=\'arpack\'" + .format(n_components, smallest_d), + PCA(n_components, svd_solver=solver) + .fit, data) + + n_components = 1.0 + type_ncom = type(n_components) + assert_raise_message(ValueError, + "n_components={} must be of type int " + "when greater than or equal to 1, was of type={}" + .format(n_components, type_ncom), + PCA(n_components, svd_solver=solver).fit, data) + + +@pytest.mark.parametrize('solver', solver_list) +def test_n_components_none(solver): # Ensures that n_components == None is handled correctly X = iris.data # We conduct the same test on X.T so that it is invariant to axis. for data in [X, X.T]: - for solver in solver_list: - pca = PCA(svd_solver=solver) - pca.fit(data) - if solver == 'arpack': - assert_equal(pca.n_components_, min(data.shape) - 1) - else: - assert_equal(pca.n_components_, min(data.shape)) + pca = PCA(svd_solver=solver) + pca.fit(data) + if solver == 'arpack': + assert_equal(pca.n_components_, min(data.shape) - 1) + else: + assert_equal(pca.n_components_, min(data.shape)) def test_randomized_pca_check_projection(): @@ -706,15 +706,15 @@ def fit_deprecated(X): assert_array_almost_equal(Y, Y_pca) -def test_pca_sparse_input(): +@pytest.mark.parametrize('svd_solver', solver_list) +def test_pca_sparse_input(svd_solver): X = np.random.RandomState(0).rand(5, 4) X = sp.sparse.csr_matrix(X) assert(sp.sparse.issparse(X)) - for svd_solver in solver_list: - pca = PCA(n_components=3, svd_solver=svd_solver) + pca = PCA(n_components=3, svd_solver=svd_solver) - assert_raises(TypeError, pca.fit, X) + assert_raises(TypeError, pca.fit, X) def test_pca_bad_solver(): diff --git a/sklearn/decomposition/tests/test_truncated_svd.py b/sklearn/decomposition/tests/test_truncated_svd.py index 6d853642e1ce0..f08648bb4c469 100644 --- a/sklearn/decomposition/tests/test_truncated_svd.py +++ b/sklearn/decomposition/tests/test_truncated_svd.py @@ -3,6 +3,8 @@ import numpy as np import scipy.sparse as sp +import pytest + from sklearn.decomposition import TruncatedSVD from sklearn.utils import check_random_state from sklearn.utils.testing import (assert_array_almost_equal, assert_equal, @@ -43,31 +45,31 @@ def test_attributes(): assert_equal(tsvd.components_.shape, (n_components, n_features)) -def test_too_many_components(): - for algorithm in ["arpack", "randomized"]: - for n_components in (n_features, n_features + 1): - tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm) - assert_raises(ValueError, tsvd.fit, X) +@pytest.mark.parametrize('algorithm', ("arpack", "randomized")) +def test_too_many_components(algorithm): + for n_components in (n_features, n_features + 1): + tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm) + assert_raises(ValueError, tsvd.fit, X) -def test_sparse_formats(): - for fmt in ("array", "csr", "csc", "coo", "lil"): - Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)() - tsvd = TruncatedSVD(n_components=11) - Xtrans = tsvd.fit_transform(Xfmt) - assert_equal(Xtrans.shape, (n_samples, 11)) - Xtrans = tsvd.transform(Xfmt) - assert_equal(Xtrans.shape, (n_samples, 11)) +@pytest.mark.parametrize('fmt', ("array", "csr", "csc", "coo", "lil")) +def test_sparse_formats(fmt): + Xfmt = Xdense if fmt == "dense" else getattr(X, "to" + fmt)() + tsvd = TruncatedSVD(n_components=11) + Xtrans = tsvd.fit_transform(Xfmt) + assert_equal(Xtrans.shape, (n_samples, 11)) + Xtrans = tsvd.transform(Xfmt) + assert_equal(Xtrans.shape, (n_samples, 11)) -def test_inverse_transform(): - for algo in ("arpack", "randomized"): - # We need a lot of components for the reconstruction to be "almost - # equal" in all positions. XXX Test means or sums instead? - tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) - Xt = tsvd.fit_transform(X) - Xinv = tsvd.inverse_transform(Xt) - assert_array_almost_equal(Xinv, Xdense, decimal=1) +@pytest.mark.parametrize('algo', ("arpack", "randomized")) +def test_inverse_transform(algo): + # We need a lot of components for the reconstruction to be "almost + # equal" in all positions. XXX Test means or sums instead? + tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) + Xt = tsvd.fit_transform(X) + Xinv = tsvd.inverse_transform(Xt) + assert_array_almost_equal(Xinv, Xdense, decimal=1) def test_integers(): diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 6b35029a01a9b..8eb5a06c933b6 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1,6 +1,7 @@ import numpy as np import scipy.sparse as sp from scipy import linalg +from itertools import product import pytest @@ -152,45 +153,49 @@ def test_ridge_regression_convergence_fail(): tol=0., max_iter=None, verbose=1) -@pytest.mark.parametrize('n_samples, n_features', ((6, 5), (5, 10))) -@pytest.mark.parametrize('alpha', (1.0, 1e-2)) -@pytest.mark.parametrize('intercept', (True, False)) -@pytest.mark.parametrize('solver', ('svd', 'cholesky', 'lsqr', 'sparse_cg')) -def test_ridge_sample_weights(n_samples, n_features, alpha, intercept, solver): +def test_ridge_sample_weights(): # TODO: loop over sparse data as well + # Note: parametrizing this test with pytest results in failed + # assertions, meaning that is is not extremely robust rng = np.random.RandomState(0) + param_grid = product((1.0, 1e-2), (True, False), + ('svd', 'cholesky', 'lsqr', 'sparse_cg')) - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - sample_weight = 1.0 + rng.rand(n_samples) - - # Ridge with explicit sample_weight - est = Ridge(alpha=alpha, fit_intercept=intercept, solver=solver) - est.fit(X, y, sample_weight=sample_weight) - coefs = est.coef_ - inter = est.intercept_ - - # Closed form of the weighted regularized least square - # theta = (X^T W X + alpha I)^(-1) * X^T W y - W = np.diag(sample_weight) - if intercept is False: - X_aug = X - Id = np.eye(n_features) - else: - dummy_column = np.ones(shape=(n_samples, 1)) - X_aug = np.concatenate((dummy_column, X), axis=1) - Id = np.eye(n_features + 1) - Id[0, 0] = 0 + for n_samples, n_features in ((6, 5), (5, 10)): - cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * Id, - X_aug.T.dot(W).dot(y)) + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + sample_weight = 1.0 + rng.rand(n_samples) - if intercept is False: - assert_array_almost_equal(coefs, cf_coefs) - else: - assert_array_almost_equal(coefs, cf_coefs[1:]) - assert_almost_equal(inter, cf_coefs[0]) + for (alpha, intercept, solver) in param_grid: + + # Ridge with explicit sample_weight + est = Ridge(alpha=alpha, fit_intercept=intercept, solver=solver) + est.fit(X, y, sample_weight=sample_weight) + coefs = est.coef_ + inter = est.intercept_ + + # Closed form of the weighted regularized least square + # theta = (X^T W X + alpha I)^(-1) * X^T W y + W = np.diag(sample_weight) + if intercept is False: + X_aug = X + I = np.eye(n_features) + else: + dummy_column = np.ones(shape=(n_samples, 1)) + X_aug = np.concatenate((dummy_column, X), axis=1) + I = np.eye(n_features + 1) + I[0, 0] = 0 + + cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * I, + X_aug.T.dot(W).dot(y)) + + if intercept is False: + assert_array_almost_equal(coefs, cf_coefs) + else: + assert_array_almost_equal(coefs, cf_coefs[1:]) + assert_almost_equal(inter, cf_coefs[0]) def test_ridge_shapes(): From 639488cb376d24d3aac1807d7270cc3a9bb2aaef Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 7 May 2018 15:06:49 +0300 Subject: [PATCH 11/25] More parametrizations --- sklearn/ensemble/tests/test_forest.py | 4 +- .../ensemble/tests/test_gradient_boosting.py | 403 +++++++++--------- .../tests/test_dict_vectorizer.py | 52 +-- sklearn/feature_extraction/tests/test_text.py | 136 +++--- .../tests/test_feature_select.py | 5 - .../tests/test_mutual_info.py | 5 - .../tests/test_gaussian_process.py | 17 +- sklearn/gaussian_process/tests/test_gpc.py | 124 +++--- sklearn/gaussian_process/tests/test_gpr.py | 264 ++++++------ .../gaussian_process/tests/test_kernels.py | 282 ++++++------ .../tests/test_coordinate_descent.py | 14 +- .../linear_model/tests/test_least_angle.py | 22 +- sklearn/linear_model/tests/test_logistic.py | 212 ++++----- .../tests/test_passive_aggressive.py | 49 ++- sklearn/linear_model/tests/test_ridge.py | 103 ++--- sklearn/linear_model/tests/test_sgd.py | 16 +- sklearn/manifold/tests/test_t_sne.py | 40 +- 17 files changed, 870 insertions(+), 878 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index efcf1ffc5f04c..28d45b3b80f1a 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -97,9 +97,7 @@ FOREST_ESTIMATORS.update(FOREST_REGRESSORS) FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) -FOREST_CLASSIFIERS_REGRESSORS = dict() -FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_CLASSIFIERS) -FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +FOREST_CLASSIFIERS_REGRESSORS = dict(FOREST_CLASSIFIERS, **FOREST_REGRESSORS) def check_classification_toy(name): diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index b04b857870195..c18008d0b31c1 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -4,8 +4,6 @@ import warnings import numpy as np -from itertools import product - from scipy.sparse import csr_matrix from scipy.sparse import csc_matrix from scipy.sparse import coo_matrix @@ -37,6 +35,9 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import NotFittedError +GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, + GradientBoostingRegressor] + # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] @@ -535,23 +536,23 @@ def test_staged_predict_proba(): assert_array_almost_equal(clf.predict_proba(X_test), staged_proba) -def test_staged_functions_defensive(): +@pytest.mark.parametrize('Estimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_staged_functions_defensive(Estimator): # test that staged_functions make defensive copies rng = np.random.RandomState(0) X = rng.uniform(size=(10, 3)) y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros - for estimator in [GradientBoostingRegressor(), - GradientBoostingClassifier()]: - estimator.fit(X, y) - for func in ['predict', 'decision_function', 'predict_proba']: - staged_func = getattr(estimator, "staged_" + func, None) - if staged_func is None: - # regressor has no staged_predict_proba - continue - with warnings.catch_warnings(record=True): - staged_result = list(staged_func(X)) - staged_result[1][:] = 0 - assert_true(np.all(staged_result[0] != 0)) + estimator = Estimator() + estimator.fit(X, y) + for func in ['predict', 'decision_function', 'predict_proba']: + staged_func = getattr(estimator, "staged_" + func, None) + if staged_func is None: + # regressor has no staged_predict_proba + continue + with warnings.catch_warnings(record=True): + staged_result = list(staged_func(X)) + staged_result[1][:] = 0 + assert_true(np.all(staged_result[0] != 0)) def test_serialization(): @@ -757,188 +758,187 @@ def test_more_verbose_output(): assert_equal(100, n_lines) -def test_warm_start(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start(Cls): # Test if warm start equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=200, max_depth=1) - est.fit(X, y) - - est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=200) - est_ws.fit(X, y) - - if Cls is GradientBoostingRegressor: - assert_array_almost_equal(est_ws.predict(X), est.predict(X)) - else: - # Random state is preserved and hence predict_proba must also be - # same - assert_array_equal(est_ws.predict(X), est.predict(X)) - assert_array_almost_equal(est_ws.predict_proba(X), - est.predict_proba(X)) - - -def test_warm_start_n_estimators(): + est = Cls(n_estimators=200, max_depth=1) + est.fit(X, y) + + est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) + + if Cls is GradientBoostingRegressor: + assert_array_almost_equal(est_ws.predict(X), est.predict(X)) + else: + # Random state is preserved and hence predict_proba must also be + # same + assert_array_equal(est_ws.predict(X), est.predict(X)) + assert_array_almost_equal(est_ws.predict_proba(X), + est.predict_proba(X)) + + +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_n_estimators(Cls): # Test if warm start equals fit - set n_estimators. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=300, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=300, max_depth=1) + est.fit(X, y) - est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=300) - est_ws.fit(X, y) + est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=300) + est_ws.fit(X, y) assert_array_almost_equal(est_ws.predict(X), est.predict(X)) -def test_warm_start_max_depth(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_max_depth(Cls): # Test if possible to fit trees of different depth in ensemble. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=110, max_depth=2) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, max_depth=2) + est.fit(X, y) - # last 10 trees have different depth - assert_equal(est.estimators_[0, 0].max_depth, 1) - for i in range(1, 11): - assert_equal(est.estimators_[-i, 0].max_depth, 2) + # last 10 trees have different depth + assert_equal(est.estimators_[0, 0].max_depth, 1) + for i in range(1, 11): + assert_equal(est.estimators_[-i, 0].max_depth, 2) -def test_warm_start_clear(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_clear(Cls): # Test if fit clears state. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) - est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_2.fit(X, y) # inits state - est_2.set_params(warm_start=False) - est_2.fit(X, y) # clears old state and equals est + est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_2.fit(X, y) # inits state + est_2.set_params(warm_start=False) + est_2.fit(X, y) # clears old state and equals est - assert_array_almost_equal(est_2.predict(X), est.predict(X)) + assert_array_almost_equal(est_2.predict(X), est.predict(X)) -def test_warm_start_zero_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_zero_n_estimators(Cls): # Test if warm start with zero n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=0) - assert_raises(ValueError, est.fit, X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=0) + assert_raises(ValueError, est.fit, X, y) -def test_warm_start_smaller_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_smaller_n_estimators(Cls): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=99) - assert_raises(ValueError, est.fit, X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=99) + assert_raises(ValueError, est.fit, X, y) -def test_warm_start_equal_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_equal_n_estimators(Cls): # Test if warm start with equal n_estimators does nothing X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) - est2 = clone(est) - est2.set_params(n_estimators=est.n_estimators, warm_start=True) - est2.fit(X, y) + est2 = clone(est) + est2.set_params(n_estimators=est.n_estimators, warm_start=True) + est2.fit(X, y) - assert_array_almost_equal(est2.predict(X), est.predict(X)) + assert_array_almost_equal(est2.predict(X), est.predict(X)) -def test_warm_start_oob_switch(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob_switch(Cls): # Test if oob can be turned on during warm start. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=110, subsample=0.5) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, subsample=0.5) + est.fit(X, y) - assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) - # the last 10 are not zeros - assert_array_equal(est.oob_improvement_[-10:] == 0.0, - np.zeros(10, dtype=np.bool)) + assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) + # the last 10 are not zeros + assert_array_equal(est.oob_improvement_[-10:] == 0.0, + np.zeros(10, dtype=np.bool)) -def test_warm_start_oob(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob(Cls): # Test if warm start OOB equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=200, max_depth=1, subsample=0.5, - random_state=1) - est.fit(X, y) + est = Cls(n_estimators=200, max_depth=1, subsample=0.5, + random_state=1) + est.fit(X, y) - est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=200) - est_ws.fit(X, y) + est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) - assert_array_almost_equal(est_ws.oob_improvement_[:100], - est.oob_improvement_[:100]) + assert_array_almost_equal(est_ws.oob_improvement_[:100], + est.oob_improvement_[:100]) -def test_warm_start_sparse(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_sparse(Cls): # Test that all sparse matrix types are supported X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) sparse_matrix_type = [csr_matrix, csc_matrix, coo_matrix] - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_dense.fit(X, y) - est_dense.predict(X) - est_dense.set_params(n_estimators=200) - est_dense.fit(X, y) - y_pred_dense = est_dense.predict(X) - - for sparse_constructor in sparse_matrix_type: - X_sparse = sparse_constructor(X) - - est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_sparse.fit(X_sparse, y) - est_sparse.predict(X) - est_sparse.set_params(n_estimators=200) - est_sparse.fit(X_sparse, y) - y_pred_sparse = est_sparse.predict(X) - - assert_array_almost_equal(est_dense.oob_improvement_[:100], - est_sparse.oob_improvement_[:100]) - assert_array_almost_equal(y_pred_dense, y_pred_sparse) - - -def test_warm_start_fortran(): + est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_dense.fit(X, y) + est_dense.predict(X) + est_dense.set_params(n_estimators=200) + est_dense.fit(X, y) + y_pred_dense = est_dense.predict(X) + + for sparse_constructor in sparse_matrix_type: + X_sparse = sparse_constructor(X) + + est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_sparse.fit(X_sparse, y) + est_sparse.predict(X) + est_sparse.set_params(n_estimators=200) + est_sparse.fit(X_sparse, y) + y_pred_sparse = est_sparse.predict(X) + + assert_array_almost_equal(est_dense.oob_improvement_[:100], + est_sparse.oob_improvement_[:100]) + assert_array_almost_equal(y_pred_dense, y_pred_sparse) + + +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_fortran(Cls): # Test that feeding a X in Fortran-ordered is giving the same results as # in C-ordered X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est_c = Cls(n_estimators=1, random_state=1, warm_start=True) - est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True) + est_c = Cls(n_estimators=1, random_state=1, warm_start=True) + est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True) - est_c.fit(X, y) - est_c.set_params(n_estimators=11) - est_c.fit(X, y) + est_c.fit(X, y) + est_c.set_params(n_estimators=11) + est_c.fit(X, y) - X_fortran = np.asfortranarray(X) - est_fortran.fit(X_fortran, y) - est_fortran.set_params(n_estimators=11) - est_fortran.fit(X_fortran, y) + X_fortran = np.asfortranarray(X) + est_fortran.fit(X_fortran, y) + est_fortran.set_params(n_estimators=11) + est_fortran.fit(X_fortran, y) - assert_array_almost_equal(est_c.predict(X), - est_fortran.predict(X)) + assert_array_almost_equal(est_c.predict(X), est_fortran.predict(X)) def early_stopping_monitor(i, est, locals): @@ -949,40 +949,40 @@ def early_stopping_monitor(i, est, locals): return False -def test_monitor_early_stopping(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_monitor_early_stopping(Cls): # Test if monitor return value works. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) - est.fit(X, y, monitor=early_stopping_monitor) - assert_equal(est.n_estimators, 20) # this is not altered - assert_equal(est.estimators_.shape[0], 10) - assert_equal(est.train_score_.shape[0], 10) - assert_equal(est.oob_improvement_.shape[0], 10) - - # try refit - est.set_params(n_estimators=30) - est.fit(X, y) - assert_equal(est.n_estimators, 30) - assert_equal(est.estimators_.shape[0], 30) - assert_equal(est.train_score_.shape[0], 30) - - est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, - warm_start=True) - est.fit(X, y, monitor=early_stopping_monitor) - assert_equal(est.n_estimators, 20) - assert_equal(est.estimators_.shape[0], 10) - assert_equal(est.train_score_.shape[0], 10) - assert_equal(est.oob_improvement_.shape[0], 10) - - # try refit - est.set_params(n_estimators=30, warm_start=False) - est.fit(X, y) - assert_equal(est.n_estimators, 30) - assert_equal(est.train_score_.shape[0], 30) - assert_equal(est.estimators_.shape[0], 30) - assert_equal(est.oob_improvement_.shape[0], 30) + est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) + est.fit(X, y, monitor=early_stopping_monitor) + assert_equal(est.n_estimators, 20) # this is not altered + assert_equal(est.estimators_.shape[0], 10) + assert_equal(est.train_score_.shape[0], 10) + assert_equal(est.oob_improvement_.shape[0], 10) + + # try refit + est.set_params(n_estimators=30) + est.fit(X, y) + assert_equal(est.n_estimators, 30) + assert_equal(est.estimators_.shape[0], 30) + assert_equal(est.train_score_.shape[0], 30) + + est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, + warm_start=True) + est.fit(X, y, monitor=early_stopping_monitor) + assert_equal(est.n_estimators, 20) + assert_equal(est.estimators_.shape[0], 10) + assert_equal(est.train_score_.shape[0], 10) + assert_equal(est.oob_improvement_.shape[0], 10) + + # try refit + est.set_params(n_estimators=30, warm_start=False) + est.fit(X, y) + assert_equal(est.n_estimators, 30) + assert_equal(est.train_score_.shape[0], 30) + assert_equal(est.estimators_.shape[0], 30) + assert_equal(est.oob_improvement_.shape[0], 30) def test_complete_classification(): @@ -1066,48 +1066,45 @@ def test_zero_estimator_clf(): assert_raises(ValueError, est.fit, X, y) -def test_max_leaf_nodes_max_depth(): +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_max_leaf_nodes_max_depth(GBEstimator): # Test precedence of max_leaf_nodes over max_depth. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, - GradientBoostingClassifier] k = 4 - for GBEstimator in all_estimators: - est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) - tree = est.estimators_[0, 0].tree_ - assert_greater(tree.max_depth, 1) - est = GBEstimator(max_depth=1).fit(X, y) - tree = est.estimators_[0, 0].tree_ - assert_equal(tree.max_depth, 1) + est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert_greater(tree.max_depth, 1) + est = GBEstimator(max_depth=1).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert_equal(tree.max_depth, 1) -def test_min_impurity_split(): + +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_min_impurity_split(GBEstimator): # Test if min_impurity_split of base estimators is set # Regression test for #8006 X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] - for GBEstimator in all_estimators: - est = GBEstimator(min_impurity_split=0.1) - est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", - est.fit, X, y) - for tree in est.estimators_.flat: - assert_equal(tree.min_impurity_split, 0.1) + est = GBEstimator(min_impurity_split=0.1) + est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", + est.fit, X, y) + for tree in est.estimators_.flat: + assert_equal(tree.min_impurity_split, 0.1) -def test_min_impurity_decrease(): +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_min_impurity_decrease(GBEstimator): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] - for GBEstimator in all_estimators: - est = GBEstimator(min_impurity_decrease=0.1) - est.fit(X, y) - for tree in est.estimators_.flat: - # Simply check if the parameter is passed on correctly. Tree tests - # will suffice for the actual working of this param - assert_equal(tree.min_impurity_decrease, 0.1) + est = GBEstimator(min_impurity_decrease=0.1) + est.fit(X, y) + for tree in est.estimators_.flat: + # Simply check if the parameter is passed on correctly. Tree tests + # will suffice for the actual working of this param + assert_equal(tree.min_impurity_decrease, 0.1) def test_warm_start_wo_nestimators_change(): @@ -1218,9 +1215,7 @@ def check_sparse_input(EstimatorClass, X, X_sparse, y): @pytest.mark.parametrize( 'EstimatorClass', (GradientBoostingClassifier, GradientBoostingRegressor)) -@pytest.mark.parametrize( - 'sparse_matrix', - (csr_matrix, csc_matrix, coo_matrix)) +@pytest.mark.parametrize('sparse_matrix', (csr_matrix, csc_matrix, coo_matrix)) def test_sparse_input(EstimatorClass, sparse_matrix): y, X = datasets.make_multilabel_classification(random_state=0, n_samples=50, diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 72ebbb1653c41..66d678421e90b 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -5,8 +5,10 @@ from random import Random import numpy as np import scipy.sparse as sp - from numpy.testing import assert_array_equal + +import pytest + from sklearn.utils.testing import (assert_equal, assert_in, assert_false, assert_true) @@ -14,34 +16,34 @@ from sklearn.feature_selection import SelectKBest, chi2 -def test_dictvectorizer(): +@pytest.mark.parametrize('sparse', (True, False)) +@pytest.mark.parametrize('dtype', (int, np.float32, np.int16)) +@pytest.mark.parametrize('sort', (True, False)) +@pytest.mark.parametrize('iterable', (True, False)) +def test_dictvectorizer(sparse, dtype, sort, iterable): D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}] - for sparse in (True, False): - for dtype in (int, np.float32, np.int16): - for sort in (True, False): - for iterable in (True, False): - v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) - X = v.fit_transform(iter(D) if iterable else D) - - assert_equal(sp.issparse(X), sparse) - assert_equal(X.shape, (3, 5)) - assert_equal(X.sum(), 14) - assert_equal(v.inverse_transform(X), D) - - if sparse: - # CSR matrices can't be compared for equality - assert_array_equal(X.A, v.transform(iter(D) if iterable - else D).A) - else: - assert_array_equal(X, v.transform(iter(D) if iterable - else D)) - - if sort: - assert_equal(v.feature_names_, - sorted(v.feature_names_)) + v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) + X = v.fit_transform(iter(D) if iterable else D) + + assert_equal(sp.issparse(X), sparse) + assert_equal(X.shape, (3, 5)) + assert_equal(X.sum(), 14) + assert_equal(v.inverse_transform(X), D) + + if sparse: + # CSR matrices can't be compared for equality + assert_array_equal(X.A, v.transform(iter(D) if iterable + else D).A) + else: + assert_array_equal(X, v.transform(iter(D) if iterable + else D)) + + if sort: + assert_equal(v.feature_names_, + sorted(v.feature_names_)) def test_feature_selection(): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index ff4dd6bd86a79..b0209f6bbe3ac 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -115,42 +115,42 @@ def test_to_ascii(): assert_equal(strip_accents_ascii(a), expected) -def test_word_analyzer_unigrams(): - for Vectorizer in (CountVectorizer, HashingVectorizer): - wa = Vectorizer(strip_accents='ascii').build_analyzer() - text = ("J'ai mang\xe9 du kangourou ce midi, " - "c'\xe9tait pas tr\xeas bon.") - expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi', - 'etait', 'pas', 'tres', 'bon'] - assert_equal(wa(text), expected) - - text = "This is a test, really.\n\n I met Harry yesterday." - expected = ['this', 'is', 'test', 'really', 'met', 'harry', - 'yesterday'] - assert_equal(wa(text), expected) - - wa = Vectorizer(input='file').build_analyzer() - text = StringIO("This is a test with a file-like object!") - expected = ['this', 'is', 'test', 'with', 'file', 'like', - 'object'] - assert_equal(wa(text), expected) - - # with custom preprocessor - wa = Vectorizer(preprocessor=uppercase).build_analyzer() - text = ("J'ai mang\xe9 du kangourou ce midi, " - " c'\xe9tait pas tr\xeas bon.") - expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI', - 'ETAIT', 'PAS', 'TRES', 'BON'] - assert_equal(wa(text), expected) - - # with custom tokenizer - wa = Vectorizer(tokenizer=split_tokenize, - strip_accents='ascii').build_analyzer() - text = ("J'ai mang\xe9 du kangourou ce midi, " - "c'\xe9tait pas tr\xeas bon.") - expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,', - "c'etait", 'pas', 'tres', 'bon.'] - assert_equal(wa(text), expected) +@pytest.mark.parametrize('Vectorizer', (CountVectorizer, HashingVectorizer)) +def test_word_analyzer_unigrams(Vectorizer): + wa = Vectorizer(strip_accents='ascii').build_analyzer() + text = ("J'ai mang\xe9 du kangourou ce midi, " + "c'\xe9tait pas tr\xeas bon.") + expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi', + 'etait', 'pas', 'tres', 'bon'] + assert_equal(wa(text), expected) + + text = "This is a test, really.\n\n I met Harry yesterday." + expected = ['this', 'is', 'test', 'really', 'met', 'harry', + 'yesterday'] + assert_equal(wa(text), expected) + + wa = Vectorizer(input='file').build_analyzer() + text = StringIO("This is a test with a file-like object!") + expected = ['this', 'is', 'test', 'with', 'file', 'like', + 'object'] + assert_equal(wa(text), expected) + + # with custom preprocessor + wa = Vectorizer(preprocessor=uppercase).build_analyzer() + text = ("J'ai mang\xe9 du kangourou ce midi, " + " c'\xe9tait pas tr\xeas bon.") + expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI', + 'ETAIT', 'PAS', 'TRES', 'BON'] + assert_equal(wa(text), expected) + + # with custom tokenizer + wa = Vectorizer(tokenizer=split_tokenize, + strip_accents='ascii').build_analyzer() + text = ("J'ai mang\xe9 du kangourou ce midi, " + "c'\xe9tait pas tr\xeas bon.") + expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,', + "c'etait", 'pas', 'tres', 'bon.'] + assert_equal(wa(text), expected) def test_word_analyzer_unigrams_and_bigrams(): @@ -574,22 +574,17 @@ def test_feature_names(): assert_equal(idx, cv.vocabulary_.get(name)) -def test_vectorizer_max_features(): - vec_factories = ( - CountVectorizer, - TfidfVectorizer, - ) - +@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer)) +def test_vectorizer_max_features(Vectorizer): expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza']) expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke', u'sparkling', u'water', u'the']) - for vec_factory in vec_factories: - # test bounded number of extracted features - vectorizer = vec_factory(max_df=0.6, max_features=4) - vectorizer.fit(ALL_FOOD_DOCS) - assert_equal(set(vectorizer.vocabulary_), expected_vocabulary) - assert_equal(vectorizer.stop_words_, expected_stop_words) + # test bounded number of extracted features + vectorizer = Vectorizer(max_df=0.6, max_features=4) + vectorizer.fit(ALL_FOOD_DOCS) + assert_equal(set(vectorizer.vocabulary_), expected_vocabulary) + assert_equal(vectorizer.stop_words_, expected_stop_words) def test_count_vectorizer_max_features(): @@ -713,23 +708,24 @@ def test_hashed_binary_occurrences(): assert_equal(X.dtype, np.float64) -def test_vectorizer_inverse_transform(): +@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer)) +def test_vectorizer_inverse_transform(Vectorizer): # raw documents data = ALL_FOOD_DOCS - for vectorizer in (TfidfVectorizer(), CountVectorizer()): - transformed_data = vectorizer.fit_transform(data) - inversed_data = vectorizer.inverse_transform(transformed_data) - analyze = vectorizer.build_analyzer() - for doc, inversed_terms in zip(data, inversed_data): - terms = np.sort(np.unique(analyze(doc))) - inversed_terms = np.sort(np.unique(inversed_terms)) - assert_array_equal(terms, inversed_terms) - - # Test that inverse_transform also works with numpy arrays - transformed_data = transformed_data.toarray() - inversed_data2 = vectorizer.inverse_transform(transformed_data) - for terms, terms2 in zip(inversed_data, inversed_data2): - assert_array_equal(np.sort(terms), np.sort(terms2)) + vectorizer = Vectorizer() + transformed_data = vectorizer.fit_transform(data) + inversed_data = vectorizer.inverse_transform(transformed_data) + analyze = vectorizer.build_analyzer() + for doc, inversed_terms in zip(data, inversed_data): + terms = np.sort(np.unique(analyze(doc))) + inversed_terms = np.sort(np.unique(inversed_terms)) + assert_array_equal(terms, inversed_terms) + + # Test that inverse_transform also works with numpy arrays + transformed_data = transformed_data.toarray() + inversed_data2 = vectorizer.inverse_transform(transformed_data) + for terms, terms2 in zip(inversed_data, inversed_data2): + assert_array_equal(np.sort(terms), np.sort(terms2)) def test_count_vectorizer_pipeline_grid_selection(): @@ -1030,16 +1026,16 @@ def test_vectorizer_vocab_clone(): assert_equal(vect_vocab_clone.vocabulary_, vect_vocab.vocabulary_) -def test_vectorizer_string_object_as_input(): +@pytest.mark.parametrize('Vectorizer', + (CountVectorizer, TfidfVectorizer, HashingVectorizer)) +def test_vectorizer_string_object_as_input(Vectorizer): message = ("Iterable over raw text documents expected, " "string object received.") - for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]: - assert_raise_message( + vec = Vectorizer() + assert_raise_message( ValueError, message, vec.fit_transform, "hello world!") - assert_raise_message( - ValueError, message, vec.fit, "hello world!") - assert_raise_message( - ValueError, message, vec.transform, "hello world!") + assert_raise_message(ValueError, message, vec.fit, "hello world!") + assert_raise_message(ValueError, message, vec.transform, "hello world!") @pytest.mark.parametrize("vec", [ diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index d3f1eca333cd1..14e621473090a 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -7,7 +7,6 @@ import numpy as np from scipy import stats, sparse -from numpy.testing import run_module_suite from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises @@ -670,7 +669,3 @@ def test_mutual_info_regression(): gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) - - -if __name__ == '__main__': - run_module_suite() diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index 615abf1c5b0a0..f05e0b52b3707 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -1,7 +1,6 @@ from __future__ import division import numpy as np -from numpy.testing import run_module_suite from scipy.sparse import csr_matrix from sklearn.utils import check_random_state @@ -200,7 +199,3 @@ def test_mutual_info_options(): assert_array_equal(mi_3, mi_4) assert_false(np.allclose(mi_1, mi_3)) - - -if __name__ == '__main__': - run_module_suite() diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index 37d872fc99fb5..6d6fa3ab81193 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -7,6 +7,8 @@ import numpy as np +import pytest + from sklearn.gaussian_process import GaussianProcess from sklearn.gaussian_process import regression_models as regression from sklearn.gaussian_process import correlation_models as correlation @@ -100,16 +102,17 @@ def test_wrong_number_of_outputs(): assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3]) -def test_more_builtin_correlation_models(random_start=1): +@pytest.mark.parametrize( + 'corr', + ['absolute_exponential', 'squared_exponential', 'cubic', 'linear']) +def test_more_builtin_correlation_models(corr): # Repeat test_1d and test_2d for several built-in correlation # models specified as strings. - all_corr = ['absolute_exponential', 'squared_exponential', 'cubic', - 'linear'] + random_start = 1 - for corr in all_corr: - test_1d(regr='constant', corr=corr, random_start=random_start) - test_2d(regr='constant', corr=corr, random_start=random_start) - test_2d_2d(regr='constant', corr=corr, random_start=random_start) + test_1d(regr='constant', corr=corr, random_start=random_start) + test_2d(regr='constant', corr=corr, random_start=random_start) + test_2d_2d(regr='constant', corr=corr, random_start=random_start) def test_ordinary_kriging(): diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py index ba98cddb77f79..25aaa05fac3a8 100644 --- a/sklearn/gaussian_process/tests/test_gpc.py +++ b/sklearn/gaussian_process/tests/test_gpc.py @@ -7,6 +7,8 @@ from scipy.optimize import approx_fprime +import pytest + from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C @@ -31,62 +33,60 @@ def f(x): RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)), C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))] +non_fixed_kernels = [kernel for kernel in kernels + if kernel != fixed_kernel] -def test_predict_consistent(): +@pytest.mark.parametrize('kernel', kernels) +def test_predict_consistent(kernel): # Check binary predict decision has also predicted probability above 0.5. - for kernel in kernels: - gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) - assert_array_equal(gpc.predict(X), - gpc.predict_proba(X)[:, 1] >= 0.5) + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_array_equal(gpc.predict(X), + gpc.predict_proba(X)[:, 1] >= 0.5) -def test_lml_improving(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) - assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta), - gpc.log_marginal_likelihood(kernel.theta)) + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta), + gpc.log_marginal_likelihood(kernel.theta)) -def test_lml_precomputed(): +@pytest.mark.parametrize('kernel', kernels) +def test_lml_precomputed(kernel): # Test that lml of optimized kernel is stored correctly. - for kernel in kernels: - gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) - assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta), - gpc.log_marginal_likelihood(), 7) + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta), + gpc.log_marginal_likelihood(), 7) -def test_converged_to_local_maximum(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_converged_to_local_maximum(kernel): # Test that we are in local maximum after hyperparameter-optimization. - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) - lml, lml_gradient = \ - gpc.log_marginal_likelihood(gpc.kernel_.theta, True) + lml, lml_gradient = \ + gpc.log_marginal_likelihood(gpc.kernel_.theta, True) - assert_true(np.all((np.abs(lml_gradient) < 1e-4) | - (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0]) | - (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1]))) + assert_true(np.all((np.abs(lml_gradient) < 1e-4) | + (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0]) | + (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1]))) -def test_lml_gradient(): +@pytest.mark.parametrize('kernel', kernels) +def test_lml_gradient(kernel): # Compare analytic and numeric gradient of log marginal likelihood. - for kernel in kernels: - gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) - lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True) - lml_gradient_approx = \ - approx_fprime(kernel.theta, - lambda theta: gpc.log_marginal_likelihood(theta, - False), - 1e-10) + lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True) + lml_gradient_approx = \ + approx_fprime(kernel.theta, + lambda theta: gpc.log_marginal_likelihood(theta, + False), + 1e-10) - assert_almost_equal(lml_gradient, lml_gradient_approx, 3) + assert_almost_equal(lml_gradient, lml_gradient_approx, 3) def test_random_starts(): @@ -110,7 +110,8 @@ def test_random_starts(): last_lml = lml -def test_custom_optimizer(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_custom_optimizer(kernel): # Test that GPC can use externally defined optimizers. # Define a dummy optimizer that simply tests 50 random hyperparameters def optimizer(obj_func, initial_theta, bounds): @@ -125,38 +126,35 @@ def optimizer(obj_func, initial_theta, bounds): theta_opt, func_min = theta, f return theta_opt, func_min - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer) - gpc.fit(X, y_mc) - # Checks that optimizer improved marginal likelihood - assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta), - gpc.log_marginal_likelihood(kernel.theta)) + gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer) + gpc.fit(X, y_mc) + # Checks that optimizer improved marginal likelihood + assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta), + gpc.log_marginal_likelihood(kernel.theta)) -def test_multi_class(): +@pytest.mark.parametrize('kernel', kernels) +def test_multi_class(kernel): # Test GPC for multi-class classification problems. - for kernel in kernels: - gpc = GaussianProcessClassifier(kernel=kernel) - gpc.fit(X, y_mc) + gpc = GaussianProcessClassifier(kernel=kernel) + gpc.fit(X, y_mc) - y_prob = gpc.predict_proba(X2) - assert_almost_equal(y_prob.sum(1), 1) + y_prob = gpc.predict_proba(X2) + assert_almost_equal(y_prob.sum(1), 1) - y_pred = gpc.predict(X2) - assert_array_equal(np.argmax(y_prob, 1), y_pred) + y_pred = gpc.predict(X2) + assert_array_equal(np.argmax(y_prob, 1), y_pred) -def test_multi_class_n_jobs(): +@pytest.mark.parametrize('kernel', kernels) +def test_multi_class_n_jobs(kernel): # Test that multi-class GPC produces identical results with n_jobs>1. - for kernel in kernels: - gpc = GaussianProcessClassifier(kernel=kernel) - gpc.fit(X, y_mc) + gpc = GaussianProcessClassifier(kernel=kernel) + gpc.fit(X, y_mc) - gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2) - gpc_2.fit(X, y_mc) + gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2) + gpc_2.fit(X, y_mc) - y_prob = gpc.predict_proba(X2) - y_prob_2 = gpc_2.predict_proba(X2) - assert_almost_equal(y_prob, y_prob_2) + y_prob = gpc.predict_proba(X2) + y_prob_2 = gpc_2.predict_proba(X2) + assert_almost_equal(y_prob, y_prob_2) diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index 602b2b88ae9c9..18f82b00fb7f1 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -7,6 +7,8 @@ from scipy.optimize import approx_fprime +import pytest + from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels \ import RBF, ConstantKernel as C, WhiteKernel @@ -37,110 +39,106 @@ def f(x): C(0.1, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + C(1e-5, (1e-5, 1e2))] +non_fixed_kernels = [kernel for kernel in kernels + if kernel != fixed_kernel] -def test_gpr_interpolation(): +@pytest.mark.parametrize('kernel', kernels) +def test_gpr_interpolation(kernel): # Test the interpolating property for different kernels. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - y_pred, y_cov = gpr.predict(X, return_cov=True) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_pred, y_cov = gpr.predict(X, return_cov=True) - assert_almost_equal(y_pred, y) - assert_almost_equal(np.diag(y_cov), 0.) + assert_almost_equal(y_pred, y) + assert_almost_equal(np.diag(y_cov), 0.) -def test_lml_improving(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), - gpr.log_marginal_likelihood(kernel.theta)) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), + gpr.log_marginal_likelihood(kernel.theta)) -def test_lml_precomputed(): +@pytest.mark.parametrize('kernel', kernels) +def test_lml_precomputed(kernel): # Test that lml of optimized kernel is stored correctly. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - assert_equal(gpr.log_marginal_likelihood(gpr.kernel_.theta), - gpr.log_marginal_likelihood()) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert_equal(gpr.log_marginal_likelihood(gpr.kernel_.theta), + gpr.log_marginal_likelihood()) -def test_converged_to_local_maximum(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_converged_to_local_maximum(kernel): # Test that we are in local maximum after hyperparameter-optimization. - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - lml, lml_gradient = \ - gpr.log_marginal_likelihood(gpr.kernel_.theta, True) + lml, lml_gradient = \ + gpr.log_marginal_likelihood(gpr.kernel_.theta, True) - assert_true(np.all((np.abs(lml_gradient) < 1e-4) | - (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0]) | - (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1]))) + assert_true(np.all((np.abs(lml_gradient) < 1e-4) | + (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0]) | + (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1]))) -def test_solution_inside_bounds(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_solution_inside_bounds(kernel): # Test that hyperparameter-optimization remains in bounds# - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - bounds = gpr.kernel_.bounds - max_ = np.finfo(gpr.kernel_.theta.dtype).max - tiny = 1e-10 - bounds[~np.isfinite(bounds[:, 1]), 1] = max_ + bounds = gpr.kernel_.bounds + max_ = np.finfo(gpr.kernel_.theta.dtype).max + tiny = 1e-10 + bounds[~np.isfinite(bounds[:, 1]), 1] = max_ - assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny) - assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny) + assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny) + assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny) -def test_lml_gradient(): +@pytest.mark.parametrize('kernel', kernels) +def test_lml_gradient(kernel): # Compare analytic and numeric gradient of log marginal likelihood. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True) - lml_gradient_approx = \ - approx_fprime(kernel.theta, - lambda theta: gpr.log_marginal_likelihood(theta, - False), - 1e-10) + lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True) + lml_gradient_approx = \ + approx_fprime(kernel.theta, + lambda theta: gpr.log_marginal_likelihood(theta, + False), + 1e-10) - assert_almost_equal(lml_gradient, lml_gradient_approx, 3) + assert_almost_equal(lml_gradient, lml_gradient_approx, 3) -def test_prior(): +@pytest.mark.parametrize('kernel', kernels) +def test_prior(kernel): # Test that GP prior has mean 0 and identical variances. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel) + gpr = GaussianProcessRegressor(kernel=kernel) - y_mean, y_cov = gpr.predict(X, return_cov=True) + y_mean, y_cov = gpr.predict(X, return_cov=True) - assert_almost_equal(y_mean, 0, 5) - if len(gpr.kernel.theta) > 1: - # XXX: quite hacky, works only for current kernels - assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5) - else: - assert_almost_equal(np.diag(y_cov), 1, 5) + assert_almost_equal(y_mean, 0, 5) + if len(gpr.kernel.theta) > 1: + # XXX: quite hacky, works only for current kernels + assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5) + else: + assert_almost_equal(np.diag(y_cov), 1, 5) -def test_sample_statistics(): +@pytest.mark.parametrize('kernel', kernels) +def test_sample_statistics(kernel): # Test that statistics of samples drawn from GP are correct. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - y_mean, y_cov = gpr.predict(X2, return_cov=True) + y_mean, y_cov = gpr.predict(X2, return_cov=True) - samples = gpr.sample_y(X2, 300000) + samples = gpr.sample_y(X2, 300000) - # More digits accuracy would require many more samples - assert_almost_equal(y_mean, np.mean(samples, 1), 1) - assert_almost_equal(np.diag(y_cov) / np.diag(y_cov).max(), - np.var(samples, 1) / np.diag(y_cov).max(), 1) + # More digits accuracy would require many more samples + assert_almost_equal(y_mean, np.mean(samples, 1), 1) + assert_almost_equal(np.diag(y_cov) / np.diag(y_cov).max(), + np.var(samples, 1) / np.diag(y_cov).max(), 1) def test_no_optimizer(): @@ -150,13 +148,13 @@ def test_no_optimizer(): assert_equal(np.exp(gpr.kernel_.theta), 1.0) -def test_predict_cov_vs_std(): +@pytest.mark.parametrize('kernel', kernels) +def test_predict_cov_vs_std(kernel): # Test that predicted std.-dev. is consistent with cov's diagonal. - for kernel in kernels: - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - y_mean, y_cov = gpr.predict(X2, return_cov=True) - y_mean, y_std = gpr.predict(X2, return_std=True) - assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_mean, y_cov = gpr.predict(X2, return_cov=True) + y_mean, y_std = gpr.predict(X2, return_std=True) + assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std) def test_anisotropic_kernel(): @@ -197,32 +195,33 @@ def test_random_starts(): last_lml = lml -def test_y_normalization(): +@pytest.mark.parametrize('kernel', kernels) +def test_y_normalization(kernel): # Test normalization of the target values in GP # Fitting non-normalizing GP on normalized y and fitting normalizing GP # on unnormalized y should yield identical results y_mean = y.mean(0) y_norm = y - y_mean - for kernel in kernels: - # Fit non-normalizing GP on normalized y - gpr = GaussianProcessRegressor(kernel=kernel) - gpr.fit(X, y_norm) - # Fit normalizing GP on unnormalized y - gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True) - gpr_norm.fit(X, y) - # Compare predicted mean, std-devs and covariances - y_pred, y_pred_std = gpr.predict(X2, return_std=True) - y_pred = y_mean + y_pred - y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True) + # Fit non-normalizing GP on normalized y + gpr = GaussianProcessRegressor(kernel=kernel) + gpr.fit(X, y_norm) + # Fit normalizing GP on unnormalized y + gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr_norm.fit(X, y) + + # Compare predicted mean, std-devs and covariances + y_pred, y_pred_std = gpr.predict(X2, return_std=True) + y_pred = y_mean + y_pred + y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True) - assert_almost_equal(y_pred, y_pred_norm) - assert_almost_equal(y_pred_std, y_pred_std_norm) + assert_almost_equal(y_pred, y_pred_norm) + assert_almost_equal(y_pred_std, y_pred_std_norm) - _, y_cov = gpr.predict(X2, return_cov=True) - _, y_cov_norm = gpr_norm.predict(X2, return_cov=True) - assert_almost_equal(y_cov, y_cov_norm) + _, y_cov = gpr.predict(X2, return_cov=True) + _, y_cov_norm = gpr_norm.predict(X2, return_cov=True) + assert_almost_equal(y_cov, y_cov_norm) def test_y_multioutput(): @@ -268,7 +267,8 @@ def test_y_multioutput(): assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4) -def test_custom_optimizer(): +@pytest.mark.parametrize('kernel', non_fixed_kernels) +def test_custom_optimizer(kernel): # Test that GPR can use externally defined optimizers. # Define a dummy optimizer that simply tests 50 random hyperparameters def optimizer(obj_func, initial_theta, bounds): @@ -283,14 +283,11 @@ def optimizer(obj_func, initial_theta, bounds): theta_opt, func_min = theta, f return theta_opt, func_min - for kernel in kernels: - if kernel == fixed_kernel: - continue - gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer) - gpr.fit(X, y) - # Checks that optimizer improved marginal likelihood - assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), - gpr.log_marginal_likelihood(gpr.kernel.theta)) + gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer) + gpr.fit(X, y) + # Checks that optimizer improved marginal likelihood + assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), + gpr.log_marginal_likelihood(gpr.kernel.theta)) def test_gpr_correct_error_message(): @@ -306,30 +303,28 @@ def test_gpr_correct_error_message(): % kernel, gpr.fit, X, y) -def test_duplicate_input(): +@pytest.mark.parametrize('kernel', kernels) +def test_duplicate_input(kernel): # Test GPR can handle two different output-values for the same input. - for kernel in kernels: - gpr_equal_inputs = \ - GaussianProcessRegressor(kernel=kernel, alpha=1e-2) - gpr_similar_inputs = \ - GaussianProcessRegressor(kernel=kernel, alpha=1e-2) + gpr_equal_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2) + gpr_similar_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2) - X_ = np.vstack((X, X[0])) - y_ = np.hstack((y, y[0] + 1)) - gpr_equal_inputs.fit(X_, y_) + X_ = np.vstack((X, X[0])) + y_ = np.hstack((y, y[0] + 1)) + gpr_equal_inputs.fit(X_, y_) - X_ = np.vstack((X, X[0] + 1e-15)) - y_ = np.hstack((y, y[0] + 1)) - gpr_similar_inputs.fit(X_, y_) + X_ = np.vstack((X, X[0] + 1e-15)) + y_ = np.hstack((y, y[0] + 1)) + gpr_similar_inputs.fit(X_, y_) - X_test = np.linspace(0, 10, 100)[:, None] - y_pred_equal, y_std_equal = \ - gpr_equal_inputs.predict(X_test, return_std=True) - y_pred_similar, y_std_similar = \ - gpr_similar_inputs.predict(X_test, return_std=True) + X_test = np.linspace(0, 10, 100)[:, None] + y_pred_equal, y_std_equal = \ + gpr_equal_inputs.predict(X_test, return_std=True) + y_pred_similar, y_std_similar = \ + gpr_similar_inputs.predict(X_test, return_std=True) - assert_almost_equal(y_pred_equal, y_pred_similar) - assert_almost_equal(y_std_equal, y_std_similar) + assert_almost_equal(y_pred_equal, y_pred_similar) + assert_almost_equal(y_std_equal, y_std_similar) def test_no_fit_default_predict(): @@ -348,19 +343,20 @@ def test_no_fit_default_predict(): assert_array_almost_equal(y_cov1, y_cov2) -def test_K_inv_reset(): +@pytest.mark.parametrize('kernel', kernels) +def test_K_inv_reset(kernel): y2 = f(X2).ravel() - for kernel in kernels: - # Test that self._K_inv is reset after a new fit - gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) - assert_true(hasattr(gpr, '_K_inv')) - assert_true(gpr._K_inv is None) - gpr.predict(X, return_std=True) - assert_true(gpr._K_inv is not None) - gpr.fit(X2, y2) - assert_true(gpr._K_inv is None) - gpr.predict(X2, return_std=True) - gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2) - gpr2.predict(X2, return_std=True) - # the value of K_inv should be independent of the first fit - assert_array_equal(gpr._K_inv, gpr2._K_inv) + + # Test that self._K_inv is reset after a new fit + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert_true(hasattr(gpr, '_K_inv')) + assert_true(gpr._K_inv is None) + gpr.predict(X, return_std=True) + assert_true(gpr._K_inv is not None) + gpr.fit(X2, y2) + assert_true(gpr._K_inv is None) + gpr.predict(X2, return_std=True) + gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2) + gpr2.predict(X2, return_std=True) + # the value of K_inv should be independent of the first fit + assert_array_equal(gpr._K_inv, gpr2._K_inv) diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 3ede9032fd6fc..eeb367bc93fc2 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -48,98 +48,100 @@ kernels.append(PairwiseKernel(gamma=1.0, metric=metric)) -def test_kernel_gradient(): +@pytest.mark.parametrize('kernel', kernels) +def test_kernel_gradient(kernel): # Compare analytic and numeric gradient of kernels. - for kernel in kernels: - K, K_gradient = kernel(X, eval_gradient=True) + K, K_gradient = kernel(X, eval_gradient=True) - assert_equal(K_gradient.shape[0], X.shape[0]) - assert_equal(K_gradient.shape[1], X.shape[0]) - assert_equal(K_gradient.shape[2], kernel.theta.shape[0]) + assert_equal(K_gradient.shape[0], X.shape[0]) + assert_equal(K_gradient.shape[1], X.shape[0]) + assert_equal(K_gradient.shape[2], kernel.theta.shape[0]) - def eval_kernel_for_theta(theta): - kernel_clone = kernel.clone_with_theta(theta) - K = kernel_clone(X, eval_gradient=False) - return K + def eval_kernel_for_theta(theta): + kernel_clone = kernel.clone_with_theta(theta) + K = kernel_clone(X, eval_gradient=False) + return K - K_gradient_approx = \ - _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10) + K_gradient_approx = \ + _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10) - assert_almost_equal(K_gradient, K_gradient_approx, 4) + assert_almost_equal(K_gradient, K_gradient_approx, 4) -def test_kernel_theta(): +@pytest.mark.parametrize( + 'kernel', + [kernel for kernel in kernels + if not (isinstance(kernel, KernelOperator) # skip non-basic kernels + or isinstance(kernel, Exponentiation))]) +def test_kernel_theta(kernel): # Check that parameter vector theta of kernel is set correctly. - for kernel in kernels: - if isinstance(kernel, KernelOperator) \ - or isinstance(kernel, Exponentiation): # skip non-basic kernels - continue - theta = kernel.theta - _, K_gradient = kernel(X, eval_gradient=True) - - # Determine kernel parameters that contribute to theta - init_sign = signature(kernel.__class__.__init__).parameters.values() - args = [p.name for p in init_sign if p.name != 'self'] - theta_vars = map(lambda s: s[0:-len("_bounds")], - filter(lambda s: s.endswith("_bounds"), args)) - assert_equal( - set(hyperparameter.name - for hyperparameter in kernel.hyperparameters), - set(theta_vars)) - - # Check that values returned in theta are consistent with - # hyperparameter values (being their logarithms) - for i, hyperparameter in enumerate(kernel.hyperparameters): - assert_equal(theta[i], - np.log(getattr(kernel, hyperparameter.name))) - - # Fixed kernel parameters must be excluded from theta and gradient. - for i, hyperparameter in enumerate(kernel.hyperparameters): - # create copy with certain hyperparameter fixed - params = kernel.get_params() - params[hyperparameter.name + "_bounds"] = "fixed" - kernel_class = kernel.__class__ - new_kernel = kernel_class(**params) - # Check that theta and K_gradient are identical with the fixed - # dimension left out - _, K_gradient_new = new_kernel(X, eval_gradient=True) - assert_equal(theta.shape[0], new_kernel.theta.shape[0] + 1) - assert_equal(K_gradient.shape[2], K_gradient_new.shape[2] + 1) - if i > 0: - assert_equal(theta[:i], new_kernel.theta[:i]) - assert_array_equal(K_gradient[..., :i], - K_gradient_new[..., :i]) - if i + 1 < len(kernel.hyperparameters): - assert_equal(theta[i + 1:], new_kernel.theta[i:]) - assert_array_equal(K_gradient[..., i + 1:], - K_gradient_new[..., i:]) - - # Check that values of theta are modified correctly - for i, hyperparameter in enumerate(kernel.hyperparameters): - theta[i] = np.log(42) - kernel.theta = theta - assert_almost_equal(getattr(kernel, hyperparameter.name), 42) - - setattr(kernel, hyperparameter.name, 43) - assert_almost_equal(kernel.theta[i], np.log(43)) - - -def test_auto_vs_cross(): + theta = kernel.theta + _, K_gradient = kernel(X, eval_gradient=True) + + # Determine kernel parameters that contribute to theta + init_sign = signature(kernel.__class__.__init__).parameters.values() + args = [p.name for p in init_sign if p.name != 'self'] + theta_vars = map(lambda s: s[0:-len("_bounds")], + filter(lambda s: s.endswith("_bounds"), args)) + assert_equal( + set(hyperparameter.name + for hyperparameter in kernel.hyperparameters), + set(theta_vars)) + + # Check that values returned in theta are consistent with + # hyperparameter values (being their logarithms) + for i, hyperparameter in enumerate(kernel.hyperparameters): + assert_equal(theta[i], + np.log(getattr(kernel, hyperparameter.name))) + + # Fixed kernel parameters must be excluded from theta and gradient. + for i, hyperparameter in enumerate(kernel.hyperparameters): + # create copy with certain hyperparameter fixed + params = kernel.get_params() + params[hyperparameter.name + "_bounds"] = "fixed" + kernel_class = kernel.__class__ + new_kernel = kernel_class(**params) + # Check that theta and K_gradient are identical with the fixed + # dimension left out + _, K_gradient_new = new_kernel(X, eval_gradient=True) + assert_equal(theta.shape[0], new_kernel.theta.shape[0] + 1) + assert_equal(K_gradient.shape[2], K_gradient_new.shape[2] + 1) + if i > 0: + assert_equal(theta[:i], new_kernel.theta[:i]) + assert_array_equal(K_gradient[..., :i], + K_gradient_new[..., :i]) + if i + 1 < len(kernel.hyperparameters): + assert_equal(theta[i + 1:], new_kernel.theta[i:]) + assert_array_equal(K_gradient[..., i + 1:], + K_gradient_new[..., i:]) + + # Check that values of theta are modified correctly + for i, hyperparameter in enumerate(kernel.hyperparameters): + theta[i] = np.log(42) + kernel.theta = theta + assert_almost_equal(getattr(kernel, hyperparameter.name), 42) + + setattr(kernel, hyperparameter.name, 43) + assert_almost_equal(kernel.theta[i], np.log(43)) + + +@pytest.mark.parametrize('kernel', + [kernel for kernel in kernels + # Identity is not satisfied on diagonal + if kernel != kernel_white]) +def test_auto_vs_cross(kernel): # Auto-correlation and cross-correlation should be consistent. - for kernel in kernels: - if kernel == kernel_white: - continue # Identity is not satisfied on diagonal - K_auto = kernel(X) - K_cross = kernel(X, X) - assert_almost_equal(K_auto, K_cross, 5) + K_auto = kernel(X) + K_cross = kernel(X, X) + assert_almost_equal(K_auto, K_cross, 5) -def test_kernel_diag(): +@pytest.mark.parametrize('kernel', kernels) +def test_kernel_diag(kernel): # Test that diag method of kernel returns consistent results. - for kernel in kernels: - K_call_diag = np.diag(kernel(X)) - K_diag = kernel.diag(X) - assert_almost_equal(K_call_diag, K_diag, 5) + K_call_diag = np.diag(kernel(X)) + K_diag = kernel.diag(X) + assert_almost_equal(K_call_diag, K_diag, 5) def test_kernel_operator_commutative(): @@ -174,13 +176,13 @@ def test_kernel_anisotropic(): assert_array_equal(kernel.k2.length_scale, [1.0, 4.0]) -def test_kernel_stationary(): +@pytest.mark.parametrize('kernel', + [kernel for kernel in kernels + if kernel.is_stationary()]) +def test_kernel_stationary(kernel): # Test stationarity of kernels. - for kernel in kernels: - if not kernel.is_stationary(): - continue - K = kernel(X, X + 1) - assert_almost_equal(K[0, 0], np.diag(K)) + K = kernel(X, X + 1) + assert_almost_equal(K[0, 0], np.diag(K)) def check_hyperparameters_equal(kernel1, kernel2): @@ -258,63 +260,65 @@ def test_matern_kernel(): assert_array_almost_equal(K1, K2) -def test_kernel_versus_pairwise(): +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_versus_pairwise(kernel): # Check that GP kernels can also be used as pairwise kernels. - for kernel in kernels: - # Test auto-kernel - if kernel != kernel_white: - # For WhiteKernel: k(X) != k(X,X). This is assumed by - # pairwise_kernels - K1 = kernel(X) - K2 = pairwise_kernels(X, metric=kernel) - assert_array_almost_equal(K1, K2) - - # Test cross-kernel - K1 = kernel(X, Y) - K2 = pairwise_kernels(X, Y, metric=kernel) + + # Test auto-kernel + if kernel != kernel_white: + # For WhiteKernel: k(X) != k(X,X). This is assumed by + # pairwise_kernels + K1 = kernel(X) + K2 = pairwise_kernels(X, metric=kernel) assert_array_almost_equal(K1, K2) + # Test cross-kernel + K1 = kernel(X, Y) + K2 = pairwise_kernels(X, Y, metric=kernel) + assert_array_almost_equal(K1, K2) + -def test_set_get_params(): +@pytest.mark.parametrize("kernel", kernels) +def test_set_get_params(kernel): # Check that set_params()/get_params() is consistent with kernel.theta. - for kernel in kernels: - # Test get_params() - index = 0 - params = kernel.get_params() - for hyperparameter in kernel.hyperparameters: - if isinstance("string", type(hyperparameter.bounds)): - if hyperparameter.bounds == "fixed": - continue - size = hyperparameter.n_elements - if size > 1: # anisotropic kernels - assert_almost_equal(np.exp(kernel.theta[index:index + size]), - params[hyperparameter.name]) - index += size - else: - assert_almost_equal(np.exp(kernel.theta[index]), - params[hyperparameter.name]) - index += 1 - # Test set_params() - index = 0 - value = 10 # arbitrary value - for hyperparameter in kernel.hyperparameters: - if isinstance("string", type(hyperparameter.bounds)): - if hyperparameter.bounds == "fixed": - continue - size = hyperparameter.n_elements - if size > 1: # anisotropic kernels - kernel.set_params(**{hyperparameter.name: [value] * size}) - assert_almost_equal(np.exp(kernel.theta[index:index + size]), - [value] * size) - index += size - else: - kernel.set_params(**{hyperparameter.name: value}) - assert_almost_equal(np.exp(kernel.theta[index]), value) - index += 1 - - -def test_repr_kernels(): + + # Test get_params() + index = 0 + params = kernel.get_params() + for hyperparameter in kernel.hyperparameters: + if isinstance("string", type(hyperparameter.bounds)): + if hyperparameter.bounds == "fixed": + continue + size = hyperparameter.n_elements + if size > 1: # anisotropic kernels + assert_almost_equal(np.exp(kernel.theta[index:index + size]), + params[hyperparameter.name]) + index += size + else: + assert_almost_equal(np.exp(kernel.theta[index]), + params[hyperparameter.name]) + index += 1 + # Test set_params() + index = 0 + value = 10 # arbitrary value + for hyperparameter in kernel.hyperparameters: + if isinstance("string", type(hyperparameter.bounds)): + if hyperparameter.bounds == "fixed": + continue + size = hyperparameter.n_elements + if size > 1: # anisotropic kernels + kernel.set_params(**{hyperparameter.name: [value] * size}) + assert_almost_equal(np.exp(kernel.theta[index:index + size]), + [value] * size) + index += size + else: + kernel.set_params(**{hyperparameter.name: value}) + assert_almost_equal(np.exp(kernel.theta[index]), value) + index += 1 + + +@pytest.mark.parametrize("kernel", kernels) +def test_repr_kernels(kernel): # Smoke-test for repr in kernels. - for kernel in kernels: - repr(kernel) + repr(kernel) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index a3b35f40a88d7..fb65d800e78ba 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -706,17 +706,17 @@ def test_overrided_gram_matrix(): clf.fit, X, y) -def test_lasso_non_float_y(): +@pytest.mark.parametrize('model', [ElasticNet, Lasso]) +def test_lasso_non_float_y(model): X = [[0, 0], [1, 1], [-1, -1]] y = [0, 1, 2] y_float = [0.0, 1.0, 2.0] - for model in [ElasticNet, Lasso]: - clf = model(fit_intercept=False) - clf.fit(X, y) - clf_float = model(fit_intercept=False) - clf_float.fit(X, y_float) - assert_array_equal(clf.coef_, clf_float.coef_) + clf = model(fit_intercept=False) + clf.fit(X, y) + clf_float = model(fit_intercept=False) + clf_float.fit(X, y_float) + assert_array_equal(clf.coef_, clf_float.coef_) def test_enet_float_precision(): diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index e41df9cce1178..630559fe4fef2 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -3,6 +3,8 @@ import numpy as np from scipy import linalg +import pytest + from sklearn.model_selection import train_test_split from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_almost_equal @@ -172,18 +174,20 @@ def test_no_path_all_precomputed(): assert_true(alpha_ == alphas_[-1]) -def test_lars_precompute(): +@pytest.mark.parametrize( + 'classifier', + [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]) +def test_lars_precompute(classifier): # Check for different values of precompute X, y = diabetes.data, diabetes.target G = np.dot(X.T, X) - for classifier in [linear_model.Lars, linear_model.LarsCV, - linear_model.LassoLarsIC]: - clf = classifier(precompute=G) - output_1 = ignore_warnings(clf.fit)(X, y).coef_ - for precompute in [True, False, 'auto', None]: - clf = classifier(precompute=precompute) - output_2 = clf.fit(X, y).coef_ - assert_array_almost_equal(output_1, output_2, decimal=8) + + clf = classifier(precompute=G) + output_1 = ignore_warnings(clf.fit)(X, y).coef_ + for precompute in [True, False, 'auto', None]: + clf = classifier(precompute=precompute) + output_2 = clf.fit(X, y).coef_ + assert_array_almost_equal(output_1, output_2, decimal=8) def test_singular_matrix(): diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index a179c89e199a3..e363fed2abb9d 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1,6 +1,9 @@ import numpy as np import scipy.sparse as sp from scipy import linalg, optimize, sparse + +import pytest + from sklearn.datasets import load_iris, make_classification from sklearn.metrics import log_loss from sklearn.model_selection import StratifiedKFold @@ -139,63 +142,63 @@ def test_predict_iris(): assert_greater(np.mean(pred == target), .95) -def test_multinomial_validation(): - for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: - lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial') - assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1]) +@pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']) +def test_multinomial_validation(solver): + lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial') + assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1]) -def test_check_solver_option(): +@pytest.mark.parametrize('LR', [LogisticRegression, LogisticRegressionCV]) +def test_check_solver_option(LR): X, y = iris.data, iris.target - for LR in [LogisticRegression, LogisticRegressionCV]: - msg = ('Logistic Regression supports only liblinear, newton-cg, ' - 'lbfgs, sag and saga solvers, got wrong_name') - lr = LR(solver="wrong_name") + msg = ('Logistic Regression supports only liblinear, newton-cg, ' + 'lbfgs, sag and saga solvers, got wrong_name') + lr = LR(solver="wrong_name") + assert_raise_message(ValueError, msg, lr.fit, X, y) + + msg = "multi_class should be either multinomial or ovr, got wrong_name" + lr = LR(solver='newton-cg', multi_class="wrong_name") + assert_raise_message(ValueError, msg, lr.fit, X, y) + + # only 'liblinear' solver + msg = "Solver liblinear does not support a multinomial backend." + lr = LR(solver='liblinear', multi_class='multinomial') + assert_raise_message(ValueError, msg, lr.fit, X, y) + + # all solvers except 'liblinear' + for solver in ['newton-cg', 'lbfgs', 'sag']: + msg = ("Solver %s supports only l2 penalties, got l1 penalty." % + solver) + lr = LR(solver=solver, penalty='l1') assert_raise_message(ValueError, msg, lr.fit, X, y) - - msg = "multi_class should be either multinomial or ovr, got wrong_name" - lr = LR(solver='newton-cg', multi_class="wrong_name") + for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']: + msg = ("Solver %s supports only dual=False, got dual=True" % + solver) + lr = LR(solver=solver, dual=True) assert_raise_message(ValueError, msg, lr.fit, X, y) - # only 'liblinear' solver - msg = "Solver liblinear does not support a multinomial backend." - lr = LR(solver='liblinear', multi_class='multinomial') - assert_raise_message(ValueError, msg, lr.fit, X, y) - # all solvers except 'liblinear' - for solver in ['newton-cg', 'lbfgs', 'sag']: - msg = ("Solver %s supports only l2 penalties, got l1 penalty." % - solver) - lr = LR(solver=solver, penalty='l1') - assert_raise_message(ValueError, msg, lr.fit, X, y) - for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']: - msg = ("Solver %s supports only dual=False, got dual=True" % - solver) - lr = LR(solver=solver, dual=True) - assert_raise_message(ValueError, msg, lr.fit, X, y) - - -def test_multinomial_binary(): +@pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']) +def test_multinomial_binary(solver): # Test multinomial LR on a binary problem. target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] - for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: - clf = LogisticRegression(solver=solver, multi_class='multinomial', - random_state=42, max_iter=2000) - clf.fit(iris.data, target) + clf = LogisticRegression(solver=solver, multi_class='multinomial', + random_state=42, max_iter=2000) + clf.fit(iris.data, target) - assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) - assert_equal(clf.intercept_.shape, (1,)) - assert_array_equal(clf.predict(iris.data), target) + assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) + assert_equal(clf.intercept_.shape, (1,)) + assert_array_equal(clf.predict(iris.data), target) - mlr = LogisticRegression(solver=solver, multi_class='multinomial', - random_state=42, fit_intercept=False) - mlr.fit(iris.data, target) - pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), - axis=1)] - assert_greater(np.mean(pred == target), .9) + mlr = LogisticRegression(solver=solver, multi_class='multinomial', + random_state=42, fit_intercept=False) + mlr.fit(iris.data, target) + pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), + axis=1)] + assert_greater(np.mean(pred == target), .9) def test_multinomial_binary_probabilities(): @@ -1043,7 +1046,9 @@ def test_max_iter(): assert_equal(lr.n_iter_[0], max_iter) -def test_n_iter(): +@pytest.mark.parametrize('solver', + ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']) +def test_n_iter(solver): # Test that self.n_iter_ has the correct format. X, y = iris.data, iris.target y_bin = y.copy() @@ -1052,76 +1057,73 @@ def test_n_iter(): n_Cs = 4 n_cv_fold = 2 - for solver in ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']: - # OvR case - n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] - clf = LogisticRegression(tol=1e-2, multi_class='ovr', - solver=solver, C=1., - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes,)) + # OvR case + n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] + clf = LogisticRegression(tol=1e-2, multi_class='ovr', + solver=solver, C=1., + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes,)) - n_classes = np.unique(y).shape[0] - clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', - solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) - clf.fit(X, y_bin) - assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) - - # multinomial case - n_classes = 1 - if solver in ('liblinear', 'sag', 'saga'): - break - - clf = LogisticRegression(tol=1e-2, multi_class='multinomial', - solver=solver, C=1., - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes,)) + n_classes = np.unique(y).shape[0] + clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', + solver=solver, Cs=n_Cs, cv=n_cv_fold, + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) + clf.fit(X, y_bin) + assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) + + # multinomial case + n_classes = 1 + if solver in ('liblinear', 'sag', 'saga'): + return + + clf = LogisticRegression(tol=1e-2, multi_class='multinomial', + solver=solver, C=1., + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes,)) - clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', - solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) - clf.fit(X, y_bin) - assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) + clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', + solver=solver, Cs=n_Cs, cv=n_cv_fold, + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) + clf.fit(X, y_bin) + assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) -def test_warm_start(): +@pytest.mark.parametrize('solver', ('newton-cg', 'sag', 'saga', 'lbfgs')) +@pytest.mark.parametrize('warm_start', (True, False)) +@pytest.mark.parametrize('fit_intercept', (True, False)) +@pytest.mark.parametrize('multi_class', ['ovr', 'multinomial']) +def test_warm_start(solver, warm_start, fit_intercept, multi_class): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target - solvers = ['newton-cg', 'sag', 'saga', 'lbfgs'] - - for warm_start in [True, False]: - for fit_intercept in [True, False]: - for solver in solvers: - for multi_class in ['ovr', 'multinomial']: - clf = LogisticRegression(tol=1e-4, multi_class=multi_class, - warm_start=warm_start, - solver=solver, - random_state=42, max_iter=100, - fit_intercept=fit_intercept) - with ignore_warnings(category=ConvergenceWarning): - clf.fit(X, y) - coef_1 = clf.coef_ - - clf.max_iter = 1 - clf.fit(X, y) - cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) - msg = ("Warm starting issue with %s solver in %s mode " - "with fit_intercept=%s and warm_start=%s" - % (solver, multi_class, str(fit_intercept), - str(warm_start))) - if warm_start: - assert_greater(2.0, cum_diff, msg) - else: - assert_greater(cum_diff, 2.0, msg) + clf = LogisticRegression(tol=1e-4, multi_class=multi_class, + warm_start=warm_start, + solver=solver, + random_state=42, max_iter=100, + fit_intercept=fit_intercept) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + coef_1 = clf.coef_ + + clf.max_iter = 1 + clf.fit(X, y) + cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) + msg = ("Warm starting issue with %s solver in %s mode " + "with fit_intercept=%s and warm_start=%s" + % (solver, multi_class, str(fit_intercept), + str(warm_start))) + if warm_start: + assert_greater(2.0, cum_diff, msg) + else: + assert_greater(cum_diff, 2.0, msg) def test_saga_vs_liblinear(): diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index 5620c29e18374..ee519b7390c5b 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -2,6 +2,8 @@ import numpy as np import scipy.sparse as sp +import pytest + from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal, assert_array_equal @@ -111,23 +113,22 @@ def test_classifier_refit(): assert_array_equal(clf.classes_, iris.target_names) -def test_classifier_correctness(): +@pytest.mark.parametrize('loss', ("hinge", "squared_hinge")) +def test_classifier_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 - for loss in ("hinge", "squared_hinge"): - - clf1 = MyPassiveAggressive( - C=1.0, loss=loss, fit_intercept=True, n_iter=2) - clf1.fit(X, y_bin) + clf1 = MyPassiveAggressive( + C=1.0, loss=loss, fit_intercept=True, n_iter=2) + clf1.fit(X, y_bin) - for data in (X, X_csr): - clf2 = PassiveAggressiveClassifier( - C=1.0, loss=loss, fit_intercept=True, max_iter=2, - shuffle=False, tol=None) - clf2.fit(data, y_bin) + for data in (X, X_csr): + clf2 = PassiveAggressiveClassifier( + C=1.0, loss=loss, fit_intercept=True, max_iter=2, + shuffle=False, tol=None) + clf2.fit(data, y_bin) - assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2) + assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2) def test_classifier_undefined_methods(): @@ -248,22 +249,24 @@ def test_regressor_partial_fit(): assert_true(hasattr(reg, 'standard_coef_')) -def test_regressor_correctness(): +@pytest.mark.parametrize( + 'loss', + ("epsilon_insensitive", "squared_epsilon_insensitive")) +def test_regressor_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 - for loss in ("epsilon_insensitive", "squared_epsilon_insensitive"): - reg1 = MyPassiveAggressive( - C=1.0, loss=loss, fit_intercept=True, n_iter=2) - reg1.fit(X, y_bin) + reg1 = MyPassiveAggressive( + C=1.0, loss=loss, fit_intercept=True, n_iter=2) + reg1.fit(X, y_bin) - for data in (X, X_csr): - reg2 = PassiveAggressiveRegressor( - C=1.0, tol=None, loss=loss, fit_intercept=True, max_iter=2, - shuffle=False) - reg2.fit(data, y_bin) + for data in (X, X_csr): + reg2 = PassiveAggressiveRegressor( + C=1.0, tol=None, loss=loss, fit_intercept=True, max_iter=2, + shuffle=False) + reg2.fit(data, y_bin) - assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2) + assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2) def test_regressor_undefined_methods(): diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 8eb5a06c933b6..2f574b88ba7b5 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -59,41 +59,42 @@ SPARSE_FILTER = lambda X: sp.csr_matrix(X) -def test_ridge(): +@pytest.mark.parametrize('solver', + ("svd", "sparse_cg", "cholesky", "lsqr", "sag")) +def test_ridge(solver): # Ridge regression convergence test using score # TODO: for this test to be robust, we should use a dataset instead # of np.random. rng = np.random.RandomState(0) alpha = 1.0 - for solver in ("svd", "sparse_cg", "cholesky", "lsqr", "sag"): - # With more samples than features - n_samples, n_features = 6, 5 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) + # With more samples than features + n_samples, n_features = 6, 5 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) - ridge = Ridge(alpha=alpha, solver=solver) - ridge.fit(X, y) - assert_equal(ridge.coef_.shape, (X.shape[1], )) - assert_greater(ridge.score(X, y), 0.47) + ridge = Ridge(alpha=alpha, solver=solver) + ridge.fit(X, y) + assert_equal(ridge.coef_.shape, (X.shape[1], )) + assert_greater(ridge.score(X, y), 0.47) - if solver in ("cholesky", "sag"): - # Currently the only solvers to support sample_weight. - ridge.fit(X, y, sample_weight=np.ones(n_samples)) - assert_greater(ridge.score(X, y), 0.47) + if solver in ("cholesky", "sag"): + # Currently the only solvers to support sample_weight. + ridge.fit(X, y, sample_weight=np.ones(n_samples)) + assert_greater(ridge.score(X, y), 0.47) - # With more features than samples - n_samples, n_features = 5, 10 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - ridge = Ridge(alpha=alpha, solver=solver) - ridge.fit(X, y) - assert_greater(ridge.score(X, y), .9) + # With more features than samples + n_samples, n_features = 5, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + ridge = Ridge(alpha=alpha, solver=solver) + ridge.fit(X, y) + assert_greater(ridge.score(X, y), .9) - if solver in ("cholesky", "sag"): - # Currently the only solvers to support sample_weight. - ridge.fit(X, y, sample_weight=np.ones(n_samples)) - assert_greater(ridge.score(X, y), 0.9) + if solver in ("cholesky", "sag"): + # Currently the only solvers to support sample_weight. + ridge.fit(X, y, sample_weight=np.ones(n_samples)) + assert_greater(ridge.score(X, y), 0.9) def test_primal_dual_relationship(): @@ -545,33 +546,33 @@ def test_class_weights(): assert_array_almost_equal(reg.intercept_, rega.intercept_) -def test_class_weight_vs_sample_weight(): +@pytest.mark.parametrize('reg', (RidgeClassifier, RidgeClassifierCV)) +def test_class_weight_vs_sample_weight(reg): """Check class_weights resemble sample_weights behavior.""" - for reg in (RidgeClassifier, RidgeClassifierCV): - - # Iris is balanced, so no effect expected for using 'balanced' weights - reg1 = reg() - reg1.fit(iris.data, iris.target) - reg2 = reg(class_weight='balanced') - reg2.fit(iris.data, iris.target) - assert_almost_equal(reg1.coef_, reg2.coef_) - - # Inflate importance of class 1, check against user-defined weights - sample_weight = np.ones(iris.target.shape) - sample_weight[iris.target == 1] *= 100 - class_weight = {0: 1., 1: 100., 2: 1.} - reg1 = reg() - reg1.fit(iris.data, iris.target, sample_weight) - reg2 = reg(class_weight=class_weight) - reg2.fit(iris.data, iris.target) - assert_almost_equal(reg1.coef_, reg2.coef_) - - # Check that sample_weight and class_weight are multiplicative - reg1 = reg() - reg1.fit(iris.data, iris.target, sample_weight ** 2) - reg2 = reg(class_weight=class_weight) - reg2.fit(iris.data, iris.target, sample_weight) - assert_almost_equal(reg1.coef_, reg2.coef_) + + # Iris is balanced, so no effect expected for using 'balanced' weights + reg1 = reg() + reg1.fit(iris.data, iris.target) + reg2 = reg(class_weight='balanced') + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Check that sample_weight and class_weight are multiplicative + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight ** 2) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(reg1.coef_, reg2.coef_) def test_class_weights_cv(): diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 9f372f706ca71..18bc073139650 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -1174,16 +1174,16 @@ def test_numerical_stability_large_gradient(): assert_true(np.isfinite(model.coef_).all()) -def test_large_regularization(): +@pytest.mark.parametrize('penalty', ['l2', 'l1', 'elasticnet']) +def test_large_regularization(penalty): # Non regression tests for numerical stability issues caused by large # regularization parameters - for penalty in ['l2', 'l1', 'elasticnet']: - model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1, - penalty=penalty, shuffle=False, - tol=None, max_iter=6) - with np.errstate(all='raise'): - model.fit(iris.data, iris.target) - assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) + model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1, + penalty=penalty, shuffle=False, + tol=None, max_iter=6) + with np.errstate(all='raise'): + model.fit(iris.data, iris.target) + assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) def test_tol_parameter(): diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 4118dd997f00c..cc692ae0d0cd0 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -598,35 +598,35 @@ def test_no_sparse_on_barnes_hut(): tsne.fit_transform, X_csr) -def test_64bit(): +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +@pytest.mark.parametrize('dt', [np.float32, np.float64]) +def test_64bit(method, dt): # Ensure 64bit arrays are handled correctly. random_state = check_random_state(0) - methods = ['barnes_hut', 'exact'] - for method in methods: - for dt in [np.float32, np.float64]: - X = random_state.randn(50, 2).astype(dt) - tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - random_state=0, method=method, verbose=0) - X_embedded = tsne.fit_transform(X) - effective_type = X_embedded.dtype - # tsne cython code is only single precision, so the output will - # always be single precision, irrespectively of the input dtype - assert effective_type == np.float32 + X = random_state.randn(50, 2).astype(dt) + tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, + random_state=0, method=method, verbose=0) + X_embedded = tsne.fit_transform(X) + effective_type = X_embedded.dtype + # tsne cython code is only single precision, so the output will + # always be single precision, irrespectively of the input dtype + assert effective_type == np.float32 -def test_kl_divergence_not_nan(): + +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +def test_kl_divergence_not_nan(method): # Ensure kl_divergence_ is computed at last iteration # even though n_iter % n_iter_check != 0, i.e. 1003 % 50 != 0 random_state = check_random_state(0) - methods = ['barnes_hut', 'exact'] - for method in methods: - X = random_state.randn(50, 2) - tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - random_state=0, method=method, verbose=0, n_iter=1003) - tsne.fit_transform(X) - assert not np.isnan(tsne.kl_divergence_) + X = random_state.randn(50, 2) + tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, + random_state=0, method=method, verbose=0, n_iter=1003) + tsne.fit_transform(X) + + assert not np.isnan(tsne.kl_divergence_) def test_barnes_hut_angle(): From 75e429217e2569f8273ee61763262ccaa67b432a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 7 May 2018 17:13:02 +0300 Subject: [PATCH 12/25] Pytest parametrize sklearn.ensemble --- sklearn/ensemble/tests/test_forest.py | 248 +++++----- .../ensemble/tests/test_gradient_boosting.py | 451 +++++++++--------- 2 files changed, 353 insertions(+), 346 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 420d10bc10bee..28d45b3b80f1a 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -10,6 +10,7 @@ import pickle from collections import defaultdict +import itertools from itertools import combinations from itertools import product @@ -18,6 +19,8 @@ from scipy.sparse import csc_matrix from scipy.sparse import coo_matrix +import pytest + from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal @@ -94,6 +97,8 @@ FOREST_ESTIMATORS.update(FOREST_REGRESSORS) FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) +FOREST_CLASSIFIERS_REGRESSORS = dict(FOREST_CLASSIFIERS, **FOREST_REGRESSORS) + def check_classification_toy(name): """Check classification on a toy dataset.""" @@ -114,9 +119,9 @@ def check_classification_toy(name): assert_equal(leaf_indices.shape, (len(X), clf.n_estimators)) -def test_classification_toy(): - for name in FOREST_CLASSIFIERS: - yield check_classification_toy, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_classification_toy(name): + check_classification_toy(name) def check_iris_criterion(name, criterion): @@ -138,9 +143,10 @@ def check_iris_criterion(name, criterion): % (criterion, score)) -def test_iris(): - for name, criterion in product(FOREST_CLASSIFIERS, ("gini", "entropy")): - yield check_iris_criterion, name, criterion +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize('criterion', ("gini", "entropy")) +def test_iris(name, criterion): + check_iris_criterion(name, criterion) def check_boston_criterion(name, criterion): @@ -162,9 +168,10 @@ def check_boston_criterion(name, criterion): "and score = %f" % (criterion, score)) -def test_boston(): - for name, criterion in product(FOREST_REGRESSORS, ("mse", "mae", "friedman_mse")): - yield check_boston_criterion, name, criterion +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +@pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse")) +def test_boston(name, criterion): + check_boston_criterion(name, criterion) def check_regressor_attributes(name): @@ -178,9 +185,9 @@ def check_regressor_attributes(name): assert_false(hasattr(r, "n_classes_")) -def test_regressor_attributes(): - for name in FOREST_REGRESSORS: - yield check_regressor_attributes, name +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +def test_regressor_attributes(name): + check_regressor_attributes(name) def check_probability(name): @@ -196,9 +203,9 @@ def check_probability(name): np.exp(clf.predict_log_proba(iris.data))) -def test_probability(): - for name in FOREST_CLASSIFIERS: - yield check_probability, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_probability(name): + check_probability(name) def check_importances(name, criterion, dtype, tolerance): @@ -241,17 +248,18 @@ def check_importances(name, criterion, dtype, tolerance): assert_less(np.abs(importances - importances_bis).mean(), tolerance) -def test_importances(): - for dtype in (np.float64, np.float32): - tolerance = 0.01 - for name, criterion in product(FOREST_CLASSIFIERS, - ["gini", "entropy"]): - yield check_importances, name, criterion, dtype, tolerance - - for name, criterion in product(FOREST_REGRESSORS, - ["mse", "friedman_mse", "mae"]): - tolerance = 0.05 if criterion == "mae" else 0.01 - yield check_importances, name, criterion, dtype, tolerance +@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +@pytest.mark.parametrize( + 'name, criterion', + itertools.chain(product(FOREST_CLASSIFIERS, + ["gini", "entropy"]), + product(FOREST_REGRESSORS, + ["mse", "friedman_mse", "mae"]))) +def test_importances(dtype, name, criterion): + tolerance = 0.01 + if name in FOREST_REGRESSORS and criterion == "mae": + tolerance = 0.05 + check_importances(name, criterion, dtype, tolerance) def test_importances_asymptotic(): @@ -352,9 +360,9 @@ def check_unfitted_feature_importances(name): "feature_importances_") -def test_unfitted_feature_importances(): - for name in FOREST_ESTIMATORS: - yield check_unfitted_feature_importances, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_unfitted_feature_importances(name): + check_unfitted_feature_importances(name) def check_oob_score(name, X, y, n_estimators=20): @@ -381,21 +389,23 @@ def check_oob_score(name, X, y, n_estimators=20): assert_warns(UserWarning, est.fit, X, y) -def test_oob_score(): - for name in FOREST_CLASSIFIERS: - yield check_oob_score, name, iris.data, iris.target +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_oob_score_classifiers(name): + check_oob_score(name, iris.data, iris.target) + + # csc matrix + check_oob_score(name, csc_matrix(iris.data), iris.target) - # csc matrix - yield check_oob_score, name, csc_matrix(iris.data), iris.target + # non-contiguous targets in classification + check_oob_score(name, iris.data, iris.target * 2 + 1) - # non-contiguous targets in classification - yield check_oob_score, name, iris.data, iris.target * 2 + 1 - for name in FOREST_REGRESSORS: - yield check_oob_score, name, boston.data, boston.target, 50 +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +def test_oob_score_regressors(name): + check_oob_score(name, boston.data, boston.target, 50) - # csc matrix - yield check_oob_score, name, csc_matrix(boston.data), boston.target, 50 + # csc matrix + check_oob_score(name, csc_matrix(boston.data), boston.target, 50) def check_oob_score_raise_error(name): @@ -421,9 +431,9 @@ def check_oob_score_raise_error(name): bootstrap=False).fit, X, y) -def test_oob_score_raise_error(): - for name in FOREST_ESTIMATORS: - yield check_oob_score_raise_error, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_oob_score_raise_error(name): + check_oob_score_raise_error(name) def check_gridsearch(name): @@ -432,10 +442,10 @@ def check_gridsearch(name): clf.fit(iris.data, iris.target) -def test_gridsearch(): +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_gridsearch(name): # Check that base trees can be grid-searched. - for name in FOREST_CLASSIFIERS: - yield check_gridsearch, name + check_gridsearch(name) def check_parallel(name, X, y): @@ -453,12 +463,14 @@ def check_parallel(name, X, y): assert_array_almost_equal(y1, y2, 3) -def test_parallel(): - for name in FOREST_CLASSIFIERS: - yield check_parallel, name, iris.data, iris.target +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_parallel(name): + if name in FOREST_CLASSIFIERS: + ds = iris + elif name in FOREST_REGRESSORS: + ds = boston - for name in FOREST_REGRESSORS: - yield check_parallel, name, boston.data, boston.target + check_parallel(name, ds.data, ds.target) def check_pickle(name, X, y): @@ -476,12 +488,14 @@ def check_pickle(name, X, y): assert_equal(score, score2) -def test_pickle(): - for name in FOREST_CLASSIFIERS: - yield check_pickle, name, iris.data[::2], iris.target[::2] +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_pickle(name): + if name in FOREST_CLASSIFIERS: + ds = iris + elif name in FOREST_REGRESSORS: + ds = boston - for name in FOREST_REGRESSORS: - yield check_pickle, name, boston.data[::2], boston.target[::2] + check_pickle(name, ds.data[::2], ds.target[::2]) def check_multioutput(name): @@ -511,12 +525,9 @@ def check_multioutput(name): assert_equal(log_proba[1].shape, (4, 4)) -def test_multioutput(): - for name in FOREST_CLASSIFIERS: - yield check_multioutput, name - - for name in FOREST_REGRESSORS: - yield check_multioutput, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput(name): + check_multioutput(name) def check_classes_shape(name): @@ -537,9 +548,9 @@ def check_classes_shape(name): assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]]) -def test_classes_shape(): - for name in FOREST_CLASSIFIERS: - yield check_classes_shape, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_classes_shape(name): + check_classes_shape(name) def test_random_trees_dense_type(): @@ -692,9 +703,9 @@ def check_max_leaf_nodes_max_depth(name): assert_equal(est.estimators_[0].tree_.max_depth, 1) -def test_max_leaf_nodes_max_depth(): - for name in FOREST_ESTIMATORS: - yield check_max_leaf_nodes_max_depth, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_max_leaf_nodes_max_depth(name): + check_max_leaf_nodes_max_depth(name) def check_min_samples_split(name): @@ -726,9 +737,9 @@ def check_min_samples_split(name): "Failed with {0}".format(name)) -def test_min_samples_split(): - for name in FOREST_ESTIMATORS: - yield check_min_samples_split, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_samples_split(name): + check_min_samples_split(name) def check_min_samples_leaf(name): @@ -763,9 +774,9 @@ def check_min_samples_leaf(name): "Failed with {0}".format(name)) -def test_min_samples_leaf(): - for name in FOREST_ESTIMATORS: - yield check_min_samples_leaf, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_samples_leaf(name): + check_min_samples_leaf(name) def check_min_weight_fraction_leaf(name): @@ -799,9 +810,9 @@ def check_min_weight_fraction_leaf(name): name, est.min_weight_fraction_leaf)) -def test_min_weight_fraction_leaf(): - for name in FOREST_ESTIMATORS: - yield check_min_weight_fraction_leaf, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_min_weight_fraction_leaf(name): + check_min_weight_fraction_leaf(name) def check_sparse_input(name, X, X_sparse, y): @@ -830,13 +841,14 @@ def check_sparse_input(name, X, X_sparse, y): dense.fit_transform(X).toarray()) -def test_sparse_input(): +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize('sparse_matrix', + (csr_matrix, csc_matrix, coo_matrix)) +def test_sparse_input(name, sparse_matrix): X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) - for name, sparse_matrix in product(FOREST_ESTIMATORS, - (csr_matrix, csc_matrix, coo_matrix)): - yield check_sparse_input, name, X, sparse_matrix(X), y + check_sparse_input(name, X, sparse_matrix(X), y) def check_memory_layout(name, dtype): @@ -886,12 +898,10 @@ def check_memory_layout(name, dtype): assert_array_almost_equal(est.fit(X, y).predict(X), y) -def test_memory_layout(): - for name, dtype in product(FOREST_CLASSIFIERS, [np.float64, np.float32]): - yield check_memory_layout, name, dtype - - for name, dtype in product(FOREST_REGRESSORS, [np.float64, np.float32]): - yield check_memory_layout, name, dtype +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +def test_memory_layout(name, dtype): + check_memory_layout(name, dtype) @ignore_warnings @@ -907,14 +917,14 @@ def check_1d_input(name, X, X_2d, y): assert_raises(ValueError, est.predict, X) +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) @ignore_warnings -def test_1d_input(): +def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target - for name in FOREST_ESTIMATORS: - yield check_1d_input, name, X, X_2d, y + check_1d_input(name, X, X_2d, y) def check_class_weights(name): @@ -966,9 +976,9 @@ def check_class_weights(name): clf.fit(iris.data, iris.target, sample_weight=sample_weight) -def test_class_weights(): - for name in FOREST_CLASSIFIERS: - yield check_class_weights, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weights(name): + check_class_weights(name) def check_class_weight_balanced_and_bootstrap_multi_output(name): @@ -985,9 +995,9 @@ def check_class_weight_balanced_and_bootstrap_multi_output(name): clf.fit(X, _y) -def test_class_weight_balanced_and_bootstrap_multi_output(): - for name in FOREST_CLASSIFIERS: - yield check_class_weight_balanced_and_bootstrap_multi_output, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weight_balanced_and_bootstrap_multi_output(name): + check_class_weight_balanced_and_bootstrap_multi_output(name) def check_class_weight_errors(name): @@ -1015,9 +1025,9 @@ def check_class_weight_errors(name): assert_raises(ValueError, clf.fit, X, _y) -def test_class_weight_errors(): - for name in FOREST_CLASSIFIERS: - yield check_class_weight_errors, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_class_weight_errors(name): + check_class_weight_errors(name) def check_warm_start(name, random_state=42): @@ -1047,9 +1057,9 @@ def check_warm_start(name, random_state=42): err_msg="Failed with {0}".format(name)) -def test_warm_start(): - for name in FOREST_ESTIMATORS: - yield check_warm_start, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start(name): + check_warm_start(name) def check_warm_start_clear(name): @@ -1069,9 +1079,9 @@ def check_warm_start_clear(name): assert_array_almost_equal(clf_2.apply(X), clf.apply(X)) -def test_warm_start_clear(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_clear, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_clear(name): + check_warm_start_clear(name) def check_warm_start_smaller_n_estimators(name): @@ -1084,9 +1094,9 @@ def check_warm_start_smaller_n_estimators(name): assert_raises(ValueError, clf.fit, X, y) -def test_warm_start_smaller_n_estimators(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_smaller_n_estimators, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_smaller_n_estimators(name): + check_warm_start_smaller_n_estimators(name) def check_warm_start_equal_n_estimators(name): @@ -1110,9 +1120,9 @@ def check_warm_start_equal_n_estimators(name): assert_array_equal(clf.apply(X), clf_2.apply(X)) -def test_warm_start_equal_n_estimators(): - for name in FOREST_ESTIMATORS: - yield check_warm_start_equal_n_estimators, name +@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +def test_warm_start_equal_n_estimators(name): + check_warm_start_equal_n_estimators(name) def check_warm_start_oob(name): @@ -1147,11 +1157,9 @@ def check_warm_start_oob(name): assert_equal(clf.oob_score_, clf_3.oob_score_) -def test_warm_start_oob(): - for name in FOREST_CLASSIFIERS: - yield check_warm_start_oob, name - for name in FOREST_REGRESSORS: - yield check_warm_start_oob, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_warm_start_oob(name): + check_warm_start_oob(name) def test_dtype_convert(n_classes=15): @@ -1187,11 +1195,9 @@ def check_decision_path(name): assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) -def test_decision_path(): - for name in FOREST_CLASSIFIERS: - yield check_decision_path, name - for name in FOREST_REGRESSORS: - yield check_decision_path, name +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +def test_decision_path(name): + check_decision_path(name) def test_min_impurity_split(): diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 6711007e2701c..c18008d0b31c1 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -4,12 +4,12 @@ import warnings import numpy as np -from itertools import product - from scipy.sparse import csr_matrix from scipy.sparse import csc_matrix from scipy.sparse import coo_matrix +import pytest + from sklearn import datasets from sklearn.base import clone from sklearn.datasets import make_classification @@ -35,6 +35,9 @@ from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import NotFittedError +GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, + GradientBoostingRegressor] + # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] @@ -75,10 +78,10 @@ def check_classification_toy(presort, loss): assert_equal(leaves.shape, (6, 10, 1)) -def test_classification_toy(): - for presort, loss in product(('auto', True, False), - ('deviance', 'exponential')): - yield check_classification_toy, presort, loss +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('deviance', 'exponential')) +def test_classification_toy(presort, loss): + check_classification_toy(presort, loss) def test_classifier_parameter_checks(): @@ -213,9 +216,10 @@ def check_classification_synthetic(presort, loss): assert_less(error_rate, 0.08) -def test_classification_synthetic(): - for presort, loss in product(('auto', True, False), ('deviance', 'exponential')): - yield check_classification_synthetic, presort, loss +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('deviance', 'exponential')) +def test_classification_synthetic(presort, loss): + check_classification_synthetic(presort, loss) def check_boston(presort, loss, subsample): @@ -248,11 +252,11 @@ def check_boston(presort, loss, subsample): last_y_pred = y_pred -def test_boston(): - for presort, loss, subsample in product(('auto', True, False), - ('ls', 'lad', 'huber'), - (1.0, 0.5)): - yield check_boston, presort, loss, subsample +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('loss', ('ls', 'lad', 'huber')) +@pytest.mark.parametrize('subsample', (1.0, 0.5)) +def test_boston(presort, loss, subsample): + check_boston(presort, loss, subsample) def check_iris(presort, subsample, sample_weight): @@ -270,12 +274,13 @@ def check_iris(presort, subsample, sample_weight): assert_equal(leaves.shape, (150, 100, 3)) -def test_iris(): - ones = np.ones(len(iris.target)) - for presort, subsample, sample_weight in product(('auto', True, False), - (1.0, 0.5), - (None, ones)): - yield check_iris, presort, subsample, sample_weight +@pytest.mark.parametrize('presort', ('auto', True, False)) +@pytest.mark.parametrize('subsample', (1.0, 0.5)) +@pytest.mark.parametrize('sample_weight', (None, 1)) +def test_iris(presort, subsample, sample_weight): + if sample_weight == 1: + sample_weight = np.ones(len(iris.target)) + check_iris(presort, subsample, sample_weight) def test_regression_synthetic(): @@ -531,23 +536,23 @@ def test_staged_predict_proba(): assert_array_almost_equal(clf.predict_proba(X_test), staged_proba) -def test_staged_functions_defensive(): +@pytest.mark.parametrize('Estimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_staged_functions_defensive(Estimator): # test that staged_functions make defensive copies rng = np.random.RandomState(0) X = rng.uniform(size=(10, 3)) y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros - for estimator in [GradientBoostingRegressor(), - GradientBoostingClassifier()]: - estimator.fit(X, y) - for func in ['predict', 'decision_function', 'predict_proba']: - staged_func = getattr(estimator, "staged_" + func, None) - if staged_func is None: - # regressor has no staged_predict_proba - continue - with warnings.catch_warnings(record=True): - staged_result = list(staged_func(X)) - staged_result[1][:] = 0 - assert_true(np.all(staged_result[0] != 0)) + estimator = Estimator() + estimator.fit(X, y) + for func in ['predict', 'decision_function', 'predict_proba']: + staged_func = getattr(estimator, "staged_" + func, None) + if staged_func is None: + # regressor has no staged_predict_proba + continue + with warnings.catch_warnings(record=True): + staged_result = list(staged_func(X)) + staged_result[1][:] = 0 + assert_true(np.all(staged_result[0] != 0)) def test_serialization(): @@ -753,188 +758,187 @@ def test_more_verbose_output(): assert_equal(100, n_lines) -def test_warm_start(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start(Cls): # Test if warm start equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=200, max_depth=1) - est.fit(X, y) - - est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=200) - est_ws.fit(X, y) - - if Cls is GradientBoostingRegressor: - assert_array_almost_equal(est_ws.predict(X), est.predict(X)) - else: - # Random state is preserved and hence predict_proba must also be - # same - assert_array_equal(est_ws.predict(X), est.predict(X)) - assert_array_almost_equal(est_ws.predict_proba(X), - est.predict_proba(X)) - - -def test_warm_start_n_estimators(): + est = Cls(n_estimators=200, max_depth=1) + est.fit(X, y) + + est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) + + if Cls is GradientBoostingRegressor: + assert_array_almost_equal(est_ws.predict(X), est.predict(X)) + else: + # Random state is preserved and hence predict_proba must also be + # same + assert_array_equal(est_ws.predict(X), est.predict(X)) + assert_array_almost_equal(est_ws.predict_proba(X), + est.predict_proba(X)) + + +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_n_estimators(Cls): # Test if warm start equals fit - set n_estimators. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=300, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=300, max_depth=1) + est.fit(X, y) - est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=300) - est_ws.fit(X, y) + est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=300) + est_ws.fit(X, y) assert_array_almost_equal(est_ws.predict(X), est.predict(X)) -def test_warm_start_max_depth(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_max_depth(Cls): # Test if possible to fit trees of different depth in ensemble. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=110, max_depth=2) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, max_depth=2) + est.fit(X, y) - # last 10 trees have different depth - assert_equal(est.estimators_[0, 0].max_depth, 1) - for i in range(1, 11): - assert_equal(est.estimators_[-i, 0].max_depth, 2) + # last 10 trees have different depth + assert_equal(est.estimators_[0, 0].max_depth, 1) + for i in range(1, 11): + assert_equal(est.estimators_[-i, 0].max_depth, 2) -def test_warm_start_clear(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_clear(Cls): # Test if fit clears state. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) - est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True) - est_2.fit(X, y) # inits state - est_2.set_params(warm_start=False) - est_2.fit(X, y) # clears old state and equals est + est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_2.fit(X, y) # inits state + est_2.set_params(warm_start=False) + est_2.fit(X, y) # clears old state and equals est - assert_array_almost_equal(est_2.predict(X), est.predict(X)) + assert_array_almost_equal(est_2.predict(X), est.predict(X)) -def test_warm_start_zero_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_zero_n_estimators(Cls): # Test if warm start with zero n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=0) - assert_raises(ValueError, est.fit, X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=0) + assert_raises(ValueError, est.fit, X, y) -def test_warm_start_smaller_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_smaller_n_estimators(Cls): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=99) - assert_raises(ValueError, est.fit, X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=99) + assert_raises(ValueError, est.fit, X, y) -def test_warm_start_equal_n_estimators(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_equal_n_estimators(Cls): # Test if warm start with equal n_estimators does nothing X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) - est2 = clone(est) - est2.set_params(n_estimators=est.n_estimators, warm_start=True) - est2.fit(X, y) + est2 = clone(est) + est2.set_params(n_estimators=est.n_estimators, warm_start=True) + est2.fit(X, y) - assert_array_almost_equal(est2.predict(X), est.predict(X)) + assert_array_almost_equal(est2.predict(X), est.predict(X)) -def test_warm_start_oob_switch(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob_switch(Cls): # Test if oob can be turned on during warm start. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=100, max_depth=1, warm_start=True) - est.fit(X, y) - est.set_params(n_estimators=110, subsample=0.5) - est.fit(X, y) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, subsample=0.5) + est.fit(X, y) - assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) - # the last 10 are not zeros - assert_array_equal(est.oob_improvement_[-10:] == 0.0, - np.zeros(10, dtype=np.bool)) + assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) + # the last 10 are not zeros + assert_array_equal(est.oob_improvement_[-10:] == 0.0, + np.zeros(10, dtype=np.bool)) -def test_warm_start_oob(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob(Cls): # Test if warm start OOB equals fit. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=200, max_depth=1, subsample=0.5, - random_state=1) - est.fit(X, y) + est = Cls(n_estimators=200, max_depth=1, subsample=0.5, + random_state=1) + est.fit(X, y) - est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_ws.fit(X, y) - est_ws.set_params(n_estimators=200) - est_ws.fit(X, y) + est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) - assert_array_almost_equal(est_ws.oob_improvement_[:100], - est.oob_improvement_[:100]) + assert_array_almost_equal(est_ws.oob_improvement_[:100], + est.oob_improvement_[:100]) -def test_warm_start_sparse(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_sparse(Cls): # Test that all sparse matrix types are supported X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) sparse_matrix_type = [csr_matrix, csc_matrix, coo_matrix] - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_dense.fit(X, y) - est_dense.predict(X) - est_dense.set_params(n_estimators=200) - est_dense.fit(X, y) - y_pred_dense = est_dense.predict(X) - - for sparse_constructor in sparse_matrix_type: - X_sparse = sparse_constructor(X) - - est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5, - random_state=1, warm_start=True) - est_sparse.fit(X_sparse, y) - est_sparse.predict(X) - est_sparse.set_params(n_estimators=200) - est_sparse.fit(X_sparse, y) - y_pred_sparse = est_sparse.predict(X) - - assert_array_almost_equal(est_dense.oob_improvement_[:100], - est_sparse.oob_improvement_[:100]) - assert_array_almost_equal(y_pred_dense, y_pred_sparse) - - -def test_warm_start_fortran(): + est_dense = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_dense.fit(X, y) + est_dense.predict(X) + est_dense.set_params(n_estimators=200) + est_dense.fit(X, y) + y_pred_dense = est_dense.predict(X) + + for sparse_constructor in sparse_matrix_type: + X_sparse = sparse_constructor(X) + + est_sparse = Cls(n_estimators=100, max_depth=1, subsample=0.5, + random_state=1, warm_start=True) + est_sparse.fit(X_sparse, y) + est_sparse.predict(X) + est_sparse.set_params(n_estimators=200) + est_sparse.fit(X_sparse, y) + y_pred_sparse = est_sparse.predict(X) + + assert_array_almost_equal(est_dense.oob_improvement_[:100], + est_sparse.oob_improvement_[:100]) + assert_array_almost_equal(y_pred_dense, y_pred_sparse) + + +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_fortran(Cls): # Test that feeding a X in Fortran-ordered is giving the same results as # in C-ordered X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est_c = Cls(n_estimators=1, random_state=1, warm_start=True) - est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True) + est_c = Cls(n_estimators=1, random_state=1, warm_start=True) + est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True) - est_c.fit(X, y) - est_c.set_params(n_estimators=11) - est_c.fit(X, y) + est_c.fit(X, y) + est_c.set_params(n_estimators=11) + est_c.fit(X, y) - X_fortran = np.asfortranarray(X) - est_fortran.fit(X_fortran, y) - est_fortran.set_params(n_estimators=11) - est_fortran.fit(X_fortran, y) + X_fortran = np.asfortranarray(X) + est_fortran.fit(X_fortran, y) + est_fortran.set_params(n_estimators=11) + est_fortran.fit(X_fortran, y) - assert_array_almost_equal(est_c.predict(X), - est_fortran.predict(X)) + assert_array_almost_equal(est_c.predict(X), est_fortran.predict(X)) def early_stopping_monitor(i, est, locals): @@ -945,40 +949,40 @@ def early_stopping_monitor(i, est, locals): return False -def test_monitor_early_stopping(): +@pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS) +def test_monitor_early_stopping(Cls): # Test if monitor return value works. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]: - est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) - est.fit(X, y, monitor=early_stopping_monitor) - assert_equal(est.n_estimators, 20) # this is not altered - assert_equal(est.estimators_.shape[0], 10) - assert_equal(est.train_score_.shape[0], 10) - assert_equal(est.oob_improvement_.shape[0], 10) - - # try refit - est.set_params(n_estimators=30) - est.fit(X, y) - assert_equal(est.n_estimators, 30) - assert_equal(est.estimators_.shape[0], 30) - assert_equal(est.train_score_.shape[0], 30) - - est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, - warm_start=True) - est.fit(X, y, monitor=early_stopping_monitor) - assert_equal(est.n_estimators, 20) - assert_equal(est.estimators_.shape[0], 10) - assert_equal(est.train_score_.shape[0], 10) - assert_equal(est.oob_improvement_.shape[0], 10) - - # try refit - est.set_params(n_estimators=30, warm_start=False) - est.fit(X, y) - assert_equal(est.n_estimators, 30) - assert_equal(est.train_score_.shape[0], 30) - assert_equal(est.estimators_.shape[0], 30) - assert_equal(est.oob_improvement_.shape[0], 30) + est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) + est.fit(X, y, monitor=early_stopping_monitor) + assert_equal(est.n_estimators, 20) # this is not altered + assert_equal(est.estimators_.shape[0], 10) + assert_equal(est.train_score_.shape[0], 10) + assert_equal(est.oob_improvement_.shape[0], 10) + + # try refit + est.set_params(n_estimators=30) + est.fit(X, y) + assert_equal(est.n_estimators, 30) + assert_equal(est.estimators_.shape[0], 30) + assert_equal(est.train_score_.shape[0], 30) + + est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5, + warm_start=True) + est.fit(X, y, monitor=early_stopping_monitor) + assert_equal(est.n_estimators, 20) + assert_equal(est.estimators_.shape[0], 10) + assert_equal(est.train_score_.shape[0], 10) + assert_equal(est.oob_improvement_.shape[0], 10) + + # try refit + est.set_params(n_estimators=30, warm_start=False) + est.fit(X, y) + assert_equal(est.n_estimators, 30) + assert_equal(est.train_score_.shape[0], 30) + assert_equal(est.estimators_.shape[0], 30) + assert_equal(est.oob_improvement_.shape[0], 30) def test_complete_classification(): @@ -1062,48 +1066,45 @@ def test_zero_estimator_clf(): assert_raises(ValueError, est.fit, X, y) -def test_max_leaf_nodes_max_depth(): +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_max_leaf_nodes_max_depth(GBEstimator): # Test precedence of max_leaf_nodes over max_depth. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, - GradientBoostingClassifier] k = 4 - for GBEstimator in all_estimators: - est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) - tree = est.estimators_[0, 0].tree_ - assert_greater(tree.max_depth, 1) - est = GBEstimator(max_depth=1).fit(X, y) - tree = est.estimators_[0, 0].tree_ - assert_equal(tree.max_depth, 1) + est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert_greater(tree.max_depth, 1) + est = GBEstimator(max_depth=1).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert_equal(tree.max_depth, 1) -def test_min_impurity_split(): + +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_min_impurity_split(GBEstimator): # Test if min_impurity_split of base estimators is set # Regression test for #8006 X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] - for GBEstimator in all_estimators: - est = GBEstimator(min_impurity_split=0.1) - est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", - est.fit, X, y) - for tree in est.estimators_.flat: - assert_equal(tree.min_impurity_split, 0.1) + est = GBEstimator(min_impurity_split=0.1) + est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", + est.fit, X, y) + for tree in est.estimators_.flat: + assert_equal(tree.min_impurity_split, 0.1) -def test_min_impurity_decrease(): +@pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS) +def test_min_impurity_decrease(GBEstimator): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier] - for GBEstimator in all_estimators: - est = GBEstimator(min_impurity_decrease=0.1) - est.fit(X, y) - for tree in est.estimators_.flat: - # Simply check if the parameter is passed on correctly. Tree tests - # will suffice for the actual working of this param - assert_equal(tree.min_impurity_decrease, 0.1) + est = GBEstimator(min_impurity_decrease=0.1) + est.fit(X, y) + for tree in est.estimators_.flat: + # Simply check if the parameter is passed on correctly. Tree tests + # will suffice for the actual working of this param + assert_equal(tree.min_impurity_decrease, 0.1) def test_warm_start_wo_nestimators_change(): @@ -1211,18 +1212,18 @@ def check_sparse_input(EstimatorClass, X, X_sparse, y): @skip_if_32bit -def test_sparse_input(): - ests = (GradientBoostingClassifier, GradientBoostingRegressor) - sparse_matrices = (csr_matrix, csc_matrix, coo_matrix) - +@pytest.mark.parametrize( + 'EstimatorClass', + (GradientBoostingClassifier, GradientBoostingRegressor)) +@pytest.mark.parametrize('sparse_matrix', (csr_matrix, csc_matrix, coo_matrix)) +def test_sparse_input(EstimatorClass, sparse_matrix): y, X = datasets.make_multilabel_classification(random_state=0, n_samples=50, n_features=1, n_classes=20) y = y[:, 0] - for EstimatorClass, sparse_matrix in product(ests, sparse_matrices): - yield check_sparse_input, EstimatorClass, X, sparse_matrix(X), y + check_sparse_input(EstimatorClass, X, sparse_matrix(X), y) def test_gradient_boosting_early_stopping(): From a060efa5d0a2533b8b8bec08ce95bce92bf8b86e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 7 May 2018 17:52:17 +0300 Subject: [PATCH 13/25] Fix CI --- sklearn/ensemble/tests/test_forest.py | 4 ++-- sklearn/utils/testing.py | 23 +++++++---------------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 28d45b3b80f1a..b6db56b0e0980 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -918,13 +918,13 @@ def check_1d_input(name, X, X_2d, y): @pytest.mark.parametrize('name', FOREST_ESTIMATORS) -@ignore_warnings def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target - check_1d_input(name, X, X_2d, y) + with ignore_warnings(): + check_1d_input(name, X, X_2d, y) def check_class_weights(name): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 561ff22293532..98a5e3e98b00d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -49,6 +49,9 @@ from sklearn.utils.fixes import signature from sklearn.utils import deprecated +import pytest + + additional_names_in_all = [] try: from nose.tools import raises as _nose_raises @@ -688,16 +691,10 @@ def run_test(*args, **kwargs): return run_test -def skip_if_32bit(func): - """Test decorator that skips tests on 32bit platforms.""" - @wraps(func) - def run_test(*args, **kwargs): - bits = 8 * struct.calcsize("P") - if bits == 32: - raise SkipTest('Test skipped on 32bit platforms.') - else: - return func(*args, **kwargs) - return run_test +skip_if_32bit = pytest.mark.skipif(8 * struct.calcsize("P") == 32, + reason='skipped on 32bit platforms') +skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', + reason='skip on travis') def if_safe_multiprocessing_with_blas(func): @@ -744,12 +741,6 @@ def check_skip_network(): raise SkipTest("Text tutorial requires large dataset download") -def check_skip_travis(): - """Skip test if being run on Travis.""" - if os.environ.get('TRAVIS') == "true": - raise SkipTest("This test needs to be skipped on Travis") - - def _delete_folder(folder_path, warn=False): """Utility function to cleanup a temporary folder if still existing. From f96210ae54623e3b1d07dcba64be055d1c72d34d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 7 May 2018 18:13:45 +0300 Subject: [PATCH 14/25] Fix CI --- sklearn/ensemble/tests/test_forest.py | 4 +-- sklearn/mixture/tests/test_gmm.py | 4 +-- sklearn/utils/testing.py | 47 ++++++++++++--------------- sklearn/utils/tests/test_stats.py | 8 +++-- 4 files changed, 30 insertions(+), 33 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 28d45b3b80f1a..b6db56b0e0980 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -918,13 +918,13 @@ def check_1d_input(name, X, X_2d, y): @pytest.mark.parametrize('name', FOREST_ESTIMATORS) -@ignore_warnings def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target - check_1d_input(name, X, X_2d, y) + with ignore_warnings(): + check_1d_input(name, X, X_2d, y) def check_class_weights(name): diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index 9e2288413f4ea..b2fa6880beb4b 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -162,9 +162,9 @@ def test_GMM_attributes(): with pytest.raises(ValueError): g._set_covars([]) with pytest.raises(ValueError): - g._set_covars( np.zeros((n_components - 2, n_features))) + g._set_covars(np.zeros((n_components - 2, n_features))) with pytest.raises(ValueError): - mixture.GMM( n_components=20, covariance_type='badcovariance_type') + mixture.GMM(n_components=20, covariance_type='badcovariance_type') class GMMTester(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 98a5e3e98b00d..4b72c0915bf26 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -697,32 +697,27 @@ def run_test(*args, **kwargs): reason='skip on travis') -def if_safe_multiprocessing_with_blas(func): - """Decorator for tests involving both BLAS calls and multiprocessing. - - Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction with - some implementation of BLAS (or other libraries that manage an internal - posix thread pool) can cause a crash or a freeze of the Python process. - - In practice all known packaged distributions (from Linux distros or - Anaconda) of BLAS under Linux seems to be safe. So we this problem seems to - only impact OSX users. - - This wrapper makes it possible to skip tests that can possibly cause - this crash under OS X with. - - Under Python 3.4+ it is possible to use the `forkserver` start method - for multiprocessing to avoid this issue. However it can cause pickling - errors on interactively defined functions. It therefore not enabled by - default. - """ - @wraps(func) - def run_test(*args, **kwargs): - if sys.platform == 'darwin': - raise SkipTest( - "Possible multi-process bug with some BLAS") - return func(*args, **kwargs) - return run_test +# Decorator for tests involving both BLAS calls and multiprocessing. +# +# Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction with +# some implementation of BLAS (or other libraries that manage an internal +# posix thread pool) can cause a crash or a freeze of the Python process. +# +# In practice all known packaged distributions (from Linux distros or +# Anaconda) of BLAS under Linux seems to be safe. So we this problem seems to +# only impact OSX users. +# +# This wrapper makes it possible to skip tests that can possibly cause +# this crash under OS X with. +# +# Under Python 3.4+ it is possible to use the `forkserver` start method +# for multiprocessing to avoid this issue. However it can cause pickling +# errors on interactively defined functions. It therefore not enabled by +# default. + +if_safe_multiprocessing_with_blas = pytest.mark.skipif( + sys.platform == 'darwin', + reason="Possible multi-process bug with some BLAS") def clean_warning_registry(): diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py index c1a6ffd907bbe..36e3bf72b609b 100644 --- a/sklearn/utils/tests/test_stats.py +++ b/sklearn/utils/tests/test_stats.py @@ -14,8 +14,10 @@ ) -@ignore_warnings # Test deprecated backport to be removed in 0.21 @pytest.mark.parametrize("values, method, expected", _cases) def test_cases_rankdata(values, method, expected): - r = rankdata(values, method=method) - assert_array_equal(r, expected) + + # Test deprecated backport to be removed in 0.21 + with ignore_warnings(): + r = rankdata(values, method=method) + assert_array_equal(r, expected) From a581f55e65fd18d21411e9c93b89b2854d032684 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 8 May 2018 12:21:22 +0300 Subject: [PATCH 15/25] Ensure sklearn/utils/testing.py doesn't import pytest --- sklearn/utils/testing.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 98a5e3e98b00d..0619bfd5015c7 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -49,9 +49,6 @@ from sklearn.utils.fixes import signature from sklearn.utils import deprecated -import pytest - - additional_names_in_all = [] try: from nose.tools import raises as _nose_raises @@ -691,10 +688,31 @@ def run_test(*args, **kwargs): return run_test -skip_if_32bit = pytest.mark.skipif(8 * struct.calcsize("P") == 32, - reason='skipped on 32bit platforms') -skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', - reason='skip on travis') +try: + import pytest + + skip_if_32bit = pytest.mark.skipif(8 * struct.calcsize("P") == 32, + reason='skipped on 32bit platforms') + skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', + reason='skip on travis') + +except ImportError: + + def skip_if_32bit(func): + """Test decorator that skips tests on 32bit platforms.""" + @wraps(func) + def run_test(*args, **kwargs): + bits = 8 * struct.calcsize("P") + if bits == 32: + raise SkipTest('Test skipped on 32bit platforms.') + else: + return func(*args, **kwargs) + return run_test + + def skip_travis(): + """Skip test if being run on Travis.""" + if os.environ.get('TRAVIS') == "true": + raise SkipTest("This test needs to be skipped on Travis") def if_safe_multiprocessing_with_blas(func): From 501bae7f4222db76aae251833a751bbbd111caf0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 17 May 2018 13:41:36 +0200 Subject: [PATCH 16/25] Don't use dict(x, **y) --- sklearn/ensemble/tests/test_forest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index b6db56b0e0980..0054707ba5a0d 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -97,7 +97,8 @@ FOREST_ESTIMATORS.update(FOREST_REGRESSORS) FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) -FOREST_CLASSIFIERS_REGRESSORS = dict(FOREST_CLASSIFIERS, **FOREST_REGRESSORS) +FOREST_CLASSIFIERS_REGRESSORS = FOREST_CLASSIFIERS.copy() +FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) def check_classification_toy(name): From 0bc2ccb8adcaed3580f8716c801f9249b295198d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 17 May 2018 13:52:32 +0200 Subject: [PATCH 17/25] Remove skip decorators for nose from sklearn/utils/testing.py --- sklearn/utils/testing.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 0619bfd5015c7..6f959687c56ed 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -697,22 +697,7 @@ def run_test(*args, **kwargs): reason='skip on travis') except ImportError: - - def skip_if_32bit(func): - """Test decorator that skips tests on 32bit platforms.""" - @wraps(func) - def run_test(*args, **kwargs): - bits = 8 * struct.calcsize("P") - if bits == 32: - raise SkipTest('Test skipped on 32bit platforms.') - else: - return func(*args, **kwargs) - return run_test - - def skip_travis(): - """Skip test if being run on Travis.""" - if os.environ.get('TRAVIS') == "true": - raise SkipTest("This test needs to be skipped on Travis") + pass def if_safe_multiprocessing_with_blas(func): From 4f4c543509e1a22bdee4c718337bd0a8188d1a8f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 24 May 2018 10:27:50 +0200 Subject: [PATCH 18/25] Fix Py2 compatibility in test_mldata tmpdir fixture --- sklearn/datasets/tests/test_mldata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_mldata.py b/sklearn/datasets/tests/test_mldata.py index 2863290216522..c8b98d8f51f43 100644 --- a/sklearn/datasets/tests/test_mldata.py +++ b/sklearn/datasets/tests/test_mldata.py @@ -18,7 +18,7 @@ @pytest.fixture def tmpdata(tmpdir): - os.makedirs(tmpdir.join('mldata')) + os.makedirs(str(tmpdir.join('mldata'))) yield tmpdir From b4b15d121b1a5d8cc98597b7edae0a27da400ed9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 24 May 2018 10:27:50 +0200 Subject: [PATCH 19/25] Fix Py2 compatibility in test_mldata tmpdir fixture --- sklearn/datasets/tests/test_mldata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_mldata.py b/sklearn/datasets/tests/test_mldata.py index 2863290216522..ea34137979ffe 100644 --- a/sklearn/datasets/tests/test_mldata.py +++ b/sklearn/datasets/tests/test_mldata.py @@ -18,8 +18,8 @@ @pytest.fixture def tmpdata(tmpdir): - os.makedirs(tmpdir.join('mldata')) - yield tmpdir + os.makedirs(str(tmpdir.join('mldata'))) + yield str(tmpdir) def test_mldata_filename(): From 332abb3ce3ac4ad8a5ae9aace41687bd26e71224 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 25 May 2018 22:55:35 +0200 Subject: [PATCH 20/25] Fix Python 2 compatibility --- sklearn/datasets/tests/test_base.py | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index a58e515a8f760..9d7fd4dc80af3 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -40,34 +40,34 @@ def _remove_dir(path): @pytest.fixture(scope="module") def data_home(tmpdir_factory): - return tmpdir_factory.mktemp("scikit_learn_data_home_test") + tmp_file = str(tmpdir_factory.mktemp("scikit_learn_data_home_test")) + yield tmp_file + _remove_dir(tmp_file) @pytest.fixture(scope="module") def load_files_root(tmpdir_factory): - return tmpdir_factory.mktemp("scikit_learn_load_files_test") + tmp_file = str(tmpdir_factory.mktemp("scikit_learn_load_files_test")) + yield tmp_file + _remove_dir(tmp_file) @pytest.fixture def test_category_dir_1(load_files_root): - try: - TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=load_files_root) - sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, - delete=False) - sample_file.write(b("Hello World!\n")) - sample_file.close() - yield TEST_CATEGORY_DIR1 - finally: - _remove_dir(TEST_CATEGORY_DIR1) + TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=load_files_root) + sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, + delete=False) + sample_file.write(b("Hello World!\n")) + sample_file.close() + yield str(TEST_CATEGORY_DIR1) + _remove_dir(TEST_CATEGORY_DIR1) @pytest.fixture def test_category_dir_2(load_files_root): - try: - TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=load_files_root) - yield TEST_CATEGORY_DIR2 - finally: - _remove_dir(TEST_CATEGORY_DIR2) + TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=load_files_root) + yield str(TEST_CATEGORY_DIR2) + _remove_dir(TEST_CATEGORY_DIR2) def test_data_home(data_home): From 9e07655df58013a6f16247eb707162051830a722 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 26 May 2018 00:36:48 +0200 Subject: [PATCH 21/25] PEP8 --- sklearn/metrics/tests/test_pairwise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 1897af3c88318..6c490903c30fe 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -2,7 +2,6 @@ import numpy as np from numpy import linalg -import pytest from scipy.sparse import dok_matrix, csr_matrix, issparse from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski From 75f16c55f577cb6f1ce6c5df2c524bdb6f3b0290 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 1 Jun 2018 12:02:30 +0200 Subject: [PATCH 22/25] PEP8 --- sklearn/datasets/tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index f375ff244ee73..bf03c4e3075a6 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -43,7 +43,7 @@ def data_home(tmpdir_factory): tmp_file = str(tmpdir_factory.mktemp("scikit_learn_data_home_test")) yield tmp_file _remove_dir(tmp_file) - + @pytest.fixture(scope="module") def load_files_root(tmpdir_factory): @@ -51,7 +51,7 @@ def load_files_root(tmpdir_factory): yield tmp_file _remove_dir(tmp_file) - + @pytest.fixture def test_category_dir_1(load_files_root): test_category_dir1 = tempfile.mkdtemp(dir=load_files_root) From 645fcd56fbbbbf0563c37a71d3386f6923eacf5b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 5 Jun 2018 22:28:01 +0200 Subject: [PATCH 23/25] Hanmin's comments --- sklearn/mixture/tests/test_gmm.py | 13 ++++++------- sklearn/tree/tests/test_tree.py | 5 ++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index b2fa6880beb4b..134c0493cf558 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -18,7 +18,7 @@ from sklearn.datasets.samples_generator import make_spd_matrix from sklearn.utils.testing import (assert_true, assert_greater, assert_raise_message, assert_warns_message, - ignore_warnings) + ignore_warnings, assert_raises) from sklearn.metrics.cluster import adjusted_rand_score from sklearn.externals.six.moves import cStringIO as StringIO @@ -159,12 +159,11 @@ def test_GMM_attributes(): covars = (0.1 + 2 * rng.rand(n_components, n_features)) ** 2 g.covars_ = covars assert_array_almost_equal(g.covars_, covars) - with pytest.raises(ValueError): - g._set_covars([]) - with pytest.raises(ValueError): - g._set_covars(np.zeros((n_components - 2, n_features))) - with pytest.raises(ValueError): - mixture.GMM(n_components=20, covariance_type='badcovariance_type') + assert_raises(ValueError, g._set_covars, []) + assert_raises(ValueError, g._set_covars, + np.zeros((n_components - 2, n_features))) + assert_raises(ValueError, mixture.GMM, n_components=20, + covariance_type='badcovariance_type') class GMMTester(): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index f57064f4d76cb..bb117d8a29863 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1547,11 +1547,10 @@ def check_raise_error_on_1d_input(name): assert_raises(ValueError, est.predict, [X]) -# XXX -# @ignore_warnings @pytest.mark.parametrize("name", ALL_TREES) def test_1d_input(name): - check_raise_error_on_1d_input(name) + with ignore_warnings(): + check_raise_error_on_1d_input(name) def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight): From dd89184401b6cca7824c2aef06d5211b5bddd6e9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 6 Jun 2018 15:42:47 +0200 Subject: [PATCH 24/25] More parametrizations in sklearn/metrics/tests/ --- sklearn/metrics/tests/test_classification.py | 97 +++--- sklearn/metrics/tests/test_common.py | 314 ++++++++++--------- sklearn/metrics/tests/test_pairwise.py | 204 ++++++------ sklearn/metrics/tests/test_ranking.py | 18 +- 4 files changed, 347 insertions(+), 286 deletions(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 1705510cc1ea4..d5489f7c235f0 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -6,6 +6,8 @@ from itertools import product import warnings +import pytest + from sklearn import datasets from sklearn import svm @@ -520,7 +522,8 @@ def test_matthews_corrcoef_multiclass(): assert_almost_equal(mcc, 0.) -def test_matthews_corrcoef_overflow(): +@pytest.mark.parametrize('n_points', [100, 10000, 1000000]) +def test_matthews_corrcoef_overflow(n_points): # https://github.com/scikit-learn/scikit-learn/issues/9622 rng = np.random.RandomState(20170906) @@ -543,16 +546,15 @@ def random_ys(n_points): # binary y_pred = (x_pred > 0.5) return y_true, y_pred - for n_points in [100, 10000, 1000000]: - arr = np.repeat([0., 1.], n_points) # binary - assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) - arr = np.repeat([0., 1., 2.], n_points) # multiclass - assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0., 1.], n_points) # binary + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0., 1., 2.], n_points) # multiclass + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) - y_true, y_pred = random_ys(n_points) - assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) - assert_almost_equal(matthews_corrcoef(y_true, y_pred), - mcc_safe(y_true, y_pred)) + y_true, y_pred = random_ys(n_points) + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + assert_almost_equal(matthews_corrcoef(y_true, y_pred), + mcc_safe(y_true, y_pred)) def test_precision_recall_f1_score_multiclass(): @@ -610,18 +612,19 @@ def test_precision_recall_f1_score_multiclass(): assert_array_equal(s, [24, 20, 31]) -def test_precision_refcall_f1_score_multilabel_unordered_labels(): +@pytest.mark.parametrize('average', + ['samples', 'micro', 'macro', 'weighted', None]) +def test_precision_refcall_f1_score_multilabel_unordered_labels(average): # test that labels need not be sorted in the multilabel case y_true = np.array([[1, 1, 0, 0]]) y_pred = np.array([[0, 0, 1, 1]]) - for average in ['samples', 'micro', 'macro', 'weighted', None]: - p, r, f, s = precision_recall_fscore_support( - y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average) - assert_array_equal(p, 0) - assert_array_equal(r, 0) - assert_array_equal(f, 0) - if average is None: - assert_array_equal(s, [0, 1, 1, 0]) + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average) + assert_array_equal(p, 0) + assert_array_equal(r, 0) + assert_array_equal(f, 0) + if average is None: + assert_array_equal(s, [0, 1, 1, 0]) def test_precision_recall_f1_score_binary_averaged(): @@ -1207,7 +1210,9 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): 0.333, 2) -def test_precision_recall_f1_no_labels(): +@pytest.mark.parametrize('beta', [1]) +@pytest.mark.parametrize('average', ["macro", "micro", "weighted", "samples"]) +def test_precision_recall_f1_no_labels(beta, average): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) @@ -1219,33 +1224,31 @@ def test_precision_recall_f1_no_labels(): # |y_i| = [0, 0, 0] # |y_hat_i| = [0, 0, 0] - for beta in [1]: - p, r, f, s = assert_warns(UndefinedMetricWarning, - precision_recall_fscore_support, - y_true, y_pred, average=None, beta=beta) - assert_array_almost_equal(p, [0, 0, 0], 2) - assert_array_almost_equal(r, [0, 0, 0], 2) - assert_array_almost_equal(f, [0, 0, 0], 2) - assert_array_almost_equal(s, [0, 0, 0], 2) - - fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, - y_true, y_pred, beta=beta, average=None) - assert_array_almost_equal(fbeta, [0, 0, 0], 2) - - for average in ["macro", "micro", "weighted", "samples"]: - p, r, f, s = assert_warns(UndefinedMetricWarning, - precision_recall_fscore_support, - y_true, y_pred, average=average, - beta=beta) - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - - fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, - y_true, y_pred, - beta=beta, average=average) - assert_almost_equal(fbeta, 0) + p, r, f, s = assert_warns(UndefinedMetricWarning, + precision_recall_fscore_support, + y_true, y_pred, average=None, beta=beta) + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, + y_true, y_pred, beta=beta, average=None) + assert_array_almost_equal(fbeta, [0, 0, 0], 2) + + p, r, f, s = assert_warns(UndefinedMetricWarning, + precision_recall_fscore_support, + y_true, y_pred, average=average, + beta=beta) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, + y_true, y_pred, + beta=beta, average=average) + assert_almost_equal(fbeta, 0) def test_prf_warnings(): diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 680b78c3dd43d..e5141ed84032c 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -2,10 +2,13 @@ from functools import partial from itertools import product +from itertools import chain import numpy as np import scipy.sparse as sp +import pytest + from sklearn.datasets import make_multilabel_classification from sklearn.preprocessing import LabelBinarizer from sklearn.utils.multiclass import type_of_target @@ -415,17 +418,17 @@ def test_symmetry(): msg="%s seems to be symmetric" % name) -@ignore_warnings -def test_sample_order_invariance(): +@pytest.mark.parametrize( + 'name', + set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_sample_order_invariance(name): random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) - for name, metric in ALL_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue - + with ignore_warnings(): + metric = ALL_METRICS[name] assert_almost_equal(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" @@ -472,8 +475,10 @@ def test_sample_order_invariance_multilabel_and_multioutput(): % name) -@ignore_warnings -def test_format_invariance_with_1d_vectors(): +@pytest.mark.parametrize( + 'name', + set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_format_invariance_with_1d_vectors(name): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) @@ -489,9 +494,8 @@ def test_format_invariance_with_1d_vectors(): y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) - for name, metric in ALL_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue + with ignore_warnings(): + metric = ALL_METRICS[name] measure = metric(y1, y2) @@ -551,9 +555,11 @@ def test_format_invariance_with_1d_vectors(): assert_raises(ValueError, metric, y1_row, y2_row) -@ignore_warnings -def test_invariance_string_vs_numbers_labels(): - # Ensure that classification metrics with string labels +@pytest.mark.parametrize( + 'name', + set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_classification_invariance_string_vs_numbers_labels(name): + # Ensure that classification metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) @@ -564,10 +570,8 @@ def test_invariance_string_vs_numbers_labels(): pos_label_str = "spam" labels_str = ["eggs", "spam"] - for name, metric in CLASSIFICATION_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue - + with ignore_warnings(): + metric = CLASSIFICATION_METRICS[name] measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label @@ -600,7 +604,20 @@ def test_invariance_string_vs_numbers_labels(): err_msg="{0} failed string vs number " "invariance test".format(name)) - for name, metric in THRESHOLDED_METRICS.items(): + +@pytest.mark.parametrize('name', THRESHOLDED_METRICS) +def test_thresholded_invariance_string_vs_numbers_labels(name): + # Ensure that thresholded metrics with string labels are invariant + random_state = check_random_state(0) + y1 = random_state.randint(0, 2, size=(20, )) + y2 = random_state.randint(0, 2, size=(20, )) + + y1_str = np.array(["eggs", "spam"])[y1] + + pos_label_str = "spam" + + with ignore_warnings(): + metric = THRESHOLDED_METRICS[name] if name not in METRIC_UNDEFINED_BINARY: # Ugly, but handle case with a pos_label and label metric_str = metric @@ -623,28 +640,30 @@ def test_invariance_string_vs_numbers_labels(): assert_raises(ValueError, metric, y1_str.astype('O'), y2) -def test_inf_nan_input(): - invalids =[([0, 1], [np.inf, np.inf]), - ([0, 1], [np.nan, np.nan]), - ([0, 1], [np.nan, np.inf])] +invalids = [([0, 1], [np.inf, np.inf]), + ([0, 1], [np.nan, np.nan]), + ([0, 1], [np.nan, np.inf])] + - METRICS = dict() - METRICS.update(THRESHOLDED_METRICS) - METRICS.update(REGRESSION_METRICS) +@pytest.mark.parametrize( + 'metric', + chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values())) +def test_regression_thresholded_inf_nan_input(metric): - for metric in METRICS.values(): - for y_true, y_score in invalids: - assert_raise_message(ValueError, - "contains NaN, infinity", - metric, y_true, y_score) + for y_true, y_score in invalids: + assert_raise_message(ValueError, + "contains NaN, infinity", + metric, y_true, y_score) + +@pytest.mark.parametrize('metric', CLASSIFICATION_METRICS.values()) +def test_classification_inf_nan_input(metric): # Classification metrics all raise a mixed input exception - for metric in CLASSIFICATION_METRICS.values(): - for y_true, y_score in invalids: - assert_raise_message(ValueError, - "Classification metrics can't handle a mix " - "of binary and continuous targets", - metric, y_true, y_score) + for y_true, y_score in invalids: + assert_raise_message(ValueError, + "Classification metrics can't handle a mix " + "of binary and continuous targets", + metric, y_true, y_score) @ignore_warnings @@ -667,45 +686,47 @@ def check_single_sample_multioutput(name): metric(np.array([[i, j]]), np.array([[k, l]])) -def test_single_sample(): - for name in ALL_METRICS: - if (name in METRIC_UNDEFINED_BINARY_MULTICLASS or - name in THRESHOLDED_METRICS): - # Those metrics are not always defined with one sample - # or in multiclass classification - continue +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) + # Those metrics are not always defined with one sample + # or in multiclass classification + - METRIC_UNDEFINED_BINARY_MULTICLASS + - set(THRESHOLDED_METRICS))) +def test_single_sample(name): + check_single_sample(name) - yield check_single_sample, name - for name in MULTIOUTPUT_METRICS + MULTILABELS_METRICS: - yield check_single_sample_multioutput, name +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS + MULTILABELS_METRICS) +def test_single_sample_multioutput(name): + check_single_sample_multioutput(name) -def test_multioutput_number_of_output_differ(): +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS) +def test_multioutput_number_of_output_differ(name): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0], [1, 0], [0, 0]]) - for name in MULTIOUTPUT_METRICS: - metric = ALL_METRICS[name] - assert_raises(ValueError, metric, y_true, y_pred) + metric = ALL_METRICS[name] + assert_raises(ValueError, metric, y_true, y_pred) -def test_multioutput_regression_invariance_to_dimension_shuffling(): +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS) +def test_multioutput_regression_invariance_to_dimension_shuffling(name): # test invariance to dimension shuffling random_state = check_random_state(0) y_true = random_state.uniform(0, 2, size=(20, 5)) y_pred = random_state.uniform(0, 2, size=(20, 5)) - for name in MULTIOUTPUT_METRICS: - metric = ALL_METRICS[name] - error = metric(y_true, y_pred) + metric = ALL_METRICS[name] + error = metric(y_true, y_pred) - for _ in range(3): - perm = random_state.permutation(y_true.shape[1]) - assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), - error, - err_msg="%s is not dimension shuffling " - "invariant" % name) + for _ in range(3): + perm = random_state.permutation(y_true.shape[1]) + assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), + error, + err_msg="%s is not dimension shuffling " + "invariant" % name) @ignore_warnings @@ -747,7 +768,8 @@ def test_multilabel_representation_invariance(): "formats." % name) -def test_raise_value_error_multilabel_sequences(): +@pytest.mark.parametrize('name', MULTILABELS_METRICS) +def test_raise_value_error_multilabel_sequences(name): # make sure the multilabel-sequence format raises ValueError multilabel_sequences = [ [[0, 1]], @@ -757,41 +779,41 @@ def test_raise_value_error_multilabel_sequences(): [()], np.array([[], [1, 2]], dtype='object')] - for name in MULTILABELS_METRICS: - metric = ALL_METRICS[name] - for seq in multilabel_sequences: - assert_raises(ValueError, metric, seq, seq) + metric = ALL_METRICS[name] + for seq in multilabel_sequences: + assert_raises(ValueError, metric, seq, seq) -def test_normalize_option_binary_classification(n_samples=20): +@pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION) +def test_normalize_option_binary_classification(name): # Test in the binary case + n_samples = 20 random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) - for name in METRICS_WITH_NORMALIZE_OPTION: - metrics = ALL_METRICS[name] - measure = metrics(y_true, y_pred, normalize=True) - assert_greater(measure, 0, - msg="We failed to test correctly the normalize option") - assert_almost_equal(metrics(y_true, y_pred, normalize=False) - / n_samples, measure) + metrics = ALL_METRICS[name] + measure = metrics(y_true, y_pred, normalize=True) + assert_greater(measure, 0, + msg="We failed to test correctly the normalize option") + assert_almost_equal(metrics(y_true, y_pred, normalize=False) + / n_samples, measure) -def test_normalize_option_multiclass_classification(): +@pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION) +def test_normalize_option_multiclass_classification(name): # Test in the multiclass case random_state = check_random_state(0) y_true = random_state.randint(0, 4, size=(20, )) y_pred = random_state.randint(0, 4, size=(20, )) n_samples = y_true.shape[0] - for name in METRICS_WITH_NORMALIZE_OPTION: - metrics = ALL_METRICS[name] - measure = metrics(y_true, y_pred, normalize=True) - assert_greater(measure, 0, - msg="We failed to test correctly the normalize option") - assert_almost_equal(metrics(y_true, y_pred, normalize=False) - / n_samples, measure) + metrics = ALL_METRICS[name] + measure = metrics(y_true, y_pred, normalize=True) + assert_greater(measure, 0, + msg="We failed to test correctly the normalize option") + assert_almost_equal(metrics(y_true, y_pred, normalize=False) + / n_samples, measure) def test_normalize_option_multilabel_classification(): @@ -886,7 +908,9 @@ def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, raise ValueError("Metric is not recorded as having an average option") -def test_averaging_multiclass(n_samples=50, n_classes=3): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multiclass(name): + n_samples, n_classes = 50, 3 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples, )) y_pred = random_state.randint(0, n_classes, size=(n_samples, )) @@ -896,12 +920,14 @@ def test_averaging_multiclass(n_samples=50, n_classes=3): y_true_binarize = lb.transform(y_true) y_pred_binarize = lb.transform(y_pred) - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) -def test_averaging_multilabel(n_classes=5, n_samples=40): +@pytest.mark.parametrize( + 'name', METRICS_WITH_AVERAGING + THRESHOLDED_METRICS_WITH_AVERAGING) +def test_averaging_multilabel(name): + n_samples, n_classes = 40, 5 _, y = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=5, n_samples=n_samples, allow_unlabeled=False) @@ -911,22 +937,27 @@ def test_averaging_multilabel(n_classes=5, n_samples=40): y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING + THRESHOLDED_METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) -def test_averaging_multilabel_all_zeroes(): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multilabel_all_zeroes(name): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_score = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) + +def test_averaging_binary_multilabel_all_zeroes(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros((20, 3)) + y_true_binarize = y_true + y_pred_binarize = y_pred # Test _average_binary_score for weight.sum() == 0 binary_metric = (lambda y_true, y_score, average="macro": _average_binary_score( @@ -935,16 +966,16 @@ def test_averaging_multilabel_all_zeroes(): y_pred_binarize, is_multilabel=True) -def test_averaging_multilabel_all_ones(): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multilabel_all_ones(name): y_true = np.ones((20, 3)) y_pred = np.ones((20, 3)) y_score = np.ones((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) @ignore_warnings @@ -1022,54 +1053,64 @@ def check_sample_weight_invariance(name, metric, y1, y2): sample_weight])) -def test_sample_weight_invariance(n_samples=50): +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS).intersection(REGRESSION_METRICS) + - set(METRICS_WITHOUT_SAMPLE_WEIGHT))) +def test_regression_sample_weight_invariance(name): + n_samples = 50 random_state = check_random_state(0) # regression y_true = random_state.random_sample(size=(n_samples,)) y_pred = random_state.random_sample(size=(n_samples,)) - for name in ALL_METRICS: - if name not in REGRESSION_METRICS: - continue - if name in METRICS_WITHOUT_SAMPLE_WEIGHT: - continue - metric = ALL_METRICS[name] - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) - set(REGRESSION_METRICS) + - set(METRICS_WITHOUT_SAMPLE_WEIGHT) - set(METRIC_UNDEFINED_BINARY))) +def test_binary_sample_weight_invariance(name): # binary + n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) - for name in ALL_METRICS: - if name in REGRESSION_METRICS: - continue - if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or - name in METRIC_UNDEFINED_BINARY): - continue - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score - else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) - set(REGRESSION_METRICS) + - set(METRICS_WITHOUT_SAMPLE_WEIGHT) + - METRIC_UNDEFINED_BINARY_MULTICLASS)) +def test_multiclass_sample_weight_invariance(name): # multiclass + n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples, )) y_pred = random_state.randint(0, 5, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, 5)) - for name in ALL_METRICS: - if name in REGRESSION_METRICS: - continue - if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or - name in METRIC_UNDEFINED_BINARY_MULTICLASS): - continue - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score - else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + set(MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + + MULTIOUTPUT_METRICS) - set(METRICS_WITHOUT_SAMPLE_WEIGHT)) +def test_multilabel_sample_weight_invariance(name): # multilabel indicator + random_state = check_random_state(0) _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) @@ -1080,18 +1121,11 @@ def test_sample_weight_invariance(n_samples=50): y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) - for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + - MULTIOUTPUT_METRICS): - if name in METRICS_WITHOUT_SAMPLE_WEIGHT: - continue - - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield (check_sample_weight_invariance, name, metric, - y_true, y_score) - else: - yield (check_sample_weight_invariance, name, metric, - y_true, y_pred) + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) @ignore_warnings diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 6c490903c30fe..e63219a817bed 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -130,52 +130,52 @@ def test_pairwise_distances(): assert_raises(ValueError, pairwise_distances, X, Y, metric="blah") -# ignore conversion to boolean in pairwise_distances -@ignore_warnings(category=DataConversionWarning) -def test_pairwise_boolean_distance(): +@pytest.mark.parametrize('metric', PAIRWISE_BOOLEAN_FUNCTIONS) +def test_pairwise_boolean_distance(metric): # test that we convert to boolean arrays for boolean distances rng = np.random.RandomState(0) X = rng.randn(5, 4) Y = X.copy() Y[0, 0] = 1 - Y[0, 0] - for metric in PAIRWISE_BOOLEAN_FUNCTIONS: + # ignore conversion to boolean in pairwise_distances + with ignore_warnings(category=DataConversionWarning): for Z in [Y, None]: res = pairwise_distances(X, Z, metric=metric) res[np.isnan(res)] = 0 assert_true(np.sum(res != 0) == 0) -def test_pairwise_precomputed(): - for func in [pairwise_distances, pairwise_kernels]: - # Test correct shape - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), metric='precomputed') - # with two args - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), np.zeros((4, 4)), - metric='precomputed') - # even if shape[1] agrees (although thus second arg is spurious) - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), np.zeros((4, 3)), - metric='precomputed') - - # Test not copied (if appropriate dtype) - S = np.zeros((5, 5)) - S2 = func(S, metric="precomputed") - assert_true(S is S2) - # with two args - S = np.zeros((5, 3)) - S2 = func(S, np.zeros((3, 3)), metric="precomputed") - assert_true(S is S2) - - # Test always returns float dtype - S = func(np.array([[1]], dtype='int'), metric='precomputed') - assert_equal('f', S.dtype.kind) - - # Test converts list to array-like - S = func([[1.]], metric='precomputed') - assert_true(isinstance(S, np.ndarray)) +@pytest.mark.parametrize('func', [pairwise_distances, pairwise_kernels]) +def test_pairwise_precomputed(func): + # Test correct shape + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), metric='precomputed') + # with two args + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), np.zeros((4, 4)), + metric='precomputed') + # even if shape[1] agrees (although thus second arg is spurious) + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), np.zeros((4, 3)), + metric='precomputed') + + # Test not copied (if appropriate dtype) + S = np.zeros((5, 5)) + S2 = func(S, metric="precomputed") + assert_true(S is S2) + # with two args + S = np.zeros((5, 3)) + S2 = func(S, np.zeros((3, 3)), metric="precomputed") + assert_true(S is S2) + + # Test always returns float dtype + S = func(np.array([[1]], dtype='int'), metric='precomputed') + assert_equal('f', S.dtype.kind) + + # Test converts list to array-like + S = func([[1.]], metric='precomputed') + assert_true(isinstance(S, np.ndarray)) def check_pairwise_parallel(func, metric, kwds): @@ -230,41 +230,51 @@ def test_pairwise_callable_nonstrict_metric(): assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5) -def test_pairwise_kernels(): # Test the pairwise_kernels helper function. +# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. +@pytest.mark.parametrize( + 'metric', + ["rbf", "laplacian", "sigmoid", "polynomial", "linear", + "chi2", "additive_chi2"]) +def test_pairwise_kernels(metric): + # Test the pairwise_kernels helper function. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) - # Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. - test_metrics = ["rbf", "laplacian", "sigmoid", "polynomial", "linear", - "chi2", "additive_chi2"] - for metric in test_metrics: - function = PAIRWISE_KERNEL_FUNCTIONS[metric] - # Test with Y=None - K1 = pairwise_kernels(X, metric=metric) - K2 = function(X) - assert_array_almost_equal(K1, K2) - # Test with Y=Y - K1 = pairwise_kernels(X, Y=Y, metric=metric) - K2 = function(X, Y=Y) - assert_array_almost_equal(K1, K2) - # Test with tuples as X and Y - X_tuples = tuple([tuple([v for v in row]) for row in X]) - Y_tuples = tuple([tuple([v for v in row]) for row in Y]) - K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) - assert_array_almost_equal(K1, K2) + function = PAIRWISE_KERNEL_FUNCTIONS[metric] + # Test with Y=None + K1 = pairwise_kernels(X, metric=metric) + K2 = function(X) + assert_array_almost_equal(K1, K2) + # Test with Y=Y + K1 = pairwise_kernels(X, Y=Y, metric=metric) + K2 = function(X, Y=Y) + assert_array_almost_equal(K1, K2) + # Test with tuples as X and Y + X_tuples = tuple([tuple([v for v in row]) for row in X]) + Y_tuples = tuple([tuple([v for v in row]) for row in Y]) + K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) + assert_array_almost_equal(K1, K2) + + # Test with sparse X and Y + X_sparse = csr_matrix(X) + Y_sparse = csr_matrix(Y) + if metric in ["chi2", "additive_chi2"]: + # these don't support sparse matrices yet + assert_raises(ValueError, pairwise_kernels, + X_sparse, Y=Y_sparse, metric=metric) + return + K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) + assert_array_almost_equal(K1, K2) + + +def test_pairwise_kernels_callable(): + # Test the pairwise_kernels helper function + # with a callable function, with given keywords. + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((2, 4)) - # Test with sparse X and Y - X_sparse = csr_matrix(X) - Y_sparse = csr_matrix(Y) - if metric in ["chi2", "additive_chi2"]: - # these don't support sparse matrices yet - assert_raises(ValueError, pairwise_kernels, - X_sparse, Y=Y_sparse, metric=metric) - continue - K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) - assert_array_almost_equal(K1, K2) - # Test with a callable function, with given keywords. metric = callable_rbf_kernel kwds = {'gamma': 0.1} K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) @@ -289,27 +299,37 @@ def test_pairwise_kernels_filter_param(): assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params) -def test_paired_distances(): +@pytest.mark.parametrize('metric, func', iteritems(PAIRED_DISTANCES)) +def test_paired_distances(metric, func): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) - for metric, func in iteritems(PAIRED_DISTANCES): - S = paired_distances(X, Y, metric=metric) - S2 = func(X, Y) - assert_array_almost_equal(S, S2) - S3 = func(csr_matrix(X), csr_matrix(Y)) - assert_array_almost_equal(S, S3) - if metric in PAIRWISE_DISTANCE_FUNCTIONS: - # Check the pairwise_distances implementation - # gives the same value - distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) - distances = np.diag(distances) - assert_array_almost_equal(distances, S) - - # Check the callable implementation + + S = paired_distances(X, Y, metric=metric) + S2 = func(X, Y) + assert_array_almost_equal(S, S2) + S3 = func(csr_matrix(X), csr_matrix(Y)) + assert_array_almost_equal(S, S3) + if metric in PAIRWISE_DISTANCE_FUNCTIONS: + # Check the pairwise_distances implementation + # gives the same value + distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) + distances = np.diag(distances) + assert_array_almost_equal(distances, S) + + +def test_paired_distances_callable(): + # Test the pairwise_distance helper function + # with the callable implementation + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((5, 4)) + # Euclidean distance, with Y != X. + Y = rng.random_sample((5, 4)) + S = paired_distances(X, Y, metric='manhattan') S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) assert_array_almost_equal(S, S2) @@ -640,25 +660,29 @@ def test_chi_square_kernel(): csr_matrix(X), csr_matrix(Y)) -def test_kernel_symmetry(): +@pytest.mark.parametrize( + 'kernel', + (linear_kernel, polynomial_kernel, rbf_kernel, + laplacian_kernel, sigmoid_kernel, cosine_similarity)) +def test_kernel_symmetry(kernel): # Valid kernels should be symmetric rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) - for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, - laplacian_kernel, sigmoid_kernel, cosine_similarity): - K = kernel(X, X) - assert_array_almost_equal(K, K.T, 15) + K = kernel(X, X) + assert_array_almost_equal(K, K.T, 15) -def test_kernel_sparse(): +@pytest.mark.parametrize( + 'kernel', + (linear_kernel, polynomial_kernel, rbf_kernel, + laplacian_kernel, sigmoid_kernel, cosine_similarity)) +def test_kernel_sparse(kernel): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) X_sparse = csr_matrix(X) - for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, - laplacian_kernel, sigmoid_kernel, cosine_similarity): - K = kernel(X, X) - K2 = kernel(X_sparse, X_sparse) - assert_array_almost_equal(K, K2) + K = kernel(X, X) + K2 = kernel(X_sparse, X_sparse) + assert_array_almost_equal(K, K2) def test_linear_kernel(): diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index f93c11820943b..28b79e9b8474c 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -176,19 +176,19 @@ def _partial_roc(y_true, y_predict, max_fpr): return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) -def test_roc_curve(): +@pytest.mark.parametrize('drop', [True, False]) +def test_roc_curve(drop): # Test Area under Receiver Operating Characteristic (ROC) curve y_true, _, probas_pred = make_prediction(binary=True) expected_auc = _auc(y_true, probas_pred) - for drop in [True, False]: - fpr, tpr, thresholds = roc_curve(y_true, probas_pred, - drop_intermediate=drop) - roc_auc = auc(fpr, tpr) - assert_array_almost_equal(roc_auc, expected_auc, decimal=2) - assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) - assert_equal(fpr.shape, tpr.shape) - assert_equal(fpr.shape, thresholds.shape) + fpr, tpr, thresholds = roc_curve(y_true, probas_pred, + drop_intermediate=drop) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, expected_auc, decimal=2) + assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) + assert_equal(fpr.shape, tpr.shape) + assert_equal(fpr.shape, thresholds.shape) def test_roc_curve_end_points(): From 588fb4979c95e3cb38d8715f7b27efdb8530741b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 7 Jun 2018 10:14:00 +0200 Subject: [PATCH 25/25] Review comments and use sets in metrics/tests/test_common.py --- sklearn/metrics/tests/test_classification.py | 35 +++++---- sklearn/metrics/tests/test_common.py | 80 ++++++++++---------- 2 files changed, 61 insertions(+), 54 deletions(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index d5489f7c235f0..923f60994dac1 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -1216,6 +1216,27 @@ def test_precision_recall_f1_no_labels(beta, average): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) + p, r, f, s = assert_warns(UndefinedMetricWarning, + precision_recall_fscore_support, + y_true, y_pred, average=average, + beta=beta) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, + y_true, y_pred, + beta=beta, average=average) + assert_almost_equal(fbeta, 0) + + +def test_precision_recall_f1_no_labels_average_none(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + beta = 1 + # tp = [0, 0, 0] # fn = [0, 0, 0] # fp = [0, 0, 0] @@ -1236,20 +1257,6 @@ def test_precision_recall_f1_no_labels(beta, average): y_true, y_pred, beta=beta, average=None) assert_array_almost_equal(fbeta, [0, 0, 0], 2) - p, r, f, s = assert_warns(UndefinedMetricWarning, - precision_recall_fscore_support, - y_true, y_pred, average=average, - beta=beta) - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - - fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, - y_true, y_pred, - beta=beta, average=average) - assert_almost_equal(fbeta, 0) - def test_prf_warnings(): # average of per-label scores diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index e5141ed84032c..f835fdd507761 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -196,7 +196,7 @@ # is already written. # Those metrics don't support binary inputs -METRIC_UNDEFINED_BINARY = [ +METRIC_UNDEFINED_BINARY = { "samples_f0.5_score", "samples_f1_score", "samples_f2_score", @@ -212,10 +212,10 @@ "label_ranking_loss", "label_ranking_average_precision_score", -] +} # Those metrics don't support multiclass inputs -METRIC_UNDEFINED_MULTICLASS = [ +METRIC_UNDEFINED_MULTICLASS = { "brier_score_loss", "balanced_accuracy_score", @@ -232,24 +232,24 @@ "f1_score", "f2_score", "f0.5_score", -] +} # Metric undefined with "binary" or "multiclass" input -METRIC_UNDEFINED_BINARY_MULTICLASS = set(METRIC_UNDEFINED_BINARY).union( - set(METRIC_UNDEFINED_MULTICLASS)) +METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union( + METRIC_UNDEFINED_MULTICLASS) # Metrics with an "average" argument -METRICS_WITH_AVERAGING = [ +METRICS_WITH_AVERAGING = { "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score" -] +} # Threshold-based metrics with an "average" argument -THRESHOLDED_METRICS_WITH_AVERAGING = [ +THRESHOLDED_METRICS_WITH_AVERAGING = { "roc_auc_score", "average_precision_score", "partial_roc_auc", -] +} # Metrics with a "pos_label" argument -METRICS_WITH_POS_LABEL = [ +METRICS_WITH_POS_LABEL = { "roc_curve", "brier_score_loss", @@ -265,12 +265,12 @@ "macro_f0.5_score", "macro_f1_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", -] +} # Metrics with a "labels" argument # TODO: Handle multi_class metrics that has a labels argument as well as a # decision function argument. e.g hinge_loss -METRICS_WITH_LABELS = [ +METRICS_WITH_LABELS = { "confusion_matrix", "hamming_loss", @@ -287,17 +287,17 @@ "macro_precision_score", "macro_recall_score", "cohen_kappa_score", -] +} # Metrics with a "normalize" option -METRICS_WITH_NORMALIZE_OPTION = [ +METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score", "jaccard_similarity_score", "zero_one_loss", -] +} # Threshold-based metrics with "multilabel-indicator" format support -THRESHOLDED_MULTILABEL_METRICS = [ +THRESHOLDED_MULTILABEL_METRICS = { "log_loss", "unnormalized_log_loss", @@ -310,10 +310,10 @@ "coverage_error", "label_ranking_loss", "label_ranking_average_precision_score", -] +} # Classification metrics with "multilabel-indicator" format -MULTILABELS_METRICS = [ +MULTILABELS_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "jaccard_similarity_score", "unnormalized_jaccard_similarity_score", @@ -330,17 +330,17 @@ "samples_f0.5_score", "samples_f1_score", "samples_f2_score", "samples_precision_score", "samples_recall_score", -] +} # Regression metrics with "multioutput-continuous" format support -MULTIOUTPUT_METRICS = [ +MULTIOUTPUT_METRICS = { "mean_absolute_error", "mean_squared_error", "r2_score", "explained_variance_score" -] +} # Symmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) == metric(y_pred, y_true). -SYMMETRIC_METRICS = [ +SYMMETRIC_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "jaccard_similarity_score", "unnormalized_jaccard_similarity_score", @@ -356,11 +356,11 @@ "median_absolute_error", "cohen_kappa_score", -] +} # Asymmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) != metric(y_pred, y_true). -NOT_SYMMETRIC_METRICS = [ +NOT_SYMMETRIC_METRICS = { "balanced_accuracy_score", "explained_variance_score", "r2_score", @@ -373,18 +373,18 @@ "macro_f0.5_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "log_loss", "hinge_loss" -] +} # No Sample weight support -METRICS_WITHOUT_SAMPLE_WEIGHT = [ +METRICS_WITHOUT_SAMPLE_WEIGHT = { "confusion_matrix", # Left this one here because the tests in this file do # not work for confusion_matrix, as its output is a # matrix instead of a number. Testing of # confusion_matrix with sample_weight is in # test_classification.py "median_absolute_error", -] +} @ignore_warnings @@ -395,13 +395,13 @@ def test_symmetry(): y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics - assert_equal(set(SYMMETRIC_METRICS).union( - NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS, + assert_equal(SYMMETRIC_METRICS.union( + NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS), METRIC_UNDEFINED_BINARY_MULTICLASS), set(ALL_METRICS)) assert_equal( - set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), + SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS), set([])) # Symmetric metric @@ -550,7 +550,7 @@ def test_format_invariance_with_1d_vectors(name): # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. - if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS + + if (name not in (MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row) @@ -697,7 +697,7 @@ def test_single_sample(name): check_single_sample(name) -@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS + MULTILABELS_METRICS) +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS | MULTILABELS_METRICS) def test_single_sample_multioutput(name): check_single_sample_multioutput(name) @@ -925,7 +925,7 @@ def test_averaging_multiclass(name): @pytest.mark.parametrize( - 'name', METRICS_WITH_AVERAGING + THRESHOLDED_METRICS_WITH_AVERAGING) + 'name', METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING) def test_averaging_multilabel(name): n_samples, n_classes = 40, 5 _, y = make_multilabel_classification(n_features=1, n_classes=n_classes, @@ -1055,8 +1055,8 @@ def check_sample_weight_invariance(name, metric, y1, y2): @pytest.mark.parametrize( 'name', - (set(ALL_METRICS).intersection(REGRESSION_METRICS) - - set(METRICS_WITHOUT_SAMPLE_WEIGHT))) + (set(ALL_METRICS).intersection(set(REGRESSION_METRICS)) + - METRICS_WITHOUT_SAMPLE_WEIGHT)) def test_regression_sample_weight_invariance(name): n_samples = 50 random_state = check_random_state(0) @@ -1070,7 +1070,7 @@ def test_regression_sample_weight_invariance(name): @pytest.mark.parametrize( 'name', (set(ALL_METRICS) - set(REGRESSION_METRICS) - - set(METRICS_WITHOUT_SAMPLE_WEIGHT) - set(METRIC_UNDEFINED_BINARY))) + - METRICS_WITHOUT_SAMPLE_WEIGHT - METRIC_UNDEFINED_BINARY)) def test_binary_sample_weight_invariance(name): # binary n_samples = 50 @@ -1088,7 +1088,7 @@ def test_binary_sample_weight_invariance(name): @pytest.mark.parametrize( 'name', (set(ALL_METRICS) - set(REGRESSION_METRICS) - - set(METRICS_WITHOUT_SAMPLE_WEIGHT) + - METRICS_WITHOUT_SAMPLE_WEIGHT - METRIC_UNDEFINED_BINARY_MULTICLASS)) def test_multiclass_sample_weight_invariance(name): # multiclass @@ -1106,8 +1106,8 @@ def test_multiclass_sample_weight_invariance(name): @pytest.mark.parametrize( 'name', - set(MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + - MULTIOUTPUT_METRICS) - set(METRICS_WITHOUT_SAMPLE_WEIGHT)) + (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS | + MULTIOUTPUT_METRICS) - METRICS_WITHOUT_SAMPLE_WEIGHT) def test_multilabel_sample_weight_invariance(name): # multilabel indicator random_state = check_random_state(0)