From 628cec17a822db05514a1f6a0d0b0e77da021bd0 Mon Sep 17 00:00:00 2001 From: Tim Bicker Date: Tue, 11 Jun 2019 11:39:48 +0200 Subject: [PATCH] refactor tests --- sklearn/tests/test_naive_bayes.py | 362 +++++++++++++++--------------- 1 file changed, 178 insertions(+), 184 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 684d157617900..77ebb0125529f 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -117,7 +117,7 @@ def test_gnb_priors(): def test_gnb_priors_sum_isclose(): # test whether the class prior sum is properly tested""" X = np.array([[-1, -1], [-2, -1], [-3, -2], [-4, -5], [-5, -4], - [1, 1], [2, 1], [3, 2], [4, 4], [5, 5]]) + [1, 1], [2, 1], [3, 2], [4, 4], [5, 5]]) priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) @@ -146,7 +146,7 @@ def test_gnb_prior_large_bias(): assert_equal(clf.predict([[-0.1, -0.1]]), np.array([2])) -def test_check_update_with_no_data(): +def test_gnb_check_update_with_no_data(): """ Test when the partial fit is called without any data""" # Create an empty array prev_points = 100 @@ -169,126 +169,52 @@ def test_gnb_pfit_wrong_nb_features(): assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y) -def test_discrete_prior(): - # Test whether class priors are properly set. - for cls in [BernoulliNB, MultinomialNB]: - clf = cls().fit(X2, y2) - assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0), - clf.class_log_prior_, 8) - - -@pytest.mark.parametrize('kind', ('dense', 'sparse')) -def test_mnnb(kind): - # Test Multinomial Naive Bayes classification. - # This checks that MultinomialNB implements fit and predict and returns - # correct values for a simple toy dataset. - - if kind == 'dense': - X = X2 - elif kind == 'sparse': - X = scipy.sparse.csr_matrix(X2) - - # Check the ability to predict the learning set. - clf = MultinomialNB() - assert_raises(ValueError, clf.fit, -X, y2) - y_pred = clf.fit(X, y2).predict(X) - - assert_array_equal(y_pred, y2) - - # Verify that np.log(clf.predict_proba(X)) gives the same results as - # clf.predict_log_proba(X) - y_pred_proba = clf.predict_proba(X) - y_pred_log_proba = clf.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) +def test_gnb_partial_fit(): + clf = GaussianNB().fit(X, y) + clf_pf = GaussianNB().partial_fit(X, y, np.unique(y)) + assert_array_almost_equal(clf.theta_, clf_pf.theta_) + assert_array_almost_equal(clf.sigma_, clf_pf.sigma_) + assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_) - # Check that incremental fitting yields the same results - clf2 = MultinomialNB() - clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) - clf2.partial_fit(X[2:5], y2[2:5]) - clf2.partial_fit(X[5:], y2[5:]) + clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y)) + clf_pf2.partial_fit(X[1::2], y[1::2]) + assert_array_almost_equal(clf.theta_, clf_pf2.theta_) + assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_) + assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_) - y_pred2 = clf2.predict(X) - assert_array_equal(y_pred2, y2) - y_pred_proba2 = clf2.predict_proba(X) - y_pred_log_proba2 = clf2.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) - assert_array_almost_equal(y_pred_proba2, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) +def test_gnb_naive_bayes_scale_invariance(): + # Scaling the data should not change the prediction results + iris = load_iris() + X, y = iris.data, iris.target + labels = [GaussianNB().fit(f * X, y).predict(f * X) + for f in [1E-10, 1, 1E10]] + assert_array_equal(labels[0], labels[1]) + assert_array_equal(labels[1], labels[2]) - # Partial fit on the whole data at once should be the same as fit too - clf3 = MultinomialNB() - clf3.partial_fit(X, y2, classes=np.unique(y2)) - y_pred3 = clf3.predict(X) - assert_array_equal(y_pred3, y2) - y_pred_proba3 = clf3.predict_proba(X) - y_pred_log_proba3 = clf3.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) - assert_array_almost_equal(y_pred_proba3, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) +@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB]) +def test_discretenb_prior(cls): + # Test whether class priors are properly set. + clf = cls().fit(X2, y2) + assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0), + clf.class_log_prior_, 8) -def check_partial_fit(cls): +@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB]) +def test_discretenb_partial_fit(cls): clf1 = cls() - clf1.fit([[0, 1], [1, 0]], [0, 1]) + clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1]) clf2 = cls() - clf2.partial_fit([[0, 1], [1, 0]], [0, 1], classes=[0, 1]) + clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1]) assert_array_equal(clf1.class_count_, clf2.class_count_) - assert_array_equal(clf1.feature_count_, clf2.feature_count_) clf3 = cls() clf3.partial_fit([[0, 1]], [0], classes=[0, 1]) clf3.partial_fit([[1, 0]], [1]) + clf3.partial_fit([[1, 1]], [1]) assert_array_equal(clf1.class_count_, clf3.class_count_) - assert_array_equal(clf1.feature_count_, clf3.feature_count_) - - -def test_mnb_prior_unobserved_targets(): - # test smoothing of prior for yet unobserved targets - - # Create toy training data - X = np.array([[0, 1], [1, 0]]) - y = np.array([0, 1]) - - clf = MultinomialNB() - - assert_no_warnings( - clf.partial_fit, X, y, classes=[0, 1, 2] - ) - - assert clf.predict([[0, 1]]) == 0 - assert clf.predict([[1, 0]]) == 1 - assert clf.predict([[1, 1]]) == 0 - - # add a training example with previously unobserved class - assert_no_warnings( - clf.partial_fit, [[1, 1]], [2] - ) - - assert clf.predict([[0, 1]]) == 0 - assert clf.predict([[1, 0]]) == 1 - assert clf.predict([[1, 1]]) == 2 - - -@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB]) -def test_discretenb_partial_fit(cls): - check_partial_fit(cls) - - -def test_gnb_partial_fit(): - clf = GaussianNB().fit(X, y) - clf_pf = GaussianNB().partial_fit(X, y, np.unique(y)) - assert_array_almost_equal(clf.theta_, clf_pf.theta_) - assert_array_almost_equal(clf.sigma_, clf_pf.sigma_) - assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_) - - clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y)) - clf_pf2.partial_fit(X[1::2], y[1::2]) - assert_array_almost_equal(clf.theta_, clf_pf2.theta_) - assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_) - assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_) @pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) @@ -317,7 +243,7 @@ def test_discretenb_pickle(cls): @pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) -def test_input_check_fit(cls): +def test_discretenb_input_check_fit(cls): # Test input checks for the fit method # check shape consistency for number of samples at fit time @@ -329,7 +255,7 @@ def test_input_check_fit(cls): @pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) -def test_input_check_partial_fit(cls): +def test_discretenb_input_check_partial_fit(cls): # check shape consistency assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], classes=np.unique(y2)) @@ -359,7 +285,7 @@ def test_discretenb_predict_proba(): X_multinomial = [[0, 1], [1, 3], [4, 0]] # test binary case (1-d output) - y = [0, 0, 2] # 2 is regression test for binary case, 02e673 + y = [0, 0, 2] # 2 is regression test for binary case, 02e673 for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) @@ -429,12 +355,8 @@ def test_discretenb_provide_prior_with_partial_fit(cls): @pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) -def test_sample_weight_multiclass(cls): +def test_discretenb_sample_weight_multiclass(cls): # check shape consistency for number of samples at fit time - check_sample_weight_multiclass(cls) - - -def check_sample_weight_multiclass(cls): X = [ [0, 0, 1], [0, 1, 1], @@ -456,84 +378,107 @@ def check_sample_weight_multiclass(cls): assert_array_equal(clf.predict(X), [0, 1, 1, 2]) -def test_sample_weight_mnb(): - clf = MultinomialNB() - clf.fit([[1, 2], [1, 2], [1, 0]], - [0, 0, 1], - sample_weight=[1, 1, 4]) - assert_array_equal(clf.predict([[1, 0]]), [1]) - positive_prior = np.exp(clf.intercept_[0]) - assert_array_almost_equal([1 - positive_prior, positive_prior], - [1 / 3., 2 / 3.]) - - -def test_coef_intercept_shape(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_coef_intercept_shape(cls): # coef_ and intercept_ should have shapes as in other linear models. # Non-regression test for issue #2127. X = [[1, 0, 0], [1, 1, 1]] y = [1, 2] # binary classification + clf = cls() - for clf in [MultinomialNB(), BernoulliNB()]: - clf.fit(X, y) - assert_equal(clf.coef_.shape, (1, 3)) - assert_equal(clf.intercept_.shape, (1,)) + clf.fit(X, y) + assert_equal(clf.coef_.shape, (1, 3)) + assert_equal(clf.intercept_.shape, (1,)) -def test_check_accuracy_on_digits(): - # Non regression test to make sure that any further refactoring / optim - # of the NB models do not harm the performance on a slightly non-linearly - # separable dataset - digits = load_digits() - X, y = digits.data, digits.target - binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8) - X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] +@pytest.mark.parametrize('kind', ('dense', 'sparse')) +def test_mnnb(kind): + # Test Multinomial Naive Bayes classification. + # This checks that MultinomialNB implements fit and predict and returns + # correct values for a simple toy dataset. - # Multinomial NB - scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) - assert_greater(scores.mean(), 0.86) + if kind == 'dense': + X = X2 + elif kind == 'sparse': + X = scipy.sparse.csr_matrix(X2) - scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) - assert_greater(scores.mean(), 0.94) + # Check the ability to predict the learning set. + clf = MultinomialNB() + assert_raises(ValueError, clf.fit, -X, y2) + y_pred = clf.fit(X, y2).predict(X) - # Bernoulli NB - scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) - assert_greater(scores.mean(), 0.83) + assert_array_equal(y_pred, y2) - scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) - assert_greater(scores.mean(), 0.92) + # Verify that np.log(clf.predict_proba(X)) gives the same results as + # clf.predict_log_proba(X) + y_pred_proba = clf.predict_proba(X) + y_pred_log_proba = clf.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) - # Gaussian NB - scores = cross_val_score(GaussianNB(), X, y, cv=10) - assert_greater(scores.mean(), 0.77) + # Check that incremental fitting yields the same results + clf2 = MultinomialNB() + clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) + clf2.partial_fit(X[2:5], y2[2:5]) + clf2.partial_fit(X[5:], y2[5:]) - scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10) - assert_greater(scores.mean(), 0.89) + y_pred2 = clf2.predict(X) + assert_array_equal(y_pred2, y2) - scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) - assert_greater(scores.mean(), 0.86) + y_pred_proba2 = clf2.predict_proba(X) + y_pred_log_proba2 = clf2.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) + assert_array_almost_equal(y_pred_proba2, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) + # Partial fit on the whole data at once should be the same as fit too + clf3 = MultinomialNB() + clf3.partial_fit(X, y2, classes=np.unique(y2)) -def test_feature_log_prob_bnb(): - # Test for issue #4268. - # Tests that the feature log prob value computed by BernoulliNB when - # alpha=1.0 is equal to the expression given in Manning, Raghavan, - # and Schuetze's "Introduction to Information Retrieval" book: - # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html + y_pred3 = clf3.predict(X) + assert_array_equal(y_pred3, y2) + y_pred_proba3 = clf3.predict_proba(X) + y_pred_log_proba3 = clf3.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) + assert_array_almost_equal(y_pred_proba3, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) - X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]]) - Y = np.array([0, 0, 1, 2, 2]) - # Fit Bernoulli NB w/ alpha = 1.0 - clf = BernoulliNB(alpha=1.0) - clf.fit(X, Y) +def test_mnb_prior_unobserved_targets(): + # test smoothing of prior for yet unobserved targets - # Manually form the (log) numerator and denominator that - # constitute P(feature presence | class) - num = np.log(clf.feature_count_ + 1.0) - denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T + # Create toy training data + X = np.array([[0, 1], [1, 0]]) + y = np.array([0, 1]) - # Check manual estimate matches - assert_array_almost_equal(clf.feature_log_prob_, (num - denom)) + clf = MultinomialNB() + + assert_no_warnings( + clf.partial_fit, X, y, classes=[0, 1, 2] + ) + + assert clf.predict([[0, 1]]) == 0 + assert clf.predict([[1, 0]]) == 1 + assert clf.predict([[1, 1]]) == 0 + + # add a training example with previously unobserved class + assert_no_warnings( + clf.partial_fit, [[1, 1]], [2] + ) + + assert clf.predict([[0, 1]]) == 0 + assert clf.predict([[1, 0]]) == 1 + assert clf.predict([[1, 1]]) == 2 + + +def test_mnb_sample_weight(): + clf = MultinomialNB() + clf.fit([[1, 2], [1, 2], [1, 0]], + [0, 0, 1], + sample_weight=[1, 1, 4]) + assert_array_equal(clf.predict([[1, 0]]), [1]) + positive_prior = np.exp(clf.intercept_[0]) + assert_array_almost_equal([1 - positive_prior, positive_prior], + [1 / 3., 2 / 3.]) def test_bnb(): @@ -567,7 +512,8 @@ def test_bnb(): # Check the feature probabilities are correct feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], - [1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]]) + [1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, + 2 / 3.0]]) assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob) # Testing data point is: @@ -581,6 +527,29 @@ def test_bnb(): assert_array_almost_equal(clf.predict_proba(X_test), predict_proba) +def test_bnb_feature_log_prob(): + # Test for issue #4268. + # Tests that the feature log prob value computed by BernoulliNB when + # alpha=1.0 is equal to the expression given in Manning, Raghavan, + # and Schuetze's "Introduction to Information Retrieval" book: + # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html + + X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]]) + Y = np.array([0, 0, 1, 2, 2]) + + # Fit Bernoulli NB w/ alpha = 1.0 + clf = BernoulliNB(alpha=1.0) + clf.fit(X, Y) + + # Manually form the (log) numerator and denominator that + # constitute P(feature presence | class) + num = np.log(clf.feature_count_ + 1.0) + denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T + + # Check manual estimate matches + assert_array_almost_equal(clf.feature_log_prob_, (num - denom)) + + def test_cnb(): # Tests ComplementNB when alpha=1.0 for the toy example in Manning, # Raghavan, and Schuetze's "Introduction to Information Retrieval" book: @@ -647,15 +616,6 @@ def test_cnb(): assert_array_almost_equal(clf.feature_log_prob_, normed_weights) -def test_naive_bayes_scale_invariance(): - # Scaling the data should not change the prediction results - iris = load_iris() - X, y = iris.data, iris.target - labels = [GaussianNB().fit(f * X, y).predict(f * X) - for f in [1E-10, 1, 1E10]] - assert_array_equal(labels[0], labels[1]) - assert_array_equal(labels[1], labels[2]) - def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case @@ -670,7 +630,7 @@ def test_alpha(): nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) - prob = np.array([[2./3, 1./3], [0, 1]]) + prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X @@ -682,7 +642,7 @@ def test_alpha(): nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) - prob = np.array([[2./3, 1./3], [0, 1]]) + prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 @@ -743,3 +703,37 @@ def test_alpha_vector(): expected_msg = ('alpha should be a scalar or a numpy array ' 'with shape [n_features]') assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) + + +def test_check_accuracy_on_digits(): + # Non regression test to make sure that any further refactoring / optim + # of the NB models do not harm the performance on a slightly non-linearly + # separable dataset + digits = load_digits() + X, y = digits.data, digits.target + binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8) + X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] + + # Multinomial NB + scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) + assert_greater(scores.mean(), 0.86) + + scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) + assert_greater(scores.mean(), 0.94) + + # Bernoulli NB + scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) + assert_greater(scores.mean(), 0.83) + + scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) + assert_greater(scores.mean(), 0.92) + + # Gaussian NB + scores = cross_val_score(GaussianNB(), X, y, cv=10) + assert_greater(scores.mean(), 0.77) + + scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10) + assert_greater(scores.mean(), 0.89) + + scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) + assert_greater(scores.mean(), 0.86)