8000 [MRG+2] Fix LDA predict_proba() by agamemnonc · Pull Request #11796 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+2] Fix LDA predict_proba() #11796

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
206f1de
fix LDA predict_proba() to handle binary and multi-class case
agamemnonc Aug 11, 2018
824e0cc
test_lda_predict_proba non-regression test
agamemnonc Aug 21, 2018
6560db8
pep8 fix
agamemnonc Aug 21, 2018
c6e2b62
lda predict_proba refactoring
agamemnonc Dec 5, 2018
d7a1226
Typo fix
agamemnonc Dec 5, 2018
163683e
Merge master & predict_proba() fix (test previously failed)
agamemnonc Jan 15, 2019
85d45b0
flake8 fix
agamemnonc Jan 16, 2019
15b59e7
predict_proba check_is_fitted check
agamemnonc Jan 16, 2019
cac71cc
update what's new rst file
agamemnonc Jan 17, 2019
6626945
Merge branch 'master' of git://github.com/scikit-learn/scikit-learn i…
agamemnonc Feb 7, 2019
8144c1a
rename prob to decision
agamemnonc Feb 7, 2019
2cf19a5
include additional tests for predict_proba
agamemnonc Feb 7, 2019
67ad73e
use allcose vs. assert_array_almost_equal
agamemnonc Feb 7, 2019
cb5abc1
fix indent
agamemnonc Feb 7, 2019
ef6a570
replace len with size
agamemnonc Feb 11, 2019
92f9aa2
explicit computation for binary case
agamemnonc Feb 11, 2019
7320883
fix style whats_new rst
agamemnonc Mar 1, 2019
9613fea
predict_proba new regression test
agamemnonc Mar 1, 2019
55e3d2a
give credit for regression test
agamemnonc Mar 1, 2019
2c27e9b
Merge branch 'master' into lda_predict_proba_fix
agamemnonc Mar 1, 2019
ce85441
Merge branch 'master' into lda_predict_proba_fix
agamemnonc Mar 1, 2019
9d198b1
fix bug for eigen solution
agamemnonc Mar 4, 2019
bd4370e
include all three solvers in predict_proba regression test
agamemnonc Mar 4, 2019
480e108
update whats_new rst file
agamemnonc Mar 4, 2019
06e8572
fix minor formatting issue
agamemnonc Mar 4, 2019
625c3f6
use scipy.linalg instead of np.linalg
agamemnonc Mar 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ parameters, may produce different models from the previous version. This often
occurs due to changes in the modelling logic (bug fixes or enhancements), or in
random sampling procedures.

- :class:`discriminant_analysis.LinearDiscriminantAnalysis` for multiclass
classification. |Fix|
- :class:`discriminant_analysis.LinearDiscriminantAnalysis` with 'eigen'
solver. |Fix|
- :class:`linear_model.BayesianRidge` |Fix|
- Decision trees and derived ensembles when both `max_depth` and
`max_leaf_nodes` are set. |Fix|
Expand Down Expand Up @@ -107,6 +111,16 @@ Support for Python 3.4 and below has been officially dropped.
Previously the change was made, but silently. :issue:`11526` by
:user:`William de Vazelhes<wdevazelhes>`.

- |Fix| Fixed a bug in :class:`discriminant_analysis.LinearDiscriminantAnalysis`
where the predicted probabilities would be incorrectly computed in the
multiclass case. :issue:`6848`, by :user:`Agamemnon Krasoulis
<agamemnonc>` and `Guillaume Lemaitre <glemaitre>`.

- |Fix| Fixed a bug in :class:`discriminant_analysis.LinearDiscriminantAnalysis`
where the predicted probabilities would be incorrectly computed with ``eigen``
solver. :issue:`11727`, by :user:`Agamemnon Krasoulis
<agamemnonc>`.

:mod:`sklearn.dummy`
....................

Expand Down
16 changes: 8 additions & 8 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .utils import check_array, check_X_y
from .utils.validation import check_is_fitted
from .utils.multiclass import check_classification_targets
from .utils.extmath import softmax
from .preprocessing import StandardScaler


Expand Down Expand Up @@ -338,7 +339,6 @@ class scatter). This solver supports both classification and
self.explained_variance_ratio_ = np.sort(evals / np.sum(evals)
)[::-1][:self._max_components]
evecs = evecs[:, np.argsort(evals)[::-1]] # sort eigenvectors
evecs /= np.linalg.norm(evecs, axis=0)

self.scalings_ = evecs
self.coef_ = np.dot(self.means_, evecs).dot(evecs.T)
Expand Down Expand Up @@ -531,14 +531,14 @@ def predict_proba(self, X):
C : array, shape (n_samples, n_classes)
Estimated probabilities.
"""
prob = self.decision_function(X)
expit(prob, out=prob)
if len(self.classes_) == 2: # binary case
return np.column_stack([1 - prob, prob])
check_is_fitted(self, 'classes_')

decision = self.decision_function(X)
if self.classes_.size == 2:
proba = expit(decision)
return np.vstack([1-proba, proba]).T
else:
# OvR normalization, like LibLinear's predict_probability
prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
return prob
return softmax(decision)

def predict_log_proba(self, X):
"""Estimate log probability.
Expand Down
76 changes: 74 additions & 2 deletions sklearn/tests/test_discriminant_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import pytest

from numpy.testing import assert_allclose
from scipy import linalg

from sklearn.exceptions import ChangedBehaviorWarning
from sklearn.utils import check_random_state
from sklearn.utils.testing import (assert_array_equal, assert_no_warnings,
Expand Down Expand Up @@ -95,6 +98,75 @@ def test_lda_predict():
assert_raises(ValueError, clf.fit, X, y)


@pytest.mark.parametrize("n_classes", [2, 3])
@pytest.mark.parametrize("solver", ["svd", "lsqr", "eigen"])
def test_lda_predict_proba(solver, n_classes):
def generate_dataset(n_samples, centers, covariances, random_state=None):
"""Generate a multivariate normal data given some centers and
covariances"""
rng = check_random_state(random_state)
X = np.vstack([rng.multivariate_normal(mean, cov,
size=n_samples // len(centers))
for mean, cov in zip(centers, covariances)])
y = np.hstack([[clazz] * (n_samples // len(centers))
for clazz in range(len(centers))])
return X, y

blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
X, y = generate_dataset(
n_samples=90000, centers=blob_centers, covariances=blob_stds,
random_state=42
)
lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True,
shrinkage=None).fit(X, y)
# check that the empirical means and covariances are close enough to the
# one used to generate the data
assert_allclose(lda.means_, blob_centers, atol=1e-1)
assert_allclose(lda.covariance_, blob_stds[0], atol=1)

# implement the method to compute the probability given in The Elements
# of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
# or LDA?")
precision = linalg.inv(blob_stds[0])
alpha_k = []
alpha_k_0 = []
for clazz in range(len(blob_centers) - 1):
alpha_k.append(
np.dot(precision,
(blob_centers[clazz] - blob_centers[-1])[:, np.newaxis]))
alpha_k_0.append(
np.dot(- 0.5 * (blob_centers[clazz] +
blob_centers[-1])[np.newaxis, :], alpha_k[-1]))

sample = np.array([[-22, 22]])

def discriminant_func(sample, coef, intercept, clazz):
return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))

prob = np.array([float(
discriminant_func(sample, alpha_k, alpha_k_0, clazz) /
(1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
for clazz in range(n_classes - 1)]))) for clazz in range(
n_classes - 1)])

prob_ref = 1 - np.sum(prob)

# check the consistency of the computed probability
# all probabilities should sum to one
prob_ref_2 = float(
1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
for clazz in range(n_classes - 1)]))
)

assert prob_ref == pytest.approx(prob_ref_2)
# check that the probability of LDA are close to the theoretical
# probabilties
assert_allclose(lda.predict_proba(sample),
np.hstack([prob, prob_ref])[np.newaxis],
atol=1e-2)


def test_lda_priors():
# Test priors (negative priors)
priors = np.array([0.5, -0.5])
Expand Down Expand Up @@ -229,7 +301,7 @@ def test_lda_scaling():


def test_lda_store_covariance():
# Test for slover 'lsqr' and 'eigen'
# Test for solver 'lsqr' and 'eigen'
# 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers
for solver in ('lsqr', 'eigen'):
clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6)
Expand All @@ -245,7 +317,7 @@ def test_lda_store_covariance():
np.array([[0.422222, 0.088889], [0.088889, 0.533333]])
)

# Test for SVD slover, the default is to not set the covariances_ attribute
# Test for SVD solver, the default is to not set the covariances_ attribute
clf = LinearDiscriminantAnalysis(solver='svd').fit(X6, y6)
assert not hasattr(clf, 'covariance_')

Expand Down
0