8000 FIX array_api support for non-integer n_components in PCA (#27431) · REDVM/scikit-learn@3363080 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3363080

Browse files
ogriselREDVM
authored andcommitted
FIX array_api support for non-integer n_components in PCA (scikit-learn#27431)
1 parent 55ff260 commit 3363080

File tree

4 files changed

+58
-22
lines changed

4 files changed

+58
-22
lines changed

doc/whats_new/v1.4.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ Changelog
276276
- |Enhancement| :class:`decomposition.PCA` now supports the Array API for the
277277
`full` and `randomized` solvers (with QR power iterations). See
278278
:ref:`array_api` for more details.
279-
:pr:`26315` and :pr:`27098` by :user:`Mateusz Sokół <mtsokol>`,
279+
:pr:`26315`, :pr:`27098` and :pr:`27431` by :user:`Mateusz Sokół <mtsokol>`,
280280
:user:`Olivier Grisel <ogrisel>` and :user:`Edoardo Abati <EdAbati>`.
281281

282282
- |Feature| :class:`decomposition.PCA` now supports :class:`scipy.sparse.sparray`

sklearn/decomposition/_pca.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from ..base import _fit_context
2323
from ..utils import check_random_state
2424
from ..utils._arpack import _init_arpack_v0
25-
from ..utils._array_api import get_namespace
25+
from ..utils._array_api import _convert_to_numpy, get_namespace
2626
from ..utils._param_validation import Interval, RealNotInt, StrOptions
2727
from ..utils.deprecation import deprecated
2828
from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip
@@ -60,6 +60,7 @@ def _assess_dimension(spectrum, rank, n_samples):
6060
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
6161
<https://proceedings.neurips.cc/paper/2000/file/7503cfacd12053d309b6bed5c89de212-Paper.pdf>`_
6262
"""
63+
xp, _ = get_namespace(spectrum)
6364

6465
n_features = spectrum.shape[0]
6566
if not 1 <= rank < n_features:
@@ -73,29 +74,29 @@ def _assess_dimension(spectrum, rank, n_samples):
7374
# small and won't be the max anyway. Also, it can lead to numerical
7475
# issues below when computing pa, in particular in log((spectrum[i] -
7576
# spectrum[j]) because this will take the log of something very small.
76-
return -np.inf
77+
return -xp.inf
7778

7879
pu = -rank * log(2.0)
7980
for i in range(1, rank + 1):
8081
pu += (
8182
gammaln((n_features - i + 1) / 2.0)
82-
- log(np.pi) * (n_features - i + 1) / 2.0
83+
- log(xp.pi) * (n_features - i + 1) / 2.0
8384
)
8485

85-
pl = np.sum(np.log(spectrum[:rank]))
86+
pl = xp.sum(xp.log(spectrum[:rank]))
8687
pl = -pl * n_samples / 2.0
8788

88-
v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
89-
pv = -np.log(v) * n_samples * (n_features - rank) / 2.0
89+
v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank))
90+
pv = -log(v) * n_samples * (n_features - rank) / 2.0
9091

9192
m = n_features * rank - rank * (rank + 1.0) / 2.0
92-
pp = log(2.0 * np.pi) * (m + rank) / 2.0
93+
pp = log(2.0 * xp.pi) * (m + rank) / 2.0
9394

9495
pa = 0.0
95-
spectrum_ = spectrum.copy()
96+
spectrum_ = xp.asarray(spectrum, copy=True)
9697
spectrum_[rank:n_features] = v
9798
for i in range(rank):
98-
for j in range(i + 1, len(spectrum)):
99+
for j in range(i + 1, spectrum.shape[0]):
99100
pa += log(
100101
(spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])
101102
) + log(n_samples)
@@ -116,7 +117,7 @@ def _infer_dimension(spectrum, n_samples):
116117
ll[0] = -xp.inf # we don't want to return n_components = 0
117118
for rank in range(1, spectrum.shape[0]):
118119
ll[rank] = _assess_dimension(spectrum, rank, n_samples)
119-
return ll.argmax()
120+
return xp.argmax(ll)
120121

121122

122123
class PCA(_BasePCA):
@@ -578,8 +579,24 @@ def _fit_full(self, X, n_components):
578579
# side='right' ensures that number of features selected
579580
# their variance is always greater than n_components float
580581
# passed. More discussion in issue: #15669
581-
ratio_cumsum = stable_cumsum(explained_variance_ratio_)
582-
n_components = xp.searchsorted(ratio_cumsum, n_components, side="right") + 1
582+
if is_array_api_compliant:
583+
# Convert to numpy as xp.cumsum and xp.searchsorted are not
584+
# part of the Array API standard yet:
585+
#
586+
# https://github.com/data-apis/array-api/issues/597
587+
# https://github.com/data-apis/array-api/issues/688
588+
#
589+
# Furthermore, it's not always safe to call them for namespaces
590+
# that already implement them: for instance as
591+
# cupy.searchsorted does not accept a float as second argument.
592+
explained_variance_ratio_np = _convert_to_numpy(
593+
explained_variance_ratio_, xp=xp
594+
)
595+
else:
596+
explained_variance_ratio_np = explained_variance_ratio_
597+
ratio_cumsum = stable_cumsum(explained_variance_ratio_np)
598+
n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1
599+
583600
# Compute noise covariance using Probabilistic PCA model
584601
# The sigma2 maximum likelihood (cf. eq. 12.46)
585602
if n_components < min(n_features, n_samples):

sklearn/decomposition/tests/test_pca.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from sklearn.utils._testing import _array_api_for_tests, assert_allclose
2020
from sklearn.utils.estimator_checks import (
2121
_get_check_estimator_ids,
22+
check_array_api_input,
2223
check_array_api_input_and_values,
2324
)
2425
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
@@ -859,7 +860,7 @@ def check_array_api_get_precision(name, estimator, array_namespace, device, dtyp
859860
"estimator",
860861
[
861862
PCA(n_components=2, svd_solver="full"),
862-
PCA(n_components=2, svd_solver="full", whiten=True),
863+
PCA(n_components=0.1, svd_solver="full", whiten=True),
863864
PCA(
864865
n_components=2,
865866
svd_solver="randomized",
@@ -874,6 +875,28 @@ def test_pca_array_api_compliance(estimator, check, array_namespace, device, dty
874875
check(name, estimator, array_namespace, device=device, dtype=dtype)
875876

876877

878+
@pytest.mark.parametrize(
879+
"array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
880+
)
881+
@pytest.mark.parametrize(
882+
"check",
883+
[check_array_api_input, check_array_api_get_precision],
884+
ids=_get_check_estimator_ids,
885+
)
886+
@pytest.mark.parametrize(
887+
"estimator",
888+
[
889+
# PCA with mle cannot use check_array_api_input_and_values because of
890+
# rounding errors in the noisy (low variance) components.
891+
PCA(n_components="mle", svd_solver="full"),
892+
],
893+
ids=_get_check_estimator_ids,
894+
)
895+
def test_pca_mle_array_api_compliance(estimator, check, array_namespace, device, dtype):
896+
name = estimator.__class__.__name__
897+
check(name, estimator, array_namespace, device=device, dtype=dtype)
898+
899+
877900
def test_array_api_error_and_warnings_on_unsupported_params():
878901
pytest.importorskip("array_api_compat")
879902
xp = pytest.importorskip("numpy.array_api")

sklearn/utils/extmath.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,14 +1211,10 @@ def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
12111211
out : ndarray
12121212
Array with the cumulative sums along the chosen axis.
12131213
"""
1214-
xp, _ = get_namespace(arr)
1215-
1216-
out = xp.cumsum(arr, axis=axis, dtype=np.float64)
1217-
expected = xp.sum(arr, axis=axis, dtype=np.float64)
1218-
if not xp.all(
1219-
xp.isclose(
1220-
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
1221-
)
1214+
out = np.cumsum(arr, axis=axis, dtype=np.float64)
1215+
expected = np.sum(arr, axis=axis, dtype=np.float64)
1216+
if not np.allclose(
1217+
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
12221218
):
12231219
warnings.warn(
12241220
(

0 commit comments

Comments
 (0)
0