2121 _get_check_estimator_ids ,
2222 check_array_api_input_and_values ,
2323)
24- from sklearn .utils .fixes import CSR_CONTAINERS
24+ from sklearn .utils .fixes import CSC_CONTAINERS , CSR_CONTAINERS
2525
2626iris = datasets .load_iris ()
2727PCA_SOLVERS = ["full" , "arpack" , "randomized" , "auto" ]
2828
29+ # `SPARSE_M` and `SPARSE_N` could be larger, but be aware:
30+ # * SciPy's generation of random sparse matrix can be costly
31+ # * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against
32+ SPARSE_M , SPARSE_N = 1000 , 300 # arbitrary
33+ SPARSE_MAX_COMPONENTS = min (SPARSE_M , SPARSE_N )
34+
35+
36+ def _check_fitted_pca_close (pca1 , pca2 , rtol ):
37+ assert_allclose (pca1 .components_ , pca2 .components_ , rtol = rtol )
38+ assert_allclose (pca1 .explained_variance_ , pca2 .explained_variance_ , rtol = rtol )
39+ assert_allclose (pca1 .singular_values_ , pca2 .singular_values_ , rtol = rtol )
40+ assert_allclose (pca1 .mean_ , pca2 .mean_ , rtol = rtol )
41+ assert_allclose (pca1 .n_components_ , pca2 .n_components_ , rtol = rtol )
42+ assert_allclose (pca1 .n_samples_ , pca2 .n_samples_ , rtol = rtol )
43+ assert_allclose (pca1 .noise_variance_ , pca2 .noise_variance_ , rtol = rtol )
44+ assert_allclose (pca1 .n_features_in_ , pca2 .n_features_in_ , rtol = rtol )
45+
2946
3047@pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
3148@pytest .mark .parametrize ("n_components" , range (1 , iris .data .shape [1 ]))
@@ -49,6 +66,118 @@ def test_pca(svd_solver, n_components):
4966 assert_allclose (np .dot (cov , precision ), np .eye (X .shape [1 ]), atol = 1e-12 )
5067
5168
69+ @pytest .mark .parametrize ("density" , [0.01 , 0.1 , 0.30 ])
70+ @pytest .mark .parametrize ("n_components" , [1 , 2 , 10 ])
71+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
72+ @pytest .mark .parametrize ("svd_solver" , ["arpack" ])
73+ @pytest .mark .parametrize ("scale" , [1 , 10 , 100 ])
74+ def test_pca_sparse (
75+ global_random_seed , svd_solver , sparse_container , n_components , density , scale
76+ ):
77+ # Make sure any tolerance changes pass with SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all"
78+ rtol = 5e-07
79+ transform_rtol = 3e-05
80+
81+ random_state = np .random .default_rng (global_random_seed )
82+ X = sparse_container (
83+ sp .sparse .random (
84+ SPARSE_M ,
85+ SPARSE_N ,
86+ random_state = random_state ,
87+ density = density ,
88+ )
89+ )
90+ # Scale the data + vary the column means
91+ scale_vector = random_state .random (X .shape [1 ]) * scale
92+ X = X .multiply (scale_vector )
93+
94+ pca = PCA (
95+ n_components = n_components ,
96+ svd_solver = svd_solver ,
97+ random_state = global_random_seed ,
98+ )
99+ pca .fit (X )
100+
101+ Xd = X .toarray ()
102+ pcad = PCA (
103+ n_components = n_components ,
104+ svd_solver = svd_solver ,
105+ random_state = global_random_seed ,
106+ )
107+ pcad .fit (Xd )
108+
109+ # Fitted attributes equality
110+ _check_fitted_pca_close (pca , pcad , rtol = rtol )
111+
112+ # Test transform
113+ X2 = sparse_container (
114+ sp .sparse .random (
115+ SPARSE_M ,
116+ SPARSE_N ,
117+ random_state = random_state ,
118+ density = density ,
119+ )
120+ )
121+ X2d = X2 .toarray ()
122+
123+ assert_allclose (pca .transform (X2 ), pca .transform (X2d ), rtol = transform_rtol )
124+ assert_allclose (pca .transform (X2 ), pcad .transform (X2d ), rtol = transform_rtol )
125+
126+
127+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
128+ def test_pca_sparse_fit_transform (global_random_seed , sparse_container ):
129+ random_state = np .random .default_rng (global_random_seed )
130+ X = sparse_container (
131+ sp .sparse .random (
132+ SPARSE_M ,
133+ SPARSE_N ,
134+ random_state = random_state ,
135+ density = 0.01 ,
136+ )
137+ )
138+ X2 = sparse_container (
139+ sp .sparse .random (
140+ SPARSE_M ,
141+ SPARSE_N ,
142+ random_state = random_state ,
143+ density = 0.01 ,
144+ )
145+ )
146+
147+ pca_fit = PCA (n_components = 10 , svd_solver = "arpack" , random_state = global_random_seed )
148+ pca_fit_transform = PCA (
149+ n_components = 10 , svd_solver = "arpack" , random_state = global_random_seed
150+ )
151+
152+ pca_fit .fit (X )
153+ transformed_X = pca_fit_transform .fit_transform (X )
154+
155+ _check_fitted_pca_close (pca_fit , pca_fit_transform , rtol = 1e-10 )
156+ assert_allclose (transformed_X , pca_fit_transform .transform (X ), rtol = 2e-9 )
157+ assert_allclose (transformed_X , pca_fit .transform (X ), rtol = 2e-9 )
158+ assert_allclose (pca_fit .transform (X2 ), pca_fit_transform .transform (X2 ), rtol = 2e-9 )
159+
160+
161+ @pytest .mark .parametrize ("svd_solver" , ["randomized" , "full" , "auto" ])
162+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
163+ def test_sparse_pca_solver_error (global_random_seed , svd_solver , sparse_container ):
164+ random_state = np .random .RandomState (global_random_seed )
165+ X = sparse_container (
166+ sp .sparse .random (
167+ SPARSE_M ,
168+ SPARSE_N ,
169+ random_state = random_state ,
170+ )
171+ )
172+ pca = PCA (n_components = 30 , svd_solver = svd_solver )
173+ error_msg_pattern = (
174+ f'PCA only support sparse inputs with the "arpack" solver, while "{ svd_solver } "'
175+ " was passed"
176+ )
177+ with pytest .raises (TypeError , match = error_msg_pattern ):
178+ pca .fit (X )
179+
180+
52181def test_no_empty_slice_warning ():
53182 # test if we avoid numpy warnings for computing over empty arrays
54183 n_components = 10
@@ -502,18 +631,6 @@ def test_pca_svd_solver_auto(data, n_components, expected_solver):
502631 assert_allclose (pca_auto .components_ , pca_test .components_ )
503632
504633
505- @pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
506- @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
507- def test_pca_sparse_input (svd_solver , csr_container ):
508- X = np .random .RandomState (0 ).rand (5 , 4 )
509<
A62C
/code>- X = csr_container (X )
510- assert sp .sparse .issparse (X )
511-
512- pca = PCA (n_components = 3 , svd_solver = svd_solver )
513- with pytest .raises (TypeError ):
514- pca .fit (X )
515-
516-
517634@pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
518635def test_pca_deterministic_output (svd_solver ):
519636 rng = np .random .RandomState (0 )
0 commit comments