21
21
_get_check_estimator_ids ,
22
22
check_array_api_input_and_values ,
23
23
)
24
- from sklearn .utils .fixes import CSR_CONTAINERS
24
+ from sklearn .utils .fixes import CSC_CONTAINERS , CSR_CONTAINERS
25
25
26
26
iris = datasets .load_iris ()
27
27
PCA_SOLVERS = ["full" , "arpack" , "randomized" , "auto" ]
28
28
29
+ # `SPARSE_M` and `SPARSE_N` could be larger, but be aware:
30
+ # * SciPy's generation of random sparse matrix can be costly
31
+ # * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against
32
+ SPARSE_M , SPARSE_N = 1000 , 300 # arbitrary
33
+ SPARSE_MAX_COMPONENTS = min (SPARSE_M , SPARSE_N )
34
+
35
+
36
+ def _check_fitted_pca_close (pca1 , pca2 , rtol ):
37
+ assert_allclose (pca1 .components_ , pca2 .components_ , rtol = rtol )
38
+ assert_allclose (pca1 .explained_variance_ , pca2 .explained_variance_ , rtol = rtol )
39
+ assert_allclose (pca1 .singular_values_ , pca2 .singular_values_ , rtol = rtol )
40
+ assert_allclose (pca1 .mean_ , pca2 .mean_ , rtol = rtol )
41
+ assert_allclose (pca1 .n_components_ , pca2 .n_components_ , rtol = rtol )
42
+ assert_allclose (pca1 .n_samples_ , pca2 .n_samples_ , rtol = rtol )
43
+ assert_allclose (pca1 .noise_variance_ , pca2 .noise_variance_ , rtol = rtol )
44
+ assert_allclose (pca1 .n_features_in_ , pca2 .n_features_in_ , rtol = rtol )
45
+
29
46
30
47
@pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
31
48
@pytest .mark .parametrize ("n_components" , range (1 , iris .data .shape [1 ]))
@@ -49,6 +66,118 @@ def test_pca(svd_solver, n_components):
49
66
assert_allclose (np .dot (cov , precision ), np .eye (X .shape [1 ]), atol = 1e-12 )
50
67
51
68
69
+ @pytest .mark .parametrize ("density" , [0.01 , 0.1 , 0.30 ])
70
+ @pytest .mark .parametrize ("n_components" , [1 , 2 , 10 ])
71
+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
72
+ @pytest .mark .parametrize ("svd_solver" , ["arpack" ])
73
+ @pytest .mark .parametrize ("scale" , [1 , 10 , 100 ])
74
+ def test_pca_sparse (
75
+ global_random_seed , svd_solver , sparse_container , n_components , density , scale
76
+ ):
77
+ # Make sure any tolerance changes pass with SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all"
78
+ rtol = 5e-07
79
+ transform_rtol = 3e-05
80
+
81
+ random_state = np .random .default_rng (global_random_seed )
82
+ X = sparse_container (
83
+ sp .sparse .random (
84
+ SPARSE_M ,
85
+ SPARSE_N ,
86
+ random_state = random_state ,
87
+ density = density ,
88
+ )
89
+ )
90
+ # Scale the data + vary the column means
91
+ scale_vector = random_state .random (X .shape [1 ]) * scale
92
+ X = X .multiply (scale_vector )
93
+
94
+ pca = PCA (
95
+ n_components = n_components ,
96
+ svd_solver = svd_solver ,
97
+ random_state = global_random_seed ,
98
+ )
99
+ pca .fit (X )
100
+
101
+ Xd = X .toarray ()
102
+ pcad = PCA (
103
+ n_components = n_components ,
104
+ svd_solver = svd_solver ,
105
+ random_state = global_random_seed ,
106
+ )
107
+ pcad .fit (Xd )
108
+
109
+ # Fitted attributes equality
110
+ _check_fitted_pca_close (pca , pcad , rtol = rtol )
111
+
112
+ # Test transform
113
+ X2 = sparse_container (
114
+ sp .sparse .random (
115
+ SPARSE_M ,
116
+ SPARSE_N ,
117
+ random_state = random_state ,
118
+ density = density ,
119
+ )
120
+ )
121
+ X2d = X2 .toarray ()
122
+
123
+ assert_allclose (pca .transform (X2 ), pca .transform (X2d ), rtol = transform_rtol )
124
+ assert_allclose (pca .transform (X2 ), pcad .transform (X2d ), rtol = transform_rtol )
125
+
126
+
127
+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
128
+ def test_pca_sparse_fit_transform (global_random_seed , sparse_container ):
129
+ random_state = np .random .default_rng (global_random_seed )
130
+ X = sparse_container (
131
+ sp .sparse .random (
132
+ SPARSE_M ,
133
+ SPARSE_N ,
134
+ random_state = random_state ,
135
+ density = 0.01 ,
136
+ )
137
+ )
138
+ X2 = sparse_container (
139
+ sp .sparse .random (
140
+ SPARSE_M ,
141
+ SPARSE_N ,
142
+ random_state = random_state ,
143
+ density = 0.01 ,
144
+ )
145
+ )
146
+
147
+ pca_fit = PCA (n_components = 10 , svd_solver = "arpack" , random_state = global_random_seed )
148
+ pca_fit_transform = PCA (
149
+ n_components = 10 , svd_solver = "arpack" , random_state = global_random_seed
150
+ )
151
+
152
+ pca_fit .fit (X )
153
+ transformed_X = pca_fit_transform .fit_transform (X )
154
+
155
+ _check_fitted_pca_close (pca_fit , pca_fit_transform , rtol = 1e-10 )
156
+ assert_allclose (transformed_X , pca_fit_transform .transform (X ), rtol = 2e-9 )
157
+ assert_allclose (transformed_X , pca_fit .transform (X ), rtol = 2e-9 )
158
+ assert_allclose (pca_fit .transform (X2 ), pca_fit_transform .transform (X2 ), rtol = 2e-9 )
159
+
160
+
161
+ @pytest .mark .parametrize ("svd_solver" , ["randomized" , "full" , "auto" ])
162
+ @pytest .mark .parametrize ("sparse_container" , CSR_CONTAINERS + CSC_CONTAINERS )
163
+ def test_sparse_pca_solver_error (global_random_seed , svd_solver , sparse_container ):
164
+ random_state = np .random .RandomState (global_random_seed )
165
+ X = sparse_container (
166
+ sp .sparse .random (
167
+ SPARSE_M ,
168
+ SPARSE_N ,
169
+ random_state = random_state ,
170
+ )
171
+ )
172
+ pca = PCA (n_components = 30 , svd_solver = svd_solver )
173
+ error_msg_pattern = (
174
+ f'PCA only support sparse inputs with the "arpack" solver, while "{ svd_solver } "'
175
+ " was passed"
176
+ )
177
+ with pytest .raises (TypeError , match = error_msg_pattern ):
178
+ pca .fit (X )
179
+
180
+
52
181
def test_no_empty_slice_warning ():
53
182
# test if we avoid numpy warnings for computing over empty arrays
54
183
n_components = 10
@@ -502,18 +631,6 @@ def test_pca_svd_solver_auto(data, n_components, expected_solver):
502
631
assert_allclose (pca_auto .components_ , pca_test .components_ )
503
632
504
633
505
- @pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
506
- @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
507
- def test_pca_sparse_input (svd_solver , csr_container ):
508
- X = np .random .RandomState (0 ).rand (5 , 4 )
509
- X = csr_container (X )
510
- assert sp .sparse .issparse (X )
511
-
512
- pca = PCA (n_components = 3 , svd_solver = svd_solver )
513
- with pytest .raises (TypeError ):
514
- pca .fit (X )
515
-
516
-
517
634
@pytest .mark .parametrize ("svd_solver" , PCA_SOLVERS )
518
635
def test_pca_deterministic_output (svd_solver ):
519
636
rng = np .random .RandomState (0 )
0 commit comments