8000 BUG ensure that parallel/sequential give the same permutation importa… · scikit-learn/scikit-learn@a68ba97 · GitHub
[go: up one dir, main page]

Skip to content

Commit a68ba97

Browse files
glemaitreogrisel
authored andcommitted
BUG ensure that parallel/sequential give the same permutation importances (#15933)
1 parent 3a5aced commit a68ba97

File tree

3 files changed

+168
-32
lines changed

3 files changed

+168
-32
lines changed

doc/whats_new/v0.22.rst

+13
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,25 @@ This is a bug-fix release to primarily resolve some packaging issues in version
1515
Changelog
1616
---------
1717

18+
1819
:mod:`sklearn.cluster`
1920
......................
2021

2122
- |Fix| :class:`KMeans` with ``algorithm="elkan"`` now uses the same stopping
2223
criterion as with the default ``algorithm="full"``. :pr:`15930` by
2324
:user:`inder128`.
25+
26+
:mod:`sklearn.inspection`
27+
.........................
28+
29+
- |Fix| :func:`inspection.permutation_importance` will return the same
30+
`importances` when a `random_state` is given for both `n_jobs=1` or
31+
`n_jobs>1` both with shared memory backends (thread-safety) and
32+
isolated memory, process-based backends.
33+
Also avoid casting the data as object dtype and avoid read-only error
34+
on large dataframes with `n_jobs>1` as reported in :issue:`15810`.
35+
Follow-up of :pr:`15898` by :user:`Shivam Gargsya <shivamgargsya>`.
36+
:pr:`15933` by :user:`Guillaume Lemaitre <glemaitre>` and `Olivier Grisel`_.
2437

2538
:mod:`sklearn.metrics`
2639
......................

sklearn/inspection/_permutation_importance.py

+29-32
Original file line numberDiff line numberDiff line change
@@ -4,41 +4,36 @@
44
from joblib import delayed
55

66
from ..metrics import check_scoring
7+
from ..utils import Bunch
78
from ..utils import check_random_state
89
from ..utils import check_array
9-
from ..utils import Bunch
10-
11-
12-
def _safe_column_setting(X, col_idx, values):
13-
"""Set column on X using `col_idx`"""
14-
if hasattr(X, "iloc"):
15-
X.iloc[:, col_idx] = values
16-
else:
17-
X[:, col_idx] = values
18-
19-
20-
def _safe_column_indexing(X, col_idx):
21-
"""Return column from X using `col_idx`"""
22-
if hasattr(X, "iloc"):
23-
return X.iloc[:, col_idx].values
24-
else:
25-
return X[:, col_idx]
2610

2711

2812
def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
2913
n_repeats, scorer):
3014
"""Calculate score when `col_idx` is permuted."""
31-
original_feature = _safe_column_indexing(X, col_idx).copy()
32-
temp = original_feature.copy()
15+
random_state = check_random_state(random_state)
3316

17+
# Work on a copy of X to to ensure thread-safety in case of threading based
18+
# parallelism. Furthermore, making a copy is also useful when the joblib
19+
# backend is 'loky' (default) or the old 'multiprocessing': in those cases,
20+
# if X is large it will be automatically be backed by a readonly memory map
21+
# (memmap). X.copy() on the other hand is always guaranteed to return a
22+
# writable da 9E88 ta-structure whose columns can be shuffled inplace.
23+
X_permuted = X.copy()
3424
scores = np.zeros(n_repeats)
25+
shuffling_idx = np.arange(X.shape[0])
3526
for n_round in range(n_repeats):
36-
random_state.shuffle(temp)
37-
_safe_column_setting(X, col_idx, temp)
38-
feature_score = scorer(estimator, X, y)
27+
random_state.shuffle(shuffling_idx)
28+
if hasattr(X_permuted, "iloc"):
29+
col = X_permuted.iloc[shuffling_idx, col_idx]
30+
col.index = X_permuted.index
31+
X_permuted.iloc[:, col_idx] = col
32+
else:
33+
X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
34+
feature_score = scorer(estimator, X_permuted, y)
3935
scores[n_round] = feature_score
4036

41-
_safe_column_setting(X, col_idx, original_feature)
4237
return scores
4338

4439

@@ -104,20 +99,22 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
10499
.. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
105100
2001. https://doi.org/10.1023/A:1010933404324
106101
"""
107-
if hasattr(X, "iloc"):
108-
X = X.copy() # Dataframe
109-
else:
110-
X = check_array(X, force_all_finite='allow-nan', dtype=np.object,
111-
copy=True)
112-
102+
if not hasattr(X, "iloc"):
103+
X = check_array(X, force_all_finite='allow-nan', dtype=None)
104+
105+
# Precompute random seed from the random state to be used
106+
# to get a fresh independent RandomState instance for each
107+
# parallel call to _calculate_permutation_scores, irrespective of
108+
# the fact that variables are shared or not depending on the active
109+
# joblib backend (sequential, thread-based or process-based).
113110
random_state = check_random_state(random_state)
114-
scorer = check_scoring(estimator, scoring=scoring)
111+
random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
115112

113+
scorer = check_scoring(estimator, scoring=scoring)
116114
baseline_score = scorer(estimator, X, y)
117-
scores = np.zeros((X.shape[1], n_repeats))
118115

119116
scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
120-
estimator, X, y, col_idx, random_state, n_repeats, scorer
117+
estimator, X, y, col_idx, random_seed, n_repeats, scorer
121118
) for col_idx in range(X.shape[1]))
122119

123120
importances = baseline_score - np.array(scores)

sklearn/inspection/tests/test_permutation_importance.py

+126
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,23 @@
66
from sklearn.compose import ColumnTransformer
77
from sklearn.datasets import load_boston
88
from sklearn.datasets import load_iris
9+
from sklearn.datasets import make_classification
910
from sklearn.datasets import make_regression
11+
from sklearn.dummy import DummyClassifier
1012
from sklearn.ensemble import RandomForestRegressor
1113
from sklearn.ensemble import RandomForestClassifier
1214
from sklearn.linear_model import LinearRegression
1315
from sklearn.linear_model import LogisticRegression
1416
from sklearn.impute import SimpleImputer
1517
from sklearn.inspection import permutation_importance
1618
from sklearn.pipeline import make_pipeline
19+
from sklearn.preprocessing import KBinsDiscretizer
1720
from sklearn.preprocessing import OneHotEncoder
1821
from sklearn.preprocessing import StandardScaler
1922
from sklearn.preprocessing import scale
23+
from sklearn.utils import parallel_backend
24+
from sklearn.utils._testing import _convert_container
25+
2026

2127
@pytest.mark.parametrize("n_jobs", [1, 2])
2228
def test_permutation_importance_correlated_feature_regression(n_jobs):
@@ -150,3 +156,123 @@ def test_permutation_importance_linear_regresssion():
150156
scoring='neg_mean_squared_error')
151157
assert_allclose(expected_importances, results.importances_mean,
152158
rtol=1e-1, atol=1e-6)
159+
160+
161+
def test_permutation_importance_equivalence_sequential_parallel():
162+
# regression test to make sure that sequential and parallel calls will
163+
# output the same results.
164+
X, y = make_regression(n_samples=500, n_features=10, random_state=0)
165+
lr = LinearRegression().fit(X, y)
166+
167+
importance_sequential = permutation_importance(
168+
lr, X, y, n_repeats=5, random_state=0, n_jobs=1
169+
)
170+
171+
# First check that the problem is structured enough and that the model is
172+
# complex enough to not yield trivial, constant importances:
173+
imp_min = importance_sequential['importances'].min()
174+
imp_max = importance_sequential['importances'].max()
175+
assert imp_max - imp_min > 0.3
176+
177+
# The actually check that parallelism does not impact the results
178+
# either with shared memory (threading) or without isolated memory
179+
# via process-based parallelism using the default backend
180+
# ('loky' or 'multiprocessing') depending on the joblib version:
181+
182+
# process-based parallelism (by default):
183+
importance_processes = permutation_importance(
184+
lr, X, y, n_repeats=5, random_state=0, n_jobs=2)
185+
assert_allclose(
186+
importance_processes['importances'],
187+
importance_sequential['importances']
188+
)
189+
190+
# thread-based parallelism:
191+
with parallel_backend("threading"):
192+
importance_threading = permutation_importance(
193+
lr, X, y, n_repeats=5, random_state=0, n_jobs=2
194+
)
195+
assert_allclose(
196+
importance_threading['importances'],
197+
importance_sequential['importances']
198+
)
199+
200+
201+
@pytest.mark.parametrize("n_jobs", [None, 1, 2])
202+
def test_permutation_importance_equivalence_array_dataframe(n_jobs):
203+
# This test checks that the column shuffling logic has the same behavior
204+
# both a dataframe and a simple numpy array.
205+
pd = pytest.importorskip('pandas')
206+
207+
# regression test to make sure that sequential and parallel calls will
208+
# output the same results.
209+
X, y = make_regression(n_samples=100, n_features=5, random_state=0)
210+
X_df = pd.DataFrame(X)
211+
212+
# Add a categorical feature that is statistically linked to y:
213+
binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
214+
cat_column = binner.fit_transform(y.reshape(-1, 1))
215+
216+
# Concatenate the extra column to the numpy array: integers will be
217+
# cast to float values
218+
X = np.hstack([X, cat_column])
219+
assert X.dtype.kind == "f"
220+
221+
# Insert extra column as a non-numpy-native dtype (while keeping backward
222+
# compat for old pandas versions):
223+
if hasattr(pd, "Categorical"):
224+
cat_column = pd.Categorical(cat_column.ravel())
225+
else:
226+
cat_column = cat_column.ravel()
227+
new_col_idx = len(X_df.columns)
228+
X_df[new_col_idx] = cat_column
229+
assert X_df[new_col_idx].dtype == cat_column.dtype
230+
231+
# Stich an aribtrary index to the dataframe:
232+
X_df.index = np.arange(len(X_df)).astype(str)
233+
234+
rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
235+
rf.fit(X, y)
236+
237+
n_repeats = 3
238+
importance_array = permutation_importance(
239+
rf, X, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
240+
)
241+
242+
# First check that the problem is structured enough and that the model is
243+
# complex enough to not yield trivial, constant importances:
244+
imp_min = importance_array['importances'].min()
245+
imp_max = importance_array['importances'].max()
246+
assert imp_max - imp_min > 0.3
247+
248+
# Now check that importances computed on dataframe matche the values
249+
# of those computed on the array with the same data.
250+
importance_dataframe = permutation_importance(
251+
rf, X_df, y, n_repeats=n_repeats, random_state=0, n_jobs=n_jobs
252+
)
253+
assert_allclose(
254+
importance_array['importances'],
255+
importance_dataframe['importances']
256+
)
257+
258+
259+
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
260+
def test_permutation_importance_large_memmaped_data(input_type):
261+
# Smoke, non-regression test for:
262+
# https://github.com/scikit-learn/scikit-learn/issues/15810
263+
n_samples, n_features = int(5e4), 4
264+
X, y = make_classification(n_samples=n_samples, n_features=n_features,
265+
random_state=0)
266+
assert X.nbytes > 1e6 # trigger joblib memmaping
267+
268+
X = _convert_container(X, input_type)
269+
clf = DummyClassifier(strategy='prior').fit(X, y)
270+
271+
# Actual smoke test: should not raise any error:
272+
n_repeats = 5
273+
r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
274+
275+
# Auxiliary check: DummyClassifier is feature independent:
276+
# permutating feature should not change the predictions
277+
expected_importances = np.zeros((n_features, n_repeats))
278+
assert_allclose(expected_importances, r.importances)

0 commit comments

Comments
 (0)
0