8000 BF: fix issue #1127 about MinCovDet breaking with X.shape = (3, 1) · seckcoder/scikit-learn@d65fae7 · GitHub
[go: up one dir, main page]

Skip to content

Commit d65fae7

Browse files
VirgileFritschGaelVaroquaux
authored andcommitted
BF: fix issue scikit-learn#1127 about MinCovDet breaking with X.shape = (3, 1)
Note that in such a case, there is no point to be robust so the standard location and variance are returned.
1 parent 890bb8e commit d65fae7

File tree

2 files changed

+35
-16
lines changed

2 files changed

+35
-16
lines changed

sklearn/covariance/robust_covariance.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -331,22 +331,32 @@ def fast_mcd(X, support_fraction=None,
331331
# (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
332332
# Regression and Outlier Detection, John Wiley & Sons, chapter 4)
333333
if n_features == 1:
334-
# find the sample shortest halves
335-
X_sorted = np.sort(np.ravel(X))
336-
diff = X_sorted[n_support:] - X_sorted[:(n_samples - n_support)]
337-
halves_start = np.where(diff == np.min(diff))[0]
338-
# take the middle points' mean to get the robust location estimate
339-
location = 0.5 * (X_sorted[n_support + halves_start]
340-
+ X_sorted[halves_start]).mean()
341-
support = np.zeros(n_samples).astype(bool)
342-
X_centered = X - location
343-
support[np.argsort(np.abs(X - location), axis=0)[:n_support]] = True
344-
covariance = np.asarray([[np.var(X[support])]])
345-
location = np.array([location])
346-
# get precision matrix in an optimized way
347-
precision = pinvh(covariance)
348-
dist = (np.dot(X_centered, precision) \
349-
* (X_centered)).sum(axis=1)
334+
if n_support < n_samples:
335+
# find the sample shortest halves
336+
X_sorted = np.sort(np.ravel(X))
337+
diff = X_sorted[n_support:] - X_sorted[:(n_samples - n_support)]
338+
halves_start = np.where(diff == np.min(diff))[0]
339+
# take the middle points' mean to get the robust location estimate
340+
location = 0.5 * (X_sorted[n_support + halves_start]
341+
+ X_sorted[halves_start]).mean()
342+
support = np.zeros(n_samples, dtype=bool)
343+
X_centered = X - location
344+
support[np.argsort(np.abs(X - location), 0)[:n_support]] = True
345+
covariance = np.asarray([[np.var(X[support])]])
346+
location = np.array([location])
347+
# get precision matrix in an optimized way
348+
precision = pinvh(covariance)
349+
dist = (np.dot(X_centered, precision) \
350+
* (X_centered)).sum(axis=1)
351+
else:
352+
support = np.ones(n_samples, dtype=bool)
353+
covariance = np.asarray([[np.var(X)]])
354+
location = np.asarray([np.mean(X)])
355+
X_centered = X - location
356+
# get precision matrix in an optimized way
357+
precision = pinvh(covariance)
358+
dist = (np.dot(X_centered, precision) \
359+
* (X_centered)).sum(axis=1)
350360

351361
### Starting FastMCD algorithm for p-dimensional case
352362
if (n_samples > 500) and (n_features > 1):

sklearn/covariance/tests/test_robust_covariance.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,15 @@ def launch_mcd_on_dataset(
6868
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
6969

7070

71+
def test_mcd_issue1127():
72+
# Check that the code does not break with X.shape = (3, 1)
73+
# (i.e. n_support = n_samples)
74+
rnd = np.random.RandomState(0)
75+
X = rnd.normal(size=(3, 1))
76+
mcd = MinCovDet()
77+
mcd.fit(X)
78+
79+
7180
def test_outlier_detection():
7281
rnd = np.random.RandomState(0)
7382
X = rnd.randn(100, 10)

0 commit comments

Comments
 (0)
0