diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f2166ac91621c..fdbecc358be35 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1051,20 +1051,25 @@ def check_clustering(name, clusterer_orig): assert_in(pred.dtype, [np.dtype('int32'), np.dtype('int64')]) assert_in(pred2.dtype, [np.dtype('int32'), np.dtype('int64')]) + # Add noise to X to test the possible values of the labels + rng = np.random.RandomState(7) + X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))]) + labels = clusterer.fit_predict(X_noise) + # There should be at least one sample in every cluster. Equivalently # labels_ should contain all the consecutive values between its # min and its max. - pred_sorted = np.unique(pred) - assert_array_equal(pred_sorted, np.arange(pred_sorted[0], - pred_sorted[-1] + 1)) + labels_sorted = np.unique(labels) + assert_array_equal(labels_sorted, np.arange(labels_sorted[0], + labels_sorted[-1] + 1)) - # labels_ should be greater than -1 - assert_greater_equal(pred_sorted[0], -1) - # labels_ should be less than n_clusters - 1 + # Labels are expected to start at 0 (no noise) or -1 (if noise) + assert_true(labels_sorted[0] in [0, -1]) + # Labels should be less than n_clusters - 1 if hasattr(clusterer, 'n_clusters'): n_clusters = getattr(clusterer, 'n_clusters') - assert_greater_equal(n_clusters - 1, pred_sorted[-1]) - # else labels_ should be less than max(labels_) which is necessarily true + assert_greater_equal(n_clusters - 1, labels_sorted[-1]) + # else labels should be less than max(labels_) which is necessarily true @ignore_warnings(category=DeprecationWarning)