From aa4b63c3c67110f3c8cd125db2589305b29f1fad Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 4 Jun 2025 15:48:10 -0500 Subject: [PATCH 1/4] [31462] DummyClassifier strategy that produces randomized probabilities --- .../sklearn.dummy/31462.feature.rst | 5 ++ sklearn/dummy.py | 35 +++++++++--- sklearn/tests/test_dummy.py | 55 +++++++++++++++++++ 3 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst b/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst new file mode 100644 index 0000000000000..ce9933d803a47 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst @@ -0,0 +1,5 @@ +- :class:`dummy.DummyClassifier` now supports a new strategy "uniform-proba" that + generates random probability distributions for each sample using a Dirichlet + distribution with all concentration parameters set to 1. This results in uniformly + distributed probability vectors that sum to 1 for each sample. + By :user:`Chris Boseakc ` diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 7d44fa2e473bb..b4c4ec36ccc76 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -57,7 +57,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): Parameters ---------- strategy : {"most_frequent", "prior", "stratified", "uniform", \ - "constant"}, default="prior" + "uniform-proba", "constant"}, default="prior" Strategy to use to generate predictions. * "most_frequent": the `predict` method always returns the most @@ -79,6 +79,10 @@ class prior probabilities. * "uniform": generates predictions uniformly at random from the list of unique classes observed in `y`, i.e. each class has equal probability. + * "uniform-proba": generates random probability distributions for each + sample using a Dirichlet distribution with all concentration parameters + set to 1. This results in uniformly distributed probability vectors + that sum to 1 for each sample. * "constant": always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class. @@ -89,7 +93,8 @@ class prior probabilities. random_state : int, RandomState instance or None, default=None Controls the randomness to generate the predictions when - ``strategy='stratified'`` or ``strategy='uniform'``. + ``strategy='stratified'``, ``strategy='uniform'``, or + ``strategy='uniform-proba'``. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. @@ -147,7 +152,16 @@ class prior probabilities. _parameter_constraints: dict = { "strategy": [ - StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"}) + StrOptions( + { + "most_frequent", + "prior", + "stratified", + "uniform", + "uniform-proba", + "constant", + } + ) ], "random_state": ["random_state"], "constant": [Integral, str, "array-like", None], @@ -280,7 +294,7 @@ def predict(self, X): class_prior_ = [class_prior_] constant = [constant] # Compute probability only once - if self._strategy == "stratified": + if self._strategy in ("stratified", "uniform-proba"): proba = self.predict_proba(X) if self.n_outputs_ == 1: proba = [proba] @@ -293,10 +307,10 @@ def predict(self, X): elif self._strategy == "stratified": class_prob = class_prior_ - elif self._strategy == "uniform": + elif self._strategy in ("uniform", "uniform-proba"): raise ValueError( "Sparse target prediction is not " - "supported with the uniform strategy" + f"supported with the {self._strategy} strategy" ) elif self._strategy == "constant": @@ -313,7 +327,7 @@ def predict(self, X): [n_samples, 1], ) - elif self._strategy == "stratified": + elif self._strategy in ("stratified", "uniform-proba"): y = np.vstack( [ classes_[k][proba[k].argmax(axis=1)] @@ -387,6 +401,13 @@ def predict_proba(self, X): out = np.ones((n_samples, n_classes_[k]), dtype=np.float64) out /= n_classes_[k] + elif self._strategy == "uniform-proba": + # Generate random probability vectors from Dirichlet distribution + # with all concentration parameters set to 1 (uniform) + alpha = np.ones(n_classes_[k]) + out = rs.dirichlet(alpha, size=n_samples) + out = out.astype(np.float64) + elif self._strategy == "constant": ind = np.where(classes_[k] == constant[k]) out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64) diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 61f1803b7a24f..8a4a0d901c088 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -713,3 +713,58 @@ def test_dtype_of_classifier_probas(strategy): probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64 + + +def test_uniform_proba_strategy(global_random_seed): + X = [[0]] * 5 # ignored + y = [1, 2, 1, 1, 2] + clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed) + clf.fit(X, y) + + X_test = [[0]] * 100 + y_pred_proba = clf.predict_proba(X_test) + + # Check that probabilities sum to 1 for each sample + assert_array_almost_equal(np.sum(y_pred_proba, axis=1), np.ones(len(X_test))) + + # Check that all probabilities are >= 0 + assert np.all(y_pred_proba >= 0) + + # Check shape + assert y_pred_proba.shape == (len(X_test), len(np.unique(y))) + + # Check that predict returns the class with highest probability + y_pred = clf.predict(X_test) + for i in range(len(X_test)): + assert y_pred[i] == clf.classes_[np.argmax(y_pred_proba[i])] + + _check_predict_proba(clf, X_test, y) + + +def test_uniform_proba_strategy_multioutput(global_random_seed): + X = [[0]] * 5 # ignored + y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]]) + + clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed) + clf.fit(X, y) + + X_test = [[0]] * 100 + y_pred = clf.predict(X_test) + y_pred_proba = clf.predict_proba(X_test) + + # For multioutput, predict_proba returns a list of arrays + assert isinstance(y_pred_proba, list) + assert len(y_pred_proba) == y.shape[1] + + for k in range(y.shape[1]): + # Check that probabilities sum to 1 for each sample + assert_array_almost_equal(np.sum(y_pred_proba[k], axis=1), np.ones(len(X_test))) + + # Check that all probabilities are >= 0 + assert np.all(y_pred_proba[k] >= 0) + + # Check shape + assert y_pred_proba[k].shape == (len(X_test), len(np.unique(y[:, k]))) + + _check_predict_proba(clf, X_test, y) + _check_behavior_2d(clf) From 8a277496a42335a84df64fe349a00bc1b86c25be Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 4 Jun 2025 15:48:10 -0500 Subject: [PATCH 2/4] [31462] DummyClassifier strategy that produces randomized probabilities --- .../upcoming_changes/sklearn.dummy/31488.feature.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst b/doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst new file mode 100644 index 0000000000000..ce9933d803a47 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst @@ -0,0 +1,5 @@ +- :class:`dummy.DummyClassifier` now supports a new strategy "uniform-proba" that + generates random probability distributions for each sample using a Dirichlet + distribution with all concentration parameters set to 1. This results in uniformly + distributed probability vectors that sum to 1 for each sample. + By :user:`Chris Boseakc ` From 7167d18ccc3cf97349d6930ac69f48bb7d99c4dd Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 5 Jun 2025 09:10:35 -0500 Subject: [PATCH 3/4] changelog --- .../upcoming_changes/sklearn.dummy/31462.feature.rst | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst b/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst deleted file mode 100644 index ce9933d803a47..0000000000000 --- a/doc/whats_new/upcoming_changes/sklearn.dummy/31462.feature.rst +++ /dev/null @@ -1,5 +0,0 @@ -- :class:`dummy.DummyClassifier` now supports a new strategy "uniform-proba" that - generates random probability distributions for each sample using a Dirichlet - distribution with all concentration parameters set to 1. This results in uniformly - distributed probability vectors that sum to 1 for each sample. - By :user:`Chris Boseakc ` From e39ae04e09ede8b2f03fa48a6b8e53643fbc8598 Mon Sep 17 00:00:00 2001 From: Christopher Boseak Date: Tue, 10 Jun 2025 16:12:38 -0500 Subject: [PATCH 4/4] Update sklearn/tests/test_dummy.py based on suggestion Co-authored-by: Tom McClintock --- sklearn/tests/test_dummy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 8a4a0d901c088..b3c2801638133 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -715,7 +715,8 @@ def test_dtype_of_classifier_probas(strategy): assert probas.dtype == np.float64 -def test_uniform_proba_strategy(global_random_seed): +def test_uniform_proba_strategy(global_random_seed) -> None: + """Basic checks on uniform probability distributions in the dummy classifier.""" X = [[0]] * 5 # ignored y = [1, 2, 1, 1, 2] clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed)