scikit-learn · cboseak · Jun 4, 2025 · Jun 4, 2025 · Jun 5, 2025 · Jun 5, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst b/doc/whats_new/upcoming_changes/sklearn.dummy/31488.feature.rst
@@ -0,0 +1,5 @@
+- :class:`dummy.DummyClassifier` now supports a new strategy "uniform-proba" that
+  generates random probability distributions for each sample using a Dirichlet
+  distribution with all concentration parameters set to 1. This results in uniformly
+  distributed probability vectors that sum to 1 for each sample.
+  By :user:`Chris Boseakc <cboseak>`
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -57,7 +57,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
     Parameters
     ----------
     strategy : {"most_frequent", "prior", "stratified", "uniform", \
-            "constant"}, default="prior"
+            "uniform-proba", "constant"}, default="prior"
         Strategy to use to generate predictions.
 
         * "most_frequent": the `predict` method always returns the most
@@ -79,6 +79,10 @@ class prior probabilities.
         * "uniform": generates predictions uniformly at random from the list
           of unique classes observed in `y`, i.e. each class has equal
           probability.
+        * "uniform-proba": generates random probability distributions for each
+          sample using a Dirichlet distribution with all concentration parameters
+          set to 1. This results in uniformly distributed probability vectors
+          that sum to 1 for each sample.
         * "constant": always predicts a constant label that is provided by
           the user. This is useful for metrics that evaluate a non-majority
           class.
@@ -89,7 +93,8 @@ class prior probabilities.
 
     random_state : int, RandomState instance or None, default=None
         Controls the randomness to generate the predictions when
-        ``strategy='stratified'`` or ``strategy='uniform'``.
+        ``strategy='stratified'``, ``strategy='uniform'``, or
+        ``strategy='uniform-proba'``.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
@@ -147,7 +152,16 @@ class prior probabilities.
 
     _parameter_constraints: dict = {
         "strategy": [
-            StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"})
+            StrOptions(
+                {
+                    "most_frequent",
+                    "prior",
+                    "stratified",
+                    "uniform",
+                    "uniform-proba",
+                    "constant",
+                }
+            )
         ],
         "random_state": ["random_state"],
         "constant": [Integral, str, "array-like", None],
@@ -280,7 +294,7 @@ def predict(self, X):
             class_prior_ = [class_prior_]
             constant = [constant]
         # Compute probability only once
-        if self._strategy == "stratified":
+        if self._strategy in ("stratified", "uniform-proba"):
             proba = self.predict_proba(X)
             if self.n_outputs_ == 1:
                 proba = [proba]
@@ -293,10 +307,10 @@ def predict(self, X):
             elif self._strategy == "stratified":
                 class_prob = class_prior_
 
-            elif self._strategy == "uniform":
+            elif self._strategy in ("uniform", "uniform-proba"):
                 raise ValueError(
                     "Sparse target prediction is not "
-                    "supported with the uniform strategy"
+                    f"supported with the {self._strategy} strategy"
                 )
 
             elif self._strategy == "constant":
@@ -313,7 +327,7 @@ def predict(self, X):
                     [n_samples, 1],
                 )
 
-            elif self._strategy == "stratified":
+            elif self._strategy in ("stratified", "uniform-proba"):
                 y = np.vstack(
                     [
                         classes_[k][proba[k].argmax(axis=1)]
@@ -387,6 +401,13 @@ def predict_proba(self, X):
                 out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
                 out /= n_classes_[k]
 
+            elif self._strategy == "uniform-proba":
+                # Generate random probability vectors from Dirichlet distribution
+                # with all concentration parameters set to 1 (uniform)
+                alpha = np.ones(n_classes_[k])
+                out = rs.dirichlet(alpha, size=n_samples)
+                out = out.astype(np.float64)
+
             elif self._strategy == "constant":
                 ind = np.where(classes_[k] == constant[k])
                 out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)

diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
@@ -713,3 +713,59 @@ def test_dtype_of_classifier_probas(strategy):
     probas = model.fit(X, y).predict_proba(X)
 
     assert probas.dtype == np.float64
+
+
+def test_uniform_proba_strategy(global_random_seed) -> None:
+    """Basic checks on uniform probability distributions in the dummy classifier."""
+    X = [[0]] * 5  # ignored
+    y = [1, 2, 1, 1, 2]
+    clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X_test = [[0]] * 100
+    y_pred_proba = clf.predict_proba(X_test)
+
+    # Check that probabilities sum to 1 for each sample
+    assert_array_almost_equal(np.sum(y_pred_proba, axis=1), np.ones(len(X_test)))
+
+    # Check that all probabilities are >= 0
+    assert np.all(y_pred_proba >= 0)
+
+    # Check shape
+    assert y_pred_proba.shape == (len(X_test), len(np.unique(y)))
+
+    # Check that predict returns the class with highest probability
+    y_pred = clf.predict(X_test)
+    for i in range(len(X_test)):
+        assert y_pred[i] == clf.classes_[np.argmax(y_pred_proba[i])]
+
+    _check_predict_proba(clf, X_test, y)
+
+
+def test_uniform_proba_strategy_multioutput(global_random_seed):
+    X = [[0]] * 5  # ignored
+    y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
+
+    clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed)
+    clf.fit(X, y)
+
+    X_test = [[0]] * 100
+    y_pred = clf.predict(X_test)
+    y_pred_proba = clf.predict_proba(X_test)
+
+    # For multioutput, predict_proba returns a list of arrays
+    assert isinstance(y_pred_proba, list)
+    assert len(y_pred_proba) == y.shape[1]
+
+    for k in range(y.shape[1]):
+        # Check that probabilities sum to 1 for each sample
+        assert_array_almost_equal(np.sum(y_pred_proba[k], axis=1), np.ones(len(X_test)))
+
+        # Check that all probabilities are >= 0
+        assert np.all(y_pred_proba[k] >= 0)
+
+        # Check shape
+        assert y_pred_proba[k].shape == (len(X_test), len(np.unique(y[:, k])))
+
+    _check_predict_proba(clf, X_test, y)
+    _check_behavior_2d(clf)