10000 FEA Add DummyClassifier strategy that produces randomized probabilities by cboseak · Pull Request #31488 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FEA Add DummyClassifier strategy that produces randomized probabilities #31488

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- :class:`dummy.DummyClassifier` now supports a new strategy "uniform-proba" that
generates random probability distributions for each sample using a Dirichlet
distribution with all concentration parameters set to 1. This results in uniformly
distributed probability vectors that sum to 1 for each sample.
By :user:`Chris Boseakc <cboseak>`
35 changes: 28 additions & 7 deletions sklearn/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
Parameters
----------
strategy : {"most_frequent", "prior", "stratified", "uniform", \
"constant"}, default="prior"
"uniform-proba", "constant"}, default="prior"
Strategy to use to generate predictions.

* "most_frequent": the `predict` method always returns the most
Expand All @@ -79,6 +79,10 @@ class prior probabilities.
* "uniform": generates predictions uniformly at random from the list
of unique classes observed in `y`, i.e. each class has equal
probability.
* "uniform-proba": generates random probability distributions for each
sample using a Dirichlet distribution with all concentration parameters
set to 1. This results in uniformly distributed probability vectors
that sum to 1 for each sample.
* "constant": always predicts a constant label that is provided by
the user. This is useful for metrics that evaluate a non-majority
class.
Expand All @@ -89,7 +93,8 @@ class prior probabilities.

random_state : int, RandomState instance or None, default=None
Controls the randomness to generate the predictions when
``strategy='stratified'`` or ``strategy='uniform'``.
``strategy='stratified'``, ``strategy='uniform'``, or
``strategy='uniform-proba'``.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.

Expand Down Expand Up @@ -147,7 +152,16 @@ class prior probabilities.

_parameter_constraints: dict = {
"strategy": [
StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"})
StrOptions(
{
"most_frequent",
"prior",
"stratified",
"uniform",
"uniform-proba",
"constant",
}
)
],
"random_state": ["random_state"],
"constant": [Integral, str, "array-like", None],
Expand Down Expand Up @@ -280,7 +294,7 @@ def predict(self, X):
class_prior_ = [class_prior_]
constant = [constant]
# Compute probability only once
if self._strategy == "stratified":
if self._strategy in ("stratified", "uniform-proba"):
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
proba = [proba]
Expand All @@ -293,10 +307,10 @@ def predict(self, X):
elif self._strategy == "stratified":
class_prob = class_prior_

elif self._strategy == "uniform":
elif self._strategy in ("uniform", "uniform-proba"):
raise ValueError(
"Sparse target prediction is not "
"supported with the uniform strategy"
f"supported with the {self._strategy} strategy"
)

elif self._strategy == "constant":
Expand All @@ -313,7 +327,7 @@ def predict(self, X):
[n_samples, 1],
)

elif self._strategy == "stratified":
elif self._strategy in ("stratified", "uniform-proba"):
y = np.vstack(
[
classes_[k][proba[k].argmax(axis=1)]
Expand Down Expand Up @@ -387,6 +401,13 @@ def predict_proba(self, X):
out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
out /= n_classes_[k]

elif self._strategy == "uniform-proba":
# Generate random probability vectors from Dirichlet distribution
# with all concentration parameters set to 1 (uniform)
alpha = np.ones(n_classes_[k])
out = rs.dirichlet(alpha, size=n_samples)
out = out.astype(np.float64)

elif self._strategy == "constant":
ind = np.where(classes_[k] == constant[k])
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
Expand Down
56 changes: 56 additions & 0 deletions sklearn/tests/test_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,3 +713,59 @@ def test_dtype_of_classifier_probas(strategy):
probas = model.fit(X, y).predict_proba(X)

assert probas.dtype == np.float64


def test_uniform_proba_strategy(global_random_seed) -> None:
"""Basic checks on uniform probability distributions in the dummy classifier."""
X = [[0]] * 5 # ignored
y = [1, 2, 1, 1, 2]
clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed)
clf.fit(X, y)

X_test = [[0]] * 100
y_pred_proba = clf.predict_proba(X_test)

# Check that probabilities sum to 1 for each sample
assert_array_almost_equal(np.sum(y_pred_proba, axis=1), np.ones(len(X_test)))

# Check that all probabilities are >= 0
assert np.all(y_pred_proba >= 0)

# Check shape
assert y_pred_proba.shape == (len(X_test), len(np.unique(y)))

# Check that predict returns the class with highest probability
y_pred = clf.predict(X_test)
for i in range(len(X_test)):
assert y_pred[i] == clf.classes_[np.argmax(y_pred_proba[i])]

_check_predict_proba(clf, X_test, y)


def test_uniform_proba_strategy_multioutput(global_random_seed):
X = [[0]] * 5 # ignored
y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])

clf = DummyClassifier(strategy="uniform-proba", random_state=global_random_seed)
clf.fit(X, y)

X_test = [[0]] * 100
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

# For multioutput, predict_proba returns a list of arrays
assert isinstance(y_pred_proba, list)
assert len(y_pred_proba) == y.shape[1]

for k in range(y.shape[1]):
# Check that probabilities sum to 1 for each sample
assert_array_almost_equal(np.sum(y_pred_proba[k], axis=1), np.ones(len(X_test)))

# Check that all probabilities are >= 0
assert np.all(y_pred_proba[k] >= 0)

# Check shape
assert y_pred_proba[k].shape == (len(X_test), len(np.unique(y[:, k])))

_check_predict_proba(clf, X_test, y)
_check_behavior_2d(clf)
0