8000 DOC improve the cost-sensitive learning example (#29149) · scikit-learn/scikit-learn@5d92c35 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5d92c35

Browse files
DOC improve the cost-sensitive learning example (#29149)
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
1 parent 7398114 commit 5d92c35

File tree

1 file changed

+64
-81
lines changed

1 file changed

+64
-81
lines changed

examples/model_selection/plot_cost_sensitive_learning.py

Lines changed: 64 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
172172
return np.sum(cm * gain_matrix)
173173

174174

175-
scoring["cost_gain"] = make_scorer(
175+
scoring["credit_gain"] = make_scorer(
176176
credit_gain_score, neg_label=neg_label, pos_label=pos_label
177177
)
178178
# %%
@@ -247,7 +247,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
247247
# However, we recall that the original aim was to minimize the cost (or maximize the
248248
# gain) as defined by the business metric. We can compute the value of the business
249249
# metric:
250-
print(f"Business defined metric: {scoring['cost_gain'](model, X_test, y_test)}")
250+
print(f"Business defined metric: {scoring['credit_gain'](model, X_test, y_test)}")
251251

252252
# %%
253253
# At this stage we don't know if any other cut-off can lead to a greater gain. To find
@@ -272,7 +272,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
272272

273273
tuned_model = TunedThresholdClassifierCV(
274274
estimator=model,
275-
scoring=scoring["cost_gain"],
275+
scoring=scoring["credit_gain"],
276276
store_cv_results=True, # necessary to inspect all results
277277
)
278278
tuned_model.fit(X_train, y_train)
@@ -379,7 +379,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
379379
#
380380
# We can now check if choosing this cut-off point leads to a better score on the testing
381381
# set:
382-
print(f"Business defined metric: {scoring['cost_gain'](tuned_model, X_test, y_test)}")
382+
print(f"Business defined metric: {scoring['credit_gain'](tuned_model, X_test, y_test)}")
383383

384384
# %%
385385
# We observe that tuning the decision threshold almost improves our business gains
@@ -487,7 +487,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
487487
fraud = target == 1
488488
amount_fraud = data["Amount"][fraud]
489489
_, ax = plt.subplots()
490-
ax.hist(amount_fraud, bins=100)
490+
ax.hist(amount_fraud, bins=30)
491491
ax.set_title("Amount of fraud transaction")
492492
_ = ax.set_xlabel("Amount (€)")
493493

@@ -500,20 +500,18 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
500500
# a gain of 2% of the amount of the transaction. However, accepting a fraudulent
501501
# transaction result in a loss of the amount of the transaction. As stated in [2]_, the
502502
# gain and loss related to refusals (of fraudulent and legitimate transactions) are not
503-
# trivial to define. Here, we define that a refusal of a legitimate transaction is
504-
# estimated to a loss of 5€ while the refusal of a fraudulent transaction is estimated
505-
# to a gain of 50€ and the amount of the transaction. Therefore, we define the
506-
# following function to compute the total benefit of a given decision:
503+
# trivial to define. Here, we define that a refusal of a legitimate transaction
504+
# is estimated to a loss of 5€ while the refusal of a fraudulent transaction is
505+
# estimated to a gain of 50€. Therefore, we define the following function to
506+
# compute the total benefit of a given decision:
507507

508508

509509
def business_metric(y_true, y_pred, amount):
510510
mask_true_positive = (y_true == 1) & (y_pred == 1)
511511
mask_true_negative = (y_true == 0) & (y_pred == 0)
512512
mask_false_positive = (y_true == 0) & (y_pred == 1)
513513
mask_false_negative = (y_true == 1) & (y_pred == 0)
514-
fraudulent_refuse = (mask_true_positive.sum() * 50) + amount[
515-
mask_true_positive
516-
].sum()
514+
fraudulent_refuse = mask_true_positive.sum() * 50
517515
fraudulent_accept = -amount[mask_false_negative].sum()
518516
legitimate_refuse = mask_false_positive.sum() * -5
519517
legitimate_accept = (amount[mask_true_negative] * 0.02).sum()
@@ -540,7 +538,6 @@ def business_metric(y_true, y_pred, amount):
540538
amount = credit_card.frame["Amount"].to_numpy()
541539

542540
# %%
543-
# We first start to train a dummy classifier to have some baseline results.
544541
from sklearn.model_selection import train_test_split
545542

546543
data_train, data_test, target_train, target_test, amount_train, amount_test = (
@@ -550,50 +547,44 @@ def business_metric(y_true, y_pred, amount):
550547
)
551548

552549
# %%
550+
# We first evaluate some baseline policies to serve as reference. Recall that
551+
# class "0" is the legitimate class and class "1" is the fraudulent class.
553552
from sklearn.dummy import DummyClassifier
554553

555-
easy_going_classifier = DummyClassifier(strategy="constant", constant=0)
556-
easy_going_classifier.fit(data_train, target_train)
557-
benefit_cost = business_scorer(
558-
easy_going_classifier, data_test, target_test, amount=amount_test
554+
always_accept_policy = DummyClassifier(strategy="constant", constant=0)
555+
always_accept_policy.fit(data_train, target_train)
556+
benefit = business_scorer(
557+
always_accept_policy, data_test, target_test, amount=amount_test
559558
)
560-
print(f"Benefit/cost of our easy-going classifier: {benefit_cost:,.2f}€")
559+
print(f"Benefit of the 'always accept' policy: {benefit:,.2f}€")
561560

562561
# %%
563-
# A classifier that predict all transactions as legitimate would create a profit of
564-
# around 220,000.€ We make the same evaluation for a classifier that predicts all
562+
# A policy that considers all transactions as legitimate would create a profit of
563+
# around 220,000€. We make the same evaluation for a classifier that predicts all
565564
# transactions as fraudulent.
566-
intolerant_classifier = DummyClassifier(strategy="constant", constant=1)
567-
intolerant_classifier.fit(data_train, target_train)
568-
benefit_cost = business_scorer(
569-
intolerant_classifier, data_test, target_test, amount=amount_test
565+
always_reject_policy = DummyClassifier(strategy="constant", constant=1)
566+
always_reject_policy.fit(data_train, target_train)
567+
benefit = business_scorer(
568+
always_reject_policy, data_test, target_test, amount=amount_test
570569
)
571-
print(f"Benefit/cost of our intolerant classifier: {benefit_cost:,.2f}€")
570+
print(f"Benefit of the 'always reject' policy: {benefit:,.2f}€")
572571

573-
# %%
574-
# Such a classifier create a loss of around 670,000.€ A predictive model should allow
575-
# us to make a profit larger than 220,000.€ It is interesting to compare this business
576-
# metric with another "standard" statistical metric such as the balanced accuracy.
577-
from sklearn.metrics import get_scorer
578-
579-
balanced_accuracy_scorer = get_scorer("balanced_accuracy")
580-
print(
581-
"Balanced accuracy of our easy-going classifier: "
582-
f"{balanced_accuracy_scorer(easy_going_classifier, data_test, target_test):.3f}"
583-
)
584-
print(
585-
"Balanced accuracy of our intolerant classifier: "
586-
f"{balanced_accuracy_scorer(intolerant_classifier, data_test, target_test):.3f}"
587-
)
588572

589573
# %%
590-
# This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
591-
# However, we need to be careful in the rest of the evaluation: we potentially can
592-
# obtain a model with a decent balanced accuracy that does not make any profit.
593-
# In this case, the model would be harmful for our business.
574+
# Such a policy would entail a catastrophic loss: around 670,000€. This is
575+
# expected since the vast majority of the transactions are legitimate and the
576+
# policy would refuse them at a non-trivial cost.
577+
#
578+
# A predictive model that adapts the accept/reject decisions on a per
579+
# transaction basis should ideally allow us to make a profit larger than the
580+
# 220,000€ of the best of our constant baseline policies.
594581
#
595-
# Let's now create a predictive model using a logistic regression without tuning the
596-
# decision threshold.
582+
# We start with a logistic regression model with the default decision threshold
583+
# at 0.5. Here we tune the hyperparameter `C` of the logistic regression with a
584+
# proper scoring rule (the log loss) to ensure that the model's probabilistic
585+
# predictions returned by its `predict_proba` method are as accurate as
586+
# possible, irrespectively of the choice of the value of the decision
587+
# threshold.
597588
from sklearn.linear_model import LogisticRegression
598589
from sklearn.model_selection import GridSearchCV
599590
from sklearn.pipeline import make_pipeline
@@ -604,21 +595,19 @@ def business_metric(y_true, y_pred, amount):
604595
model = GridSearchCV(logistic_regression, param_grid, scoring="neg_log_loss").fit(
605596
data_train, target_train
606597
)
598+
model
607599

600+
# %%
608601
print(
609-
"Benefit/cost of our logistic regression: "
602+
"Benefit of logistic regression with default threshold: "
610603
f"{business_scorer(model, data_test, target_test, amount=amount_test):,.2f}€"
611604
)
612-
print(
613-
"Balanced accuracy of our logistic regression: "
614-
f"{balanced_accuracy_scorer(model, data_test, target_test):.3f}"
615-
)
616605

617606
# %%
618-
# By observing the balanced accuracy, we see that our predictive model is learning
619-
# some associations between the features and the target. The business metric also shows
620-
# that our model is beating the baseline in terms of profit and it would be already
621-
# beneficial to use it instead of ignoring the fraud detection problem.
607+
# The business metric shows that our predictive model with a default decision
608+
# threshold is already winning over the baseline in terms of profit and it would be
609+
# already beneficial to use it to accept or reject transactions instead of
610+
# accepting all transactions.
622611
#
623612
# Tuning the decision threshold
624613
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -642,30 +631,21 @@ def business_metric(y_true, y_pred, amount):
642631
# automatically dispatching this metadata to the underlying scorer.
643632
tuned_model.fit(data_train, target_train, amount=amount_train)
644633

634+
# %%
635+
# We observe that the tuned decision threshold is far away from the default 0.5:
636+
print(f"Tuned decision threshold: {tuned_model.best_threshold_:.2f}")
637+
645638
# %%
646639
print(
647-
"Benefit/cost of our logistic regression: "
640+
"Benefit of logistic regression with a tuned threshold: "
648641
f"{business_scorer(tuned_model, data_test, target_test, amount=amount_test):,.2f}€"
649642
)
650-
print(
651-
"Balanced accuracy of our logistic regression: "
652-
f"{balanced_accuracy_scorer(tuned_model, data_test, target_test):.3f}"
653-
)
654643

655644
# %%
656-
# We observe that tuning the decision threshold increases the expected profit of
657-
# deploying our model as estimated by the business metric.
658-
# Eventually, the balanced accuracy also increased. Note that it might not always be
659-
# the case because the statistical metric is not necessarily a surrogate of the
660-
# business metric. It is therefore important, whenever possible, optimize the decision
661-
# threshold with respect to the business metric.
662-
#
663-
# Finally, the estimate of the business metric itself can be unreliable, in
664-
# particular when the number of data points in the minority class is so small.
665-
# Any business impact estimated by cross-validation of a business metric on
666-
# historical data (offline evaluation) should ideally be confirmed by A/B testing
667-
# on live data (online evaluation). Note however that A/B testing models is
668-
# beyond the scope of the scikit-learn library itself.
645+
# We observe that tuning the decision threshold increases the expected profit
646+
# when deploying our model - as indicated by the business metric. It is therefore
647+
# valuable, whenever possible, to optimize the decision threshold with respect
648+
# to the business metric.
669649
#
670650
# Manually setting the decision threshold instead of tuning it
671651
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -684,19 +664,22 @@ def business_metric(y_true, y_pred, amount):
684664
from sklearn.model_selection import FixedThresholdClassifier
685665

686666
model_fixed_threshold = FixedThresholdClassifier(
687-
estimator=model, threshold=tuned_model.best_threshold_
667+
estimator=model, threshold=tuned_model.best_threshold_, prefit=True
688668
).fit(data_train, target_train)
689669

690670
# %%
691671
business_score = business_scorer(
692672
model_fixed_threshold, data_test, target_test, amount=amount_test
693673
)
694-
print(f"Benefit/cost of our logistic regression: {business_score:,.2f}€")
695-
print(
696-
"Balanced accuracy of our logistic regression: "
697-
f"{balanced_accuracy_scorer(model_fixed_threshold, data_test, target_test):.3f}"
698-
)
674+
print(f"Benefit of logistic regression with a tuned threshold: {business_score:,.2f}€")
699675

700676
# %%
701-
# We observe that we obtained the exact same results but the fitting process was much
702-
# faster since we did not perform any search.
677+
# We observe that we obtained the exact same results but the fitting process
678+
# was much faster since we did not perform any hyper-parameter search.
679+
#
680+
# Finally, the estimate of the (average) business metric itself can be unreliable, in
681+
# particular when the number of data points in the minority class is very small.
682+
# Any business impact estimated by cross-validation of a business metric on
683+
# historical data (offline evaluation) should ideally be confirmed by A/B testing
684+
# on live data (online evaluation). Note however that A/B testing models is
685+
# beyond the scope of the scikit-learn library itself.

0 commit comments

Comments
 (0)
0