@@ -172,7 +172,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
172
172
return np .sum (cm * gain_matrix )
173
173
174
174
175
- scoring ["cost_gain " ] = make_scorer (
175
+ scoring ["credit_gain " ] = make_scorer (
176
176
credit_gain_score , neg_label = neg_label , pos_label = pos_label
177
177
)
178
178
# %%
@@ -247,7 +247,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
247
247
# However, we recall that the original aim was to minimize the cost (or maximize the
248
248
# gain) as defined by the business metric. We can compute the value of the business
249
249
# metric:
250
- print (f"Business defined metric: { scoring ['cost_gain ' ](model , X_test , y_test )} " )
250
+ print (f"Business defined metric: { scoring ['credit_gain ' ](model , X_test , y_test )} " )
251
251
252
252
# %%
253
253
# At this stage we don't know if any other cut-off can lead to a greater gain. To find
@@ -272,7 +272,7 @@ def credit_gain_score(y, y_pred, neg_label, pos_label):
272
272
273
273
tuned_model = TunedThresholdClassifierCV (
274
274
estimator = model ,
275
- scoring = scoring ["cost_gain " ],
275
+ scoring = scoring ["credit_gain " ],
276
276
store_cv_results = True , # necessary to inspect all results
277
277
)
278
278
tuned_model .fit (X_train , y_train )
@@ -379,7 +379,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
379
379
#
380
380
# We can now check if choosing this cut-off point leads to a better score on the testing
381
381
# set:
382
- print (f"Business defined metric: { scoring ['cost_gain ' ](tuned_model , X_test , y_test )} " )
382
+ print (f"Business defined metric: { scoring ['credit_gain ' ](tuned_model , X_test , y_test )} " )
383
383
384
384
# %%
385
385
# We observe that tuning the decision threshold almost improves our business gains
@@ -487,7 +487,7 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
487
487
fraud = target == 1
488
488
amount_fraud = data ["Amount" ][fraud ]
489
489
_ , ax = plt .subplots ()
490
- ax .hist (amount_fraud , bins = 100 )
490
+ ax .hist (amount_fraud , bins = 30 )
491
491
ax .set_title ("Amount of fraud transaction" )
492
492
_ = ax .set_xlabel ("Amount (€)" )
493
493
@@ -500,20 +500,18 @@ def plot_roc_pr_curves(vanilla_model, tuned_model, *, title):
500
500
# a gain of 2% of the amount of the transaction. However, accepting a fraudulent
501
501
# transaction result in a loss of the amount of the transaction. As stated in [2]_, the
502
502
# gain and loss related to refusals (of fraudulent and legitimate transactions) are not
503
- # trivial to define. Here, we define that a refusal of a legitimate transaction is
504
- # estimated to a loss of 5€ while the refusal of a fraudulent transaction is estimated
505
- # to a gain of 50€ and the amount of the transaction . Therefore, we define the
506
- # following function to compute the total benefit of a given decision:
503
+ # trivial to define. Here, we define that a refusal of a legitimate transaction
504
+ # is estimated to a loss of 5€ while the refusal of a fraudulent transaction is
505
+ # estimated to a gain of 50€. Therefore, we define the following function to
506
+ # compute the total benefit of a given decision:
507
507
508
508
509
509
def business_metric (y_true , y_pred , amount ):
510
510
mask_true_positive = (y_true == 1 ) & (y_pred == 1 )
511
511
mask_true_negative = (y_true == 0 ) & (y_pred == 0 )
512
512
mask_false_positive = (y_true == 0 ) & (y_pred == 1 )
513
513
mask_false_negative = (y_true == 1 ) & (y_pred == 0 )
514
- fraudulent_refuse = (mask_true_positive .sum () * 50 ) + amount [
515
- mask_true_positive
516
- ].sum ()
514
+ fraudulent_refuse = mask_true_positive .sum () * 50
517
515
fraudulent_accept = - amount [mask_false_negative ].sum ()
518
516
legitimate_refuse = mask_false_positive .sum () * - 5
519
517
legitimate_accept = (amount [mask_true_negative ] * 0.02 ).sum ()
@@ -540,7 +538,6 @@ def business_metric(y_true, y_pred, amount):
540
538
amount = credit_card .frame ["Amount" ].to_numpy ()
541
539
542
540
# %%
543
- # We first start to train a dummy classifier to have some baseline results.
544
541
from sklearn .model_selection import train_test_split
545
542
546
543
data_train , data_test , target_train , target_test , amount_train , amount_test = (
@@ -550,50 +547,44 @@ def business_metric(y_true, y_pred, amount):
550
547
)
551
548
552
549
# %%
550
+ # We first evaluate some baseline policies to serve as reference. Recall that
551
+ # class "0" is the legitimate class and class "1" is the fraudulent class.
553
552
from sklearn .dummy import DummyClassifier
554
553
555
- easy_going_classifier = DummyClassifier (strategy = "constant" , constant = 0 )
556
- easy_going_classifier .fit (data_train , target_train )
557
- benefit_cost = business_scorer (
558
- easy_going_classifier , data_test , target_test , amount = amount_test
554
+ always_accept_policy = DummyClassifier (strategy = "constant" , constant = 0 )
555
+ always_accept_policy .fit (data_train , target_train )
556
+ benefit = business_scorer (
557
+ always_accept_policy , data_test , target_test , amount = amount_test
559
558
)
560
- print (f"Benefit/cost of our easy-going classifier : { benefit_cost :,.2f} €" )
559
+ print (f"Benefit of the 'always accept' policy : { benefit :,.2f} €" )
561
560
562
561
# %%
563
- # A classifier that predict all transactions as legitimate would create a profit of
564
- # around 220,000.€ We make the same evaluation for a classifier that predicts all
562
+ # A policy that considers all transactions as legitimate would create a profit of
563
+ # around 220,000€. We make the same evaluation for a classifier that predicts all
565
564
# transactions as fraudulent.
566
- intolerant_classifier = DummyClassifier (strategy = "constant" , constant = 1 )
567
- intolerant_classifier .fit (data_train , target_train )
568
- benefit_cost = business_scorer (
569
- intolerant_classifier , data_test , target_test , amount = amount_test
565
+ always_reject_policy = DummyClassifier (strategy = "constant" , constant = 1 )
566
+ always_reject_policy .fit (data_train , target_train )
567
+ benefit = business_scorer (
568
+ always_reject_policy , data_test , target_test , amount = amount_test
570
569
)
571
- print (f"Benefit/cost of our intolerant classifier : { benefit_cost :,.2f} €" )
570
+ print (f"Benefit of the 'always reject' policy : { benefit :,.2f} €" )
572
571
573
- # %%
574
- # Such a classifier create a loss of around 670,000.€ A predictive model should allow
575
- # us to make a profit larger than 220,000.€ It is interesting to compare this business
576
- # metric with another "standard" statistical metric such as the balanced accuracy.
577
- from sklearn .metrics import get_scorer
578
-
579
- balanced_accuracy_scorer = get_scorer ("balanced_accuracy" )
580
- print (
581
- "Balanced accuracy of our easy-going classifier: "
582
- f"{ balanced_accuracy_scorer (easy_going_classifier , data_test , target_test ):.3f} "
583
- )
584
- print (
585
- "Balanced accuracy of our intolerant classifier: "
586
- f"{ balanced_accuracy_scorer (intolerant_classifier , data_test , target_test ):.3f} "
587
- )
588
572
589
573
# %%
590
- # This is not a surprise that the balanced accuracy is at 0.5 for both classifiers.
591
- # However, we need to be careful in the rest of the evaluation: we potentially can
592
- # obtain a model with a decent balanced accuracy that does not make any profit.
593
- # In this case, the model would be harmful for our business.
574
+ # Such a policy would entail a catastrophic loss: around 670,000€. This is
575
+ # expected since the vast majority of the transactions are legitimate and the
576
+ # policy would refuse them at a non-trivial cost.
577
+ #
578
+ # A predictive model that adapts the accept/reject decisions on a per
579
+ # transaction basis should ideally allow us to make a profit larger than the
580
+ # 220,000€ of the best of our constant baseline policies.
594
581
#
595
- # Let's now create a predictive model using a logistic regression without tuning the
596
- # decision threshold.
582
+ # We start with a logistic regression model with the default decision threshold
583
+ # at 0.5. Here we tune the hyperparameter `C` of the logistic regression with a
584
+ # proper scoring rule (the log loss) to ensure that the model's probabilistic
585
+ # predictions returned by its `predict_proba` method are as accurate as
586
+ # possible, irrespectively of the choice of the value of the decision
587
+ # threshold.
597
588
from sklearn .linear_model import LogisticRegression
598
589
from sklearn .model_selection import GridSearchCV
599
590
from sklearn .pipeline import make_pipeline
@@ -604,21 +595,19 @@ def business_metric(y_true, y_pred, amount):
604
595
model = GridSearchCV (logistic_regression , param_grid , scoring = "neg_log_loss" ).fit (
605
596
data_train , target_train
606
597
)
598
+ model
607
599
600
+ # %%
608
601
print (
609
- "Benefit/cost of our logistic regression: "
602
+ "Benefit of logistic regression with default threshold : "
610
603
f"{ business_scorer (model , data_test , target_test , amount = amount_test ):,.2f} €"
611
604
)
612
- print (
613
- "Balanced accuracy of our logistic regression: "
614
- f"{ balanced_accuracy_scorer (model , data_test , target_test ):.3f} "
615
- )
616
605
617
606
# %%
618
- # By observing the balanced accuracy, we see that our predictive model is learning
619
- # some associations between the features and the target. The business metric also shows
620
- # that our model is beating the baseline in terms of profit and it would be already
621
- # beneficial to use it instead of ignoring the fraud detection problem .
607
+ # The business metric shows that our predictive model with a default decision
608
+ # threshold is already winning over the baseline in terms of profit and it would be
609
+ # already beneficial to use it to accept or reject transactions instead of
610
+ # accepting all transactions .
622
611
#
623
612
# Tuning the decision threshold
624
613
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -642,30 +631,21 @@ def business_metric(y_true, y_pred, amount):
642
631
# automatically dispatching this metadata to the underlying scorer.
643
632
tuned_model .fit (data_train , target_train , amount = amount_train )
644
633
634
+ # %%
635
+ # We observe that the tuned decision threshold is far away from the default 0.5:
636
+ print (f"Tuned decision threshold: { tuned_model .best_threshold_ :.2f} " )
637
+
645
638
# %%
646
639
print (
647
- "Benefit/cost of our logistic regression: "
640
+ "Benefit of logistic regression with a tuned threshold : "
648
641
f"{ business_scorer (tuned_model , data_test , target_test , amount = amount_test ):,.2f} €"
649
642
)
650
- print (
651
- "Balanced accuracy of our logistic regression: "
652
- f"{ balanced_accuracy_scorer (tuned_model , data_test , target_test ):.3f} "
653
- )
654
643
655
644
# %%
656
- # We observe that tuning the decision threshold increases the expected profit of
657
- # deploying our model as estimated by the business metric.
658
- # Eventually, the balanced accuracy also increased. Note that it might not always be
659
- # the case because the statistical metric is not necessarily a surrogate of the
660
- # business metric. It is therefore important, whenever possible, optimize the decision
661
- # threshold with respect to the business metric.
662
- #
663
- # Finally, the estimate of the business metric itself can be unreliable, in
664
- # particular when the number of data points in the minority class is so small.
665
- # Any business impact estimated by cross-validation of a business metric on
666
- # historical data (offline evaluation) should ideally be confirmed by A/B testing
667
- # on live data (online evaluation). Note however that A/B testing models is
668
- # beyond the scope of the scikit-learn library itself.
645
+ # We observe that tuning the decision threshold increases the expected profit
646
+ # when deploying our model - as indicated by the business metric. It is therefore
647
+ # valuable, whenever possible, to optimize the decision threshold with respect
648
+ # to the business metric.
669
649
#
670
650
# Manually setting the decision threshold instead of tuning it
671
651
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -684,19 +664,22 @@ def business_metric(y_true, y_pred, amount):
684
664
from sklearn .model_selection import FixedThresholdClassifier
685
665
686
666
model_fixed_threshold = FixedThresholdClassifier (
687
- estimator = model , threshold = tuned_model .best_threshold_
667
+ estimator = model , threshold = tuned_model .best_threshold_ , prefit = True
688
668
).fit (data_train , target_train )
689
669
690
670
# %%
691
671
business_score = business_scorer (
692
672
model_fixed_threshold , data_test , target_test , amount = amount_test
693
673
)
694
- print (f"Benefit/cost of our logistic regression: { business_score :,.2f} €" )
695
- print (
696
- "Balanced accuracy of our logistic regression: "
697
- f"{ balanced_accuracy_scorer (model_fixed_threshold , data_test , target_test ):.3f} "
698
- )
674
+ print (f"Benefit of logistic regression with a tuned threshold: { business_score :,.2f} €" )
699
675
700
676
# %%
701
- # We observe that we obtained the exact same results but the fitting process was much
702
- # faster since we did not perform any search.
677
+ # We observe that we obtained the exact same results but the fitting process
678
+ # was much faster since we did not perform any hyper-parameter search.
679
+ #
680
+ # Finally, the estimate of the (average) business metric itself can be unreliable, in
681
+ # particular when the number of data points in the minority class is very small.
682
+ # Any business impact estimated by cross-validation of a business metric on
683
+ # historical data (offline evaluation) should ideally be confirmed by A/B testing
684
+ # on live data (online evaluation). Note however that A/B testing models is
685
+ # beyond the scope of the scikit-learn library itself.
0 commit comments