@@ -199,46 +199,46 @@ def prep_liftstat(model, X_train, y_train, X_test, y_test, targetname, targetval
199
199
200
200
# prepare function
201
201
202
- def calc_cumulative_gains (df : pd .DataFrame , actual_col : str , predicted_col : str , probability_col : str ):
203
-
204
- df .sort_values (by = probability_col , ascending = False , inplace = True )
205
-
206
- subset = df [df [predicted_col ] == True ]
207
-
208
- rows = []
209
-
210
- for group in np .array_split (subset , 20 ):
211
- score = accuracy_score (group [actual_col ].tolist (),
212
- group [predicted_col ].tolist (),
213
- normalize = False )
214
-
215
- rows .append ({'NumCases' : len (group ),
216
- 'NumCorrectPredictions' : score })
217
-
218
- lift = pd .DataFrame (rows )
219
-
220
- # Cumulative Gains Calculation
221
-
222
- lift ['RunningCorrect' ] = lift ['NumCorrectPredictions' ].cumsum ()
223
- lift ['PercentCorrect' ] = lift .apply (
224
- lambda x : (100 / lift ['NumCorrectPredictions' ].sum ()) * x ['RunningCorrect' ], axis = 1 )
225
- lift ['CumulativeCorrectBestCase' ] = lift ['NumCases' ].cumsum ()
226
- lift ['PercentCorrectBestCase' ] = lift ['CumulativeCorrectBestCase' ].apply (
227
- lambda x : 100 if (100 / lift ['NumCorrectPredictions' ].sum ()) * x > 100 else (100 / lift [
228
- 'NumCorrectPredictions' ].sum ()) * x )
229
- lift ['AvgCase' ] = lift ['NumCorrectPredictions' ].sum () / len (lift )
230
- lift ['CumulativeAvgCase' ] = lift ['AvgCase' ].cumsum ()
231
- lift ['PercentAvgCase' ] = lift ['CumulativeAvgCase' ].apply (
232
- lambda x : (100 / lift ['NumCorrectPredictions' ].sum ()) * x )
233
-
234
- # Lift Chart
235
- lift ['NormalisedPercentAvg' ] = 1
236
- lift ['NormalisedPercentWithModel' ] = lift ['PercentCorrect' ] / \
237
- lift ['PercentAvgCase' ]
238
-
239
- lift_dict = lift .to_dict ()
240
-
241
- return lift_dict
202
+ # def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col: str, probability_col: str):
203
+ #
204
+ # df.sort_values(by=probability_col, ascending=False, inplace=True)
205
+ #
206
+ # subset = df[df[predicted_col] == True]
207
+ #
208
+ # rows = []
209
+ #
210
+ # for group in np.array_split(subset, 20):
211
+ # score = accuracy_score(group[actual_col].tolist(),
212
+ # group[predicted_col].tolist(),
213
+ # normalize=False)
214
+ #
215
+ # rows.append({'NumCases': len(group),
216
+ # 'NumCorrectPredictions': score})
217
+ #
218
+ # lift = pd.DataFrame(rows)
219
+ #
220
+ # # Cumulative Gains Calculation
221
+ #
222
+ # lift['RunningCorrect'] = lift['NumCorrectPredictions'].cumsum()
223
+ # lift['PercentCorrect'] = lift.apply(
224
+ # lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x['RunningCorrect'], axis=1)
225
+ # lift['CumulativeCorrectBestCase'] = lift['NumCases'].cumsum()
226
+ # lift['PercentCorrectBestCase'] = lift['CumulativeCorrectBestCase'].apply(
227
+ # lambda x: 100 if (100 / lift['NumCorrectPredictions'].sum()) * x > 100 else (100 / lift[
228
+ # 'NumCorrectPredictions'].sum()) * x)
229
+ # lift['AvgCase'] = lift['NumCorrectPredictions'].sum() / len(lift)
230
+ # lift['CumulativeAvgCase'] = lift['AvgCase'].cumsum()
231
+ # lift['PercentAvgCase'] = lift['CumulativeAvgCase'].apply(
232
+ # lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x)
233
+ #
234
+ # # Lift Chart
235
+ # lift['NormalisedPercentAvg'] = 1
236
+ # lift['NormalisedPercentWithModel'] = lift['PercentCorrect'] / \
237
+ # lift['PercentAvgCase']
238
+ #
239
+ # lift_dict = lift.to_dict()
240
+ #
241
+ # return lift_dict
242
242
243
243
with open (templatedir + '/R_HMEQ_lift.json' , 'r' ) as i :
244
244
@@ -374,6 +374,104 @@ def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col: str,
374
374
print ("Saved in:" , outdir )
375
375
376
376
377
+ def lift_statistics (model , train = None , valid = None , test = None , event = None ):
378
+ """
379
+
380
+ Parameters
381
+ ----------
382
+ model
383
+ train
384
+ valid
385
+ test
386
+ event
387
+
388
+ Returns
389
+ -------
390
+
391
+ """
392
+ datasets = (valid , train , test )
393
+ labels = ['VALIDATE' , 'TRAIN' , 'TEST' ]
394
+
395
+ # At least some combination of datasets must be provided
396
+ if all (d is None for d in datasets ):
397
+ raise ValueError ("At least one dataset must be provided." )
398
+
399
+ results = []
400
+ row_count = 0
401
+ if event is None :
402
+ event = 1
403
+ elif event in model .classes_ :
404
+ event = 0 if event == model .classes_ [0 ] else 1
405
+ else :
406
+ event = int (event )
407
+
408
+ for idx , dataset in enumerate (datasets ):
409
+ if dataset is None :
410
+ continue
411
+
412
+ X , y_true = dataset
413
+
414
+ target_column = getattr (y_true , 'name' , 'Class' )
415
+ proba_columns = ['P_%s%d' % (target_column , i ) for i in (0 , 1 )]
416
+ event_column = proba_columns [event ]
417
+
418
+ # We need to re-assign int values to the dataset to ensure they match
419
+ # up to the column order output by the model. Otherwise, output labels
420
+ # will match, but underlying int representations will be off.
421
+ y_true = y_true .cat .reorder_categories (model .classes_ )
422
+
423
+ # Predicted probability for each class
424
+ y_pred_probs = pd .DataFrame (model .predict_proba (X ))
425
+
426
+ # Maximum likelihood class for each observation
427
+ y_pred_index = y_pred_probs .idxmax (axis = 1 )
428
+
429
+ # Column names default to 0 / 1. Only rename after calculating idxmax
430
+ # to ensure y_pred_index contains 0 / 1 values.
431
+ y_pred_probs .columns = proba_columns
432
+
433
+ # Explicitly reset indexes. pd.concat(ignore_index=True) didn't work.
434
+ y_pred_probs .reset_index (drop = True , inplace = True )
435
+ y_pred_index .reset_index (drop = True , inplace = True )
436
+ y_true_codes = y_true .cat .codes
437
+ y_true_codes .reset_index (drop = True , inplace = True )
438
+
439
+ df = pd .concat ((y_pred_probs , y_pred_index , y_true_codes ), axis = 1 )
440
+ df .columns = proba_columns + ['predicted' , 'target' ]
441
+
442
+ # Sort by highest probability of event (according to model)
443
+ df .sort_values (by = event_column , ascending = False , inplace = True )
444
+
445
+ # TODO: bin rows by percentiles or evenly?
446
+ num_groups = 20
447
+ groups = []
448
+ for i , g in enumerate (np .array_split (df , num_groups )):
449
+ stats = {'nobjs' : len (g ),
450
+ 'true_events' : (g ['target' ] == event ).count (),
451
+ 'accuracy' : accuracy_score (g ['target' ],
452
+ g ['predicted' ], normalize = False ),
453
+ 'sample' : i }
454
+ groups .append (stats )
455
+
456
+ t2 = pd .DataFrame (groups )
457
+ t2 ['total_correct' ] = t2 ['accuracy' ].cumsum ()
458
+ t2 ['total_obs' ] = t2 ['nobjs' ].cumsum ()
459
+ t2 ['percent_accuracy' ] = 100 * t2 ['total_correct' ] / t2 ['total_obs' ]
460
+
461
+ total_true_events = (df ['target' ] == event ).sum ()
462
+ total_samples = len (df )
463
+ true_event_rate = total_true_events / float (total_samples )
464
+ # -----
465
+
466
+ actualValue = df ['target' ]
467
+ predictValue = df ['P_Type1' ]
468
+ numObservations = len (actualValue )
469
+ quantileCutOff = np .percentile (df ['P_Type1' ], np .arange (0 , 100 , 10 ))
470
+
471
+
472
+ return {'data' : None }
473
+
474
+
377
475
def roc_statistics (model , train = None , valid = None , test = None ):
378
476
"""
379
477
0 commit comments