8000 calc lift statistics (incomplete) · Sivateja0689/python-sasctl@c434deb · GitHub
[go: up one dir, main page]

8000 Skip to content

Commit c434deb

Browse files
committed
calc lift statistics (incomplete)
1 parent 69517ba commit c434deb

File tree

2 files changed

+162
-45
lines changed

2 files changed

+162
-45
lines changed

src/sasctl/utils/metrics/compare.py

Lines changed: 138 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -199,46 +199,46 @@ def prep_liftstat(model, X_train, y_train, X_test, y_test, targetname, targetval
199199

200200
# prepare function
201201

202-
def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col: str, probability_col: str):
203-
204-
df.sort_values(by=probability_col, ascending=False, inplace=True)
205-
206-
subset = df[df[predicted_col] == True]
207-
208-
rows = []
209-
210-
for group in np.array_split(subset, 20):
211-
score = accuracy_score(group[actual_col].tolist(),
212-
group[predicted_col].tolist(),
213-
normalize=False)
214-
215-
rows.append({'NumCases': len(group),
216-
'NumCorrectPredictions': score})
217-
218-
lift = pd.DataFrame(rows)
219-
220-
# Cumulative Gains Calculation
221-
222-
lift['RunningCorrect'] = lift['NumCorrectPredictions'].cumsum()
223-
lift['PercentCorrect'] = lift.apply(
224-
lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x['RunningCorrect'], axis=1)
225-
lift['CumulativeCorrectBestCase'] = lift['NumCases'].cumsum()
226-
lift['PercentCorrectBestCase'] = lift['CumulativeCorrectBestCase'].apply(
227-
lambda x: 100 if (100 / lift['NumCorrectPredictions'].sum()) * x > 100 else (100 / lift[
228-
'NumCorrectPredictions'].sum()) * x)
229-
lift['AvgCase'] = lift['NumCorrectPredictions'].sum() / len(lift)
230-
lift['CumulativeAvgCase'] = lift['AvgCase'].cumsum()
231-
lift['PercentAvgCase'] = lift['CumulativeAvgCase'].apply(
232-
lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x)
233-
234-
# Lift Chart
235-
lift['NormalisedPercentAvg'] = 1
236-
lift['NormalisedPercentWithModel'] = lift['PercentCorrect'] / \
237-
lift['PercentAvgCase']
238-
239-
lift_dict = lift.to_dict()
240-
241-
return lift_dict
202+
# def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col: str, probability_col: str):
203+
#
204+
# df.sort_values(by=probability_col, ascending=False, inplace=True)
205+
#
206+
# subset = df[df[predicted_col] == True]
207+
#
208+
# rows = []
209+
#
210+
# for group in np.array_split(subset, 20):
211+
# score = accuracy_score(group[actual_col].tolist(),
212+
# group[predicted_col].tolist(),
213+
# normalize=False)
214+
#
215+
# rows.append({'NumCases': len(group),
216+
# 'NumCorrectPredictions': score})
217+
#
218+
# lift = pd.DataFrame(rows)
219+
#
220+
# # Cumulative Gains Calculation
221+
#
222+
# lift['RunningCorrect'] = lift['NumCorrectPredictions'].cumsum()
223+
# lift['PercentCorrect'] = lift.apply(
224+
# lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x['RunningCorrect'], axis=1)
225+
# lift['CumulativeCorrectBestCase'] = lift['NumCases'].cumsum()
226+
# lift['PercentCorrectBestCase'] = lift['CumulativeCorrectBestCase'].apply(
227+
# lambda x: 100 if (100 / lift['NumCorrectPredictions'].sum()) * x > 100 else (100 / lift[
228+
# 'NumCorrectPredictions'].sum()) * x)
229+
# lift['AvgCase'] = lift['NumCorrectPredictions'].sum() / len(lift)
230+
# lift['CumulativeAvgCase'] = lift['AvgCase'].cumsum()
231+
# lift['PercentAvgCase'] = lift['CumulativeAvgCase'].apply(
232+
# lambda x: (100 / lift['NumCorrectPredictions'].sum()) * x)
233+
#
234+
# # Lift Chart
235+
# lift['NormalisedPercentAvg'] = 1
236+
# lift['NormalisedPercentWithModel'] = lift['PercentCorrect'] / \
237+
# lift['PercentAvgCase']
238+
#
239+
# lift_dict = lift.to_dict()
240+
#
241+
# return lift_dict
242242

243243
with open(templatedir + '/R_HMEQ_lift.json', 'r') as i:
244244

@@ -374,6 +374,104 @@ def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col: str,
374374
print("Saved in:", outdir)
375375

376376

377+
def lift_statistics(model, train=None, valid=None, test=None, event=None):
378+
"""
379+
380+
Parameters
381+
----------
382+
model
383+
train
384+
valid
385+
test
386+
event
387+
388+
Returns
389+
-------
390+
391+
"""
392+
datasets = (valid, train, test)
393+
labels = ['VALIDATE', 'TRAIN', 'TEST']
394+
395+
# At least some combination of datasets must be provided
396+
if all(d is None for d in datasets):
397+
raise ValueError("At least one dataset must be provided.")
398+
399+
results = []
400+
row_count = 0
401+
if event is None:
402+
event = 1
403+
elif event in model.classes_:
404+
event = 0 if event == model.classes_[0] else 1
405+
else:
406+
event = int(event)
407+
408+
for idx, dataset in enumerate(datasets):
409+
if dataset is None:
410+
continue
411+
412+
X, y_true = dataset
413+
414+
target_column = getattr(y_true, 'name', 'Class')
415+
proba_columns = ['P_%s%d' % (target_column, i) for i in (0, 1)]
416+
event_column = proba_columns[event]
417+
418+
# We need to re-assign int values to the dataset to ensure they match
419+
# up to the column order output by the model. Otherwise, output labels
420+
# will match, but underlying int representations will be off.
421+
y_true = y_true.cat.reorder_categories(model.classes_)
422+
423+
# Predicted probability for each class
424+
y_pred_probs = pd.DataFrame(model.predict_proba(X))
425+
426+
# Maximum likelihood class for each observation
427+
y_pred_index = y_pred_probs.idxmax(axis=1)
428+
429+
# Column names default to 0 / 1. Only rename after calculating idxmax
430+
# to ensure y_pred_index contains 0 / 1 values.
431+
y_pred_probs.columns = proba_columns
432+
433+
# Explicitly reset indexes. pd.concat(ignore_index=True) didn't work.
434+
y_pred_probs.reset_index(drop=True, inplace=True)
435+
y_pred_index.reset_index(drop=True, inplace=True)
436+
y_true_codes = y_true.cat.codes
437+
y_true_codes.reset_index(drop=True, inplace=True)
438+
439+
df = pd.concat((y_pred_probs, y_pred_index, y_true_codes), axis=1)
440+
df.columns = proba_columns + ['predicted', 'target']
441+
442+
# Sort by highest probability of event (according to model)
443+
df.sort_values(by=event_column, ascending=False, inplace=True)
444+
445+
# TODO: bin rows by percentiles or evenly?
446+
num_groups = 20
447+
groups = []
448+
for i, g in enumerate(np.array_split(df, num_groups)):
449+
stats = {'nobjs': len(g),
450+
'true_events': (g['target'] == event).count(),
451+
'accuracy': accuracy_score(g['target'],
452+
g['predicted'], normalize=False),
453+
'sample': i}
454+
groups.append(stats)
455+
456+
t2 = pd.DataFrame(groups)
457+
t2['total_correct'] = t2['accuracy'].cumsum()
458+
t2['total_obs'] = t2['nobjs'].cumsum()
459+
t2['percent_accuracy'] = 100 * t2['total_correct'] / t2['total_obs']
460+
461+
total_true_events = (df['target'] == event).sum()
462+
total_samples = len(df)
463+
true_event_rate = total_true_events / float(total_samples)
464+
# -----
465+
466+
actualValue = df['target']
467+
predictValue = df['P_Type1']
468+
numObservations = len(actualValue)
469+
quantileCutOff = np.percentile(df['P_Type1'], np.arange(0, 100, 10))
470+
471+
472+
return {'data': None}
473+
474+
377475
def roc_statistics(model, train=None, valid=None, test=None):
378476
"""
379477

tests/unit/test_metrics.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,28 @@ def test_roc_statistics_binary(cancer_dataset):
104104
assert isinstance(stats, dict)
105105

106106
# Should only contain stats for training data
107-
assert len(stats['data']) == 1
107+
assert len(stats['data']) == 3
108108

109-
assert stats['data'][0]['rowNumber'] == 1
110-
assert stats['data'][0]['dataMap']['_DataRole_'] == 'TRAIN'
111-
assert stats['data'][0]['dataMap']['_NObs_'] == X.shape[0]
112-
assert stats['data'][0]['dataMap']['_DIV_'] == X.shape[0]
109+
110+
def test_lift_statistics_binary(cancer_dataset):
111+
sklearn = pytest.importorskip('sklearn')
112+
113+
from sklearn.ensemble import RandomForestClassifier
114+
from sklearn.model_selection import train_test_split
115+
from sasctl.utils.metrics import compare
116+
117+
model = RandomForestClassifier()
118+
X = cancer_dataset.drop('Type', axis=1)
119+
y = cancer_dataset['Type']
120+
121+
X_train, X_test, y_train, y_test = train_test_split(X, y)
122+
model.fit(X_train, y_train)
123+
124+
stats = compare.lift_statistics(model, train=(X_train, y_train),
125+
test=(X_test, y_test),
126+
event='malignant')
127+
128+
assert isinstance(stats, dict)
129+
130+
# Should only contain stats for training data
131+
assert len(stats['data']) == 3

0 commit comments

Comments
 (0)
0