|
29 | 29 | from sklearn.model_selection import cross_val_score
|
30 | 30 | from sklearn.model_selection import KFold
|
31 | 31 | from sklearn.model_selection import StratifiedKFold
|
| 32 | +from sklearn.model_selection import BinnedStratifiedKFold |
32 | 33 | from sklearn.model_selection import LabelKFold
|
33 | 34 | from sklearn.model_selection import LeaveOneOut
|
34 | 35 | from sklearn.model_selection import LeaveOneLabelOut
|
@@ -140,34 +141,27 @@ def test_cross_validator_with_default_params():
|
140 | 141 | X_1d = np.array([1, 2, 3, 4])
|
141 | 142 | y = np.array([1, 1, 2, 2])
|
142 | 143 | labels = np.array([1, 2, 3, 4])
|
143 |
| - loo = LeaveOneOut() |
144 |
| - lpo = LeavePOut(p) |
145 |
| - kf = KFold(n_folds) |
146 |
| - skf = StratifiedKFold(n_folds) |
147 |
| - lolo = LeaveOneLabelOut() |
148 |
| - lopo = LeavePLabelOut(p) |
149 |
| - ss = ShuffleSplit(random_state=0) |
150 |
| - ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 |
151 |
| - |
152 |
| - loo_repr = "LeaveOneOut()" |
153 |
| - lpo_repr = "LeavePOut(p=2)" |
154 |
| - kf_repr = "KFold(n_folds=2, random_state=None, shuffle=False)" |
155 |
| - skf_repr = "StratifiedKFold(n_folds=2, random_state=None, shuffle=False)" |
156 |
| - lolo_repr = "LeaveOneLabelOut()" |
157 |
| - lopo_repr = "LeavePLabelOut(n_labels=2)" |
158 |
| - ss_repr = ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, " |
159 |
| - "train_size=None)") |
160 |
| - ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" |
| 144 | + cvs = [ |
| 145 | + (LeaveOneOut(), "LeaveOneOut()", n_samples), |
| 146 | + (LeavePOut(p), "LeavePOut(p=%u)" % p, comb(n_samples, p) ), |
| 147 | + (KFold(n_folds), "KFold(n_folds=2, random_state=None, shuffle=False)", n_folds), |
| 148 | + (StratifiedKFold(n_folds), ("StratifiedKFold(n_folds=2, " |
| 149 | + "random_state=None, shuffle=False)"), n_folds), |
| 150 | + (LeaveOneLabelOut(), "LeaveOneLabelOut()", n_unique_labels), |
| 151 | + (LeavePLabelOut(p), "LeavePLabelOut(n_labels=%u)" % p, comb(n_unique_labels, p) ), |
| 152 | + (ShuffleSplit(random_state=0), ("ShuffleSplit(n_iter=10, random_state=0, test_size=0.1, " |
| 153 | + "train_size=None)"), n_iter), |
| 154 | + (PredefinedSplit([1, 1, 2, 2]), "PredefinedSplit(test_fold=array([1, 1, 2, 2]))", 2), |
| 155 | + ] |
| 156 | + # n_splits = np of unique folds = 2 |
161 | 157 |
|
162 | 158 | n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds,
|
163 | 159 | n_unique_labels, comb(n_unique_labels, p), n_iter, 2]
|
164 | 160 |
|
165 |
| - for i, (cv, cv_repr) in enumerate(zip( |
166 |
| - [loo, lpo, kf, skf, lolo, lopo, ss, ps], |
167 |
| - [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, |
168 |
| - ss_repr, ps_repr])): |
| 161 | + for i, (cv, cv_repr, n_splits_ ) in enumerate(cvs): |
| 162 | + print( cv, cv_repr, n_splits_ ) |
169 | 163 | # Test if get_n_splits works correctly
|
170 |
| - assert_equal(n_splits[i], cv.get_n_splits(X, y, labels)) |
| 164 | + assert_equal(n_splits_, cv.get_n_splits(X, y, labels)) |
171 | 165 |
|
172 | 166 | # Test if the cross-validator works as expected even if
|
173 | 167 | # the data is 1d
|
@@ -379,6 +373,110 @@ def test_stratifiedkfold_balance():
|
379 | 373 | assert_equal(np.sum(sizes), i)
|
380 | 374 |
|
381 | 375 |
|
| 376 | +def test_binnedstratifiedkfold_balance(): |
| 377 | + for i in range(11, 17): |
| 378 | + n_folds = 2 + int(10*np.random.rand()) |
| 379 | + y = np.random.randn(i) |
| 380 | + np.random.shuffle(y) |
| 381 | + sizes = [] |
| 382 | + |
| 383 | + cv = BinnedStratifiedKFold(n_folds, |
| 384 | + shuffle=False, random_state=None) |
| 385 | + bskf = cv.split(y) |
| 386 | + |
| 387 | + bins = np.array([np.percentile(y, q) for q in range(n_folds)]) |
| 388 | + for train_index, test_index in bskf: |
| 389 | + sizes.append( |
| 390 | + len(test_index) |
| 391 | + ) |
| 392 | + assert_true((np.max(sizes) - np.min(sizes)) <= 1) |
| 393 | + assert_equal(np.sum(sizes), i) |
| 394 | + |
| 395 | + |
| 396 | +def test_binnedstratifiedkfold_bin_spacing(): |
| 397 | + "check if the binned `y` falls into bins of equal size (+/- 1)" |
| 398 | + for _ in range(10): |
| 399 | + n_folds = 2 + int(10*np.random.rand()) |
| 400 | + y = np.random.randn(30) |
| 401 | + np.random.shuffle(y) |
| 402 | + |
| 403 | + cv = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False, |
| 404 | + random_state=None) |
| 405 | + bskf = cv.split(y) |
| 406 | + #bins = np.percentile(y, np.arange(n_folds)) |
| 407 | + bins = np.array([np.percentile(y, q) for q in range(n_folds)]) |
| 408 | + |
| 409 | + for train_index, test_index in bskf: |
| 410 | + y_test = y[test_index] |
| 411 | + hist_test, _ = np.histogram( y_test, bins = bins ) |
| 412 | + assert_true(all(abs(hist_test - np.mean(hist_test)) <= 1), |
| 413 | + msg = "y_test falls into bins of too ragged sizes") |
| 414 | + |
| 415 | + y_train = y[train_index] |
| 416 | + hist_train, _ = np.histogram( y_test, bins = bins ) |
| 417 | + assert_true(all(abs(hist_train - np.mean(hist_train)) <= 1), |
| 418 | + msg = "y_train falls into bins of too ragged sizes") |
| 419 | + |
| 420 | + |
| 421 | +def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds(): |
| 422 | + """check if BinnedStratifiedKFold performs on average better than KFold in terms of |
| 423 | + lower between-fold variance of fold mean(y_test) and fold std(y_test) |
| 424 | + """ |
| 425 | + binned_has_more_stable_std_list = [] |
| 426 | + binned_has_more_stable_mean_list = [] |
| 427 | + |
| 428 | + for trial in range(100): |
| 429 | + n_folds = 2 + int(10*np.random.rand()) |
| 430 | + y = np.random.randn(30) |
| 431 | + np.random.shuffle(y) |
| 432 | + ymeans_binned = [] |
| 433 | + ystds_binned = [] |
| 434 | + |
| 435 | + cv_bs = BinnedStratifiedKFold(n_folds=n_folds, shuffle = False, |
| 436 | + random_state=None) |
| 437 | + bskf = cv_bs.split(y) |
| 438 | + |
| 439 | + cv = KFold(n_folds = n_folds, |
| 440 | + shuffle = True, random_state = None) |
| 441 | + kf = cv.split(y) |
| 442 | + |
| 443 | + #bins = np.percentile(y, np.arange(n_folds)) |
| 444 | + bins = np.array([np.percentile(y, q) for q in range(n_folds)]) |
| 445 | + |
| 446 | + for train_index, test_index in bskf: |
| 447 | + y_test = y[test_index] |
| 448 | + ymeans_binned.append(y_test.mean()) |
| 449 | + ystds_binned.append(y_test.std()) |
| 450 | + hist_, _ = np.histogram(y[test_index], bins = bins) |
| 451 | + |
| 452 | + assert_true(all(abs(hist_ - np.mean(hist_)) <= 1), |
| 453 | + msg="too ragged bins") |
| 454 | + |
| 455 | + ymeans_regular = [] |
| 456 | + ystds_regular = [] |
| 457 | + for train_index_reg, test_index_reg in kf: |
| 458 | + ymeans_regular.append(y[test_index_reg].mean()) |
| 459 | + ystds_regular.append(y[test_index_reg].std()) |
| 460 | + |
| 461 | + binned_has_more_stable_std = np.std(ystds_regular) > np.std(ystds_binned) |
| 462 | + binned_has_more_stable_std_list.append(binned_has_more_stable_std) |
| 463 | + |
| 464 | + binned_has_more_stable_mean = np.std(ymeans_regular) > np.std(ymeans_binned) |
| 465 | + binned_has_more_stable_mean_list.append(binned_has_more_stable_mean) |
| 466 | + |
| 467 | + binned_has_more_stable_std_fraction = np.mean(binned_has_more_stable_std_list) |
| 468 | + binned_has_more_stable_mean_fraction = np.mean(binned_has_more_stable_mean_list) |
| 469 | + |
| 470 | + assert_greater( binned_has_more_stable_std_fraction, 0.5) |
| 471 | + assert_greater( binned_has_more_stable_mean_fraction, 0.5) |
| 472 | + print(" std(y_test) of BinnedStratifiedKFold was more stable than " |
| 473 | + "one of KFold in\t%.2f%% cases" % \ |
| 474 | + (100.0*binned_has_more_stable_std_fraction)) |
| 475 | + print("mean(y_test) of BinnedStratifiedKFold was more stable than " |
| 476 | + "one of KFold in\t%.2f%% cases" % \ |
| 477 | + (100.0*binned_has_more_stable_mean_fraction)) |
| 478 | + |
| 479 | + |
382 | 480 | def test_shuffle_kfold():
|
383 | 481 | # Check the indices are shuffled properly
|
384 | 482 | kf = KFold(3)
|
|
0 commit comments