1
1
import numpy as np
2
+ import warnings
2
3
from .base import is_classifier , clone
3
4
from .cross_validation import _check_cv
4
5
from .utils import check_arrays
9
10
def learning_curve (estimator , X , y , n_samples_range = np .linspace (0.1 , 1.0 , 10 ),
10
11
cv = None , scoring = None , exploit_incremental_learning = False ,
11
12
n_jobs = 1 , verbose = 0 ):
12
- """ TODO document me
13
+ """Learning curve
14
+
15
+ Determines cross-validated training and test scores for different training
16
+ set sizes.
13
17
14
18
Parameters
15
19
----------
@@ -63,49 +67,26 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10),
63
67
test_scores : array, shape = [n_ticks,]
64
68
Scores on test set.
65
69
"""
66
- # TODO tests, doc
67
70
# TODO use verbose argument
68
71
69
- X , y = check_arrays (X , y , sparse_format = 'csr' , allow_lists = True )
70
- # Make a list since we will be iterating multiple times over the folds
71
- cv = list (_check_cv (cv , X , y , classifier = is_classifier (estimator )))
72
-
73
72
if exploit_incremental_learning and not hasattr (estimator , 'partial_fit' ):
74
73
raise ValueError ('An estimator must support the partial_fit interface '
75
74
'to exploit incremental learning' )
76
75
77
- # Determine range of number of training samples
76
+ X , y = check_arrays (X , y , sparse_format = 'csr' , allow_lists = True )
77
+ # Make a list since we will be iterating multiple times over the folds
78
+ cv = list (_check_cv (cv , X , y , classifier = is_classifier (estimator )))
79
+
78
80
n_max_training_samples = cv [0 ][0 ].shape [0 ]
79
- n_samples_range = np .asarray (n_samples_range )
80
- n_min_required_samples = np .min (n_samples_range )
81
- n_max_required_samples = np .max (n_samples_range )
82
- if np .issubdtype (n_samples_range .dtype , np .float ):
83
- if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0 :
84
- raise ValueError ("n_samples_range must be within (0, 1], "
85
- "but is within [%f, %f]."
86
- % (n_min_required_samples ,
87
- n_max_required_samples ))
88
- n_samples_range = np .unique ((n_samples_range *
89
- n_max_training_samples ).astype (np .int ))
90
- # TODO we could
91
- # - print a warning
92
- # - *, inverse = np.unique(*, return_inverse=True); return np.take(., inverse)
93
- # if there are duplicate elements
94
- else :
95
- if (n_min_required_samples <= 0 or
96
- n_max_required_samples > n_max_training_samples ):
97
- raise ValueError ("n_samples_range must be within (0, %d], "
98
- "but is within [%d, %d]."
99
- % (n_max_training_samples ,
100
- n_min_required_samples ,
101
- n_max_required_samples ))
81
+ n_samples_range , n_unique_ticks = _translate_n_samples_range (
82
+ n_samples_range , n_max_training_samples )
102
83
103
84
_check_scorable (estimator , scoring = scoring )
104
85
scorer = _deprecate_loss_and_score_funcs (scoring = scoring )
105
86
106
87
if exploit_incremental_learning :
88
+ raise NotImplemented ("Incremental learning is not supported yet" )
107
89
# TODO exploit incremental learning
108
- pass
109
90
else :
110
91
out = Parallel (
111
92
# TODO use pre_dispatch parameter? what is it good for?
@@ -116,13 +97,47 @@ def learning_curve(estimator, X, y, n_samples_range=np.linspace(0.1, 1.0, 10),
116
97
for n_train_samples in n_samples_range for train , test in cv )
117
98
118
99
out = np .array (out )
119
- n_unique_ticks = n_samples_range .shape [0 ]
120
100
n_cv_folds = out .shape [0 ]/ n_unique_ticks
121
101
out = out .reshape (n_unique_ticks , n_cv_folds , 2 )
122
102
avg_over_cv = out .mean (axis = 1 ).reshape (n_unique_ticks , 2 )
123
103
124
104
return n_samples_range , avg_over_cv [:, 0 ], avg_over_cv [:, 1 ]
125
105
106
+
107
+ def _translate_n_samples_range (n_samples_range , n_max_training_samples ):
108
+ """Determine range of number of training samples"""
109
+ n_samples_range = np .asarray (n_samples_range )
110
+ n_ticks = n_samples_range .shape [0 ]
111
+ n_min_required_samples = np .min (n_samples_range )
112
+ n_max_required_samples = np .max (n_samples_range )
113
+ if np .issubdtype (n_samples_range .dtype , np.float ):
114
+ if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0 :
115
+ raise ValueError ("n_samples_range must be within (0, 1], "
116
+ "but is within [%f, %f]."
117
+ % (n_min_required_samples ,
118
+ n_max_required_samples ))
119
+ n_samples_range = (n_samples_range * n_max_training_samples
120
+ ).astype (np .int )
121
+ n_samples_range = np .clip (n_samples_range , 1 , n_max_training_samples )
122
+ else :
123
+ if (n_min_required_samples <= 0 or
124
+ n_max_required_samples > n_max_training_samples ):
125
+ raise ValueError ("n_samples_range must be within (0, %d], "
126
+ "but is within [%d, %d]."
127
+ % (n_max_training_samples ,
128
+ n_min_required_samples ,
129
+ n_max_required_samples ))
130
+
131
+ n_samples_range = np .unique (n_samples_range )
132
+ n_unique_ticks = n_samples_range .shape [0 ]
133
+ if n_ticks > n_unique_ticks :
134
+ warnings .warn ("Number of ticks will be less than than the size of "
135
+ "'n_samples_range' (%d instead of %d)."
136
+ % (n_unique_ticks , n_ticks ), RuntimeWarning )
137
+
138
+ return n_samples_range , n_unique_ticks
139
+
140
+
126
141
def _fit_estimator (base_estimator , X , y , train , test , n_train_samples ,
127
142
scorer , verbose ):
128
143
# HACK as long as boolean indices are allowed in cv generators
0 commit comments