8000 Merge pull request #10 from glouppe/treeweights · pprett/scikit-learn@3b4bd9f · GitHub
[go: up one dir, main page]

Skip to content

Commit 3b4bd9f

Browse files
committed
Merge pull request #10 from glouppe/treeweights
Solve conflicts with master
2 parents dd39379 + 719d724 commit 3b4bd9f

File tree

18 files changed

+257
-110
lines changed

18 files changed

+257
-110
lines changed

doc/developers/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ rules before submitting a pull request:
145145

146146
You can also check for common programming errors with the following tools:
147147

148-
* Code with a good unittest coverage (at least 80%), check with::
148+
* Code with a good unittest coverage (at least 90%, better 100%), check
149+
with::
149150

150151
$ pip install nose coverage
151152
$ nosetests --with-coverage path/to/tests_for_package

doc/modules/multiclass.rst

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ default choice. Below is an example::
6464
>>> from sklearn.svm import LinearSVC
6565
>>> iris = datasets.load_iris()
6666
>>> X, y = iris.data, iris.target
67-
>>> OneVsRestClassifier(LinearSVC()).fit(X, y).predict(X)
67+
>>> OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)
6868
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6969
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7070
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -110,7 +110,7 @@ dataset is used `n_classes` times. Below is an example::
110110
>>> from sklearn.svm import LinearSVC
111111
>>> iris = datasets.load_iris()
112112
>>> X, y = iris.data, iris.target
113-
>>> OneVsOneClassifier(LinearSVC()).fit(X, y).predict(X)
113+
>>> OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)
114114
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115115
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116116
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -139,8 +139,8 @@ At fitting time, one binary classifier per bit in the code book is fitted.
139139
At prediction time, the classifiers are used to project new points in the
140140
class space and the class closest to the points is chosen.
141141

142-
In :class:`OutputCodeClassifier`, the `code_size` attribute allows the user to control
143-
the number of classifiers which will be used. It is a percentage of the
142+
In :class:`OutputCodeClassifier`, the `code_size` attribute allows the user to
143+
control the number of classifiers which will be used. It is a percentage of the
144144
total number of classes.
145145

146146
A number between 0 and 1 will require fewer classifiers than
@@ -162,7 +162,9 @@ Example::
162162
>>> from sklearn.svm import LinearSVC
163163
>>> iris = datasets.load_iris()
164164
>>> X, y = iris.data, iris.target
165-
>>> OutputCodeClassifier(LinearSVC(), code_size=2, random_state=0).fit(X, y).predict(X)
165+
>>> clf = OutputCodeClassifier(LinearSVC(random_state=0),
166+
... code_size=2, random_state=0)
167+
>>> clf.fit(X, y).predict(X)
166168
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167169
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168170
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
@@ -171,7 +173,6 @@ Example::
171173
2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2,
172174
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
173175

174-
175176
.. topic:: References:
176177

177178
.. [1] "Solving multiclass learning problems via error-correcting ouput codes",

doc/modules/tree.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,18 @@ Once trained, we can export the tree in `Graphviz
123123
exporter. Below is an example export of a tree trained on the entire
124124
iris dataset::
125125

126+
>>> import StringIO
126127
>>> with open("iris.dot", 'w') as f:
127128
... f = tree.export_graphviz(clf, out_file=f)
128129

129130
Then we can use Graphviz's ``dot`` tool to create a PDF file (or any other
130131
supported file type): ``dot -Tpdf iris.dot -o iris.pdf``.
131132

133+
::
134+
135+
>>> import os
136+
>>> os.unlink('iris.dot')
137+
132138
Alternatively, if we have Python module ``pydot`` installed, we can generate
133139
a PDF file (or any other supported file type) directly in Python::
134140

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ Changelog
9898
- Fixed a floating point exception in the :ref:`decision trees <tree>`
9999
module, by Seberg.
100100

101+
- Fix :func:`metrics.roc_curve` fails when y_true has only one class
102+
by Wei Li.
103+
104+
101105
API changes summary
102106
-------------------
103107
- Renamed all occurences of ``n_atoms`` to ``n_components`` for consistency.

sklearn/cross_validation.py

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def __repr__(self):
188188

189189
def __len__(self):
190190
return int(factorial(self.n) / factorial(self.n - self.p)
191-
/ factorial(self.p))
191+
/ factorial(self.p))
192192

193193

194194
def _validate_kfold(k, n_samples):
@@ -257,10 +257,10 @@ class KFold(object):
257257
"""
258258

259259
def __init__(self, n, n_folds=3, indices=True, shuffle=False,
260-
random_state=None, k=None):
260+
random_state=None, k=None):
261261
if k is not None: # pragma: no cover
262262
warnings.warn("The parameter k was renamed to n_folds and will be"
263-
" removed in 0.15.", DeprecationWarning)
263+
" removed in 0.15.", DeprecationWarning)
264264
n_folds = k
265265
_validate_kfold(n_folds, n)
266266
random_state = check_random_state(random_state)
@@ -353,7 +353,7 @@ class StratifiedKFold(object):
353353
def __init__(self, y, n_folds=3, indices=True, k=None):
354354
if k is not None: # pragma: no cover
355355
warnings.warn("The parameter k was renamed to n_folds and will be"
356-
" removed in 0.15.", DeprecationWarning)
356+
" removed in 0.15.", DeprecationWarning)
357357
n_folds = k
358358
y = np.asarray(y)
359359
n = y.shape[0]
@@ -566,8 +566,8 @@ def __repr__(self):
566566

567567
def __len__(self):
568568
return int(factorial(self.n_unique_labels) /
569-
factorial(self.n_unique_labels - self.p) /
570-
factorial(self.p))
569+
factorial(self.n_unique_labels - self.p) /
570+
factorial(self.p))
571571

572572

573573
class Bootstrap(object):
@@ -647,8 +647,8 @@ def __init__(self, n, n_iter=3, train_size=.5, test_size=None,
647647
"be removed in 0.16.", DeprecationWarning)
648648
n_iter = n_bootstraps
649649
self.n_iter = n_iter
650-
if (isinstance(train_size, numbers.Real) and train_size >= 0.0
651-
and train_size <= 1.0):
650+
if (isinstance(train_size, numbers.Real) and train_size >= 0.0 and
651+
train_size <= 1.0):
652652
self.train_size = ceil(train_size * n)
653653
elif isinstance(train_size, numbers.Integral):
654654
self.train_size = train_size
@@ -683,10 +683,8 @@ def __iter__(self):
683683
+ self.test_size]
684684

685685
# bootstrap in each split individually
686-
train = rng.randint(0, self.train_size,
687-
size=(self.train_size,))
688-
test = rng.randint(0, self.test_size,
689-
size=(self.test_size,))
686+
train = rng.randint(0, self.train_size, size=(self.train_size,))
687+
test = rng.randint(0, self.test_size, size=(self.test_size,))
690688
yield ind_train[train], ind_test[test]
691689

692690
def __repr__(self):
@@ -772,12 +770,12 @@ class ShuffleSplit(object):
772770
"""
773771

774772
def __init__(self, n, n_iter=10, test_size=0.1, train_size=None,
775-
indices=True, random_state=None, n_iterations=None):
773+
indices=True, random_state=None, n_iterations=None):
776774
self.n = n
777775
self.n_iter = n_iter
778776
if n_iterations is not None: # pragma: no cover
779777
warnings.warn("n_iterations was renamed to n_iter for consistency "
780-
" and will be removed in 0.16.")
778+
" and will be removed in 0.16.")
781779
self.n_iter = n_iterations
782780
self.test_size = test_size
783781
self.train_size = train_size
@@ -956,14 +954,14 @@ class StratifiedShuffleSplit(object):
956954
"""
957955

958956
def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
959-
indices=True, random_state=None, n_iterations=None):
957+
indices=True, random_state=None, n_iterations=None):
960958

961959
self.y = np.array(y)
962960
self.n = self.y.size
963961
self.n_iter = n_iter
964962
if n_iterations is not None: # pragma: no cover
965963
warnings.warn("n_iterations was renamed to n_iter for consistency "
966-
" and will be removed in 0.16.")
964+
" and will be removed in 0.16.")
967965
self.n_iter = n_iterations
968966
self.test_size = test_size
969967
self.train_size = train_size
@@ -1025,9 +1023,9 @@ def _cross_val_score(estimator, X, y, score_func, train, test, verbose,
10251023
fit_params):
10261024
"""Inner loop for cross validation"""
10271025
n_samples = X.shape[0] if sp.issparse(X) else len(X)
1028-
fit_params = dict([(k, np.asarray(v)[train]
1029-
if hasattr(v, '__len__') and len(v) == n_samples else v)
1030-
for k, v in fit_params.items()])
1026+
fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and
1027+
len(v) == n_samples else v)
1028+
for k, v in fit_params.items()])
10311029
if getattr(estimator, "_pairwise", False):
10321030
# X is a precomputed square kernel matrix
10331031
if X.shape[0] != X.shape[1]:
@@ -1105,9 +1103,10 @@ def cross_val_score(estimator, X, y=None, score_func=None, cv=None, n_jobs=1,
11051103
# independent, and that it is pickle-able.
11061104
fit_params = fit_params if fit_params is not None else {}
11071105
scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
1108-
delayed(_cross_val_score)(clone(estimator), X, y, score_func,
1109-
train, test, verbose, fit_params)
1110-
for train, test in cv)
1106+
delayed(_cross_val_score)(
1107+
clone(estimator), X, y, score_func, train, test, verbose,
1108+
fit_params)
1109+
for train, test in cv)
11111110
return np.array(scores)
11121111

11131112

@@ -1173,8 +1172,8 @@ def check_cv(cv, X=None, y=None, classifier=False):
11731172

11741173

11751174
def permutation_test_score(estimator, X, y, score_func, cv=None,
1176-
n_permutations=100, n_jobs=1, labels=None,
1177-
random_state=0, verbose=0):
1175+
n_permutations=100, n_jobs=1, labels=None,
1176+
random_state=0, verbose=0):
11781177
"""Evaluate the significance of a cross-validated score with permutations
11791178
11801179
Parameters
@@ -1249,10 +1248,10 @@ def permutation_test_score(estimator, X, y, score_func, cv=None,
12491248
# independent, and that it is pickle-able.
12501249
score = _permutation_test_score(clone(estimator), X, y, cv, score_func)
12511250
permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
1252-
delayed(_permutation_test_score)(clone(estimator), X,
1253-
_shuffle(y, labels, random_state),
1254-
cv, score_func)
1255-
for _ in range(n_permutations))
1251+
delayed(_permutation_test_score)(
1252+
clone(estimator), X, _shuffle(y, labels, random_state), cv,
1253+
score_func)
1254+
for _ in range(n_permutations))
12561255
permutation_scores = np.array(permutation_scores)
12571256
pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
12581257
return score, permutation_scores, pvalue

sklearn/ensemble/gradient_boosting.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def _fit_stage(self, i, X, X_argsorted, y, y_pred, sample_mask):
468468
max_depth=self.max_depth,
469469
min_samples_split=self.min_samples_split,
470470
min_samples_leaf=self.min_samples_leaf,
471-
min_density=0.0,
471+
min_density=self.min_density,
472472
max_features=self.max_features,
473473
compute_importances=False,
474474
random_state=self.random_state)
@@ -559,6 +559,9 @@ def fit(self, X, y):
559559

560560
self.random_state = check_random_state(self.random_state)
561561

562+
# use default min_density (0.1) only for deep trees
563+
self.min_density = 0.0 if self.max_depth < 6 else 0.1
564+
562565
# create argsorted X for fast tree induction
563566
X_argsorted = np.asfortranarray(
564567
np.argsort(X.T, axis=1).astype(np.int32).T)

sklearn/ensemble/tests/test_gradient_boosting.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,3 +475,14 @@ def test_mem_layout():
475475
clf.fit(X, y_)
476476
assert_array_equal(clf.predict(T), true_result)
477477
assert_equal(100, len(clf.estimators_))
478+
479+
480+
def test_min_density():
481+
"""Check if min_density is properly set when growing deep trees."""
482+
clf = GradientBoostingClassifier(max_depth=6)
483+
clf.fit(X, y)
484+
assert clf.min_density == 0.1
485+
486+
clf = GradientBoostingClassifier(max_depth=5)
487+
clf.fit(X, y)
488+
assert clf.min_density == 0.0

sklearn/feature_extraction/tests/test_image.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,8 @@ def test_extract_patches_strided():
246246

247247
for (image_shape, patch_size, patch_step,
248248
expected_view, last_patch) in zip(
249-
image_shapes, patch_sizes, patch_steps,
250-
expected_views, last_patches):
249+
image_shapes, patch_sizes, patch_steps, expected_views,
250+
last_patches):
251251
image = np.arange(np.prod(image_shape)).reshape(image_shape)
252252
patches = extract_patches(image, patch_shape=patch_size,
253253
extraction_step=patch_step)
@@ -261,6 +261,16 @@ def test_extract_patches_strided():
261261
image[last_patch_slices].squeeze()).all())
262262

263263

264+
def test_extract_patches_square():
265+
# test same patch size for all dimensions
266+
lena = downsampled_lena
267+
i_h, i_w = lena.shape
268+
p = 8
269+
expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
270+
patches = extract_patches(lena, patch_shape=p)
271+
assert_true(patches.shape == (expected_n_patches[0], expected_n_patches[1],
272+
p, p))
273+
264274
if __name__ == '__main__':
265275
import nose
266276
nose.runmodule()

sklearn/grid_search.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def fit_grid_point(X, y, sample_weight, base_clf,
7777
if verbose > 1:
7878
start_time = time.time()
7979
msg = '%s' % (', '.join('%s=%s' % (k, v)
80-
for k, v in clf_params.iteritems()))
80+
for k, v in clf_params.iteritems()))
8181
print "[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')
8282

8383
X, y = check_arrays(X, y, sparse_format="csr")
@@ -157,7 +157,7 @@ def _check_param_grid(param_grid):
157157

158158
if len(v) == 0:
159159
raise ValueError("Parameter values should be a non-empty "
160-
"list.")
160+
"list.")
161161

162162

163163
def _has_one_grid_point(param_grid):
@@ -305,8 +305,7 @@ class GridSearchCV(BaseEstimator, MetaEstimatorMixin):
305305

306306
def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
307307
fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
308-
verbose=0, pre_dispatch='2*n_jobs',
309-
):
308+
verbose=0, pre_dispatch='2*n_jobs'):
310309
if not hasattr(estimator, 'fit') or \
311310
not (hasattr(estimator,  538E 9;predict') or hasattr(estimator, 'score')):
312311
raise TypeError("estimator should a be an estimator implementing"
@@ -396,12 +395,12 @@ def _fit(self, X, y, sample_weight):
396395

397396
pre_dispatch = self.pre_dispatch
398397
out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
399-
pre_dispatch=pre_dispatch)(
400-
delayed(fit_grid_point)(
401-
X, y, sample_weight, base_clf, clf_params, train, test,
402-
self.loss_func, self.score_func, self.verbose,
403-
**self.fit_params)
404-
for clf_params in grid for train, test in cv)
398+
pre_dispatch=pre_dispatch)(
399+
delayed(fit_grid_point)(
400+
X, y, sample_weight, base_clf, clf_params,
401+
train, test, self.loss_func, self.score_func,
402+
self.verbose, **self.fit_params)
403+
for clf_params in grid for train, test in cv)
405404

406405
# Out is a list of triplet: score, estimator, n_test_samples
407406
n_grid_points = len(list(grid))
@@ -415,7 +414,7 @@ def _fit(self, X, y, sample_weight):
415414
score = 0
416415
these_points = list()
417416
for this_score, clf_params, this_n_test_samples in \
418-
out[grid_start:grid_start + n_folds]:
417+
out[grid_start:grid_start + n_folds]:
419418
these_points.append(this_score)
420419
if self.iid:
421420
this_score *= this_n_test_samples
@@ -461,10 +460,9 @@ def _fit(self, X, y, sample_weight):
461460
# Store the computed scores
462461
# XXX: the name is too specific, it shouldn't have
463462
# 'grid' in it. Also, we should be retrieving/storing variance
464-
self.grid_scores_ = [
465-
(clf_params, score, all_scores)
466-
for clf_params, (score, _), all_scores
467-
in zip(grid, scores, cv_scores)]
463+
self.grid_scores_ = [(clf_params, score, all_scores)
464+
for clf_params, (score, _), all_scores
465+
in zip(grid, scores, cv_scores)]
468466
return self
469467

470468
def score(self, X, y=None):

0 commit comments

Comments
 (0)
0