8000 Merge pull request #1 from scikit-learn/master · scikit-learn/scikit-learn@7be35c8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7be35c8

Browse files
committed
Merge pull request #1 from scikit-learn/master
Pulling from upstream
2 parents 624c9f4 + cb219b4 commit 7be35c8

File tree

13 files changed

+199
-59
lines changed

13 files changed

+199
-59
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ cython:
4747
ctags:
4848
# make tags for symbol based navigation in emacs and vim
4949
# Install with: sudo apt-get install exuberant-ctags
50-
$(CTAGS) -R *
50+
$(CTAGS) -R sklearn
5151

5252
doc: inplace
5353
$(MAKE) -C doc html

doc/modules/model_evaluation.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,8 @@ implements three such simple strategies for classification:
12701270
- ``stratified`` generates random predictions by respecting the training
12711271
set class distribution.
12721272
- ``most_frequent`` always predicts the most frequent label in the training set.
1273+
- ``prior`` always predicts the class that maximizes the class prior
1274+
(like ``most_frequent`) and ``predict_proba`` returns the class prior.
12731275
- ``uniform`` generates predictions uniformly at random.
12741276
- ``constant`` always predicts a constant label that is provided by the user.
12751277
A major motivation of this method is F1-scoring, when the positive class

doc/whats_new.rst

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ Enhancements
1919
- :class:`naive_bayes.GaussianNB` now supports fitting with ``sample_weights``.
2020
By `Jan Hendrik Metzen`_.
2121

22+
- :class:`dummy.DummyClassifier` now supports a prior fitting strategy.
23+
By `Arnaud Joly`_.
24+
2225
Bug fixes
2326
.........
2427

@@ -29,6 +32,38 @@ API changes summary
2932
for retrieving the leaf indices samples are predicted as. By
3033
`Daniel Galvez`_ and `Gilles Louppe`_.
3134

35+
.. _changes_0_1_16:
36+
37+
0.16.1
38+
=======
39+
40+
Changelog
41+
---------
42+
43+
Bug fixes
44+
.........
45+
46+
- Allow input data larger than ``block_size`` in
47+
:class:`covariance.LedoitWolf` by `Andreas Müller`_.
48+
49+
- Fix a bug in :class:`isotonic.IsotonicRegression` deduplication that
50+
caused unstable result in :class:`calibration.CalibratedClassifierCV` by
51+
`Jan Hendrik Metzen`_.
52+
53+
- Fix sorting of labels in func:`preprocessing.label_binarize` by Michael Heilman.
54+
55+
- Fix several stability and convergence issues in
56+
:class:`cross_decomposition.CCA` and
57+
:class:`cross_decomposition.PLSCanonical` by `Andreas Müller`_
58+
59+
- Fix a bug in :class:`cluster.KMeans` when ``precompute_distances=False``
60+
on fortran-ordered data.
61+
62+
- Fix a speed regression in :class:`ensemble.RandomForestClassifier`'s ``predict``
63+
and ``predict_proba`` by `Andreas Müller`_.
64+
65+
- Fix a regression where ``utils.shuffle`` converted lists and dataframes to arrays, by `Olivier Grisel`_
66+
3267
.. _changes_0_16:
3368

3469
0.16

sklearn/dummy.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class DummyClassifier(BaseEstimator, ClassifierMixin):
3333
set's class distribution.
3434
* "most_frequent": always predicts the most frequent label in the
3535
training set.
36+
* "prior": always predicts the class that maximizes the class prior
37+
(like "most_frequent") and ``predict_proba`` returns the class prior.
3638
* "uniform": generates predictions uniformly at random.
3739
* "constant": always predicts a constant label that is provided by
3840
the user. This is useful for metrics that evaluate a non-majority
@@ -95,7 +97,7 @@ def fit(self, X, y, sample_weight=None):
9597
Returns self.
9698
"""
9799
if self.strategy not in ("most_frequent", "stratified", "uniform",
98-
"constant"):
100+
"constant", "prior"):
99101
raise ValueError("Unknown strategy type.")
100102

101103
if self.strategy == "uniform" and sp.issparse(y):
@@ -147,8 +149,7 @@ def fit(self, X, y, sample_weight=None):
147149
return self
148150

149151
def predict(self, X):
150-
"""
151-
Perform classification on test vectors X.
152+
"""Perform classification on test vectors X.
152153
153154
Parameters
154155
----------
@@ -188,7 +189,7 @@ def predict(self, X):
188189

189190
if self.sparse_output_:
190191
class_prob = None
191-
if self.strategy == "most_frequent":
192+
if self.strategy in ("most_frequent", "prior"):
192193
classes_ = [np.array([cp.argmax()]) for cp in class_prior_]
193194

194195
elif self.strategy == "stratified":
@@ -204,7 +205,7 @@ def predict(self, X):
204205
y = random_choice_csc(n_samples, classes_, class_prob,
205206
self.random_state)
206207
else:
207-
if self.strategy == "most_frequent":
208+
if self.strategy in ("most_frequent", "prior"):
208209
y = np.tile([classes_[k][class_prior_[k].argmax()] for
209210
k in range(self.n_outputs_)], [n_samples, 1])
210211

@@ -268,6 +269,8 @@ def predict_proba(self, X):
268269
ind = np.ones(n_samples, dtype=int) * class_prior_[k].argmax()
269270
out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
270271
out[:, ind] = 1.0
272+
elif self.strategy == "prior":
273+
out = np.ones((n_samples, 1)) * class_prior_[k]
271274

272275
elif self.strategy == "stratified":
273276
out = rs.multinomial(1, class_prior_[k], size=n_samples)

sklearn/ensemble/forest.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,8 @@ def _set_oob_score(self, X, y):
365365
mask = np.ones(n_samples, dtype=np.bool)
366366
mask[estimator.indices_] = False
367367
mask_indices = sample_indices[mask]
368-
p_estimator = estimator.predict_proba(X[mask_indices, :])
368+
p_estimator = estimator.predict_proba(X[mask_indices, :],
369+
check_input=False)
369370

370371
if self.n_outputs_ == 1:
371372
p_estimator = [p_estimator]
@@ -508,7 +509,7 @@ class in a leaf.
508509
# Parallel loop
509510
all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose,
510511
backend="threading")(
511-
delayed(_parallel_helper)(e, 'predict_proba', X)
512+
delayed(_parallel_helper)(e, 'predict_proba', X, check_input=False)
512513
for e in self.estimators_)
513514

514515
# Reduce
@@ -614,6 +615,10 @@ def predict(self, X):
614615

615616
# Check data
616617
X = check_array(X, dtype=DTYPE, accept_sparse="csr")
618+
if issparse(X) and (X.indices.dtype != np.intc or
619+
X.indptr.dtype != np.intc):
620+
raise ValueError("No support for np.int64 index based "
621+
"sparse matrices")
617622

618623
# Assign chunk of trees to jobs
619624
n_jobs, n_trees, starts = _partition_estimators(self.n_estimators,
@@ -622,7 +627,7 @@ def predict(self, X):
622627
# Parallel loop
623628
all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose,
624629
backend="threading")(
625-
delayed(_parallel_helper)(e, 'predict', X)
630+
delayed(_parallel_helper)(e, 'predict', X, check_input=False)
626631
for e in self.estimators_)
627632

628633
# Reduce
@@ -642,7 +647,7 @@ def _set_oob_score(self, X, y):
642647
mask = np.ones(n_samples, dtype=np.bool)
643648
mask[estimator.indices_] = False
644649
mask_indices = sample_indices[mask]
645-
p_estimator = estimator.predict(X[mask_indices, :])
650+
p_estimator = estimator.predict(X[mask_indices, :], check_input=False)
646651

647652
if self.n_outputs_ == 1:
648653
p_estimator = p_estimator[:, np.newaxis]

sklearn/neighbors/base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,8 @@ class from an array representing our data set and ask who's
335335
train_size = self._fit_X.shape[0]
336336
if n_neighbors > train_size:
337337
raise ValueError(
338-
"Expected n_neighbors <= %d. Got %d" %
338+
"Expected n_neighbors <= n_samples, "
339+
" but n_samples = %d, n_neighbors = %d" %
339340
(train_size, n_neighbors)
340341
)
341342
n_samples, _ = X.shape

sklearn/pipeline.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,26 @@ def predict(self, X):
183183
Xt = transform.transform(Xt)
184184
return self.steps[-1][-1].predict(Xt)
185185

186+
@if_delegate_has_method(delegate='_final_estimator')
187+
def fit_predict(self, X, y=None, **fit_params):
188+
"""Applies fit_predict of last step in pipeline after transforms.
189+
190+
Applies fit_transforms of a pipeline to the data, followed by the
191+
fit_predict method of the final estimator in the pipeline. Valid
192+
only if the final estimator implements fit_predict.
193+
194+
Parameters
195+
----------
196+
X : iterable
197+
Training data. Must fulfill input requirements of first step of
198+
the pipeline.
199+
y : iterable, default=None
200+
Training targets. Must fulfill label requirements for all steps
201+
of the pipeline.
202+
"""
203+
Xt, fit_params = self._pre_transform(X, y, **fit_params)
204+
return self.steps[-1][-1].fit_predict(Xt, y, **fit_params)
205+
186206
@if_delegate_has_method(delegate='_final_estimator')
187207
def predict_proba(self, X):
188208
"""Applies transforms to the data, and the predict_proba method of the

sklearn/tests/test_dummy.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import division
2-
import warnings
2+
33
import numpy as np
44
import scipy.sparse as sp
55

@@ -11,17 +11,17 @@
1111
from sklearn.utils.testing import assert_raises
1212
from sklearn.utils.testing import assert_true
1313
from sklearn.utils.testing import assert_warns_message
14+
from sklearn.utils.testing import ignore_warnings
1415
from sklearn.utils.stats import _weighted_percentile
1516

1617
from sklearn.dummy import DummyClassifier, DummyRegressor
1718

1819

20+
@ignore_warnings
1921
def _check_predict_proba(clf, X, y):
2022
proba = clf.predict_proba(X)
21-
with warnings.catch_warnings():
22-
warnings.simplefilter("ignore")
23-
# We know that we can have division by zero
24-
log_proba = clf.predict_log_proba(X)
23+
# We know that we can have division by zero
24+
log_proba = clf.predict_log_proba(X)
2525

2626
y = np.atleast_1d(y)
2727
if y.ndim == 1:
@@ -38,10 +38,8 @@ def _check_predict_proba(clf, X, y):
3838
assert_equal(proba[k].shape[0], n_samples)
3939
assert_equal(proba[k].shape[1], len(np.unique(y[:, k])))
4040
assert_array_equal(proba[k].sum(axis=1), np.ones(len(X)))
41-
with warnings.catch_warnings():
42-
warnings.simplefilter("ignore")
43-
# We know that we can have division by zero
44-
assert_array_equal(np.log(proba[k]), log_proba[k])
41+
# We know that we can have division by zero
42+
assert_array_equal(np.log(proba[k]), log_proba[k])
4543

4644

4745
def _check_behavior_2d(clf):
@@ -85,17 +83,25 @@ def _check_equality_regressor(statistic, y_learn, y_pred_learn,
8583
y_pred_test)
8684

8785

88-
def test_most_frequent_strategy():
86+
def test_most_frequent_and_prior_strategy():
8987
X = [[0], [0], [0], [0]] # ignored
9088
y = [1, 2, 1, 1]
9189

92-
clf = DummyClassifier(strategy="most_frequent", random_state=0)
93-
clf.fit(X, y)
94-
assert_array_equal(clf.predict(X), np.ones(len(X)))
95-
_check_predict_proba(clf, X, y)
90+
for strategy in ("most_frequent", "prior"):
91+
clf = DummyClassifier(strategy=strategy, random_state=0)
92+
clf.fit(X, y)
93+
assert_array_equal(clf.predict(X), np.ones(len(X)))
94+
_check_predict_proba(clf, X, y)
95+
96+
if strategy == "prior":
97+
assert_array_equal(clf.predict_proba(X[0]),
98+
clf.class_prior_.reshape((1, -1)))
99+
else:
100+
assert_array_equal(clf.predict_proba(X[0]),
101+
clf.class_prior_.reshape((1, -1)) > 0.5)
96102

97103

98-
def test_most_frequent_strategy_multioutput():
104+
def test_most_frequent_and_prior_strategy_multioutput():
99105
X = [[0], [0], [0], [0]] # ignored
100106
y = np.array([[1, 0],
101107
[2, 0],
@@ -104,13 +110,14 @@ def test_most_frequent_strategy_multioutput():
104110

105111
n_samples = len(X)
106112

107-
clf = DummyClassifier(strategy="most_frequent", random_state=0)
108-
clf.fit(X, y)
109-
assert_array_equal(clf.predict(X),
110-
np.hstack([np.ones((n_samples, 1)),
111-
np.zeros((n_samples, 1))]))
112-
_check_predict_proba(clf, X, y)
113-
_check_behavior_2d(clf)
113+
for strategy in ("prior", "most_frequent"):
114+
clf = DummyClassifier(strategy=strategy, random_state=0)
115+
clf.fit(X, y)
116+
assert_array_equal(clf.predict(X),
117+
np.hstack([np.ones((n_samples, 1)),
118+
np.zeros((n_samples, 1))]))
119+
_check_predict_proba(clf, X, y)
120+
_check_behavior_2d(clf)
114121

115122

116123
def test_stratified_strategy():
@@ -555,7 +562,7 @@ def test_stratified_strategy_sparse_target():
555562
assert_almost_equal(p[4], 1. / 5, decimal=1)
556563

557564

558-
def test_most_frequent_strategy_sparse_target():
565+
def test_most_frequent_and_prior_strategy_sparse_target():
559566
X = [[0]] * 5 # ignored
560567
y = sp.csc_matrix(np.array([[1, 0],
561568
[1, 3],
@@ -564,13 +571,14 @@ def test_most_frequent_strategy_sparse_target():
564571
[1, 0]]))
565572

566573
n_samples = len(X)
567-
clf = DummyClassifier(strategy="most_frequent", random_state=0)
568-
clf.fit(X, y)
569-
570-
y_pred = clf.predict(X)
571-
assert_true(sp.issparse(y_pred))
572-
assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)),
573-
np.zeros((n_samples, 1))]))
574+
y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
575+
for strategy in ("most_frequent", "prior"):
576+
clf = DummyClassifier(strategy=strategy, random_state=0)
577+
clf.fit(X, y)
578+
579+
y_pred = clf.predict(X)
580+
assert_true(sp.issparse(y_pred))
581+
assert_array_equal(y_pred.toarray(), y_expected)
574582

575583

576584
def test_dummy_regressor_sample_weight(n_samples=10):

sklearn/tests/test_pipeline.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from scipy import sparse
66

77
from sklearn.externals.six.moves import zip
8-
from sklearn.utils.testing import assert_raises
8+
from sklearn.utils.testing import assert_raises, assert_raises_regex
99
from sklearn.utils.testing import assert_equal
1010
from sklearn.utils.testing import assert_false
1111
from sklearn.utils.testing import assert_true
@@ -17,6 +17,7 @@
1717
from sklearn.svm import SVC
1818
from sklearn.linear_model import LogisticRegression
1919
from sklearn.linear_model import LinearRegression
20+
from sklearn.cluster import KMeans
2021
from sklearn.feature_selection import SelectKBest, f_classif
2122
from sklearn.decomposition import PCA, RandomizedPCA, TruncatedSVD
2223
from sklearn.datasets import load_iris
@@ -202,6 +203,36 @@ def test_pipeline_methods_preprocessing_svm():
202203
pipe.score(X, y)
203204

204205

206+
def test_fit_predict_on_pipeline():
207+
# test that the fit_predict method is implemented on a pipeline
208+
# test that the fit_predict on pipeline yields same results as applying
209+
# transform and clustering steps separately
210+
iris = load_iris()
211+
scaler = StandardScaler()
212+
km = KMeans(random_state=0)
213+
214+
# first compute the transform and clustering step separately
215+
scaled = scaler.fit_transform(iris.data)
216+
separate_pred = km.fit_predict(scaled)
217+
218+
# use a pipeline to do the transform and clustering in one step
219+
pipe = Pipeline([('scaler', scaler), ('Kmeans', km)])
220+
pipeline_pred = pipe.fit_predict(iris.data)
221+
222+
assert_array_almost_equal(pipeline_pred, separate_pred)
223+
224+
225+
def test_fit_predict_on_pipeline_without_fit_predict():
226+
# tests that a pipeline does not have fit_predict method when final
227+
# step of pipeline does not have fit_predict defined
228+
scaler = StandardScaler()
229+
pca = PCA()
230+
pipe = Pipeline([('scaler', scaler), ('pca', pca)])
231+
assert_raises_regex(AttributeError,
232+
"'PCA' object has no attribute 'fit_predict'",
233+
getattr, pipe, 'fit_predict')
234+
235+
205236
def test_feature_union():
206237
# basic sanity check for feature union
207238
iris = load_iris()

0 commit comments

Comments
 (0)
0