8000 Merge branch 'main' into validations_ridgecv · scikit-learn/scikit-learn@b95f434 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b95f434

Browse files
author
ArturoAmorQ
committed
Merge branch 'main' into validations_ridgecv
# Please enter a commit message to explain why this merge is necessary,
2 parents c364401 + 5856205 commit b95f434

18 files changed

+246
-50
lines changed

.circleci/config.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ jobs:
5959
- MATPLOTLIB_VERSION: 'latest'
6060
- CYTHON_VERSION: 'latest'
6161
- SCIKIT_IMAGE_VERSION: 'latest'
62-
- SPHINX_VERSION: 'min'
62+
# Bump the sphinx version from time to time. Avoid latest sphinx version
63+
# that tends to break things slightly too often
64+
- SPHINX_VERSION: 4.2.0
6365
- PANDAS_VERSION: 'latest'
6466
- SPHINX_GALLERY_VERSION: 'latest'
6567
- NUMPYDOC_VERSION: 'latest'

build_tools/generate_authors_table.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@ def get_contributors():
4343
# get core devs and triage team
4444
core_devs = []
4545
triage_team = []
46-
for team_id, lst in zip((11523, 3593183), (core_devs, triage_team)):
46+
comm_team = []
47+
core_devs_id = 11523
48+
triage_team_id = 3593183
49+
comm_team_id = 5368696
50+
for team_id, lst in zip(
51+
(core_devs_id, triage_team_id, comm_team_id),
52+
(core_devs, triage_team, comm_team),
53+
):
4754
for page in [1, 2]: # 30 per page
4855
reply = get(f"https://api.github.com/teams/{team_id}/members?page={page}")
4956
lst.extend(reply.json())
@@ -59,6 +66,7 @@ def get_contributors():
5966
# keep only the logins
6067
core_devs = set(c["login"] for c in core_devs)
6168
triage_team = set(c["login"] for c in triage_team)
69+
comm_team = set(c["login"] for c in comm_team)
6270
members = set(c["login"] for c in members)
6371

6472
# add missing contributors with GitHub accounts
@@ -75,13 +83,15 @@ def get_contributors():
7583
core_devs = [get_profile(login) for login in core_devs]
7684
emeritus = [get_profile(login) for login in emeritus]
7785
triage_team = [get_profile(login) for login in triage_team]
86+
comm_team = [get_profile(login) for login in comm_team]
7887

7988
# sort by last name
8089
core_devs = sorted(core_devs, key=key)
8190
emeritus = sorted(emeritus, key=key)
8291
triage_team = sorted(triage_team, key=key)
92+
comm_team = sorted(comm_team, key=key)
8393

84-
return core_devs, emeritus, triage_team
94+
return core_devs, emeritus, triage_team, comm_team
8595

8696

8797
def get_profile(login):
@@ -145,7 +155,7 @@ def generate_list(contributors):
145155

146156
if __name__ == "__main__":
147157

148-
core_devs, emeritus, triage_team = get_contributors()
158+
core_devs, emeritus, triage_team, comm_team = get_contributors()
149159

150160
with open(REPO_FOLDER / "doc" / "authors.rst", "w+") as rst_file:
151161
rst_file.write(generate_table(core_devs))
@@ -155,3 +165,6 @@ def generate_list(contributors):
155165

156166
with open(REPO_FOLDER / "doc" / "triage_team.rst", "w+") as rst_file:
157167
rst_file.write(generate_table(triage_team))
168+
169+
with open(REPO_FOLDER / "doc" / "communication_team.rst", "w+") as rst_file:
170+
rst_file.write(generate_table(comm_team))

doc/about.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ been leading the development.
1818

1919
Governance
2020
----------
21+
2122
The decision making process and governance structure of scikit-learn is laid
2223
out in the :ref:`governance document <governance>`.
2324

@@ -47,6 +48,15 @@ maintenance:
4748

4849
.. include:: triage_team.rst
4950

51+
Communication Team
52+
------------------
53+
54+
The following people help with :ref:`communication around scikit-learn
55+
<communication_team>`.
56+
57+
.. include:: communication_team.rst
58+
59+
5060
Emeritus Core Developers
5161
------------------------
5262

doc/communication_team.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
.. raw :: html
2+
3+
<!-- Generated by generate_authors_table.py -->
4+
<div class="sk-authors-container">
5+
<style>
6+
img.avatar {border-radius: 10px;}
7+
</style>
8+
<div>
9+
<a href='https://github.com/reshamas'><img src='https://avatars.githubusercontent.com/u/2507232?v=4' class='avatar' /></a> <br />
10+
<p>Reshama Shaikh</p>
11+
</div>
12+
<div>
13+
<a href='https://github.com/laurburke'><img src='https://avatars.githubusercontent.com/u/35973528?v=4' class='avatar' /></a> <br />
14+
<p>Lauren Burke</p>
15+
</div>
16+
</div>

doc/governance.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,25 @@ Every new triager will be announced in the mailing list.
4848
Triagers are welcome to participate in `monthly core developer meetings
4949
<https://github.com/scikit-learn/administrative/tree/master/meeting_notes>`_.
5050

51+
.. _communication_team:
52+
53+
Communication team
54+
-------------------
55+
56+
Members of the communication team help with outreach and communication
57+
for scikit-learn. The goal of the team is to develop public awareness of
58+
scikit-learn, of its features and usage, as well as branding.
59+
60+
For this, they can operate the scikit-learn accounts on various social
61+
networks and produce materials.
62+
63+
Every new communicator will be announced in the mailing list.
64+
Communicators are welcome to participate in `monthly core developer meetings
65+
<https://github.com/scikit-learn/administrative/tree/master/meeting_notes>`_.
66+
5167
Core developers
5268
---------------
69+
5370
Core developers are community members who have shown that they are dedicated to
5471
the continued development of the project through ongoing engagement with the
5572
community. They have shown they can be trusted to maintain scikit-learn with

doc/whats_new/v1.0.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ Changelog
2828
and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced
2929
article. :pr:`19210` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
3030

31+
:mod:`sklearn.metrics`
32+
......................
33+
34+
- |Fix| All :class:`sklearn.metrics.DistanceMetric` subclasses now correctly support
35+
read-only buffer attributes.
36+
This fixes a regression introduced in 1.0.0 with respect to 0.24.2.
37+
:pr:`21694` by :user:`Julien Jerphanion <jjerphan>`.
38+
39+
3140
:mod:`sklearn.preprocessing`
3241
............................
3342

doc/whats_new/v1.1.rst

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ Changelog
9292
- |Fix| :class:`decomposition.FastICA` now validates input parameters in `fit` instead of `__init__`.
9393
:pr:`21432` by :user:`Hannah Bohle <hhnnhh>` and :user:`Maren Westermann <marenwestermann>`.
9494

95+
- |Fix| :class:`decomposition.KernelPCA` now validates input parameters in
96+
`fit` instead of `__init__`.
97+
:pr:`21567` by :user:`Maggie Chege <MaggieChege>`.
98+
9599
- |API| Adds :term:`get_feature_names_out` to all transformers in the
96100
:mod:`~sklearn.decomposition` module:
97101
:class:`~sklearn.decomposition.DictionaryLearning`,
@@ -186,13 +190,6 @@ Changelog
186190
instead of `__init__`.
187191
:pr:`21434` by :user:`Krum Arnaudov <krumeto>`.
188192

189-
190-
:mod:`sklearn.decomposition.KernelPCA`
191-
......................................
192-
- |Fix| :class:`decomposition.KernelPCA` now validates input parameters in
193-
`fit` instead of `__init__`.
194-
:pr:`21567` by :user:`Maggie Chege <MaggieChege>`.
195-
196193
:mod:`sklearn.svm`
197194
..................
198195

examples/cluster/plot_birch_vs_minibatchkmeans.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,16 @@
55
66
This example compares the timing of BIRCH (with and without the global
77
clustering step) and MiniBatchKMeans on a synthetic dataset having
8-
100,000 samples and 2 features generated using make_blobs.
8+
25,000 samples and 2 features generated using make_blobs.
99
10-
If ``n_clusters`` is set to None, the data is reduced from 100,000
10+
Both ``MiniBatchKMeans`` and ``BIRCH`` are very scalable algorithms and could
11+
run efficiently on hundreds of thousands or even millions of datapoints. We
12+
chose to limit the dataset size of this example in the interest of keeping
13+
our Continuous Integration resource usage reasonable but the interested
14+
reader might enjoy editing this script to rerun it with a larger value for
15+
`n_samples`.
16+
17+
If ``n_clusters`` is set to None, the data is reduced from 25,000
1118
samples to a set of 158 clusters. This can be viewed as a preprocessing
1219
step before the final (global) clustering step that further reduces these
1320
158 clusters to 100 clusters.
@@ -18,6 +25,7 @@
1825
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
1926
# License: BSD 3 clause
2027

28+
from joblib import cpu_count
2129
from itertools import cycle
2230
from time import time
2331
import numpy as np
@@ -32,10 +40,10 @@
3240
xx = np.linspace(-22, 22, 10)
3341
yy = np.linspace(-22, 22, 10)
3442
xx, yy = np.meshgrid(xx, yy)
35-
n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis]))
43+
n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis]))
3644

3745
# Generate blobs to do a comparison between MiniBatchKMeans and BIRCH.
38-
X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0)
46+
X, y = make_blobs(n_samples=25000, centers=n_centers, random_state=0)
3947

4048
# Use all colors that matplotlib provides by default.
4149
colors_ = cycle(colors.cnames.keys())
@@ -78,7 +86,7 @@
7886
mbk = MiniBatchKMeans(
7987
init="k-means++",
8088
n_clusters=100,
81-
batch_size=100,
89+
batch_size=256 * cpu_count(),
8290
n_init=10,
8391
max_no_improvement=10,
8492
verbose=0,

sklearn/calibration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,8 +1099,8 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
10991099
ax.plot([0, 1], [0, 1], "k:", label=ref_line_label)
11001100
self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
11011101

1102-
if "label" in line_kwargs:
1103-
ax.legend(loc="lower right")
1102+
# We always have to show the legend for at least the reference line
1103+
ax.legend(loc="lower right")
11041104

11051105
xlabel = f"Mean predicted probability {info_pos_label}"
11061106
ylabel = f"Fraction of positives {info_pos_label}"

sklearn/datasets/tests/test_base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,10 @@ def test_load_boston_alternative():
341341
boston_sklearn = load_boston()
342342

343343
data_url = "http://lib.stat.cmu.edu/datasets/boston"
344-
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
344+
try:
345+
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
346+
except ConnectionError as e:
347+
pytest.xfail(f"The dataset can't be downloaded. Got exception: {e}")
345348
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
346349
target = raw_df.values[1::2, 2]
347350

sklearn/metrics/_dist_metrics.pyx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ cdef DTYPE_t INF = np.inf
2929

3030
from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
3131
from ..utils._typedefs import DTYPE, ITYPE
32-
32+
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
3333

3434
######################################################################
3535
# newObj function
@@ -214,8 +214,8 @@ cdef class DistanceMetric:
214214
set state for pickling
215215
"""
216216
self.p = state[0]
217-
self.vec = state[1]
218-
self.mat = state[2]
217+
self.vec = ReadonlyArrayWrapper(state[1])
218+
self.mat = ReadonlyArrayWrapper(state[2])
219219
if self.__class__.__name__ == "PyFuncDistance":
220220
self.func = state[3]
221221
self.kwargs = state[4]
@@ -444,7 +444,7 @@ cdef class SEuclideanDistance(DistanceMetric):
444444
D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
445445
"""
446446
def __init__(self, V):
447-
self.vec = np.asarray(V, dtype=DTYPE)
447+
self.vec = ReadonlyArrayWrapper(np.asarray(V, dtype=DTYPE))
448448
self.size = self.vec.shape[0]
449449
self.p = 2
450450

@@ -605,7 +605,7 @@ cdef class WMinkowskiDistance(DistanceMetric):
605605
raise ValueError("WMinkowskiDistance requires finite p. "
606606
"For p=inf, use ChebyshevDistance.")
607607
self.p = p
608-
self.vec = np.asarray(w, dtype=DTYPE)
608+
self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE))
609609
self.size = self.vec.shape[0]
610610

611611
def _validate_data(self, X):
@@ -665,7 +665,7 @@ cdef class MahalanobisDistance(DistanceMetric):
665665
if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
666666
raise ValueError("V/VI must be square")
667667

668-
self.mat = np.asarray(VI, dtype=float, order='C')
668+
self.mat = ReadonlyArrayWrapper(np.asarray(VI, dtype=float, order='C'))
669669

670670
self.size = self.mat.shape[0]
671671

sklearn/metrics/tests/test_dist_metrics.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,16 @@ def check_pdist_bool(metric, D_true):
158158
assert_array_almost_equal(D12, D_true)
159159

160160

161+
@pytest.mark.parametrize("use_read_only_kwargs", [True, False])
161162
@pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS)
162-
def test_pickle(metric):
163+
def test_pickle(use_read_only_kwargs, metric):
163164
argdict = METRICS_DEFAULT_PARAMS[metric]
164165
keys = argdict.keys()
165166
for vals in itertools.product(*argdict.values()):
167+
if use_read_only_kwargs:
168+
for val in vals:
169+
if isinstance(val, np.ndarray):
170+
val.setflags(write=False)
166171
kwargs = dict(zip(keys, vals))
167172
check_pickle(metric, kwargs)
168173

@@ -242,3 +247,20 @@ def custom_metric(x, y):
242247
pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
243248
eucl = DistanceMetric.get_metric("euclidean")
244249
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
250+
251+
252+
def test_readonly_kwargs():
253+
# Non-regression test for:
254+
# https://github.com/scikit-learn/scikit-learn/issues/21685
255+
256+
rng = check_random_state(0)
257+
258+
weights = rng.rand(100)
259+
VI = rng.rand(10, 10)
260+
weights.setflags(write=False)
261+
VI.setflags(write=False)
262+
263+
# Those distances metrics have to support readonly buffers.
264+
DistanceMetric.get_metric("seuclidean", V=weights)
265+
DistanceMetric.get_metric("wminkowski", p=1, w=weights)
266+
DistanceMetric.get_metric("mahalanobis", VI=VI)

0 commit comments

Comments
 (0)
0