8000 MAINT cleanup utils.__init__: move safe_sqr and _approximate_mode int… · scikit-learn/scikit-learn@f323eb4 · GitHub
[go: up one dir, main page]

Skip to content

Commit f323eb4

Browse files
jeremiedbblesteve
andauthored
MAINT cleanup utils.__init__: move safe_sqr and _approximate_mode into extmath (#28481)
Co-authored-by: Loïc Estève <loic.esteve@ymail.com>
1 parent e125e67 commit f323eb4

File tree

5 files changed

+126
-125
lines changed

5 files changed

+126
-125
lines changed

sklearn/model_selection/_split.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
from scipy.special import comb
2525

2626
from ..utils import (
27-
_approximate_mode,
2827
_safe_indexing,
2928
check_random_state,
3029
indexable,
3130
metadata_routing,
3231
)
3332
from ..utils._param_validation import Interval, RealNotInt, validate_params
33+
from ..utils.extmath import _approximate_mode
3434
from ..utils.metadata_routing import _MetadataRequester
3535
from ..utils.multiclass import type_of_target
3636
from ..utils.validation import _num_samples, check_array, column_or_1d

sklearn/utils/__init__.py

Lines changed: 2 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from .class_weight import compute_class_weight, compute_sample_weight
2525
from .deprecation import deprecated
2626
from .discovery import all_estimators
27+
from .extmath import _approximate_mode, safe_sqr
2728
from .fixes import parse_version, threadpool_info
2829
from .murmurhash import murmurhash3_32
2930
from .validation import (
@@ -76,6 +77,7 @@
7677
"estimator_html_repr",
7778
"Bunch",
7879
"metadata_routing",
80+
"safe_sqr",
7981
]
8082

8183
IS_PYPY = platform.python_implementation() == "PyPy"
@@ -778,41 +780,6 @@ def shuffle(*arrays, random_state=None, n_samples=None):
778780
)
779781

780782

781-
def safe_sqr(X, *, copy=True):
782-
"""Element wise squaring of array-likes and sparse matrices.
783-
784-
Parameters
785-
----------
786-
X : {array-like, ndarray, sparse matrix}
787-
788-
copy : bool, default=True
789-
Whether to create a copy of X and operate on it or to perform
790-
inplace computation (default behaviour).
791-
792-
Returns
793-
-------
794-
X ** 2 : element wise square
795-
Return the element-wise square of the input.
796-
797-
Examples
798-
--------
799-
>>> from sklearn.utils import safe_sqr
800-
>>> safe_sqr([1, 2, 3])
801-
array([1, 4, 9])
802-
"""
803-
X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
804-
if issparse(X):
805-
if copy:
806-
X = X.copy()
807-
X.data **= 2
808-
else:
809-
if copy:
810-
X = X**2
811-
else:
812-
X **= 2
813-
return X
814-
815-
816783
def _chunk_generator(gen, chunksize):
817784
"""Chunk generator, ``gen`` into lists of length ``chunksize``. The last
818785
chunk may have a length less than ``chunksize``."""
@@ -1183,72 +1150,3 @@ def is_scalar_nan(x):
11831150
and isinstance(x, numbers.Real)
11841151
and math.isnan(x)
11851152
)
1186-
1187-
1188-
def _approximate_mode(class_counts, n_draws, rng):
1189-
"""Computes approximate mode of multivariate hypergeometric.
1190-
1191-
This is an approximation to the mode of the multivariate
1192-
hypergeometric given by class_counts and n_draws.
1193-
It shouldn't be off by more than one.
1194-
1195-
It is the mostly likely outcome of drawing n_draws many
1196-
samples from the population given by class_counts.
1197-
1198-
Parameters
1199-
----------
1200-
class_counts : ndarray of int
1201-
Population per class.
1202-
n_draws : int
1203-
Number of draws (samples to draw) from the overall population.
1204-
rng : random state
1205-
Used to break ties.
1206-
1207-
Returns
1208-
-------
1209-
sampled_classes : ndarray of int
1210-
Number of samples drawn from each class.
1211-
np.sum(sampled_classes) == n_draws
1212-
1213-
Examples
1214-
--------
1215-
>>> import numpy as np
1216-
>>> from sklearn.utils import _approximate_mode
1217-
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
1218-
array([2, 1])
1219-
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
1220-
array([3, 1])
1221-
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1222-
... n_draws=2, rng=0)
1223-
array([0, 1, 1, 0])
1224-
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1225-
... n_draws=2, rng=42)
1226-
array([1, 1, 0, 0])
1227-
"""
1228-
rng = check_random_state(rng)
1229-
# this computes a bad approximation to the mode of the
1230-
# multivariate hypergeometric given by class_counts and n_draws
1231-
continuous = class_counts / class_counts.sum() * n_draws
1232-
# floored means we don't overshoot n_samples, but probably undershoot
1233-
floored = np.floor(continuous)
1234-
# we add samples according to how much "left over" probability
1235-
# they had, until we arrive at n_samples
1236-
need_to_add = int(n_draws - floored.sum())
1237-
if need_to_add > 0:
1238-
remainder = continuous - floored
1239-
values = np.sort(np.unique(remainder))[::-1]
1240-
# add according to remainder, but break ties
1241-
# randomly to avoid biases
1242-
for value in values:
1243-
(inds,) = np.where(remainder == value)
1244-
# if we need_to_add less than what's in inds
1245-
# we draw randomly from them.
1246-
# if we need to add more, we add them all and
1247-
# go to the next value
1248-
add_now = min(len(inds), need_to_add)
1249-
inds = rng.choice(inds, size=add_now, replace=False)
1250-
floored[inds] += 1
1251-
need_to_add -= add_now
1252-
if need_to_add == 0:
1253-
break
1254-
return floored.astype(int)

sklearn/utils/extmath.py

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@
2121

2222
from ..utils import deprecated
2323
from ..utils._param_validation import Interval, StrOptions, validate_params
24-
from . import check_random_state
2524
from ._array_api import _is_numpy_namespace, device, get_namespace
2625
from .sparsefuncs_fast import csr_row_norms
27-
from .validation import check_array
26+
from .validation import check_array, check_random_state
2827

2928

3029
def squared_norm(x):
@@ -1282,3 +1281,107 @@ def _nanaverage(a, weights=None):
12821281
except ZeroDivisionError:
12831282
# this is when all weights are zero, then ignore them
12841283
return np.average(a)
1284+
1285+
1286+
def safe_sqr(X, *, copy=True):
1287+
"""Element wise squaring of array-likes and sparse matrices.
1288+
1289+
Parameters
1290+
----------
1291+
X : {array-like, ndarray, sparse matrix}
1292+
1293+
copy : bool, default=True
1294+
Whether to create a copy of X and operate on it or to perform
1295+
inplace computation (default behaviour).
1296+
1297+
Returns
1298+
-------
1299+
X ** 2 : element wise square
1300+
Return the element-wise square of the input.
1301+
1302+
Examples
1303+
--------
1304+
>>> from sklearn.utils import safe_sqr
1305+
>>> safe_sqr([1, 2, 3])
1306+
array([1, 4, 9])
1307+
"""
1308+
X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
1309+
if sparse.issparse(X):
1310+
if copy:
1311+
X = X.copy()
1312+
X.data **= 2
1313+
else:
1314+
if copy:
1315+
X = X**2
1316+
else:
1317+
X **= 2
1318+
return X
1319+
1320+
1321+
def _approximate_mode(class_counts, n_draws, rng):
1322+
"""Computes approximate mode of multivariate hypergeometric.
1323+
1324+
This is an approximation to the mode of the multivariate
1325+
hypergeometric given by class_counts and n_draws.
1326+
It shouldn't be off by more than one.
1327+
1328+
It is the mostly likely outcome of drawing n_draws many
1329+
samples from the population given by class_counts.
1330+
1331+
Parameters
1332+
----------
1333+
class_counts : ndarray of int
1334+
Population per class.
1335+
n_draws : int
1336+
Number of draws (samples to draw) from the overall population.
1337+
rng : random state
1338+
Used to break ties.
1339+
1340+
Returns
1341+
-------
1342+
sampled_classes : ndarray of int
1343+
Number of samples drawn from each class.
1344+
np.sum(sampled_classes) == n_draws
1345+
1346+
Examples
1347+
--------
1348+
>>> import numpy as np
1349+
>>> from sklearn.utils.extmath import _approximate_mode
1350+
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
1351+
array([2, 1])
1352+
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
1353+
array([3, 1])
1354+
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1355+
... n_draws=2, rng=0)
1356+
array([0, 1, 1, 0])
1357+
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1358+
... n_draws=2, rng=42)
1359+
array([1, 1, 0, 0])
1360+
"""
1361+
rng = check_random_state(rng)
1362+
# this computes a bad approximation to the mode of the
1363+
# multivariate hypergeometric given by class_counts and n_draws
1364+
continuous = class_counts / class_counts.sum() * n_draws
1365+
# floored means we don't overshoot n_samples, but probably undershoot
1366+
floored = np.floor(continuous)
1367+
# we add samples according to how much "left over" probability
1368+
# they had, until we arrive at n_samples
1369+
need_to_add = int(n_draws - floored.sum())
1370+
if need_to_add > 0:
1371+
remainder = continuous - floored
1372+
values = np.sort(np.unique(remainder))[::-1]
1373+
# add according to remainder, but break ties
1374+
# randomly to avoid biases
1375+
for value in values:
1376+
(inds,) = np.where(remainder == value)
1377+
# if we need_to_add less than what's in inds
1378+
# we draw randomly from them.
1379+
# if we need to add more, we add them all and
1380+
# go to the next value
1381+
add_now = min(len(inds), need_to_add)
1382+
inds = rng.choice(inds, size=add_now, replace=False)
1383+
floored[inds] += 1
1384+
need_to_add -= add_now
1385+
if need_to_add == 0:
1386+
break
1387+
return floored.astype(int)

sklearn/utils/tests/test_extmath.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
skip_if_32bit,
2323
)
2424
from sklearn.utils.extmath import (
25+
_approximate_mode,
2526
_deterministic_vector_sign_flip,
2627
_incremental_mean_and_var,
2728
_randomized_eigsh,
@@ -1062,3 +1063,20 @@ def test_safe_sparse_dot_dense_output(dense_output):
10621063
if dense_output:
10631064
expected = expected.toarray()
10641065
assert_allclose_dense_sparse(actual, expected)
1066+
1067+
1068+
def test_approximate_mode():
1069+
"""Make sure sklearn.utils.extmath._approximate_mode returns valid
1070+
results for cases where "class_counts * n_draws" is enough
1071+
to overflow 32-bit signed integer.
1072+
1073+
Non-regression test for:
1074+
https://github.com/scikit-learn/scikit-learn/issues/20774
1075+
"""
1076+
X = np.array([99000, 1000], dtype=np.int32)
1077+
ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)
1078+
1079+
# Draws 25% of the total population, so in this case a fair draw means:
1080+
# 25% * 99.000 = 24.750
1081+
# 25% * 1.000 = 250
1082+
assert_array_equal(ret, [24750, 250])

sklearn/utils/tests/test_utils.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from sklearn import config_context
1212
from sklearn.externals._packaging.version import parse as parse_version
1313
from sklearn.utils import (
14-
_approximate_mode,
1514
_determine_key_type,
1615
_get_column_indices,
1716
_is_polars_df,
@@ -706,23 +705,6 @@ def test_is_scalar_nan(value, result):
706705
assert isinstance(is_scalar_nan(value), bool)
707706

708707

709-
def test_approximate_mode():
710-
"""Make sure sklearn.utils._approximate_mode returns valid
711-
results for cases where "class_counts * n_draws" is enough
712-
to overflow 32-bit signed integer.
713-
714-
Non-regression test for:
715-
https://github.com/scikit-learn/scikit-learn/issues/20774
716-
"""
717-
X = np.array([99000, 1000], dtype=np.int32)
718-
ret = _approximate_mode(class_counts=X, n_draws=25000, rng=0)
719-
720-
# Draws 25% of the total population, so in this case a fair draw means:
721-
# 25% * 99.000 = 24.750
722-
# 25% * 1.000 = 250
723-
assert_array_equal(ret, [24750, 250])
724-
725-
726708
def dummy_func():
727709
pass
728710

0 commit comments

Comments
 (0)
0