8000 TST Extend tests for `scipy.sparse.*array` in `test_glm.py` by ivirshup · Pull Request #27107 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

TST Extend tests for scipy.sparse.*array in test_glm.py #27107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading < 8000 /span>
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions sklearn/linear_model/_glm/tests/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest
import scipy
from numpy.testing import assert_allclose
from scipy import linalg
from scipy import linalg, sparse
from scipy.optimize import minimize, root

from sklearn._loss import HalfBinomialLoss, HalfPoissonLoss, HalfTweedieLoss
Expand All @@ -29,6 +29,7 @@
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
from sklearn.model_selection import train_test_split
from sklearn.utils.fixes import CSR_CONTAINERS

SOLVERS = ["lbfgs", "newton-cholesky"]

Expand Down Expand Up @@ -73,8 +74,9 @@ def regression_data():
# TweedieRegressor(power=0, link="log"), # too difficult
TweedieRegressor(power=1.5),
],
CSR_CONTAINERS + [np.array],
),
ids=lambda param: f"{param[0]}-{param[1]}",
ids=lambda param: f"{param[0]}-{param[1]}-{param[2]}",
)
def glm_dataset(global_random_seed, request):
"""Dataset with GLM solutions, well conditioned X.
Expand Down Expand Up @@ -117,7 +119,7 @@ def glm_dataset(global_random_seed, request):
l2_reg_strength : float
Always equal 1.
"""
data_type, model = request.param
data_type, model, matrix_class = request.param
# Make larger dim more than double as big as the smaller one.
# This helps when constructing singular matrices like (X, X).
if data_type == "long":
Expand All @@ -137,6 +139,7 @@ def glm_dataset(global_random_seed, request):
U, s, Vt = linalg.svd(X, full_matrices=False)
assert np.all(s > 1e-3) # to be sure
assert np.max(s) / np.min(s) < 100 # condition number of X
X = matrix_class(X)

if data_type == "long":
coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
Expand Down Expand Up @@ -270,7 +273,10 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):

model = clone(model).set_params(**params)
X = X[:, :-1] # remove intercept
X = 0.5 * np.concatenate((X, X), axis=1)
if sparse.issparse(X):
X = np.multiply(sparse.hstack((X, X)), 0.5)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following looks simpler:

Suggested change
X = np.multiply(sparse.hstack((X, X)), 0.5)
X = 0.5 * sparse.hstack((X, X))

else:
X = 0.5 * np.concatenate((X, X), axis=1)
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
if fit_intercept:
coef = coef_with_intercept
Expand Down Expand Up @@ -313,10 +319,14 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
tol=1e-12,
max_iter=1000,
)
if sparse.issparse(X):
vstack = sparse.vstack
else:
vstack = np.vstack

model = clone(model).set_params(**params)
X = X[:, :-1] # remove intercept
X = np.concatenate((X, X), axis=0)
X = vstack((X, X))
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
y = np.r_[y, y]
if fit_intercept:
Expand Down Expand Up @@ -436,21 +446,25 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
tol=1e-12,
max_iter=1000,
)
if sparse.issparse(X):
hstack = sparse.hstack
else:
hstack = np.hstack

model = clone(model).set_params(**params)
if fit_intercept:
intercept = coef[-1]
coef = coef[:-1]
if n_samples > n_features:
X = X[:, :-1] # remove intercept
X = 0.5 * np.concatenate((X, X), axis=1)
X = 0.5 * hstack((X, X))
else:
# To know the minimum norm solution, we keep one intercept column and do
# not divide by 2. Later on, we must take special care.
X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
X = hstack((X[:, :-1], X[:, :-1], X[:, [-1]]))
else:
intercept = 0
X = 0.5 * np.concatenate((X, X), axis=1)
X = 0.5 * hstack((X, X))
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)

with warnings.catch_warnings():
Expand Down Expand Up @@ -522,6 +536,10 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
tol=1e-12,
max_iter=1000,
)
if sparse.issparse(X):
vstack = sparse.vstack
else:
vstack = np.vstack

model = clone(model).set_params(**params)
if fit_intercept:
Expand All @@ -530,7 +548,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
coef = coef[:-1]
else:
intercept = 0
X = np.concatenate((X, X), axis=0)
X = vstack((X, X))
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
y = np.r_[y, y]

Expand Down
0