8000 FEA Add Nonnegative LinearRegression (#17578) · sstalley/scikit-learn@075d424 · GitHub
[go: up one dir, main page]

Skip to content

Commit 075d424

Browse files
cmarmoNicolasHugjknox13
authored
FEA Add Nonnegative LinearRegression (scikit-learn#17578)
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com> Co-authored-by: Joseph Knox <jknox13@uw.edu> Co-authored-by: Joseph Knox <joseph.edward.knox@gmail.com>
1 parent febdd19 commit 075d424

File tree

6 files changed

+197
-6
lines changed

6 files changed

+197
-6
lines changed

doc/modules/linear_model.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ and will store the coefficients :math:`w` of the linear model in its
4343

4444
>>> from sklearn import linear_model
4545
>>> reg = linear_model.LinearRegression()
46+
>>> reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
47+
LinearRegression()
4648
>>> reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
4749
LinearRegression()
4850
>>> reg.coef_
@@ -61,6 +63,19 @@ example, when data are collected without an experimental design.
6163

6264
* :ref:`sphx_glr_auto_examples_linear_model_plot_ols.py`
6365

66+
Non-Negative Least Squares
67+
--------------------------
68+
69+
It is possible to constrain all the coefficients to be non-negative, which may
70+
be useful when they represent some physical or naturally non-negative
71+
quantities (e.g., frequency counts or prices of goods).
72+
:class:`LinearRegression` accepts a boolean ``positive``
73+
parameter: when set to `True` `Non Negative Least Squares
74+
<https://en.wikipedia.org/wiki/Non-negative_least_squares>`_ are then applied.
75+
76+
.. topic:: Examples:
77+
78+
* :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`
6479

6580
Or 10000 dinary Least Squares Complexity
6681
---------------------------------

doc/tutorial/statistical_inference/supervised_learning.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ Linear models: :math:`y = X\beta + \epsilon`
173173
>>> regr = linear_model.LinearRegression()
174174
>>> regr.fit(diabetes_X_train, diabetes_y_train)
175175
LinearRegression()
176-
>>> print(regr.coef_) # doctest: +SKIP
176+
>>> print(regr.coef_)
177177
[ 0.30349955 -237.63931533 510.53060544 327.73698041 -814.13170937
178178
492.81458798 102.84845219 184.60648906 743.51961675 76.09517222]
179179

doc/whats_new/v0.24.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,14 @@ Changelog
203203
- |Enhancement| :class:`isotonic.IsotonicRegression` now accepts 2darray with 1 feature as
204204
input array. :pr:`17379` by :user:`Jiaxiang <fujiaxiang>`.
205205

206+
:mod:`sklearn.linear_model`
207+
...........................
208+
209+
- |Feature| :class:`linear_model.LinearRegression` now forces coefficients
210+
to be all positive when ``positive`` is set to ``True``.
211+
:pr:`17578` by :user:`Joseph Knox <jknox13>`, :user:`Nelle Varoquaux <NelleV>`
212+
and :user:`Chiara Marmo <cmarmo>`.
213+
206214
:mod:`sklearn.manifold`
207215
.......................
208216

examples/linear_model/plot_nnls.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
==========================
3+
Non-negative least squares
4+
==========================
5+
6+
In this example, we fit a linear model with positive constraints on the
7+
regression coefficients and compare the estimated coefficients to a classic
8+
linear regression.
9+
"""
10+
print(__doc__)
11+
import numpy as np
12+
import matplotlib.pyplot as plt
13+
from sklearn.metrics import r2_score
14+
15+
# %%
16+
# Generate some random data
17+
np.random.seed(42)
18+
19+
n_samples, n_features = 200, 50
20+
X = np.random.randn(n_samples, n_features)
21+
true_coef = 3 * np.random.randn(n_features)
22+
# Threshold coefficients to render them non-negative
23+
true_coef[true_coef < 0] = 0
24+
y = np.dot(X, true_coef)
25+
26+
# Add some noise
27+
y += 5 * np.random.normal(size=(n_samples, ))
28+
29+
# %%
30+
# Split the data in train set and test set
31+
from sklearn.model_selection import train_test_split
32+
33+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
34+
35+
# %%
36+
# Fit the Non-Negative least squares.
37+
from sklearn.linear_model import LinearRegression
38+
39+
reg_nnls = LinearRegression(positive=True)
40+
y_pred_nnls = reg_nnls.fit(X_train, y_train).predict(X_test)
41+
r2_score_nnls = r2_score(y_test, y_pred_nnls)
42+
print("NNLS R2 score", r2_score_nnls)
43+
44+
# %%
45+
# Fit an OLS.
46+
reg_ols = LinearRegression()
47+
y_pred_ols = reg_ols.fit(X_train, y_train).predict(X_test)
48+
r2_score_ols = r2_score(y_test, y_pred_ols)
49+
print("OLS R2 score", r2_score_ols)
50+
51+
52+
# %%
53+
# Comparing the regression coefficients between OLS and NNLS, we can observe
54+
# they are highly correlated (the dashed line is the identity relation),
55+
# but the non-negative constraint shrinks some to 0.
56+
# The Non-Negative Least squares inherently yield sparse results.
57+
58+
fig, ax = plt.subplots()
59+
ax.plot(reg_ols.coef_, reg_nnls.coef_, linewidth=0, marker=".")
60+
61+
low_x, high_x = ax.get_xlim()
62+
low_y, high_y = ax.get_ylim()
63+
low = max(low_x, low_y)
64+
high = min(high_x, high_y)
65+
ax.plot([low, high], [low, high], ls="--", c=".3", alpha=.5)
66+
ax.set_xlabel("OLS regression coefficients", fontweight="bold")
67+
ax.set_ylabel("NNLS regression coefficients", fontweight="bold")

sklearn/linear_model/_base.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import numpy as np
2121
import scipy.sparse as sp
2222
from scipy import linalg
23+
from scipy import optimize
2324
from scipy import sparse
2425
from scipy.special import expit
2526
from joblib import Parallel, delayed
@@ -419,6 +420,12 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
419420
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
420421
for more details.
421422
423+
positive : bool, default=False
424+
When set to ``True``, forces the coefficients to be positive. This
425+
option is only supported for dense arrays.
426+
427+
.. versionadded:: 0.24
428+
422429
Attributes
423430
----------
424431
coef_ : array of shape (n_features, ) or (n_targets, n_features)
@@ -451,7 +458,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
451458
Notes
452459
-----
453460
From the implementation point of view, this is just plain Ordinary
454-
Least Squares (scipy.linalg.lstsq) wrapped as a predictor object.
461+
Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
462+
(scipy.optimize.nnls) wrapped as a predictor object.
455463
456464
Examples
457465
--------
@@ -472,11 +480,12 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
472480
"""
473481
@_deprecate_positional_args
474482
def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
475-
n_jobs=None):
483+
n_jobs=None, positive=False):
476484
self.fit_intercept = fit_intercept
477485
self.normalize = normalize
478486
self.copy_X = copy_X
479487
self.n_jobs = n_jobs
488+
self.positive = positive
480489

481490
def fit(self, X, y, sample_weight=None):
482491
"""
@@ -502,7 +511,10 @@ def fit(self, X, y, sample_weight=None):
502511
"""
503512

504513
n_jobs_ = self.n_jobs
505-
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
514+
515+
accept_sparse = False if self.positive else ['csr', 'csc', 'coo']
516+
517+
X, y = self._validate_data(X, y, accept_sparse=accept_sparse,
506518
y_numeric=True, multi_output=True)
507519

508520
if sample_weight is not None:
@@ -518,7 +530,16 @@ def fit(self, X, y, sample_weight=None):
518530
# Sample weight can be implemented via a simple rescaling.
519531
X, y = _rescale_data(X, y, sample_weight)
520532

521-
if sp.issparse(X):
533+
if self.positive:
534+
if y.ndim < 2:
535+
self.coef_, self._residues = optimize.nnls(X, y)
536+
else:
537+
# scipy.optimize.nnls cannot handle y with shape (M, K)
538+
outs = Parallel(n_jobs=n_jobs_)(
539+
delayed(optimize.nnls)(X, y[:, j])
540+
for j in range(y.shape[1]))
541+
self.coef_, self._residues = map(np.vstack, zip(*outs))
542+
elif sp.issparse(X):
522543
X_offset_scale = X_offset / X_scale
523544

524545
def matvec(b):

sklearn/linear_model/tests/test_base.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
from sklearn.utils._testing import assert_array_equal
1414
from sklearn.utils._testing import assert_almost_equal
1515
from sklearn.utils._testing import assert_allclose
16+
from sklearn.utils import check_random_state
1617
from sklearn.utils.fixes import parse_version
1718

1819
from sklearn.linear_model import LinearRegression
1920
from sklearn.linear_model._base import _preprocess_data
2021
from sklearn.linear_model._base import _rescale_data
2122
from sklearn.linear_model._base import make_dataset
22-
from sklearn.utils import check_random_state
2323
from sklearn.datasets import make_sparse_uncorrelated
2424
from sklearn.datasets import make_regression
2525
from sklearn.datasets import load_iris
@@ -94,6 +94,18 @@ def test_linear_regression_sample_weights():
9494
assert_almost_equal(inter1, coefs2[0])
9595

9696

97+
def test_raises_value_error_if_positive_and_sparse():
98+
error_msg = ('A sparse matrix was passed, '
99+
'but dense data is required.')
100+
# X must not be sparse if positive == True
101+
X = sparse.eye(10)
102+
y = np.ones(10)
103+
104+
reg = LinearRegression(positive=True)
105+
106+
with pytest.raises(TypeError, match=error_msg):
107+
reg.fit(X, y)
108+
97109
def test_raises_value_error_if_sample_weights_greater_than_1d():
98110
# Sample weights must be either scalar or 1D
99111

@@ -206,6 +218,74 @@ def test_linear_regression_sparse_multiple_outcome(random_state=0):
206218
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
207219

208220

221+
def test_linear_regression_positive():
222+
# Test nonnegative LinearRegression on a simple dataset.
223+
X = [[1], [2]]
224+
y = [1, 2]
225+
226+
reg = LinearRegression(positive=True)
227+
reg.fit(X, y)
228+
229+
assert_array_almost_equal(reg.coef_, [1])
230+
assert_array_almost_equal(reg.intercept_, [0])
231+
assert_array_almost_equal(reg.predict(X), [1, 2])
232+
233+
# test it also for degenerate input
234+
X = [[1]]
235+
y = [0]
236+
237+
reg = LinearRegression(positive=True)
238+
reg.fit(X, y)
239+
assert_allclose(reg.coef_, [0])
240+
assert_allclose(reg.intercept_, [0])
241+
assert_allclose(reg.predict(X), [0])
242+
243+
244+
def test_linear_regression_positive_multiple_outcome(random_state=0):
245+
# Test multiple-outcome nonnegative linear regressions
246+
random_state = check_random_state(random_state)
247+
X, y = make_sparse_uncorrelated(random_state=random_state)
248+
Y = np.vstack((y, y)).T
249+
n_features = X.shape[1]
250+
251+
ols = LinearRegression(positive=True)
252+
ols.fit(X, Y)
253+
assert ols.coef_.shape == (2, n_features)
254+
assert np.all(ols.coef_ >= 0.)
255+
Y_pred = ols.predict(X)
256+
ols.fit(X, y.ravel())
257+
y_pred = ols.predict(X)
258+
assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
259+
260+
261+
def test_linear_regression_positive_vs_nonpositive():
262+
# Test differences with LinearRegression when positive=False.
263+
X, y = make_sparse_uncorrelated(random_state=0)
264+
265+
reg = LinearRegression(positive=True)
266+
reg.fit(X, y)
267+
regn = LinearRegression(positive=False)
268+
regn.fit(X, y)
269+
270+
assert np.mean((reg.coef_ - regn.coef_)**2) > 1e-3
271+
272+
273+
def test_linear_regression_positive_vs_nonpositive_when_positive():
274+
# Test LinearRegression fitted coefficients
275+
# when the problem is positive.
276+
n_samples = 200
277+
n_features = 4
278+
X = rng.rand(n_samples, n_features)
279+
y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
280+
281+
reg = LinearRegression(positive=True)
282+
reg.fit(X, y)
283+
regn = LinearRegression(positive=False)
284+
regn.fit(X, y)
285+
286+
assert np.mean((reg.coef_ - regn.coef_)**2) < 1e-6
287+
288+
209289
def test_linear_regression_pd_sparse_dataframe_warning():
210290
pd = pytest.importorskip('pandas')
211291
# restrict the pd versions < '0.24.0' as they have a bug in is_sparse func

0 commit comments

Comments
 (0)
CAF
0