8000 ENH add verbosity to newton-cg solver (#27526) · scikit-learn/scikit-learn@3ee60a7 · GitHub
[go: up one dir, main page]

Skip to content

Commit 3ee60a7

Browse files
lorentzenchrglemaitreogrisel
authored
ENH add verbosity to newton-cg solver (#27526)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com> Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
1 parent 556e0cf commit 3ee60a7

File tree

6 files changed

+229
-14
lines changed

6 files changed

+229
-14
lines changed

doc/whats_new/v1.5.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,11 @@ Changelog
264264
:mod:`sklearn.linear_model`
265265
...........................
266266

267+
- |Enhancement| Solver `"newton-cg"` in :class:`linear_model.LogisticRegression` and
268+
:class:`linear_model.LogisticRegressionCV` now emits information when `verbose` is
269+
set to positive values.
270+
:pr:`27526` by :user:`Christian Lorentzen <lorentzenchr>`.
271+
267272
- |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`,
268273
:class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't
269274
accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger

sklearn/linear_model/_glm/_newton_solver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def line_search(self, X, y, sample_weight):
230230
is_verbose = self.verbose >= 2
231231
if is_verbose:
232232
print(" Backtracking Line Search")
233-
print(f" eps=10 * finfo.eps={eps}")
233+
print(f" eps=16 * finfo.eps={eps}")
234234

235235
for i in range(21): # until and including t = beta**20 ~ 1e-6
236236
self.coef = self.coef_old + t * self.coef_newton

sklearn/linear_model/_logistic.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,14 @@ def _logistic_regression_path(
477477
l2_reg_strength = 1.0 / (C * sw_sum)
478478
args = (X, target, sample_weight, l2_reg_strength, n_threads)
479479
w0, n_iter_i = _newton_cg(
480-
hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
480+
grad_hess=hess,
481+
func=func,
482+
grad=grad,
483+
x0=w0,
484+
args=args,
485+
maxiter=max_iter,
486+
tol=tol,
487+
verbose=verbose,
481488
)
482489
elif solver == "newton-cholesky":
483490
l2_reg_strength = 1.0 / (C * sw_sum)

sklearn/linear_model/tests/test_logistic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1156,7 +1156,7 @@ def test_logreg_predict_proba_multinomial():
11561156
[
11571157
(
11581158
"newton-cg",
1159-
"newton-cg failed to converge. Increase the number of iterations.",
1159+
"newton-cg failed to converge.* Increase the number of iterations.",
11601160
),
11611161
(
11621162
"liblinear",

sklearn/utils/optimize.py

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ class _LineSearchError(RuntimeError):
2727
pass
2828

2929

30-
def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):
30+
def _line_search_wolfe12(
31+
f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs
32+
):
3133
"""
3234
Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
3335
suitable step length is not found, and raise an exception if a
@@ -39,24 +41,44 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
3941
If no suitable step size is found.
4042
4143
"""
44+
is_verbose = verbose >= 2
45+
eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps
46+
if is_verbose:
47+
print(" Line Search")
48+
print(f" eps=16 * finfo.eps={eps}")
49+
print(" try line search wolfe1")
50+
4251
ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
4352

53+
if is_verbose:
54+
_not_ = "not " if ret[0] is None else ""
55+
print(" wolfe1 line search was " + _not_ + "successful")
56+
4457
if ret[0] is None:
4558
# Have a look at the line_search method of our NewtonSolver class. We borrow
4659
# the logic from there
4760
# Deal with relative loss differences around machine precision.
4861
args = kwargs.get("args", tuple())
4962
fval = f(xk + pk, *args)
50-
eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps
5163
tiny_loss = np.abs(old_fval * eps)
5264
loss_improvement = fval - old_fval
5365
check = np.abs(loss_improvement) <= tiny_loss
66+
if is_verbose:
67+
print(
68+
" check loss |improvement| <= eps * |loss_old|:"
69+
f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
70+
)
5471
if check:
5572
# 2.1 Check sum of absolute gradients as alternative condition.
5673
sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1)
5774
grad = fprime(xk + pk, *args)
5875
sum_abs_grad = scipy.linalg.norm(grad, ord=1)
5976
check = sum_abs_grad < sum_abs_grad_old
77+
if is_verbose:
78+
print(
79+
" check sum(|gradient|) < sum(|gradient_old|): "
80+
f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
81+
)
6082
if check:
6183
ret = (
6284
1.0, # step size
@@ -72,17 +94,22 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
7294
# TODO: It seems that the new check for the sum of absolute gradients above
7395
# catches all cases that, earlier, ended up here. In fact, our tests never
7496
# trigger this "if branch" here and we can consider to remove it.
97+
if is_verbose:
98+
print(" last resort: try line search wolfe2")
7599
ret = line_search_wolfe2(
76100
f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
77101
)
102+
if is_verbose:
103+
_not_ = "not " if ret[0] is None else ""
104+
print(" wolfe2 line search was " + _not_ + "successful")
78105

79106
if ret[0] is None:
80107
raise _LineSearchError()
81108

82109
return ret
83110

84111

85-
def _cg(fhess_p, fgrad, maxiter, tol):
112+
def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
86113
"""
87114
Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
88115
with a conjugate gradient descent.
@@ -107,30 +134,51 @@ def _cg(fhess_p, fgrad, maxiter, tol):
107134
xsupi : ndarray of shape (n_features,) or (n_features + 1,)
108135
Estimated solution.
109136
"""
137+
eps = 16 * np.finfo(np.float64).eps
110138
xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
111-
ri = np.copy(fgrad)
139+
ri = np.copy(fgrad) # residual = fgrad - fhess_p @ xsupi
112140
psupi = -ri
113141
i = 0
114142
dri0 = np.dot(ri, ri)
115-
# We also track of |p_i|^2.
143+
# We also keep track of |p_i|^2.
116144
psupi_norm2 = dri0
145+
is_verbose = verbose >= 2
117146

118147
while i <= maxiter:
119148
if np.sum(np.abs(ri)) <= tol:
149+
if is_verbose:
150+
print(
151+
f" Inner CG solver iteration {i} stopped with\n"
152+
f" sum(|residuals|) <= tol: {np.sum(np.abs(ri))} <= {tol}"
153+
)
120154
break
121155

122156
Ap = fhess_p(psupi)
123157
# check curvature
124158
curv = np.dot(psupi, Ap)
125-
if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2:
159+
if 0 <= curv <= eps * psupi_norm2:
126160
# See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
161+
if is_verbose:
162+
print(
< 111C /td>163+
f" Inner CG solver iteration {i} stopped with\n"
164+
f" tiny_|p| = eps * ||p||^2, eps = {eps}, "
165+
f"squred L2 norm ||p||^2 = {psupi_norm2}\n F438 "
166+
f" curvature <= tiny_|p|: {curv} <= {eps * psupi_norm2}"
167+
)
127168
break
128169
elif curv < 0:
129170
if i > 0:
171+
if is_verbose:
172+
print(
173+
f" Inner CG solver iteration {i} stopped with negative "
174+
f"curvature, curvature = {curv}"
175+
)
130176
break
131177
else:
132178
# fall back to steepest descent direction
133179
xsupi += dri0 / curv * psupi
180+
if is_verbose:
181+
print(" Inner CG solver iteration 0 fell back to steepest descent")
134182
break
135183
alphai = dri0 / curv
136184
xsupi += alphai * psupi
@@ -142,7 +190,11 @@ def _cg(fhess_p, fgrad, maxiter, tol):
142190
psupi_norm2 = dri1 + betai**2 * psupi_norm2
143191
i = i + 1
144192
dri0 = dri1 # update np.dot(ri,ri) for next time.
145-
193+
if is_verbose and i > maxiter:
194+
print(
195+
f" Inner CG solver stopped reaching maxiter={i - 1} with "
196+
f"sum(|residuals|) = {np.sum(np.abs(ri))}"
197+
)
146198
return xsupi
147199

148200

@@ -157,6 +209,7 @@ def _newton_cg(
157209
maxinner=200,
158210
line_search=True,
159211
warn=True,
212+
verbose=0,
160213
):
161214
"""
162215
Minimization of scalar function of one or more variables using the
@@ -210,6 +263,10 @@ def _newton_cg(
210263
if line_search:
211264
old_fval = func(x0, *args)
212265
old_old_fval = None
266+
else:
267+
old_fval = 0
268+
269+
is_verbose = verbose > 0
213270

214271
# Outer loop: our Newton iteration
215272
while k < maxiter:
@@ -218,7 +275,13 @@ def _newton_cg(
218275
fgrad, fhess_p = grad_hess(xk, *args)
219276

220277
absgrad = np.abs(fgrad)
221-
if np.max(absgrad) <= tol:
278+
max_absgrad = np.max(absgrad)
279+
check = max_absgrad <= tol
280+
if is_verbose:
281+
print(f"Newton-CG iter = {k}")
282+
print(" Check Convergence")
283+
print(f" max |gradient| <= tol: {max_absgrad} <= {tol} {check}")
284+
if check:
222285
break
223286

224287
maggrad = np.sum(absgrad)
@@ -227,14 +290,22 @@ def _newton_cg(
227290

228291
# Inner loop: solve the Newton update by conjugate gradient, to
229292
# avoid inverting the Hessian
230-
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
293+
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose)
231294

232295
alphak = 1.0
233296

234297
if line_search:
235298
try:
236299
alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
237-
func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args
300+
func,
301+
grad,
302+
xk,
303+
xsupi,
304+
fgrad,
305+
old_fval,
306+
old_old_fval,
307+
verbose=verbose,
308+
args=args,
238309
)
239310
except _LineSearchError:
240311
warnings.warn("Line Search failed")
@@ -245,9 +316,14 @@ def _newton_cg(
245316

246317
if warn and k >= maxiter:
247318
warnings.warn(
248-
"newton-cg failed to converge. Increase the number of iterations.",
319+
(
320+
f"newton-cg failed to converge at loss = {old_fval}. Increase the"
321+
" number of iterations."
322+
),
249323
ConvergenceWarning,
250324
)
325+
elif is_verbose:
326+
print(f" Solver did converge at loss = {old_fval}.")
251327
return xk, k
252328

253329

0 commit comments

Comments
 (0)
0