@@ -24,6 +24,9 @@ class ProxNewton(BaseSolver):
24
24
tol : float, default 1e-4
25
25
Tolerance for convergence.
26
26
27
+ fit_intercept : bool, default True
28
+ If ``True``, fits an unpenalized intercept.
29
+
27
30
verbose : bool, default False
28
31
Amount of verbosity. 0/False is silent.
29
32
@@ -53,7 +56,8 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4,
53
56
54
57
def solve (self , X , y , datafit , penalty , w_init = None , Xw_init = None ):
55
58
n_samples , n_features = X .shape
56
- w = np .zeros (n_features ) if w_init is None else w_init
59
+ fit_intercept = self .fit_intercept
60
+ w = np .zeros (n_features + fit_intercept ) if w_init is None else w_init
57
61
Xw = np .zeros (n_samples ) if Xw_init is None else Xw_init
58
62
all_features = np .arange (n_features )
59
63
stop_crit = 0.
@@ -63,20 +67,38 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
63
67
if is_sparse :
64
68
X_bundles = (X .data , X .indptr , X .indices )
65
69
70
+ if len (w ) != n_features + self .fit_intercept :
71
+ if self .fit_intercept :
72
+ val_error_message = (
73
+ "w should be of size n_features + 1 when using fit_intercept=True: "
74
+ f"expected { n_features + 1 } , got { len (w )} ." )
75
+ else :
76
+ val_error_message = (
77
+ "w should be of size n_features: "
78
+ f"expected { n_features } , got { len (w )} ." )
79
+ raise ValueError (val_error_message )
80
+
66
81
for t in range (self .max_iter ):
67
82
# compute scores
68
83
if is_sparse :
69
84
grad = _construct_grad_sparse (
70
- * X_bundles , y , w , Xw , datafit , all_features )
85
+ * X_bundles , y , w [: n_features ] , Xw , datafit , all_features )
71
86
else :
72
- grad = _construct_grad (X , y , w , Xw , datafit , all_features )
87
+ grad = _construct_grad (X , y , w [: n_features ] , Xw , datafit , all_features )
73
88
74
- opt = penalty .subdiff_distance (w , grad , all_features )
89
+ opt = penalty .subdiff_distance (w [:n_features ], grad , all_features )
90
+
91
+ # optimality of intercept
92
+ if fit_intercept :
93
+ # gradient w.r.t. intercept (constant features of ones)
94
+ intercept_opt = np .abs (np .sum (datafit .raw_grad (y , Xw )))
95
+ else :
96
+ intercept_opt = 0.
75
97
76
<
1E80
td data-grid-cell-id="diff-f30a3e38a9b6325a7becf51ca9536d89634dcd8d22e22e1f550a17aa0797c59e-76-98-1" data-selected="false" role="gridcell" style="background-color:var(--bgColor-default);text-align:center" tabindex="-1" valign="top" class="focusable-grid-cell diff-line-number position-relative diff-line-number-neutral left-side">98
# check convergences
77
- stop_crit = np .max (opt )
99
+ stop_crit = max ( np .max (opt ), intercept_opt )
78
100
if self .verbose :
79
- p_obj = datafit .value (y , w , Xw ) + penalty .value (w )
101
+ p_obj = datafit .value (y , w , Xw ) + penalty .value (w [: n_features ] )
80
102
print (
81
103
"Iteration {}: {:.10f}, " .format (t + 1 , p_obj ) +
82
104
"stopping crit: {:.2e}" .format (stop_crit )
@@ -101,20 +123,22 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
101
123
# find descent direction
102
124
if is_sparse :
103
125
delta_w_ws , X_delta_w_ws = _descent_direction_s (
104
- * X_bundles , y , w , Xw , grad_ws , datafit ,
126
+ * X_bundles , y , w , Xw , fit_intercept , grad_ws , datafit ,
105
127
penalty , ws , tol = EPS_TOL * tol_in )
106
128
else :
107
129
delta_w_ws , X_delta_w_ws = _descent_direction (
108
- X , y , w , Xw , grad_ws , datafit , penalty , ws , tol = EPS_TOL * tol_in )
130
+ X , y , w , Xw , fit_intercept , grad_ws , datafit ,
131
+ penalty , ws , tol = EPS_TOL * tol_in )
109
132
110
133
# backtracking line search with inplace update of w, Xw
111
134
if is_sparse :
112
135
grad_ws [:] = _backtrack_line_search_s (
113
- * X_bundles , y , w , Xw , datafit , penalty , delta_w_ws ,
114
- X_delta_w_ws , ws )
136
+ * X_bundles , y , w , Xw , fit_intercept , datafit , penalty ,
137
+ delta_w_ws , X_delta_w_ws , ws )
115
138
else :
116
139
grad_ws [:] = _backtrack_line_search (
117
- X , y , w , Xw , datafit , penalty , delta_w_ws , X_delta_w_ws , ws )
140
+ X , y , w , Xw , fit_intercept , datafit , penalty ,
141
+ delta_w_ws , X_delta_w_ws , ws )
118
142
119
143
# check convergence
120
144
opt_in = penalty .subdiff_distance (w , grad_ws , ws )
@@ -138,7 +162,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
138
162
139
163
140
164
@njit
141
- def _descent_direction (X , y , w_epoch , Xw_epoch , grad_ws , datafit ,
165
+ def _descent_direction (X , y , w_epoch , Xw_epoch , fit_intercept , grad_ws , datafit ,
142
166
penalty , ws , tol ):
143
167
# Given:
144
168
# 1) b = \nabla F(X w_epoch)
@@ -152,11 +176,16 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, grad_ws, datafit,
152
176
for idx , j in enumerate (ws ):
153
177
lipschitz [idx ] = raw_hess @ X [:, j ] ** 2
154
178
155
- # for a less costly stopping criterion, we do no compute the exact gradient,
156
- # but store each coordinate-wise gradient every time we upate one coordinate:
179
+ # for a less costly stopping criterion, we do not compute the exact gradient,
180
+ # but store each coordinate-wise gradient every time we update one coordinate
157
181
past_grads = np .zeros (len (ws ))
158
182
X_delta_w_ws = np .zeros (X .shape [0 ])
159
- w_ws = w_epoch [ws ]
183
+ ws_intercept = np .append (ws , - 1 ) if fit_intercept else ws
184
+ w_ws = w_epoch [ws_intercept ]
185
+
186
+ if fit_intercept :
187
+ lipschitz_intercept = np .sum (raw_hess )
188
+ grad_intercept = np .sum (datafit .raw_grad (y , Xw_epoch ))
160
189
161
190
for cd_iter in range (MAX_CD_ITER ):
162
191
for idx , j in enumerate (ws ):
@@ -174,22 +203,35 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, grad_ws, datafit,
174
203
if w_ws [idx ] != old_w_idx :
175
204
X_delta_w_ws += (w_ws [idx ] - old_w_idx ) * X [:, j ]
176
205
206
+ if fit_intercept :
207
+ past_grads_intercept = grad_intercept + raw_hess @ X_delta_w_ws
208
+ old_intercept = w_ws [- 1 ]
209
+ w_ws [- 1 ] -= past_grads_intercept / lipschitz_intercept
210
+
211
+ if w_ws [- 1 ] != old_intercept :
212
+ X_delta_w_ws += w_ws [- 1 ] - old_intercept
213
+
177
214
if cd_iter % 5 == 0 :
178
215
# TODO: can be improved by passing in w_ws but breaks for WeightedL1
179
216
current_w = w_epoch .copy ()
180
- current_w [ws ] = w_ws
217
+ current_w [ws_intercept ] = w_ws
181
218
opt = penalty .subdiff_distance (current_w , past_grads , ws )
182
- if np .max (opt ) <= tol :
219
+ stop_crit = np .max (opt )
220
+
221
+ if fit_intercept :
222
+ stop_crit = max (stop_crit , np .abs (past_grads_intercept ))
223
+
224
+ if stop_crit <= tol :
183
225
break
184
226
185
227
# descent direction
186
- return w_ws - w_epoch [ws ], X_delta_w_ws
228
+ return w_ws - w_epoch [ws_intercept ], X_delta_w_ws
187
229
188
230
189
- # sparse version of _compute_descent_direction
231
+ # sparse version of _descent_direction
190
232
@njit
191
233
def _descent_direction_s (X_data , X_indptr , X_indices , y , w_epoch ,
192
- Xw_epoch , grad_ws , datafit , penalty , ws , tol ):
234
+ Xw_epoch , fit_intercept , grad_ws , datafit , penalty , ws , tol ):
193
235
raw_hess = datafit .raw_hessian (y , Xw_epoch )
194
236
195
237
lipschitz = np .zeros (len (ws ))
@@ -201,7 +243,12 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
201
243
# see _descent_direction() comment
202
244
past_grads = np .zeros (len (ws ))
203
245
X_delta_w_ws = np .zeros (len (y ))
204
- w_ws = w_epoch [ws ]
246
+ ws_intercept = np .append (ws , - 1 ) if fit_intercept else ws
247
+ w_ws = w_epoch [ws_intercept ]
248
+
249
+ if fit_intercept :
250
+ lipschitz_intercept = np .sum (raw_hess )
251
+ grad_intercept = np .sum (datafit .raw_grad (y , Xw_epoch ))
205
252
206
253
for cd_iter in range (MAX_CD_ITER ):
207
254
for idx , j in enumerate (ws ):
@@ -224,39 +271,57 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
224
271
_update_X_delta_w (X_data , X_indptr , X_indices , X_delta_w_ws ,
225
272
w_ws [idx ] - old_w_idx , j )
226
273
274
+ if fit_intercept :
275
+ past_grads_intercept = grad_intercept + raw_hess @ X_delta_w_ws
276
+ old_intercept = w_ws [- 1 ]
277
+ w_ws [- 1 ] -= past_grads_intercept / lipschitz_intercept
278
+
279
+ if w_ws [- 1 ] != old_intercept :
280
+ X_delta_w_ws += w_ws [- 1 ] - old_intercept
281
+
227
282
if cd_iter % 5 == 0 :
228
283
# TODO: could be improved by passing in w_ws
229
284
current_w = w_epoch .copy ()
230
- current_w [ws ] = w_ws
285
+ current_w [ws_intercept ] = w_ws
231
286
opt = penalty .subdiff_distance (current_w , past_grads , ws )
232
- if np .max (opt ) <= tol :
287
+ stop_crit = np .max (opt )
288
+
289
+ if fit_intercept :
290
+ stop_crit = max (stop_crit , np .abs (past_grads_intercept ))
291
+
292
+ if stop_crit <= tol :
233
293
break
234
294
235
295
# descent direction
236
- return w_ws - w_epoch [ws ], X_delta_w_ws
296
+ return w_ws - w_epoch [ws_intercept ], X_delta_w_ws
237
297
238
298
239
299
@njit
240
- def _backtrack_line_search (X , y , w , Xw , datafit , penalty , delta_w_ws ,
300
+ def _backtrack_line_search (X , y , w , Xw , fit_intercept , datafit , penalty , delta_w_ws ,
241
301
X_delta_w_ws , ws ):
242
302
# 1) find step in [0, 1] such that:
243
303
# penalty(w + step * delta_w) - penalty(w) +
244
304
# step * \nabla datafit(w + step * delta_w) @ delta_w < 0
245
305
# ref: https://www.di.ens.fr/~aspremon/PDF/ENSAE/Newton.pdf
246
306
# 2) inplace update of w and Xw and return grad_ws of the last w and Xw
247
307
step , prev_step = 1. , 0.
308
+ n_features = X .shape [1 ]
309
+ ws_intercept = np .append (ws , - 1 ) if fit_intercept else ws
248
310
# TODO: could be improved by passing in w[ws]
249
- old_penalty_val = penalty .value (w )
311
+ old_penalty_val = penalty .value (w [: n_features ] )
250
312
251
313
# try step = 1, 1/2, 1/4, ...
252
314
for _ in range (MAX_BACKTRACK_ITER ):
253
- w [ws ] += (step - prev_step ) * delta_w_ws
315
+ w [ws_intercept ] += (step - prev_step ) * delta_w_ws
254
316
Xw += (step - prev_step ) * X_delta_w_ws
255
317
256
- grad_ws = _construct_grad (X , y , w , Xw , datafit , ws )
318
+ grad_ws = _construct_grad (X , y , w [: n_features ] , Xw , datafit , ws )
257
319
# TODO: could be improved by passing in w[ws]
258
- stop_crit = penalty .value (w ) - old_penalty_val
259
- stop_crit += step * grad_ws @ delta_w_ws
320
+ stop_crit = penalty .value (w [:n_features ]) - old_penalty_val
321
+ stop_crit += step * grad_ws @ delta_w_ws [:len (ws )]
322
+
323
+ if fit_intercept :
324
+ stop_crit += step * delta_w_ws [- 1 ] * np .sum (datafit .raw_grad (y , Xw ))
260
325
261
326
if stop_crit < 0 :
262
327
break
@@ -272,21 +337,26 @@ def _backtrack_line_search(X, y, w, Xw, datafit, penalty, delta_w_ws,
272
337
273
338
# sparse version of _backtrack_line_search
274
339
@njit
275
- def _backtrack_line_search_s (X_data , X_indptr , X_indices , y , w , Xw , datafit ,
276
- penalty , delta_w_ws , X_delta_w_ws , ws ):
340
+ def _backtrack_line_search_s (X_data , X_indptr , X_indices , y , w , Xw , fit_intercept ,
341
+ datafit , penalty , delta_w_ws , X_delta_w_ws , ws ):
277
342
step , prev_step = 1. , 0.
343
+ n_features = len (X_indptr ) - 1
344
+ ws_intercept = np .append (ws , - 1 ) if fit_intercept else ws
278
345
# TODO: could be improved by passing in w[ws]
279
- old_penalty_val = penalty .value (w )
346
+ old_penalty_val = penalty .value (w [: n_features ] )
280
347
281
348
for _ in range (MAX_BACKTRACK_ITER ):
282
- w [ws ] += (step - prev_step ) * delta_w_ws
349
+ w [ws_intercept ] += (step - prev_step ) * delta_w_ws
283
350
Xw += (step - prev_step ) * X_delta_w_ws
284
351
285
352
grad_ws = _construct_grad_sparse (X_data , X_indptr , X_indices ,
286
- y , w , Xw , datafit , ws )
353
+ y , w [: n_features ] , Xw , datafit , ws )
287
354
# TODO: could be improved by passing in w[ws]
288
- stop_crit = penalty .value (w ) - old_penalty_val
289
- stop_crit += step * grad_ws .T @ delta_w_ws
355
+ stop_crit = penalty .value (w [:n_features ]) - old_penalty_val
356
+ stop_crit += step * grad_ws .T @ delta_w_ws [:len (ws )]
357
+
358
+ if fit_intercept :
359
+ stop_crit += step * delta_w_ws [- 1 ] * np .sum (datafit .raw_grad (y , Xw ))
290
360
291
361
if stop_crit < 0 :
292
362
break
0 commit comments