@@ -681,11 +681,10 @@ def update_gradient_hessian(self, X, y, sample_weight):
681
681
hessian_out = self .h ,
682
682
n_threads = self .n_threads ,
683
683
)
684
- # For non-canonical link functions and far away from the optimum, we take
685
- # care that the hessian is at least non-negative. Tiny positive values are set
686
- # to zero, too.
684
+ # For non-canonical link functions and far away from the optimum, we take care
685
+ # that the hessian is positive.
687
686
eps = 16 * np .finfo (y .dtype ).eps
688
- self .h [self .h <= eps ] = 0
687
+ self .h [self .h <= eps ] = eps
689
688
690
689
n_features = X .shape [1 ]
691
690
# This duplicates a bit of code from LinearModelLoss.gradient.
@@ -704,13 +703,8 @@ def inner_solve(self, X, y, sample_weight):
704
703
"""
705
704
n_samples , n_features = X .shape
706
705
sqrt_h = np .sqrt (self .h )
707
- # Take care of h = 0. Tiny h are already set to 0.
708
- # If h = 0 we can exclude the corresponding row of X such that the value of b
709
- # becomes irrelevant. We set it -g as if h = 1.
710
- g_over_h_sqrt = self .g
711
- g_over_h_sqrt [sqrt_h > 0 ] /= sqrt_h [sqrt_h > 0 ]
712
706
713
- b = np .r_ [- g_over_h_sqrt , - self .sqrt_P * self .coef [:n_features ]]
707
+ b = np .r_ [- self . g / sqrt_h , - self .sqrt_P * self .coef [:n_features ]]
714
708
715
709
if self .linear_loss .fit_intercept :
716
710
n_dof = n_features + 1
@@ -766,6 +760,7 @@ def rmatvec(x):
766
760
damp = 0 ,
767
761
atol = self .tol / (max (1 , self .A_norm ) * max (1 , self .r_norm )),
768
762
btol = self .tol ,
763
+ maxiter = max (n_samples , n_dof ), # default is min(A.shape)
769
764
show = self .verbose >= 3 ,
770
765
)
771
766
# We store the estimated Frobenius norm of A and norm of residual r in
@@ -780,15 +775,28 @@ def rmatvec(x):
780
775
conda ,
781
776
normx ,
782
777
) = result
783
- # LSMR reached maxiter.
784
- eps = 4 * np .finfo (self .gradient .dtype ).eps
785
- if istop == 7 :
778
+ # LSMR reached maxiter or did produce an excessively large newton step.
779
+ # Note: We could detect too large steps by comparing norms(coef_newton) = normx
780
+ # with norm(gradient). Instead of the gradient, we use the already available
781
+ # condition number of A.
782
+ if istop == 7 or normx > 1e2 * conda :
786
783
if self .gradient_step == 0 :
787
784
# We only need to throw this warning once.
785
+ if istop == 7 :
786
+ msg = (
787
+ f"The inner solver of { self .__class__ .__name__ } reached "
788
+ "maxiter={itn} before the other stopping conditions were "
789
+ "satisfied at iteration #{self.iteration}. "
790
+ )
791
+ else :
792
+ msg = (
793
+ f"The inner solver of { self .__class__ .__name__ } produced an"
794
+ " excessively large newton step at iteration"
795
+ " #{self.iteration}. "
796
+ )
788
797
warnings .warn (
789
- f"The inner solver of { self .__class__ .__name__ } reached "
790
- "maxiter={itn} before the other stopping conditions were "
791
- "satisfied at iteration #{self.iteration}. It will now try a "
798
+ msg
799
+ + "It will now try a "
792
800
"simple gradient step. "
793
801
"Note that this warning is only raised once, the problem may, "
794
802
" however, occur in several or all iterations. Set verbose >= 1"
0 commit comments