@@ -765,6 +765,10 @@ fortran_int_max(fortran_int x, fortran_int y) {
765
765
INIT_OUTER_LOOP_5\
766
766
npy_intp s5 = *steps++;
767
767
768
+ #define INIT_OUTER_LOOP_7 \
769
+ INIT_OUTER_LOOP_6\
770
+ npy_intp s6 = *steps++;
771
+
768
772
#define BEGIN_OUTER_LOOP_2 \
769
773
for (N_ = 0;\
770
774
N_ < dN;\
@@ -805,6 +809,17 @@ fortran_int_max(fortran_int x, fortran_int y) {
805
809
args[4] += s4,\
806
810
args[5] += s5) {
807
811
812
+ #define BEGIN_OUTER_LOOP_7 \
813
+ for (N_ = 0;\
814
+ N_ < dN;\
815
+ N_++, args[0] += s0,\
816
+ args[1] += s1,\
817
+ args[2] += s2,\
818
+ args[3] += s3,\
819
+ args[4] += s4,\
820
+ args[5] += s5,\
821
+ args[6] += s6) {
822
+
808
823
#define END_OUTER_LOOP }
809
824
810
825
static NPY_INLINE void
@@ -836,6 +851,7 @@ update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
836
851
#typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
837
852
#copy = scopy, dcopy, ccopy, zcopy#
838
853
#nan = s_nan, d_nan, c_nan, z_nan#
854
+ #zero = s_zero, d_zero, c_zero, z_zero#
839
855
*/
840
856
static NPY_INLINE void *
841
857
linearize_ @TYPE @_matrix (void * dst_in ,
@@ -949,6 +965,23 @@ nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
949
965
}
950
966
}
951
967
968
+ static NPY_INLINE void
969
+ zero_ @TYPE @_matrix (void * dst_in , const LINEARIZE_DATA_t * data )
970
+ {
971
+ @typ @ * dst = (@typ @ * ) dst_in ;
972
+
973
+ int i , j ;
974
+ for (i = 0 ; i < data -> rows ; i ++ ) {
975
+ @typ @ * cp = dst ;
976
+ ptrdiff_t cs = data -> column_strides /sizeof (@typ @);
977
+ for (j = 0 ; j < data -> columns ; ++ j ) {
978
+ * cp = @zero @;
979
+ cp += cs ;
980
+ }
981
+ dst += data -> row_strides /sizeof (@typ @);
982
+ }
983
+ }
984
+
952
985
/**end repeat**/
953
986
954
987
/* identity square matrix generation */
@@ -3196,6 +3229,12 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
3196
3229
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
3197
3230
#REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
3198
3231
#lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
3232
+ #dot_func=sdot,ddot,cdotc,zdotc#
3233
+ #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
3234
+ #basetyp = npy_float, npy_double, npy_float, npy_double#
3235
+ #ftyp = fortran_real, fortran_doublereal,
3236
+ fortran_complex, fortran_doublecomplex#
3237
+ #cmplx = 0, 0, 1, 1#
3199
3238
*/
3200
3239
static inline void
3201
3240
release_ @lapack_func @(GELSD_PARAMS_t * params )
@@ -3206,42 +3245,84 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
3206
3245
memset (params , 0 , sizeof (* params ));
3207
3246
}
3208
3247
3248
+ /** Compute the squared l2 norm of a contiguous vector */
3249
+ static @basetyp @
3250
+ @TYPE @_abs2 (@typ @ * p , npy_intp n ) {
3251
+ npy_intp i ;
3252
+ @basetyp @ res = 0 ;
3253
+ for (i = 0 ; i < n ; i ++ ) {
3254
+ @typ @ el = p [i ];
3255
+ #if @cmplx @
3256
+ res + = el .real * el .real + el .imag * el .imag ;
3257
+ #else
3258
+ res + = el * el ;
3259
+ #endif
3260
+ }
3261
+ return res ;
3262
+ }
3263
+
3209
3264
static void
3210
3265
@TYPE @_lstsq (char * * args , npy_intp * dimensions , npy_intp * steps ,
3211
3266
void * NPY_UNUSED (func ))
3212
3267
{
3213
3268
GELSD_PARAMS_t params ;
3214
3269
int error_occurred = get_fp_invalid_and_clear ();
3215
3270
fortran_int n , m , nrhs ;
3216
- INIT_OUTER_LOOP_6
3271
+ fortran_int excess ;
3272
+
3273
+ INIT_OUTER_LOOP_7
3217
3274
3218
3275
m = (fortran_int )dimensions [0 ];
3219
3276
n = (fortran_int )dimensions [1 ];
3220
3277
nrhs = (fortran_int )dimensions [2 ];
3278
+ excess = m - n ;
3221
3279
3222
3280
if (init_ @lapack_func @(& params , m , n , nrhs )) {
3223
- LINEARIZE_DATA_t a_in , b_in , x_out , s_out ;
3281
+ LINEARIZE_DATA_t a_in , b_in , x_out , s_out , r_out ;
3224
3282
3225
3283
init_linearize_data (& a_in , n , m , steps [1 ], steps [0 ]);
3226
3284
init_linearize_data_ex (& b_in , nrhs , m , steps [3 ], steps [2 ], fortran_int_max (n , m ));
3227
- init_linearize_data (& x_out , nrhs , fortran_int_max (n , m ), steps [5 ], steps [4 ]);
3228
- init_linearize_data (& s_out , 1 , fortran_int_min (n , m ), 1 , steps [6 ]);
3285
+ init_linearize_data_ex (& x_out , nrhs , n , steps [5 ], steps [4 ], fortran_int_max (n , m ));
3286
+ init_linearize_data (& r_out , 1 , nrhs , 1 , steps [6 ]);
3287
+ init_linearize_data (& s_out , 1 , fortran_int_min (n , m ), 1 , steps [7 ]);
3229
3288
3230
- BEGIN_OUTER_LOOP_6
3289
+ BEGIN_OUTER_LOOP_7
3231
3290
int not_ok ;
3232
3291
linearize_ @TYPE @_matrix (params .A , args [0 ], & a_in );
3233
3292
linearize_ @TYPE @_matrix (params .B , args [1 ], & b_in );
3234
3293
params .RCOND = args [2 ];
3235
3294
not_ok = call_ @lapack_func @(& params );
3236
3295
if (!not_ok ) {
3237
3296
delinearize_ @TYPE @_matrix (args [3 ], params .B , & x_out );
3238
- * (npy_int * ) args [4 ] = params .RANK ;
3239
- delinearize_ @REALTYPE @_matrix (args [5 ], params .S , & s_out );
3297
+ * (npy_int * ) args [5 ] = params .RANK ;
3298
+ delinearize_ @REALTYPE @_matrix (args [6 ], params .S , & s_out );
3299
+
3300
+ /* Note that linalg.lstsq discards this when excess == 0 */
3301
+ if (excess >= 0 && params .RANK == n ) {
3302
+ /* Compute the residuals as the square sum of each column */
3303
+ int i ;
3304
+ char * resid = args [4 ];
3305
+ @ftyp @ * components = (@ftyp @ * )params .B + n ;
3306
+ for (i = 0 ; i < nrhs ; i ++ ) {
3307
+ @ftyp @ * vector = components + i * m ;
3308
+ /* Numpy and fortran floating types are the same size,
3309
+ * so this case is safe */
3310
+ @basetyp @ abs2 = @TYPE @_abs2 ((@typ @ * )vector , excess );
3311
+ memcpy (
3312
+ resid + i * r_out .column_strides ,
3313
+ & abs2 , sizeof (abs2 ));
3314
+ }
3315
+ }
3316
+ else {
3317
+ /* Note that this is always discarded by linalg.lstsq */
3318
+ nan_ @REALTYPE @_matrix (args [4 ], & r_out );
3319
+ }
3240
3320
} else {
3241
3321
error_occurred = 1 ;
3242
3322
nan_ @TYPE @_matrix (args [3 ], & x_out );
3243
- * (npy_int * ) args [4 ] = -1 ;
3244
- nan_ @REALTYPE @_matrix (args [5 ], & s_out );
3323
+ nan_ @REALTYPE @_matrix (args [4 ], & r_out );
3324
+ * (npy_int * ) args [5 ] = -1 ;
3325
+ nan_ @REALTYPE @_matrix (args [6 ], & s_out );
3245
3326
}
3246
3327
END_OUTER_LOOP
3247
3328
@@ -3389,12 +3470,12 @@ static char svd_1_3_types[] = {
3389
3470
NPY_CDOUBLE , NPY_CDOUBLE , NPY_DOUBLE , NPY_CDOUBLE
3390
3471
};
3391
3472
3392
- /* A, b, rcond, x, rank, s */
3473
+ /* A, b, rcond, x, resid, rank, s, */
3393
3474
static char lstsq_types [] = {
3394
- NPY_FLOAT , NPY_FLOAT , NPY_FLOAT , NPY_FLOAT , NPY_INT , NPY_FLOAT ,
3395
- NPY_DOUBLE , NPY_DOUBLE , NPY_DOUBLE , NPY_DOUBLE , NPY_INT , NPY_DOUBLE ,
3396
- NPY_CFLOAT , NPY_CFLOAT , NPY_FLOAT , NPY_CFLOAT , NPY_INT , NPY_FLOAT ,
3397
- NPY_CDOUBLE , NPY_CDOUBLE , NPY_DOUBLE , NPY_CDOUBLE , NPY_INT , NPY_DOUBLE
3475
+ NPY_FLOAT , NPY_FLOAT , NPY_FLOAT , NPY_FLOAT , NPY_FLOAT , NPY_INT , NPY_FLOAT ,
3476
+ NPY_DOUBLE , NPY_DOUBLE , NPY_DOUBLE , NPY_DOUBLE , NPY_DOUBLE , NPY_INT , NPY_DOUBLE ,
3477
+ NPY_CFLOAT , NPY_CFLOAT , NPY_FLOAT , NPY_CFLOAT , NPY_FLOAT , NPY_INT , NPY_FLOAT ,
3478
+ NPY_CDOUBLE , NPY_CDOUBLE , NPY_DOUBLE , NPY_CDOUBLE , NPY_DOUBLE , NPY_INT , NPY_DOUBLE ,
3398
3479
};
3399
3480
3400
3481
typedef struct gufunc_descriptor_struct {
@@ -3590,19 +3671,19 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
3590
3671
},
3591
3672
{
3592
3673
"lstsq_m" ,
3593
- "(m,n),(m,nrhs),()->(n,nrhs),(),(m)" ,
3674
+ "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),( ),(m)" ,
3594
3675
"least squares on the last two dimensions and broadcast to the rest. \n" \
3595
3676
"For m <= n. \n" ,
3596
- 4 , 3 , 3 ,
3677
+ 4 , 3 , 4 ,
3597
3678
FUNC_ARRAY_NAME (lstsq ),
3598
3679
lstsq_types
3599
3680
},
3600
3681
{
3601
3682
"lstsq_n" ,
3602
- "(m,n),(m,nrhs),()->(m, nrhs),(),(n)" ,
3683
+ "(m,n),(m,nrhs),()->(n,nrhs),( nrhs),(),(n)" ,
3603
3684
"least squares on the last two dimensions and broadcast to the rest. \n" \
3604
- "For m >= n. \n" ,
3605
- 4 , 3 , 3 ,
3685
+ "For m >= n, meaning that residuals are produced . \n" ,
3686
+ 4 , 3 , 4 ,
3606
3687
FUNC_ARRAY_NAME (lstsq ),
3607
3688
lstsq_types
3608
3689
}
0 commit comments