8000 MAINT: lstsq: compute residuals inside the ufunc by eric-wieser · Pull Request #10890 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

MAINT: lstsq: compute residuals inside the ufunc #10890

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions numpy/linalg/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,17 +2036,9 @@ def lstsq(a, b, rcond="warn"):
else:
gufunc = _umath_linalg.lstsq_n

signature = 'DDd->Did' if isComplexType(t) else 'ddd->did'
signature = 'DDd->Ddid' if isComplexType(t) else 'ddd->ddid'
extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq)
b_out, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)

# b_out contains both the solution and the components of the residuals
x = b_out[...,:n,:]
r_parts = b_out[...,n:,:]
if isComplexType(t):
resids = sum(abs(r_parts)**2, axis=-2)
else:
resids = sum(r_parts**2, axis=-2)
x, resids, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)

# remove the axis we added
if is_1d:
Expand Down
119 changes: 100 additions & 19 deletions numpy/linalg/umath_linalg.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,10 @@ fortran_int_max(fortran_int x, fortran_int y) {
INIT_OUTER_LOOP_5\
npy_intp s5 = *steps++;

#define INIT_OUTER_LOOP_7 \
INIT_OUTER_LOOP_6\
npy_intp s6 = *steps++;

#define BEGIN_OUTER_LOOP_2 \
for (N_ = 0;\
N_ < dN;\
Expand Down Expand Up @@ -805,6 +809,17 @@ fortran_int_max(fortran_int x, fortran_int y) {
args[4] += s4,\
args[5] += s5) {

#define BEGIN_OUTER_LOOP_7 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1,\
args[2] += s2,\
args[3] += s3,\
args[4] += s4,\
args[5] += s5,\
args[6] += s6) {

#define END_OUTER_LOOP }

static NPY_INLINE void
Expand Down Expand Up @@ -836,6 +851,7 @@ update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
#typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
#copy = scopy, dcopy, ccopy, zcopy#
#nan = s_nan, d_nan, c_nan, z_nan#
#zero = s_zero, d_zero, c_zero, z_zero#
*/
static NPY_INLINE void *
linearize_@TYPE@_matrix(void *dst_in,
Expand Down Expand Up @@ -949,6 +965,23 @@ nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
}
}

static NPY_INLINE void
zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
{
@typ@ *dst = (@typ@ *) dst_in;

int i, j;
for (i = 0; i < data->rows; i++) {
@typ@ *cp = dst;
ptrdiff_t cs = data->column_strides/sizeof(@typ@);
for (j = 0; j < data->columns; ++j) {
*cp = @zero@;
cp += cs;
}
dst += data->row_strides/sizeof(@typ@);
}
}

/**end repeat**/

/* identity square matrix generation */
Expand Down Expand Up @@ -3196,6 +3229,12 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
#lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
#dot_func=sdot,ddot,cdotc,zdotc#
#typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
#basetyp = npy_float, npy_double, npy_float, npy_double#
#ftyp = fortran_real, fortran_doublereal,
fortran_complex, fortran_doublecomplex#
#cmplx = 0, 0, 1, 1#
*/
static inline void
release_@lapack_func@(GELSD_PARAMS_t* params)
Expand All @@ -3206,42 +3245,84 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
memset(params, 0, sizeof(*params));
}

/** Compute the squared l2 norm of a contiguous vector */
static @basetyp@
@TYPE@_abs2(@typ@ *p, npy_intp n) {
npy_intp i;
@basetyp@ res = 0;
for (i = 0; i < n; i++) {
@typ@ el = p[i];
#if @cmplx@
res += el.real*el.real + el.imag*el.imag;
#else
res += el*el;
#endif
}
return res;
}

static void
@TYPE@_lstsq(char **args, npy_intp *dimensions, npy_intp *steps,
void *NPY_UNUSED(func))
{
GELSD_PARAMS_t params;
int error_occurred = get_fp_invalid_and_clear();
fortran_int n, m, nrhs;
INIT_OUTER_LOOP_6
fortran_int excess;

INIT_OUTER_LOOP_7

m = (fortran_int)dimensions[0];
n = (fortran_int)dimensions[1];
nrhs = (fortran_int)dimensions[2];
excess = m - n;

if (init_@lapack_func@(&params, m, n, nrhs)) {
LINEARIZE_DATA_t a_in, b_in, x_out, s_out;
LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;

init_linearize_data(&a_in, n, m, steps[1], steps[0]);
init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
init_linearize_data(&x_out, nrhs, fortran_int_max(n, m), steps[5], steps[4]);
init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[6]);
init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);

BEGIN_OUTER_LOOP_6
BEGIN_OUTER_LOOP_7
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
linearize_@TYPE@_matrix(params.B, args[1], &b_in);
params.RCOND = args[2];
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
*(npy_int*) args[4] = params.RANK;
delinearize_@REALTYPE@_matrix(args[5], params.S, &s_out);
*(npy_int*) args[5] = params.RANK;
delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);

/* Note that linalg.lstsq discards this when excess == 0 */
if (excess >= 0 && params.RANK == n) {
/* Compute the residuals as the square sum of each column */
int i;
char *resid = args[4];
@ftyp@ *components = (@ftyp@ *)params.B + n;
for (i = 0; i < nrhs; i++) {
@ftyp@ *vector = components + i*m;
/* Numpy and fortran floating types are the same size,
* so this case is safe */
@basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am not sure this is worth splitting out as a function: it feels to me like it just becomes less readable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're already at 5 levels of indentation here - I didn't want to introduce a 6th as well as #ifdefs.

To me, it seems clearer to try and fold the #ifdefs into the function definitions as much as possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Splitting the function also makes it easier to drop in a lapack/blas implementation of them if someone finds that's faster (and cares)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, it was not a big deal.

memcpy(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My poor knowledge of C does not really help here, but memcpy feels very odd. Can one not just assign?

Copy link
Member Author
@eric-wieser eric-wieser Apr 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If resid + i*r_out.column_strides is not aligned, there's no guarantee that the write will succeed. Unless there's some gufunc magic wrapping us that ensures this is the case, memcpy is safer.

delinearize_@TYPE@_matrix uses memcpy, which is where we copy between numpy and the other lapack buffers - so memcpy is at least consistent.

Also, something-something-strict-aliasing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fear I remain confused, but fine to go with something that works!

Copy link
Member Author
@eric-wieser eric-wieser Apr 20, A3D4 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A simple example: This will not work on some platforms:

char bytes[8] = {0};
(uint32_t *)&bytes[1] = 0x12345678;  // unaligned 32-bit write may fail
assert((uint32_t *)&bytes[0] == 0x00123456);  // assuming big endian

But using memcpy would work every time

Relatedly: https://www.alfonsobeato.net/arm/how-to-access-safely-unaligned-data/

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thanks that makes it clearer (link is a good one too)

resid + i*r_out.column_strides,
&abs2, sizeof(abs2));
}
}
else {
/* Note that this is always discarded by linalg.lstsq */
nan_@REALTYPE@_matrix(args[4], &r_out);
}
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[3], &x_out);
*(npy_int*) args[4] = -1;
nan_@REALTYPE@_matrix(args[5], &s_out);
nan_@REALTYPE@_matrix(args[4], &r_out);
*(npy_int*) args[5] = -1;
nan_@REALTYPE@_matrix(args[6], &s_out);
}
END_OUTER_LOOP

Expand Down Expand Up @@ -3389,12 +3470,12 @@ static char svd_1_3_types[] = {
NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
};

/* A, b, rcond, x, rank, s */
/* A, b, rcond, x, resid, rank, s, */
static char lstsq_types[] = {
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_INT, NPY_FLOAT,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_INT, NPY_DOUBLE
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
};

typedef struct gufunc_descriptor_struct {
Expand Down Expand Up @@ -3590,19 +3671,19 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
},
{
"lstsq_m",
"(m,n),(m,nrhs),()->(n,nrhs),(),(m)",
"(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
"least squares on the last two dimensions and broadcast to the rest. \n"\
"For m <= n. \n",
4, 3, 3,
4, 3, 4,
FUNC_ARRAY_NAME(lstsq),
lstsq_types
},
{
"lstsq_n",
"(m,n),(m,nrhs),()->(m,nrhs),(),(n)",
"(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)",
"least squares on the last two dimensions and broadcast to the rest. \n"\
"For m >= n. \n",
4, 3, 3,
"For m >= n, meaning that residuals are produced. \n",
4, 3, 4,
FUNC_ARRAY_NAME(lstsq),
lstsq_types
}
Expand Down
0