8000 ENH: added functionality nancov to numpy · numpy/numpy@7620967 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7620967

Browse files
author
David Freese
committed
ENH: added functionality nancov to numpy
Adds a function nancov to nanfunctions in numpy that mimics cov, while ignoring nan values. Implements two different ways for ignoring nans. The default eliminates nan observations across all variables, while the pairwise option only eliminates nan observations in respective pairs of variables. The bias option was dropped leaving ddof as the primary method of handling bias in the estimate. fweights and aweights are supported.
1 parent 2748d2e commit 7620967

File tree

5 files changed

+343
-2
lines changed

5 files changed

+343
-2
lines changed

doc/release/1.14.0-notes.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,18 @@ small relative to the potential improvement in speed.
461461
------------------------------------------
462462
``f2py`` now allows for the allocation of arrays of dimension 0. This allows
463463
for more consistent handling of corner cases downstream.
464+
The ``repr`` of ``np.polynomial`` classes is more explicit
465+
----------------------------------------------------------
466+
It now shows the domain and window parameters as keyword arguments to make
467+
them more clear::
468+
469+
>>> np.polynomial.Polynomial(range(4))
470+
Polynomial([ 0., 1., 2., 3.], domain=[-1, 1], window=[-1, 1])
471+
472+
f2py now handles arrays of dimension 0
473+
--------------------------------------
474+
f2py now allows for the allocation of arrays of dimension 0. This allows for
475+
more consistent handling of corner cases downstream.
464476

465477
``numpy.distutils`` supports using MSVC and mingw64-gfortran together
466478
---------------------------------------------------------------------

doc/release/1.15.0-notes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ These compute the greatest common divisor, and lowest common multiple,
5858
respectively. These work on all the numpy integer types, as well as the
5959
builtin arbitrary-precision `Decimal` and `long` types.
6060

61+
New nanfunction ``nancov`` added
62+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
63+
Nanfunction ``nancov`` has been added to compute ``cov`` by ignoring nans.
6164

6265
Improvements
6366
============

doc/source/reference/routines.statistics.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Correlating
4343
corrcoef
4444
correlate
4545
cov
46+
nancov
4647

4748
Histograms
4849
----------

numpy/lib/nanfunctions.py

Lines changed: 244 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
- `nanstd` -- standard deviation of non-NaN values
1818
- `nanmedian` -- median of non-NaN values
1919
- `nanpercentile` -- qth percentile of non-NaN values
20+
- `nancov` -- covariance matrix of non-Nan values
2021
2122
"""
2223
from __future__ import division, absolute_import, print_function
@@ -29,7 +30,7 @@
2930
__all__ = [
3031
'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean',
3132
'nanmedian', 'nanpercentile', 'nanvar', 'nanstd', 'nanprod',
32-
'nancumsum', 'nancumprod'
33+
'nancumsum', 'nancumprod', 'nancov'
3334
]
3435

3536

@@ -1441,3 +1442,245 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
14411442
else:
14421443
std = var.dtype.type(np.sqrt(var))
14431444
return std
1445+
1446+
1447+
def nancov(m, y=None, rowvar=1, ddof=1, fweights=None, aweights=None,
1448+
pairwise=False):
1449+
"""
1450+
Estimate a covariance matrix, given data, ignoring observations containing
1451+
NaN values.
1452+
1453+
Covariance indicates the level to which two variables vary together.
1454+
If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`,
1455+
then the covariance matrix element :math:`C_{ij}` is the covariance of
1456+
:math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance
1457+
of :math:`x_i`.
1458+
1459+
.. versionadded:: 1.14.0
1460+
1461+
Parameters
1462+
----------
1463+
m : array_like
1464+
A 1-D or 2-D array containing multiple variables and observations.
1465+
Each row of `m` represents a variable, and each column a single
1466+
observation of all those variables. Also see `rowvar` below.
1467+
y : array_like, optional
1468+
An additional set of variables and observations. `y` has the same
1469+
form as that of `m`.
1470+
rowvar : int, optional
1471+
If `rowvar` is non-zero (default), then each row represents a
1472+
variable, with observations in the columns. Otherwise, the relationship
1473+
is transposed: each column represents a variable, while the rows
1474+
contain observations.
1475+
ddof : int, optional
1476+
Normalization is by ``(N - ddof)``, where ``N`` is
1477+
the number of observations. The default value is ``1``.
1478+
fweights : array_like, int, optional
1479+
1-D array of integer freguency weights; the number of times each
1480+
observation vector should be repeated.
1481+
aweights : array_like, optional
1482+
1-D array of observation vector weights. These relative weights are
1483+
typically large for observations considered "important" and smaller for
1484+
observations considered less "important". If ``ddof=0`` the array of
1485+
weights can be used to assign probabilities to observation vectors.
1486+
pairwise : bool, optional
1487+
If ``False`` then a NaN value causes that observation to be eliminated
1488+
across all variables. If ``True`` then each pair of variables is
1489+
compared separately. Observations with NaN values in the pair are
1490+
removed, but do not affect other pairs. The returned covariance matrix
1491+
may no longer be positive semi-definite in this case. False is
1492+
equivalent to the ``na.or.complete`` use flag for cov in R, while
1493+
``True`` is equivalent to ``pairwise.complete.obs``.
1494+
1495+
Returns
1496+
-------
1497+
out : ndarray
1498+
The covariance matrix of the variables.
1499+
1500+
See Also
1501+
--------
1502+
corrcoef : Normalized covariance matrix
1503+
1504+
Examples
1505+
--------
1506+
Consider two variables, :math:`x_0` and :math:`x_1`, which
1507+
correlate perfectly, but in opposite directions:
1508+
1509+
>>> x = np.array([[0, 2], [1, 1], [np.nan, 1], [2, 0]]).T
1510+
>>> x
1511+
array([[0, 1, nan, 2],
1512+
[2, 1, 1, 0]])
1513+
1514+
Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance
1515+
matrix shows this clearly:
1516+
1517+
>>> np.nancov(x)
1518+
array([[ 1., -1.],
1519+
[-1., 1.]])
1520+
1521+
Note that element :math:`C_{0,1}`, which shows the correlation between
1522+
:math:`x_0` and :math:`x_1`, is negative.
1523+
1524+
Further, note how `x` and `y` are combined:
1525+
1526+
>>> x = [-2.1, -1, np.nan, 4.3]
1527+
>>> y = [3, 1.1, 1, 0.12]
1528+
>>> X = np.vstack((x,y))
1529+
>>> print np.nancov(X)
1530+
[[ 11.71 -4.286 ]
1531+
[ -4.286 2.14413333]]
1532+
>>> print np.nancov(x, y)
1533+
[[ 11.71 -4.286 ]
1534+
[ -4.286 2.14413333]]
1535+
>>> print np.nancov(x)
1536+
11.71
1537+
1538+
Finally it can be seen how the pairwise option handles nan values
1539+
differently than when when the pairwise option is disabled (default).
1540+
1541+
>>> X = np.array([[ 1., 4., 8., np.nan, 16.],
1542+
... [ 0., 2., 4., 6., 4.],
1543+
... [-1., 3., np.nan, np.nan, 4.]])
1544+
>>> np.nancov(X)
1545+
array([[ 63. , 15. , 16.5],
1546+
[ 15. , 4. , 5. ],
1547+
[ 16.5, 5. , 7. ]])
1548+
>>> np.nancov(X, pairwise=True)
1549+
array([[ 42.25, 10.5 , 16.5 ],
1550+
[ 10.5 , 5.2 , 5. ],
1551+
[ 16.5 , 5. , 7. ]])
1552+
"""
1553+
# Check inputs
1554+
if ddof is not None and ddof != int(ddof):
1555+
raise ValueError(
1556+
"ddof must be integer")
1557+
1558+
m = np.asarray(m)
1559+
if m.ndim > 2:
1560+
raise ValueError("m has more than 2 dimensions")
1561+
1562+
if y is None:
1563+
dtype = np.result_type(m, np.float64)
1564+
else:
1565+
y = np.asarray(y)
1566+
if y.ndim > 2:
1567+
raise ValueError("y has more than 2 dimensions")
1568+
dtype = np.result_type(m, y, np.float64)
1569+
X = np.array(m, ndmin=2, dtype=dtype)
1570+
1571+
if X.shape[0] == 1:
1572+
rowvar = 1
1573+
if rowvar:
1574+
axis = 0
1575+
else:
1576+
axis = 1
1577+
1578+
if y is not None:
1579+
y = np.array(y, copy=False, ndmin=2, dtype=dtype)
1580+
X = np.concatenate((X, y), axis)
1581+
1582+
if axis:
1583+
n = X.shape[0]
1584+
else:
1585+
n = X.shape[1]
1586+
1587+
# Get the product of frequencies and weights
1588+
if fweights is not None:
1589+
fweights = np.asarray(fweights, dtype=np.float64)
1590+
if not np.all(fweights == np.around(fweights)):
1591+
raise TypeError(
1592+
"fweights must be integer")
1593+
if fweights.ndim > 1:
1594+
raise RuntimeError(
1595+
"cannot handle multidimensional fweights")
1596+
if fweights.shape[0] != n:
1597+
raise RuntimeError(
1598+
"incompatible numbers of samples and fweights")
1599+
if any(fweights < 0):
1600+
raise ValueError(
1601+
"fweights cannot be negative")
1602+
if aweights is not None:
1603+
aweights = np.asarray(aweights, dtype=np.float64)
1604+
if aweights.ndim > 1:
1605+
raise RuntimeError(
1606+
"cannot handle multidimensional aweights")
1607+
if aweights.shape[0] != n:
1608+
raise RuntimeError(
1609+
"incompatible numbers of samples and aweights")
1610+
if any(aweights < 0):
1611+
raise ValueError(
1612+
"aweights cannot be negative")
1613+
1614+
if pairwise:
1615+
w = None
1616+
if fweights is not None:
1617+
w = fweights
1618+
if aweights is not None:
1619+
if w is None:
1620+
w = awei 10000 ghts
1621+
else:
1622+
w *= aweights
1623+
1624+
nan_vals = np.isnan(X)
1625+
if axis:
1626+
one_array = np.ones(X.T.shape)
1627+
one_array[nan_vals.T] = np.nan
1628+
else:
1629+
one_array = np.ones(X.shape)
1630+
one_array[nan_vals] = np.nan
1631+
# each pair may cause a unique mean for the variable, so we create
1632+
# pair_array which has the correctly nan-ed values.
1633+
if axis:
1634+
pair_array = X[:, None, :].swapaxes(0, 2) * one_array[None, :, :]
1635+
else:
1636+
pair_array = X[:, None, :] * one_array[None, :, :]
1637+
1638+
if w is None:
1639+
pair_means = nanmean(pair_array, axis=2)
1640+
pair_w_sum = nansum(~np.isnan(pair_array), axis=2)
1641+
else:
1642+
pair_w = w[None, None, :] * ~np.isnan(pair_array)
1643+
pair_w_sum = np.nansum(pair_w, axis=2)
1644+
pair_means = nansum(w[None, None, :] * pair_array,
1645+
axis=2) / pair_w_sum
1646+
pair_array -= pair_means[:, :, None]
1647+
1648+
if w is None:
1649+
dotted = pair_array * pair_array.swapaxes(0, 1).conj()
1650+
else:
1651+
dotted = pair_array * \
1652+
(w[None, None, :] * pair_array).swapaxes(0, 1).conj()
1653+
1654+
c = nansum(dotted, axis=2)
1655+
1656+
if aweights is None:
1657+
fact = pair_w_sum - ddof
1658+
else:
1659+
ddof_weight = nansum(aweights[None, None, :] * pair_w, axis=2)
1660+
fact = pair_w_sum - ddof * ddof_weight / pair_w_sum
1661+
1662+
if np.any(fact <= 0):
1663+
warnings.warn("Degrees of freedom <= 0 for a slice",
1664+
RuntimeWarning, stacklevel=2)
1665+
fact[fact <= 0] = 0.0
1666+
1667+
c *= 1. / fact.astype(np.float64)
1668+
return c
1669+
else:
1670+
# "Complete" version for handling nans where a nan value in any
1671+
# variable causes the observation to be removed from all variables.
1672+
1673+
# Find observations with nans
1674+
nan_obvs = np.any(np.isnan(X), axis=axis)
1675+
1676+
if fweights is not None:
1677+
fweights = fweights[~nan_obvs]
1678+
if aweights is not None:
1679+
aweights = aweights[~nan_obvs]
1680+
1681+
if axis:
1682+
X_nonan = X[~nan_obvs, :]
1683+
else:
1684+
X_nonan = X[:, ~nan_obvs]
1685+
return np.cov(X_nonan, rowvar=rowvar, ddof=ddof, fweights=fweights,
1686+
aweights=aweights)

numpy/lib/tests/test_nanfunctions.py

Lines changed: 83 additions & 1 deletion
936
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
import numpy as np
66
from numpy.testing import (
77
run_module_suite, assert_, assert_equal, assert_almost_equal,
8-
assert_no_warnings, assert_raises, assert_array_equal, suppress_warnings
8+
assert_no_warnings, assert_raises, assert_array_equal, suppress_warnings,
9+
assert_allclose,
910
)
1011

1112

@@ -888,5 +889,86 @@ def test_multiple_percentiles(self):
888889
assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6))
889890

890891

892+
class TestNanFunctions_Cov(object):
893+
x = np.array([[1, 2, 10, 3]])
894+
y = np.array([[0, 1, np.nan, 0]])
895+
X = np.vstack((x, y))
896+
X_no_nan_complete = X[:, (0, 1, 3)]
897+
898+
fweights = np.array([1, 2, 3, 1])
899+
aweights = np.array([0.9, 1, 0.8, 1.1])
900+
mask = ~np.isnan(y).reshape(-1,)
901+
fweights_no_nan_complete = fweights[mask]
902+
aweights_no_nan_complete = aweights[mask]
903+
904+
def test_basic(self):
905+
assert_equal(np.nancov(self.X), np.cov(self.X_no_nan_complete))
906+
assert_equal(np.nancov(self.X.T, rowvar=0),
907+
np.cov(self.X_no_nan_complete))
908+
909+
def test_xy(self):
910+
assert_equal(np.nancov(self.x, self.y),
911+
np.cov(self.X_no_nan_complete))
912+
913+
def test_complex(self):
914+
x = np.array([[1, 2, 10, 3], [1j, 2j, np.nan, 3j]])
915+
assert_allclose(np.nancov(x), np.array([[1., -1.j], [1.j, 1.]]))
916+
917+
def test_weights(self):
918+
val = np.nancov(self.X,
919+
fweights=self.fweights,
920+
aweights=self.aweights)
921+
nonan_val = np.cov(self.X_no_nan_complete,
922+
fweights=self.fweights_no_nan_complete,
923+
aweights=self.aweights_no_nan_complete)
924+
assert_equal(val, nonan_val)
925+
926+
def test_empty(self):
927+
with warnings.catch_warnings(record=True):
928+
warnings.simplefilter('always', RuntimeWarning)
929+
assert_array_equal(np.nancov(np.array([])), np.nan)
930+
assert_array_equal(np.nancov(np.array([]).reshape(0, 2)),
931+
np.array([]).reshape(0, 0))
932+
assert_array_equal(np.nancov(np.array([]).reshape(2, 0)),
933+
np.array([[np.nan, np.nan], [np.nan, np.nan]]))
934+
935+
def test_wrong_ddof(self):
+
x = np.array([[0, 2], [np.nan, 1], [2, 0]]).T
937+
with warnings.catch_warnings(record=True) as w:
938+
warnings.simplefilter('always')
939+
assert_(np.isinf(np.nancov(x, ddof=3)).all())
940+
assert_(len(w) == 2)
941+
# Degree of freedom <= 0
942+
assert_(issubclass(w[0].category, RuntimeWarning))
943+
# Divide by zero error
944+
assert_(issubclass(w[1].category, RuntimeWarning))
945+
946+
def test_pairwise(self):
947+
x = np.array([[ 1., 2., 10., np.nan, 5.],
948+
[ 4., 10., 13., np.nan, 8.],
949+
[-1., -2., np.nan, np.nan, -5.]])
950+
fweights = np.array([1, 2, 3, 1, 1])
951+
aweights = np.array([0.9, 1.0, 0.8, 1.1, 1.0])
952+
val_weight = np.zeros((3, 3))
953+
val = np.zeros((3, 3))
954+
for ii in range(3):
955+
for jj in range(ii, 3):
956+
val[ii, jj] = np.nancov(x[ii, :], x[jj, :])[0, 1]
957+
val[jj, ii] = val[ii, jj]
958+
val_weight[ii, jj] = np.nancov(x[ii, :], x[jj, :],
959+
fweights=fweights,
960+
aweights=aweights)[0, 1]
961+
val_weight[jj, ii] = val_weight[ii, jj]
962+
assert_allclose(val, np.nancov(x, pairwise=True))
963+
assert_allclose(val_weight, np.nancov(x,
964+
fweights=fweights,
965+
aweights=aweights,
966+
pairwise=True))
967+
assert_allclose(val_weight, np.nancov(x.T,
968+
rowvar=0,
969+
fweights=fweights,
970+
aweights=aweights,
971+
pairwise=True))
972+
891973
if __name__ == "__main__":
892974
run_module_suite()

0 commit comments

Comments
 (0)
0