|
17 | 17 | - `nanstd` -- standard deviation of non-NaN values
|
18 | 18 | - `nanmedian` -- median of non-NaN values
|
19 | 19 | - `nanpercentile` -- qth percentile of non-NaN values
|
| 20 | +- `nancov` -- covariance matrix of non-Nan values |
20 | 21 |
|
21 | 22 | """
|
22 | 23 | from __future__ import division, absolute_import, print_function
|
|
29 | 30 | __all__ = [
|
30 | 31 | 'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean',
|
31 | 32 | 'nanmedian', 'nanpercentile', 'nanvar', 'nanstd', 'nanprod',
|
32 |
| - 'nancumsum', 'nancumprod' |
| 33 | + 'nancumsum', 'nancumprod', 'nancov' |
33 | 34 | ]
|
34 | 35 |
|
35 | 36 |
|
@@ -1441,3 +1442,245 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
|
1441 | 1442 | else:
|
1442 | 1443 | std = var.dtype.type(np.sqrt(var))
|
1443 | 1444 | return std
|
| 1445 | + |
| 1446 | + |
| 1447 | +def nancov(m, y=None, rowvar=1, ddof=1, fweights=None, aweights=None, |
| 1448 | + pairwise=False): |
| 1449 | + """ |
| 1450 | + Estimate a covariance matrix, given data, ignoring observations containing |
| 1451 | + NaN values. |
| 1452 | +
|
| 1453 | + Covariance indicates the level to which two variables vary together. |
| 1454 | + If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, |
| 1455 | + then the covariance matrix element :math:`C_{ij}` is the covariance of |
| 1456 | + :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance |
| 1457 | + of :math:`x_i`. |
| 1458 | +
|
| 1459 | + .. versionadded:: 1.14.0 |
| 1460 | +
|
| 1461 | + Parameters |
| 1462 | + ---------- |
| 1463 | + m : array_like |
| 1464 | + A 1-D or 2-D array containing multiple variables and observations. |
| 1465 | + Each row of `m` represents a variable, and each column a single |
| 1466 | + observation of all those variables. Also see `rowvar` below. |
| 1467 | + y : array_like, optional |
| 1468 | + An additional set of variables and observations. `y` has the same |
| 1469 | + form as that of `m`. |
| 1470 | + rowvar : int, optional |
| 1471 | + If `rowvar` is non-zero (default), then each row represents a |
| 1472 | + variable, with observations in the columns. Otherwise, the relationship |
| 1473 | + is transposed: each column represents a variable, while the rows |
| 1474 | + contain observations. |
| 1475 | + ddof : int, optional |
| 1476 | + Normalization is by ``(N - ddof)``, where ``N`` is |
| 1477 | + the number of observations. The default value is ``1``. |
| 1478 | + fweights : array_like, int, optional |
| 1479 | + 1-D array of integer freguency weights; the number of times each |
| 1480 | + observation vector should be repeated. |
| 1481 | + aweights : array_like, optional |
| 1482 | + 1-D array of observation vector weights. These relative weights are |
| 1483 | + typically large for observations considered "important" and smaller for |
| 1484 | + observations considered less "important". If ``ddof=0`` the array of |
| 1485 | + weights can be used to assign probabilities to observation vectors. |
| 1486 | + pairwise : bool, optional |
| 1487 | + If ``False`` then a NaN value causes that observation to be eliminated |
| 1488 | + across all variables. If ``True`` then each pair of variables is |
| 1489 | + compared separately. Observations with NaN values in the pair are |
| 1490 | + removed, but do not affect other pairs. The returned covariance matrix |
| 1491 | + may no longer be positive semi-definite in this case. False is |
| 1492 | + equivalent to the ``na.or.complete`` use flag for cov in R, while |
| 1493 | + ``True`` is equivalent to ``pairwise.complete.obs``. |
| 1494 | +
|
| 1495 | + Returns |
| 1496 | + ------- |
| 1497 | + out : ndarray |
| 1498 | + The covariance matrix of the variables. |
| 1499 | +
|
| 1500 | + See Also |
| 1501 | + -------- |
| 1502 | + corrcoef : Normalized covariance matrix |
| 1503 | +
|
| 1504 | + Examples |
| 1505 | + -------- |
| 1506 | + Consider two variables, :math:`x_0` and :math:`x_1`, which |
| 1507 | + correlate perfectly, but in opposite directions: |
| 1508 | +
|
| 1509 | + >>> x = np.array([[0, 2], [1, 1], [np.nan, 1], [2, 0]]).T |
| 1510 | + >>> x |
| 1511 | + array([[0, 1, nan, 2], |
| 1512 | + [2, 1, 1, 0]]) |
| 1513 | +
|
| 1514 | + Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance |
| 1515 | + matrix shows this clearly: |
| 1516 | +
|
| 1517 | + >>> np.nancov(x) |
| 1518 | + array([[ 1., -1.], |
| 1519 | + [-1., 1.]]) |
| 1520 | +
|
| 1521 | + Note that element :math:`C_{0,1}`, which shows the correlation between |
| 1522 | + :math:`x_0` and :math:`x_1`, is negative. |
| 1523 | +
|
| 1524 | + Further, note how `x` and `y` are combined: |
| 1525 | +
|
| 1526 | + >>> x = [-2.1, -1, np.nan, 4.3] |
| 1527 | + >>> y = [3, 1.1, 1, 0.12] |
| 1528 | + >>> X = np.vstack((x,y)) |
| 1529 | + >>> print np.nancov(X) |
| 1530 | + [[ 11.71 -4.286 ] |
| 1531 | + [ -4.286 2.14413333]] |
| 1532 | + >>> print np.nancov(x, y) |
| 1533 | + [[ 11.71 -4.286 ] |
| 1534 | + [ -4.286 2.14413333]] |
| 1535 | + >>> print np.nancov(x) |
| 1536 | + 11.71 |
| 1537 | +
|
| 1538 | + Finally it can be seen how the pairwise option handles nan values |
| 1539 | + differently than when when the pairwise option is disabled (default). |
| 1540 | +
|
| 1541 | + >>> X = np.array([[ 1., 4., 8., np.nan, 16.], |
| 1542 | + ... [ 0., 2., 4., 6., 4.], |
| 1543 | + ... [-1., 3., np.nan, np.nan, 4.]]) |
| 1544 | + >>> np.nancov(X) |
| 1545 | + array([[ 63. , 15. , 16.5], |
| 1546 | + [ 15. , 4. , 5. ], |
| 1547 | + [ 16.5, 5. , 7. ]]) |
| 1548 | + >>> np.nancov(X, pairwise=True) |
| 1549 | + array([[ 42.25, 10.5 , 16.5 ], |
| 1550 | + [ 10.5 , 5.2 , 5. ], |
| 1551 | + [ 16.5 , 5. , 7. ]]) |
| 1552 | + """ |
| 1553 | + # Check inputs |
| 1554 | + if ddof is not None and ddof != int(ddof): |
| 1555 | + raise ValueError( |
| 1556 | + "ddof must be integer") |
| 1557 | + |
| 1558 | + m = np.asarray(m) |
| 1559 | + if m.ndim > 2: |
| 1560 | + raise ValueError("m has more than 2 dimensions") |
| 1561 | + |
| 1562 | + if y is None: |
| 1563 | + dtype = np.result_type(m, np.float64) |
| 1564 | + else: |
| 1565 | + y = np.asarray(y) |
| 1566 | + if y.ndim > 2: |
| 1567 | + raise ValueError("y has more than 2 dimensions") |
| 1568 | + dtype = np.result_type(m, y, np.float64) |
| 1569 | + X = np.array(m, ndmin=2, dtype=dtype) |
| 1570 | + |
| 1571 | + if X.shape[0] == 1: |
| 1572 | + rowvar = 1 |
| 1573 | + if rowvar: |
| 1574 | + axis = 0 |
| 1575 | + else: |
| 1576 | + axis = 1 |
| 1577 | + |
| 1578 | + if y is not None: |
| 1579 | + y = np.array(y, copy=False, ndmin=2, dtype=dtype) |
| 1580 | + X = np.concatenate((X, y), axis) |
| 1581 | + |
| 1582 | + if axis: |
| 1583 | + n = X.shape[0] |
| 1584 | + else: |
| 1585 | + n = X.shape[1] |
| 1586 | + |
| 1587 | + # Get the product of frequencies and weights |
| 1588 | + if fweights is not None: |
| 1589 | + fweights = np.asarray(fweights, dtype=np.float64) |
| 1590 | + if not np.all(fweights == np.around(fweights)): |
| 1591 | + raise TypeError( |
| 1592 | + "fweights must be integer") |
| 1593 | + if fweights.ndim > 1: |
| 1594 | + raise RuntimeError( |
| 1595 | + "cannot handle multidimensional fweights") |
| 1596 | + if fweights.shape[0] != n: |
| 1597 | + raise RuntimeError( |
| 1598 | + "incompatible numbers of samples and fweights") |
| 1599 | + if any(fweights < 0): |
| 1600 | + raise ValueError( |
| 1601 | + "fweights cannot be negative") |
| 1602 | + if aweights is not None: |
| 1603 | + aweights = np.asarray(aweights, dtype=np.float64) |
| 1604 | + if aweights.ndim > 1: |
| 1605 | + raise RuntimeError( |
| 1606 | + "cannot handle multidimensional aweights") |
| 1607 | + if aweights.shape[0] != n: |
| 1608 | + raise RuntimeError( |
| 1609 | + "incompatible numbers of samples and aweights") |
| 1610 | + if any(aweights < 0): |
| 1611 | + raise ValueError( |
| 1612 | + "aweights cannot be negative") |
| 1613 | + |
| 1614 | + if pairwise: |
| 1615 | + w = None |
| 1616 | + if fweights is not None: |
| 1617 | + w = fweights |
| 1618 | + if aweights is not None: |
| 1619 | + if w is None: |
| 1620 | + w = awei
10000
ghts |
| 1621 | + else: |
| 1622 | + w *= aweights |
| 1623 | + |
| 1624 | + nan_vals = np.isnan(X) |
| 1625 | + if axis: |
| 1626 | + one_array = np.ones(X.T.shape) |
| 1627 | + one_array[nan_vals.T] = np.nan |
| 1628 | + else: |
| 1629 | + one_array = np.ones(X.shape) |
| 1630 | + one_array[nan_vals] = np.nan |
| 1631 | + # each pair may cause a unique mean for the variable, so we create |
| 1632 | + # pair_array which has the correctly nan-ed values. |
| 1633 | + if axis: |
| 1634 | + pair_array = X[:, None, :].swapaxes(0, 2) * one_array[None, :, :] |
| 1635 | + else: |
| 1636 | + pair_array = X[:, None, :] * one_array[None, :, :] |
| 1637 | + |
| 1638 | + if w is None: |
| 1639 | + pair_means = nanmean(pair_array, axis=2) |
| 1640 | + pair_w_sum = nansum(~np.isnan(pair_array), axis=2) |
| 1641 | + else: |
| 1642 | + pair_w = w[None, None, :] * ~np.isnan(pair_array) |
| 1643 | + pair_w_sum = np.nansum(pair_w, axis=2) |
| 1644 | + pair_means = nansum(w[None, None, :] * pair_array, |
| 1645 | + axis=2) / pair_w_sum |
| 1646 | + pair_array -= pair_means[:, :, None] |
| 1647 | + |
| 1648 | + if w is None: |
| 1649 | + dotted = pair_array * pair_array.swapaxes(0, 1).conj() |
| 1650 | + else: |
| 1651 | + dotted = pair_array * \ |
| 1652 | + (w[None, None, :] * pair_array).swapaxes(0, 1).conj() |
| 1653 | + |
| 1654 | + c = nansum(dotted, axis=2) |
| 1655 | + |
| 1656 | + if aweights is None: |
| 1657 | + fact = pair_w_sum - ddof |
| 1658 | + else: |
| 1659 | + ddof_weight = nansum(aweights[None, None, :] * pair_w, axis=2) |
| 1660 | + fact = pair_w_sum - ddof * ddof_weight / pair_w_sum |
| 1661 | + |
| 1662 | + if np.any(fact <= 0): |
| 1663 | + warnings.warn("Degrees of freedom <= 0 for a slice", |
| 1664 | + RuntimeWarning, stacklevel=2) |
| 1665 | + fact[fact <= 0] = 0.0 |
| 1666 | + |
| 1667 | + c *= 1. / fact.astype(np.float64) |
| 1668 | + return c |
| 1669 | + else: |
| 1670 | + # "Complete" version for handling nans where a nan value in any |
| 1671 | + # variable causes the observation to be removed from all variables. |
| 1672 | + |
| 1673 | + # Find observations with nans |
| 1674 | + nan_obvs = np.any(np.isnan(X), axis=axis) |
| 1675 | + |
| 1676 | + if fweights is not None: |
| 1677 | + fweights = fweights[~nan_obvs] |
| 1678 | + if aweights is not None: |
| 1679 | + aweights = aweights[~nan_obvs] |
| 1680 | + |
| 1681 | + if axis: |
| 1682 | + X_nonan = X[~nan_obvs, :] |
| 1683 | + else: |
| 1684 | + X_nonan = X[:, ~nan_obvs] |
| 1685 | + return np.cov(X_nonan, rowvar=rowvar, ddof=ddof, fweights=fweights, |
| 1686 | + aweights=aweights) |
0 commit comments