Adds nancumsum, nancumprod to xarray functions

pydata · shoyer · Oct 3, 2016 · Mar 28, 2016 · Mar 28, 2016 · Oct 3, 2016
commit 129c807be4df7f07dce5825af3982ffd8052895b
diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -44,6 +44,8 @@
    Dataset.round
    Dataset.real
    Dataset.T
+   Dataset.cumsum
+   Dataset.cumprod
 
    DataArray.ndim
    DataArray.shape
@@ -87,6 +89,8 @@
    DataArray.round
    DataArray.real
    DataArray.T
+   DataArray.cumsum
+   DataArray.cumprod
 
    ufuncs.angle
    ufuncs.arccos

diff --git a/doc/api.rst b/doc/api.rst
@@ -145,6 +145,8 @@ Computation
 :py:attr:`~Dataset.round`
 :py:attr:`~Dataset.real`
 :py:attr:`~Dataset.T`
+:py:attr:`~Dataset.cumsum`
+:py:attr:`~Dataset.cumprod`
 
 **Grouped operations**:
 :py:attr:`~core.groupby.DatasetGroupBy.assign`
@@ -286,6 +288,8 @@ Computation
 :py:attr:`~DataArray.round`
 :py:attr:`~DataArray.real`
 :py:attr:`~DataArray.T`
+:py:attr:`~DataArray.cumsum`
+:py:attr:`~DataArray.cumprod`
 
 **Grouped operations**:
 :py:attr:`~core.groupby.DataArrayGroupBy.assign_coords`

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -73,6 +73,10 @@ Enhancements
   which to concatenate.
   By `Stephan Hoyer <https://github.com/shoyer>`_.
 
+- Adds DataArray and Dataset methods :py:meth:`~xarray.DataArray.cumsum` and
+  :py:meth:`~xarray.DataArray.cumprod`.  By `Phillip J. Wolfram
+  <https://github.com/pwolfram>`_.
+
 Bug fixes
 ~~~~~~~~~
 - ``groupby_bins`` now restores empty bins by default (:issue:`1019`).

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -29,6 +29,13 @@ def wrapped_func(self, dim=None, axis=None, keep_attrs=False,
             and 'axis' arguments can be supplied. If neither are supplied, then
             `{name}` is calculated over axes."""
 
+    _cum_extra_args_docstring = \
+        """dim : str or sequence of str, optional
+            Dimension over which to apply `{name}`.
+        axis : int or sequence of int, optional
+            Axis over which to apply `{name}`. Only one of the 'dim'
+            and 'axis' arguments can be supplied."""
+
 
 class ImplementsDatasetReduce(object):
     @classmethod
@@ -51,6 +58,13 @@ def wrapped_func(self, dim=None, keep_attrs=False, **kwargs):
             Dimension(s) over which to apply `func`.  By default `func` is
             applied over all dimensions."""
 
+    _cum_extra_args_docstring = \
+        """dim : str or sequence of str, optional
+            Dimension over which to apply `{name}`.
+        axis : int or sequence of int, optional
+            Axis over which to apply `{name}`. Only one o
8000
f the 'dim'
+            and 'axis' arguments can be supplied."""
+
 
 class ImplementsRollingArrayReduce(object):
     @classmethod

diff --git a/xarray/core/ops.py b/xarray/core/ops.py
@@ -45,10 +45,11 @@
 REDUCE_METHODS = ['all', 'any']
 NAN_REDUCE_METHODS = ['argmax', 'argmin', 'max', 'min', 'mean', 'prod', 'sum',
                       'std', 'var', 'median']
+NAN_CUM_METHODS = ['cumsum', 'cumprod']
 BOTTLENECK_ROLLING_METHODS = {'move_sum': 'sum', 'move_mean': 'mean',
                               'move_std': 'std', 'move_min': 'min',
                               'move_max': 'max'}
-# TODO: wrap cumprod/cumsum, take, dot, sort
+# TODO: wrap take, dot, sort
 
 
 def _dask_or_eager_func(name, eager_module=np, list_of_args=False,
@@ -201,6 +202,30 @@ def func(self, *args, **kwargs):
     func.__doc__ = f.__doc__
     return func
 
+_CUM_DOCSTRING_TEMPLATE = \
+        """Apply `{name}` along some dimension of {cls}.
+
+        Parameters
+        ----------
+        {extra_args}
+        skipna : bool, optional
+            If True, skip missing values (as marked by NaN). By default, only
+            skips missing values for float dtypes; other dtypes either do not
+            have a sentinel missing value (int) or skipna=True has not been
+            implemented (object, datetime64 or timedelta64).
+        keep_attrs : bool, optional
+            If True, the attributes (`attrs`) will be copied from the original
+            object to the new one.  If False (default), the new object will be
+            returned without attributes.
+        **kwargs : dict
+            Additional keyword arguments passed on to `{name}`.
+
+        Returns
+        -------
+        cumvalue : {cls}
+            New {cls} object with `{name}` applied to its data along the
+            indicated dimension.
+        """
 
 _REDUCE_DOCSTRING_TEMPLATE = \
         """Reduce this {cls}'s data by applying `{name}` along some
@@ -274,7 +299,9 @@ def _ignore_warnings_if(condition):
         yield
 
 
-def _create_nan_agg_method(name, numeric_only=False, coerce_strings=False):
+def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
+                           no_bottleneck=False, coerce_strings=False,
+                           keep_dims=False):
     def f(values, axis=None, skipna=None, **kwargs):
         # ignore keyword args inserted by np.mean and other numpy aggregators
         # automatically:
@@ -292,14 +319,17 @@ def f(values, axis=None, skipna=None, **kwargs):
                     'skipna=True not yet implemented for %s with dtype %s'
                     % (name, values.dtype))
             nanname = 'nan' + name
-            if isinstance(axis, tuple) or not values.dtype.isnative:
+            if isinstance(axis, tuple) or not values.dtype.isnative or no_bottleneck:
                 # bottleneck can't handle multiple axis arguments or non-native
                 # endianness
-                eager_module = np
+                if np_compat:
+                    eager_module = npcompat
+                else:
+                    eager_module = np
             else:
                 eager_module = bn
             func = _dask_or_eager_func(nanname, eager_module)
-            using_numpy_nan_func = eager_module is np
+            using_numpy_nan_func = eager_module is np or eager_module is npcompat
         else:
             func = _dask_or_eager_func(name)
             using_numpy_nan_func = False
@@ -312,10 +342,12 @@ def f(values, axis=None, skipna=None, **kwargs):
                 else:
                     assert using_numpy_nan_func
                     msg = ('%s is not available with skipna=False with the '
-                           'installed version of numpy; upgrade to numpy 1.9 '
+                           'installed version of numpy; upgrade to numpy 1.12 '
                            'or newer to use skipna=True or skipna=None' % name)
                 raise NotImplementedError(msg)
     f.numeric_only = numeric_only
+    f.keep_dims = keep_dims
+    f.__name__ = name
     return f
 
 
@@ -328,28 +360,18 @@ def f(values, axis=None, skipna=None, **kwargs):
 std = _create_nan_agg_method('std', numeric_only=True)
 var = _create_nan_agg_method('var', numeric_only=True)
 median = _create_nan_agg_method('median', numeric_only=True)
-
+prod = _create_nan_agg_method('prod', numeric_only=True, np_compat=True,
+                              no_bottleneck=True)
+cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True,
+                                 no_bottleneck=True, keep_dims=True)
+cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True,
+                                no_bottleneck=True, keep_dims=True)
 
 _fail_on_dask_array_input_skipna = partial(
     _fail_on_dask_array_input,
     msg='%r with skipna=True is not yet implemented on dask arrays')
 
 
-_prod = _dask_or_eager_func('prod')
-
-
-def prod(values, axis=None, skipna=None, **kwargs):
-    if skipna or (skipna is None and values.dtype.kind == 'f'):
-        if values.dtype.kind not in ['i', 'f']:
-            raise NotImplementedError(
-                'skipna=True not yet implemented for prod with dtype %s'
-                % values.dtype)
-        _fail_on_dask_array_input_skipna(values)
-        return npcompat.nanprod(values, axis=axis, **kwargs)
-    return _prod(values, axis=axis, **kwargs)
-prod.numeric_only = True
-
-
 def first(values, axis, skipna=None):
     """Return the first non-NA elements in this array along the given axis
     """
@@ -384,6 +406,17 @@ def inject_reduce_methods(cls):
             extra_args=cls._reduce_extra_args_docstring)
         setattr(cls, name, func)
 
+def inject_cum_methods(cls):
+    methods = ([(name, globals()[name], True) for name in NAN_CUM_METHODS])
+    for name, f, include_skipna in methods:
+        numeric_only = getattr(f, 'numeric_only', False)
+        func = cls._reduce_method(f, include_skipna, numeric_only)
+        func.__name__ = name
+        func.__doc__ = _CUM_DOCSTRING_TEMPLATE.format(
+            name=name, cls=cls.__name__,
+            extra_args=cls._cum_extra_args_docstring)
+        setattr(cls, name, func)
+
 
 def op_str(name):
     return '__%s__' % name
@@ -454,6 +487,7 @@ def inject_all_ops_and_reduce_methods(cls, priority=50, array_only=True):
             setattr(cls, name, _values_method_wrapper(name))
 
     inject_reduce_methods(cls)
+    inject_cum_methods(cls)
 
 
 def inject_bottleneck_rolling_methods(cls):

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -896,15 +896,24 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False,
         if dim is not None and axis is not None:
             raise ValueError("cannot supply both 'axis' and 'dim' arguments")
 
+        if getattr(func, 'keep_dims', False):
+            if dim is None and axis is None:
+                raise ValueError("must supply either single 'dim' or 'axis' argument to %s"
+                                 % (func.__name__))
+
         if dim is not None:
             axis = self.get_axis_num(dim)
         data = func(self.data if allow_lazy else self.values,
                     axis=axis, **kwargs)
 
-        removed_axes = (range(self.ndim) if axis is None
-                        else np.atleast_1d(axis) % self.ndim)
-        dims = [dim for n, dim in enumerate(self.dims)
-                if n not in removed_axes]
+        if getattr(data, 'shape', ()) == self.shape:
+            dims = self.dims
+        else:
+            removed_axes = (range(self.ndim) if axis is None
+                            else np.atleast_1d(axis) % self.ndim)
+            dims = [adim for n, adim in enumerate(self.dims)
+                    if n not in removed_axes]
+
 
         attrs = self._attrs if keep_attrs else None
 

diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py
@@ -145,8 +145,6 @@ def test_reduce(self):
         self.assertLazyAndAllClose(u.argmax(dim='x'), v.argmax(dim='x'))
         self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
         self.assertLazyAndAllClose((u < 1).all('x'), (v < 1).all('x'))
-        with self.assertRaisesRegexp(NotImplementedError, 'dask'):
-            v.prod()
         with self.assertRaisesRegexp(NotImplementedError, 'dask'):
             v.median()
 

diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py
@@ -1089,6 +1089,28 @@ def test_dropna(self):
         expected = arr[:, 1:]
         self.assertDataArrayIdentical(actual, expected)
 
+    def test_cumops(self):
+        coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
+                  'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),
+                  'c': -999}
+        orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords, dims=['x', 'y'])
+
+        actual = orig.cumsum('x')
+        expected = DataArray([[-1, 0, 1], [-4, 0, 4]], coords, dims=['x', 'y'])
+        self.assertDataArrayIdentical(expected, actual)
+
+        actual = orig.cumsum('y')
+        expected = DataArray([[-1, -1, 0], [-3, -3, 0]], coords, dims=['x', 'y'])
+        self.assertDataArrayIdentical(expected, actual)
+
+        actual = orig.cumprod('x')
+        expected = DataArray([[-1, 0, 1], [3, 0, 3]], coords, dims=['x', 'y'])
+        self.assertDataArrayIdentical(expected, actual)
+
+        actual = orig.cumprod('y')
+        expected = DataArray([[-1, 0, 0], [-3, 0, 0]], coords, dims=['x', 'y'])
+        self.assertDataArrayIdentical(expected, actual)
+
     def test_reduce(self):
         coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
                   'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),

diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
@@ -2421,6 +2421,25 @@ def test_reduce_bad_dim(self):
         with self.assertRaisesRegexp(ValueError, 'Dataset does not contain'):
             ds = data.mean(dim='bad_dim')
 
+    def test_reduce_cumsum_test_dims(self):
+        data = create_test_data()
+        for cumfunc in ['cumsum', 'cumprod']:
+            with self.assertRaisesRegexp(ValueError, "must supply either single 'dim' or 'axis'"):
+                ds = getattr(data, cumfunc)()
+            with self.assertRaisesRegexp(ValueError, "must supply either single 'dim' or 'axis'"):
+                ds = getattr(data, cumfunc)(dim=['dim1', 'dim2'])
+            with self.assertRaisesRegexp(ValueError, 'Dataset does not contain'):
+                ds = getattr(data, cumfunc)(dim='bad_dim')
+
+            # ensure dimensions are correct
+            for reduct, expected in [('dim1', ['dim1', 'dim2', 'dim3', 'time']),
+                                     ('dim2', ['dim1', 'dim2', 'dim3', 'time']),
+                                     ('dim3', ['dim1', 'dim2', 'dim3', 'time']),
+                                     ('time', ['dim1', 'dim2', 'dim3'])]:
+                actual = getattr(data, cumfunc)(dim=reduct).dims
+                print(reduct, actual, expected)
+                self.assertItemsEqual(actual, expected)
+
     def test_reduce_non_numeric(self):
         data1 = create_test_data(seed=44)
         data2 = create_test_data(seed=44)

diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py
@@ -974,6 +974,10 @@ def test_reduce_funcs(self):
         self.assertVariableIdentical(np.mean(v), Variable([], 2))
 
         self.assertVariableIdentical(v.prod(), Variable([], 6))
+        self.assertVariableIdentical(v.cumsum(axis=0),
+                                     Variable('x', np.array([1, 1, 3, 6])))
+        self.assertVariableIdentical(v.cumprod(axis=0),
+                                     Variable('x', np.array([1, 1, 2, 6])))
         self.assertVariableIdentical(v.var(), Variable([], 2.0 / 3))
 
         if LooseVersion(np.__version__) < '1.9':