Keep dtype whenever possible; add _update_array; docstring fixes

pandas-dev · h-vetinari · Nov 15, 2018 · Oct 16, 2018 · Nov 15, 2018 · Nov 8, 2018
commit f120d65568c9c26f510e2548d8beafbc039acdbb
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4178,7 +4178,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
         """
         Modify in place using non-NA values from another DataFrame.
 
-        Aligns on indices. There is no return value.
+        Series/DataFrame will be aligned on indexes, and whenever possible,
+        the dtype of the individual Series of the caller will be preserved.
+
+        There is no return value.
 
         Parameters
         ----------
@@ -4198,7 +4201,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
             * False: only update values that are NA in
               the original DataFrame.
 
-        filter_func : callable(1d-array) -> boolean 1d-array, optional
+        filter_func : callable(1d-array) -> bool 1d-array, optional
             Can choose to replace values other than NA. Return True for values
             that should be updated.
         errors : {'raise', 'ignore'}, default 'ignore'
@@ -4208,7 +4211,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
         Raises
         ------
         ValueError
-            When `raise_conflict` is True and there's overlapping non-NA data.
+            When `errors='ignore'` and there's overlapping non-NA data.
 
         Returns
         -------
@@ -4275,10 +4278,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
         >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
         >>> df.update(new_df)
         >>> df
-           A      B
-        0  1    4.0
-        1  2  500.0
-        2  3    6.0
+           A    B
+        0  1    4
+        1  2  500
+        2  3    6
         """
         from pandas import Series, DataFrame
         # TODO: Support other joins
@@ -4292,14 +4295,20 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
             this = self.values
             that = other.values
 
-            # missing.update_array returns an np.ndarray
-            updated_values = missing.update_array(this, that,
+            # will return None if "this" remains unchanged
+            updated_array = missing._update_array(this, that,
                                                   overwrite=overwrite,
                                                   filter_func=filter_func,
                                                   errors=errors)
             # don't overwrite unnecessarily
-            if updated_values is not None:
-                self._update_inplace(Series(updated_values, index=self.index))
+            if updated_array is not None:
+                # avoid unnecessary upcasting (introduced by alignment)
+                try:
+                    updated = Series(updated_array, index=self.index,
+                                     dtype=this.dtype)
+                except ValueError:
+                    updated = Series(updated_array, index=self.index)
+                self._update_inplace(updated)
         else:  # DataFrame
             if not isinstance(other, ABCDataFrame):
                 other = DataFrame(other)
@@ -4310,11 +4319,23 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
                 this = self[col].values
                 that = other[col].values
 
-                updated = missing.update_array(this, that, overwrite=overwrite,
-                                           filter_func=filter_func,
-                                           errors=errors)
+                # will return None if "this" remains unchanged
+                updated_array = missing._update_array(this, that,
+                                                      overwrite=overwrite,
+                                                      filter_func=filter_func,
+                                                      errors=errors)
                 # don't overwrite unnecessarily
-                if updated is not None:
+                if updated_array is not None:
+                    # no problem to set DataFrame column with array
+                    updated = updated_array
+
+                    if updated_array.dtype != this.dtype:
+                        # avoid unnecessary upcasting (introduced by alignment)
+                        try:
+                            updated = Series(updated_array, index=self.index,
+                                             dtype=this.dtype)
+                        except ValueError:
+                            pass
                     self[col] = updated
 
     def filter(self, items=None, like=None, regex=None, axis=None):

diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -106,15 +106,25 @@ def update_array(this, that, overwrite=True, filter_func=None,
 
     Returns
     -------
-    updated : np.ndarray (one-dimensional) or None
-        The updated array. Return None if `this` remains unchanged
+    updated : np.ndarray (one-dimensional)
+        The updated array.
 
     See Also
     --------
     Series.update : Similar method for `Series`.
     DataFrame.update : Similar method for `DataFrame`.
     dict.update : Similar method for `dict`.
     """
+    updated = _update_array(this, that, overwrite=overwrite,
+                           filter_func=filter_func, errors=errors)
+    return this if updated is None else updated
+
+
+def _update_array(this, that, overwrite=True, filter_func=None,
+                  errors='ignore'):
+    """
+    Same as update_array, except we return None if `this` is not updated.
+    """
     import pandas.core.computation.expressions as expressions
 
     if filter_func is not None:

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2390,7 +2390,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
         """
         Modify Series in place using non-NA values from passed Series.
 
-        Aligns on index.
+        Series will be aligned on indexes, and whenever possible, the dtype of
+        the caller will be preserved.
+
+        There is no return value.
 
         Parameters
         ----------
@@ -2411,7 +2414,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
               the original DataFrame.
 
             .. versionadded:: 0.24.0
-        filter_func : callable(1d-array) -> boolean 1d-array, optional
+        filter_func : callable(1d-array) -> bool 1d-array, optional
             Can choose to replace values other than NA. Return True for values
             that should be updated.
 
@@ -2422,10 +2425,19 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
 
             .. versionadded:: 0.24.0
 
+        Raises
+        ------
+        ValueError
+            When `errors='ignore'` and there's overlapping non-NA data.
+
+        Returns
+        -------
+        Nothing, the Series is modified inplace.
+
         See Also
         --------
         DataFrame.update : Similar method for `DataFrame`.
-        dict.update : Similar method for `dict`
+        dict.update : Similar method for `dict`.
 
         Examples
         --------
@@ -2459,10 +2471,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
         >>> s = pd.Series([1, 2, 3])
         >>> s.update(pd.Series([4, np.nan, 6]))
         >>> s
-        0    4.0
-        1    2.0
-        2    6.0
-        dtype: float64
+        0    4
+        1    2
+        2    6
+        dtype: int64
         """
         super(Series, self
9E7A
).update(other, join=join, overwrite=overwrite,
                                    filter_func=filter_func,

diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py
@@ -279,6 +279,25 @@ def test_update_dtypes(self):
                              columns=['A', 'B', 'bool1', 'bool2'])
         assert_frame_equal(df, expected)
 
+        df = DataFrame([[10, 100], [11, 101], [12, 102]], columns=['A', 'B'])
+        other = DataFrame([[61, 601], [63, 603]], columns=['A', 'B'],
+                          index=[1, 3])
+        df.update(other)
+
+        expected = DataFrame([[10, 100], [61, 601], [12, 102]],
+                             columns=['A', 'B'])
+        assert_frame_equal(df, expected)
+
+        # we always try to keep original dtype, even if other has different one
+        df.update(other.astype(float))
+        assert_frame_equal(df, expected)
+
+        # if keeping the dtype is not possible, we allow upcasting
+        df.update(other + 0.1)
+        expected = DataFrame([[10., 100.], [61.1, 601.1], [12., 102.]],
+                             columns=['A', 'B'])
+        assert_frame_equal(df, expected)
+
     def test_update_nooverwrite(self):
         df = DataFrame([[1.5, nan, 3.],
                         [1.5, nan, 3.],

diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py
@@ -10,7 +10,7 @@
 import pandas as pd
 from pandas import DataFrame, DatetimeIndex, Series, compat, date_range
 import pandas.util.testing as tm
-from pandas.util.testing import assert_series_equal
+from pandas.util.testing import assert_series_equal, assert_frame_equal
 
 
 class TestSeriesCombine():
@@ -105,8 +105,8 @@ def test_combine_first(self):
         assert_series_equal(s, result)
 
     def test_update(self):
-        s = Series([1.5, nan, 3., 4., nan])
-        s2 = Series([nan, 3.5, nan, 5.])
+        s = Series([1.5, np.nan, 3., 4., np.nan])
+        s2 = Series([np.nan, 3.5, np.nan, 5.])
         s.update(s2)
 
         expected = Series([1.5, 3.5, 3., 5., np.nan])
@@ -116,8 +116,35 @@ def test_update(self):
         df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
         df['c'] = np.nan
 
-        # this will fail as long as series is a sub-class of ndarray
-        # df['c'].update(Series(['foo'],index=[0])) #####
+        df['c'].update(Series(['foo'], index=[0]))
+        expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]],
+                             columns=['a', 'b', 'c'])
+        assert_frame_equal(df, expected)
+
+    def test_update_dtypes(self):
+        s = Series([1., 2., False, True])
+
+        other = Series([45])
+        s.update(other)
+
+        expected = Series([45., 2., False, True])
+        assert_series_equal(s, expected)
+
+        s = Series([10, 11, 12])
+        other = Series([61, 63], index=[1, 3])
+        s.update(other)
+
+        expected = Series([10, 61, 12])
+        assert_series_equal(s, expected)
+
+        # we always try to keep original dtype, even if other has different one
+        s.update(other.astype(float))
+        assert_series_equal(s, expected)
+
+        # if keeping the dtype is not possible, we allow upcasting
+        s.update(other + 0.1)
+        expected = Series([10., 61.1, 12.])
+        assert_series_equal(s, expected)
 
     def test_update_nooverwrite(self):
         s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan])
@@ -129,6 +156,8 @@ def test_update_nooverwrite(self):
         assert_series_equal(s, expected)
 
     def test_update_filtered(self):
+        # for small values, np.arange defaults to int32,
+        # but pandas default (e.g. for "expected" below) is int64
         s = Series(np.arange(8), dtype='int64')
         other = Series(np.arange(8), dtype='int64') + 10