8000 API: Unify .update to generic by h-vetinari · Pull Request #23192 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

API: Unify .update to generic #23192

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Keep dtype whenever possible; add _update_array; docstring fixes
  • Loading branch information
h-vetinari committed Nov 15, 2018
commit f120d65568c9c26f510e2548d8beafbc039acdbb
51 changes: 36 additions & 15 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4178,7 +4178,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
"""
Modify in place using non-NA values from another DataFrame.

Aligns on indices. There is no return value.
Series/DataFrame will be aligned on indexes, and whenever possible,
the dtype of the individual Series of the caller will be preserved.

There is no return value.

Parameters
----------
Expand All @@ -4198,7 +4201,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
* False: only update values that are NA in
the original DataFrame.

filter_func : callable(1d-array) -> boolean 1d-array, optional
filter_func : callable(1d-array) -> bool 1d-array, optional
Can choose to replace values other than NA. Return True for values
that should be updated.
errors : {'raise', 'ignore'}, default 'ignore'
Expand All @@ -4208,7 +4211,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
Raises
------
ValueError
When `raise_conflict` is True and there's overlapping non-NA data.
When `errors='ignore'` and there's overlapping non-NA data.

Returns
-------
Expand Down Expand Up @@ -4275,10 +4278,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
>>> df.update(new_df)
>>> df
A B
0 1 4.0
1 2 500.0
2 3 6.0
A B
0 1 4
1 2 500
2 3 6
Copy link
Contributor Author
@h-vetinari h-vetinari Nov 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't obvious from the diff (due to the code moving files), but now this keeps the dtype - yay!

"""
from pandas import Series, DataFrame
# TODO: Support other joins
Expand All @@ -4292,14 +4295,20 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
this = self.values
that = other.values

# missing.update_array returns an np.ndarray
updated_values = missing.update_array(this, that,
# will return None if "this" remains unchanged
updated_array = missing._update_array(this, that,
overwrite=overwrite,
filter_func=filter_func,
errors=errors)
# don't overwrite unnecessarily
if updated_values is not None:
self._update_inplace(Series(updated_values, index=self.index))
if updated_array is not None:
# avoid unnecessary upcasting (introduced by alignment)
try:
updated = Series(updated_array, index=self.index,
dtype=this.dtype)
except ValueError:
updated = Series(updated_array, index=self.index)
self._update_inplace(updated)
else: # DataFrame
if not isinstance(other, ABCDataFrame):
other = DataFrame(other)
Expand All @@ -4310,11 +4319,23 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
this = self[col].values
that = other[col].values

updated = missing.update_array(this, that, overwrite=overwrite,
filter_func=filter_func,
errors=errors)
# will return None if "this" remains unchanged
updated_array = missing._update_array(this, that,
overwrite=overwrite,
filter_func=filter_func,
errors=errors)
# don't overwrite unnecessarily
if updated is not None:
if updated_array is not None:
# no problem to set DataFrame column with array
updated = updated_array

if updated_array.dtype != this.dtype:
# avoid unnecessary upcasting (introduced by alignment)
try:
updated = Series(updated_array, index=self.index,
dtype=this.dtype)
except ValueError:
pass
self[col] = updated

def filter(self, items=None, like=None, regex=None, axis=None):
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,25 @@ def update_array(this, that, overwrite=True, filter_func=None,

Returns
-------
updated : np.ndarray (one-dimensional) or None
The updated array. Return None if `this` remains unchanged
updated : np.ndarray (one-dimensional)
The updated array.

See Also
--------
Series.update : Similar method for `Series`.
DataFrame.update : Similar method for `DataFrame`.
dict.update : Similar method for `dict`.
"""
updated = _update_array(this, that, overwrite=overwrite,
filter_func=filter_func, errors=errors)
return this if updated is None else updated
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
You asked me to move the function here and make it public as update_array. This had the problem that it had the fallback of returning None in case there were no changes, which is necessary to avoid pointlessly updating the columns in NDFrame.update.

However, I was not comfortable with a public function having such unexpected behaviour, so I made the function I used originally private again (but still in core.missing) and just made a thin wrapper around it public, that only filters out the None-returning case.



def _update_array(this, that, overwrite=True, filter_func=None,
errors='ignore'):
"""
Same as update_array, except we return None if `this` is not updated.
"""
import pandas.core.computation.expressions as expressions

if filter_func is not None:
Expand Down
26 changes: 19 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,7 +2390,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
"""
Modify Series in place using non-NA values from passed Series.

Aligns on index.
Series will be aligned on indexes, and whenever possible, the dtype of
the caller will be preserved.

There is no return value.

Parameters
----------
Expand All @@ -2411,7 +2414,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
the original DataFrame.

.. versionadded:: 0.24.0
filter_func : callable(1d-array) -> boolean 1d-array, optional
filter_func : callable(1d-array) -> bool 1d-array, optional
Can choose to replace values other than NA. Return True for values
that should be updated.

Expand All @@ -2422,10 +2425,19 @@ def update(self, other, join='left', overwrite=True, filter_func=None,

.. versionadded:: 0.24.0

Raises
------
ValueError
When `errors='ignore'` and there's overlapping non-NA data.

Returns
-------
Nothing, the Series is modified inplace.

See Also
--------
DataFrame.update : Similar method for `DataFrame`.
dict.update : Similar method for `dict`
dict.update : Similar method for `dict`.

Examples
--------
Expand Down Expand Up @@ -2459,10 +2471,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
>>> s = pd.Series([1, 2, 3])
>>> s.update(pd.Series([4, np.nan, 6]))
>>> s
0 4.0
1 2.0
2 6.0
dtype: float64
0 4
1 2
2 6
dtype: int64
"""
super(Series, self 9E7A ).update(other, join=join, overwrite=overwrite,
filter_func=filter_func,
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/frame/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,25 @@ def test_update_dtypes(self):
columns=['A', 'B', 'bool1', 'bool2'])
assert_frame_equal(df, expected)

df = DataFrame([[10, 100], [11, 101], [12, 102]], columns=['A', 'B'])
other = DataFrame([[61, 601], [63, 603]], columns=['A', 'B'],
index=[1, 3])
df.update(other)

expected = DataFrame([[10, 100], [61, 601], [12, 102]],
columns=['A', 'B'])
assert_frame_equal(df, expected)

# we always try to keep original dtype, even if other has different one
df.update(other.astype(float))
assert_frame_equal(df, expected)

# if keeping the dtype is not possible, we allow upcasting
df.update(other + 0.1)
expected = DataFrame([[10., 100.], [61.1, 601.1], [12., 102.]],
columns=['A', 'B'])
assert_frame_equal(df, expected)

def test_update_nooverwrite(self):
df = DataFrame([[1.5, nan, 3.],
[1.5, nan, 3.],
Expand Down
39 changes: 34 additions & 5 deletions pandas/tests/series/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
from pandas import DataFrame, DatetimeIndex, Series, compat, date_range
import pandas.util.testing as tm
from pandas.util.testing import assert_series_equal
from pandas.util.testing import assert_series_equal, assert_frame_equal


class TestSeriesCombine():
Expand Down Expand Up @@ -105,8 +105,8 @@ def test_combine_first(self):
assert_series_equal(s, result)

def test_update(self):
s = Series([1.5, nan, 3., 4., nan])
s2 = Series([nan, 3.5, nan, 5.])
s = Series([1.5, np.nan, 3., 4., np.nan])
s2 = Series([np.nan, 3.5, np.nan, 5.])
s.update(s2)

expected = Series([1.5, 3.5, 3., 5., np.nan])
Expand All @@ -116,8 +116,35 @@ def test_update(self):
df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
df['c'] = np.nan

# this will fail as long as series is a sub-class of ndarray
# df['c'].update(Series(['foo'],index=[0])) #####
df['c'].update(Series(['foo'], index=[0]))
expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]],
columns=['a', 'b', 'c'])
assert_frame_equal(df, expected)

def test_update_dtypes(self):
s = Series([1., 2., False, True])

other = Series([45])
s.update(other)

expected = Series([45., 2., False, True])
assert_series_equal(s, expected)

s = Series([10, 11, 12])
other = Series([61, 63], index=[1, 3])
s.update(other)

expected = Series([10, 61, 12])
assert_series_equal(s, expected)

# we always try to keep original dtype, even if other has different one
s.update(other.astype(float))
assert_series_equal(s, expected)

# if keeping the dtype is not possible, we allow upcasting
s.update(other + 0.1)
expected = Series([10., 61.1, 12.])
assert_series_equal(s, expected)

def test_update_nooverwrite(self):
s = Series([0, 1, 2, np.nan, np.nan, 5, 6, np.nan])
Expand All @@ -129,6 +156,8 @@ def test_update_nooverwrite(self):
assert_series_equal(s, expected)

def test_update_filtered(self):
# for small values, np.arange defaults to int32,
# but pandas default (e.g. for "expected" below) is int64
s = Series(np.arange(8), dtype='int64')
other = Series(np.arange(8), dtype='int64') + 10

Expand Down
0