10000 ExtensionArray.take default implementation by TomAugspurger · Pull Request #20814 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ExtensionArray.take default implementation #20814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Apr 27, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c449afd
Bounds checking
TomAugspurger Apr 25, 2018
82cad8b
Stale comment
TomAugspurger Apr 25, 2018
d5470a0
Fixed reorder
TomAugspurger Apr 25, 2018
bbcbf19
Cleanup
TomAugspurger Apr 25, 2018
1a4d987
Pass an array
TomAugspurger Apr 25, 2018
fc729d6
Fixed editor
TomAugspurger Apr 25, 2018
74b2c09
Added verisonadded
TomAugspurger Apr 26, 2018
5db6624
Doc and move tests
TomAugspurger Apr 26, 2018
741f284
Merge remote-tracking branch 'upstream/master' into ea-take
TomAugspurger Apr 26, 2018
fbc4425
Updates
TomAugspurger Apr 26, 2018
f3b91ca
doc fixup
TomAugspurger Apr 26, 2018
eecd632
Skip categorical take tests
TomAugspurger Apr 26, 2018
9a6c7d4
Doc updates
TomAugspurger Apr 27, 2018
7c4f625
Merge remote-tracking branch 'upstream/master' into ea-take
TomAugspurger Apr 27, 2018
eb43fa4
Really truly fix it hopefully.
TomAugspurger Apr 27, 2018
6858409
Added note
TomAugspurger Apr 27, 2018
ec0cecd
Updates
TomAugspurger Apr 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
More internals hacking
  • Loading branch information
TomAugspurger committed Apr 25, 2018
commit eba137f8c7ae45288200566aebbf32229c69720c
3 changes: 1 addition & 2 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,7 @@ def _values_for_take(self):
@Appender(_take_docstring)
def take(self, indexer, fill_value=_default_fill_value):
# type: (Sequence[int], Optional[Any]) -> ExtensionArray
if fill_value is np.nan:
import pdb; pdb.set_trace()
# assert fill_value is not np.nan
from pandas.core.algorithms import take

data = self._values_for_take()
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class _DtypeOpsMixin(object):

# na_value is the default NA value to use for this type. This is used in
# e.g. ExtensionArray.take.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the "used in EA.take" is misleading. In many cases, it will not be used in take, as this na_value is your "boxed" scalar NA type, not the underlying physical NA value.

na_value = np.nan
na_value = np.nan # TODO: change to _na_value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the TODO?


def __eq__(self, other):
"""Check whether 'other' is equal to self.
Expand Down
51 changes: 48 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import warnings

import copy
from warnings import catch_warnings
import inspect
Expand Down Expand Up @@ -82,7 +83,7 @@
from pandas.util._decorators import cache_readonly
from pandas.util._validators import validate_bool_kwarg
from pandas import compat
from pandas.compat import range, map, zip, u
from pandas.compat import range, map, zip, u, _default_fill_value


class Block(PandasObject):
Expand Down Expand Up @@ -1888,6 +1889,10 @@ def _holder(self):
# For extension blocks, the holder is values-dependent.
return type(self.values)

@property
def fill_value(self):
return self.values.dtype.na_value # TODO: change to _na_value

@property
def _can_hold_na(self):
# The default ExtensionArray._can_hold_na is True
Expand Down Expand Up @@ -4386,6 +4391,8 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,

pandas-indexer with -1's only.
"""
# TODO: see if we can make fill_value be {col -> fill_value}
# maybe earlier...
if indexer is None:
if new_axis is self.axes[axis] and not copy:
return self
Expand All @@ -4408,8 +4415,10 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
new_blocks = self._slice_take_blocks_ax0(indexer,
fill_tuple=(fill_value,))
else:
if fill_value is None:
fill_value = _default_fill_value
new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
fill_value if fill_value is not None else blk.fill_value,))
fill_value if fill_value is not _default_fill_value else blk.fill_value,))
for blk in self.blocks]

new_axes = list(self.axes)
Expand All @@ -4436,6 +4445,9 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
if self._is_single_block:
blk = self.blocks[0]

if allow_fill and fill_tuple[0] is _default_fill_value:
fill_tuple = (blk.fill_value,)

if sl_type in ('slice', 'mask'):
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
elif not allow_fill or self.ndim == 1:
Expand Down Expand Up @@ -5404,6 +5416,25 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
elif is_uniform_join_units(join_units):
b = join_units[0].block.concat_same_type(
[ju.block for ju in join_units], placement=placement)
elif is_uniform_reindexer(join_units):
old_block = join_units[0].block

new_values = concatenate_join_units(join_units, concat_axis,
copy=copy)
if new_values.ndim == 2:
# XXX: categorical returns a categorical here
# EA returns a 2d ndarray
# need to harmoinze these to always be EAs?
assert new_values.shape[0] == 1
new_values = new_values[0]

assert isinstance(old_block._holder, ABCExtensionArray)

b = old_block.make_block_same_class(
old_block._holder._from_sequence(new_values),
placement=placement
)

else:
b = make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
Expand Down Expand Up @@ -5434,6 +5465,13 @@ def is_uniform_join_units(join_units):
len(join_units) > 1)


def is_uniform_reindexer(join_units):
# For when we know we can reindex without changing type
return (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a docstring to indicate what you mean with "uniform reindex"

I understand it as "this are uniform blocks that will keep being uniform after reindexing", correct?
I am only wondering, does that not depend on the fill_value used? Or is this code path only taken in case of default filling (I don't think you can specify the filling with things like concat and merge, as opposed to reindex itself)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this are uniform blocks that will keep being uniform after reindexing

Exactly. That's the intent anyway...

It probably does depend on the fill_value. I need to better understand get_empty_dtype_and_na

all(ju.block and ju.block.is_extension for ju in join_units)
)


def get_empty_dtype_and_na(join_units):
"""
Return dtype and N/A values to use when concatenating specified units.
Expand Down Expand Up @@ -5461,12 +5499,15 @@ def get_empty_dtype_and_na(join_units):

upcast_classes = defaultdict(list)
null_upcast_classes = defaultdict(list)

for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue

if is_categorical_dtype(dtype):
upcast_cls = 'category'
elif is_extension_array_dtype(dtype):
upcast_cls = 'extension'
elif is_datetimetz(dtype):
upcast_cls = 'datetimetz'
elif issubclass(dtype.type, np.bool_):
Expand Down Expand Up @@ -5496,6 +5537,8 @@ def get_empty_dtype_and_na(join_units):
# create the result
77D0 if 'object' in upcast_classes:
return np.dtype(np.object_), np.nan
elif 'extension' in upcast_classes:
return np.dtype(np.object_), None
elif 'bool' in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
Expand Down Expand Up @@ -5755,7 +5798,9 @@ def dtype(self):
if self.block is None:
raise AssertionError("Block is None, no dtype")

if not self.needs_filling:
if not self.needs_filling or self.block.is_extension:
# ExtensionDtypes by definition can hold their
# NA value.
return self.block.dtype
else:
return _get_dtype(maybe_promote(self.block.dtype,
Expand Down
0