More internals hacking

pandas-dev · jorisvandenbossche · Apr 27, 2018 · Apr 24, 2018 · Apr 24, 2018 · Apr 24, 2018
commit eba137f8c7ae45288200566aebbf32229c69720c
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -537,8 +537,7 @@ def _values_for_take(self):
     @Appender(_take_docstring)
     def take(self, indexer, fill_value=_default_fill_value):
         # type: (Sequence[int], Optional[Any]) -> ExtensionArray
-        if fill_value is np.nan:
-            import pdb; pdb.set_trace()
+        # assert fill_value is not np.nan
         from pandas.core.algorithms import take
 
         data = self._values_for_take()

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -18,7 +18,7 @@ class _DtypeOpsMixin(object):
 
     # na_value is the default NA value to use for this type. This is used in
     # e.g. ExtensionArray.take.
-    na_value = np.nan
+    na_value = np.nan  # TODO: change to _na_value
 
     def __eq__(self, other):
         """Check whether 'other' is equal to self.

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1,4 +1,5 @@
 import warnings
+
 import copy
 from warnings import catch_warnings
 import inspect
@@ -82,7 +83,7 @@
 from pandas.util._decorators import cache_readonly
 from pandas.util._validators import validate_bool_kwarg
 from pandas import compat
-from pandas.compat import range, map, zip, u
+from pandas.compat import range, map, zip, u, _default_fill_value
 
 
 class Block(PandasObject):
@@ -1888,6 +1889,10 @@ def _holder(self):
         # For extension blocks, the holder is values-dependent.
         return type(self.values)
 
+    @property
+    def fill_value(self):
+        return self.values.dtype.na_value  # TODO: change to _na_value
+
     @property
     def _can_hold_na(self):
         # The default ExtensionArray._can_hold_na is True
@@ -4386,6 +4391,8 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
 
         pandas-indexer with -1's only.
         """
+        # TODO: see if we can make fill_value be {col -> fill_value}
+        # maybe earlier...
         if indexer is None:
             if new_axis is self.axes[axis] and not copy:
                 return self
@@ -4408,8 +4415,10 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
             new_blocks = self._slice_take_blocks_ax0(indexer,
                                                      fill_tuple=(fill_value,))
         else:
+            if fill_value is None:
+                fill_value = _default_fill_value
             new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
-                fill_value if fill_value is not None else blk.fill_value,))
+                fill_value if fill_value is not _default_fill_value else blk.fill_value,))
                 for blk in self.blocks]
 
         new_axes = list(self.axes)
@@ -4436,6 +4445,9 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
         if self._is_single_block:
             blk = self.blocks[0]
 
+            if allow_fill and fill_tuple[0] is _default_fill_value:
+                fill_tuple = (blk.fill_value,)
+
             if sl_type in ('slice', 'mask'):
                 return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
             elif not allow_fill or self.ndim == 1:
@@ -5404,6 +5416,25 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
         elif is_uniform_join_units(join_units):
             b = join_units[0].block.concat_same_type(
                 [ju.block for ju in join_units], placement=placement)
+        elif is_uniform_reindexer(join_units):
+            old_block = join_units[0].block
+
+            new_values = concatenate_join_units(join_units, concat_axis,
+                                                copy=copy)
+            if new_values.ndim == 2:
+                # XXX: categorical returns a categorical here
+                # EA returns a 2d ndarray
+                # need to harmoinze these to always be EAs?
+                assert new_values.shape[0] == 1
+                new_values = new_values[0]
+
+            assert isinstance(old_block._holder, ABCExtensionArray)
+
+            b = old_block.make_block_same_class(
+                old_block._holder._from_sequence(new_values),
+                placement=placement
+            )
+
         else:
             b = make_block(
                 concatenate_join_units(join_units, concat_axis, copy=copy),
@@ -5434,6 +5465,13 @@ def is_uniform_join_units(join_units):
         len(join_units) > 1)
 
 
+def is_uniform_reindexer(join_units):
+    # For when we know we can reindex without changing type
+    return (
+        all(ju.block and ju.block.is_extension for ju in join_units)
+    )
+
+
 def get_empty_dtype_and_na(join_units):
     """
     Return dtype and N/A values to use when concatenating specified units.
@@ -5461,12 +5499,15 @@ def get_empty_dtype_and_na(join_units):
 
     upcast_classes = defaultdict(list)
     null_upcast_classes = defaultdict(list)
+
     for dtype, unit in zip(dtypes, join_units):
         if dtype is None:
             continue
 
         if is_categorical_dtype(dtype):
             upcast_cls = 'category'
+        elif is_extension_array_dtype(dtype):
+            upcast_cls = 'extension'
         elif is_datetimetz(dtype):
             upcast_cls = 'datetimetz'
         elif issubclass(dtype.type, np.bool_):
@@ -5496,6 +5537,8 @@ def get_empty_dtype_and_na(join_units):
     # create the result
     if 'object' in upcast_classes:
         return np.dtype(np.object_), np.nan
+    elif 'extension' in upcast_classes:
+        return np.dtype(np.object_), None
     elif 'bool' in upcast_classes:
         if has_none_blocks:
             return np.dtype(np.object_), np.nan
@@ -5755,7 +5798,9 @@ def dtype(self):
         if self.block is None:
             raise AssertionError("Block is None, no dtype")
 
-        if not self.needs_filling:
+        if not self.needs_filling or self.block.is_extension:
+            # ExtensionDtypes by definition can hold their
+            # NA value.
             return self.block.dtype
         else:
             return _get_dtype(maybe_promote(self.block.dtype,