handle sparse

pandas-dev · jreback · May 2, 2020 · Apr 17, 2020 · Apr 17, 2020 · Apr 17, 2020
commit bb398e78df218e16704806cd28da883203908c30
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -952,27 +952,7 @@ def copy(self):
 
     @classmethod
     def _concat_same_type(cls, to_concat):
-        fill_values = [x.fill_value for x in to_concat]
-
-        fill_value = fill_values[0]
-
-        # np.nan isn't a singleton, so we may end up with multiple
-        # NaNs here, so we ignore tha all NA case too.
-        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
-            warnings.warn(
-                "Concatenating sparse arrays with multiple fill "
-                f"values: '{fill_values}'. Picking the first and "
-                "converting the rest.",
-                PerformanceWarning,
-                stacklevel=6,
-            )
-            keep = to_concat[0]
-            to_concat2 = [keep]
-
-            for arr in to_concat[1:]:
-                to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
-
-            to_concat = to_concat2
+        fill_value = to_concat[0].fill_value
 
         values = []
         length = 0

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -1,11 +1,13 @@
 """Sparse Dtype"""
 
 import re
-from typing import TYPE_CHECKING, Any, Tuple, Type
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type
+import warnings
 
 import numpy as np
 
-from pandas._typing import Dtype
+from pandas._typing import Dtype, DtypeObj
+from pandas.errors import PerformanceWarning
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import astype_nansafe
@@ -352,3 +354,23 @@ def _subtype_with_str(self):
         if isinstance(self.fill_value, str):
             return type(self.fill_value)
         return self.subtype
+
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+
+        fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
+        fill_value = fill_values[0]
+
+        # np.nan isn't a singleton, so we may end up with multiple
+        # NaNs here, so we ignore tha all NA case too.
+        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
+            warnings.warn(
+                "Concatenating sparse arrays with multiple fill "
+                f"values: '{fill_values}'. Picking the first and "
+                "converting the rest.",
+                PerformanceWarning,
+                stacklevel=6,
+            )
+
+        # TODO also handle non-numpy other dtypes
+        np_d
10000
types = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
+        return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -348,6 +348,7 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
         # - do we guarantee that `dtypes` is already deduplicated? (list of uniques)
         # - do we call this method if `len(dtypes) == 1`, or does this method
         #   need to handle that case
+        # - does this method need to handle "non-fully-initialized" dtypes?
         if len(set(dtypes)) == 1:
             # only itself
             return self

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+from pandas._typing import ArrayLike, DtypeObj
+
 from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -62,6 +64,30 @@ def get_dtype_kinds(l):
     return typs
 
 
+def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
+    """
+    Helper function for `arr.astype(common_type)` but handling all special
+    cases.
+    """
+    if (
+        is_categorical_dtype(arr.dtype)
+        and isinstance(dtype, np.dtype)
+        and np.issubdtype(dtype, np.integer)
+    ):
+        # problem case: categorical of int -> gives int as result dtype,
+        # but categorical can contain NAs -> fall back to object dtype
+        try:
+            return arr.astype(dtype, copy=False)
+        except ValueError:
+            return arr.astype(object, copy=False)
+
+    if is_extension_array_dtype(dtype):
+        if isinstance(arr, np.ndarray):
+            # numpy's astype cannot handle ExtensionDtypes
+            return array(arr, dtype=dtype, copy=False)
+    return arr.astype(dtype, copy=False)
+
+
 def concat_compat(to_concat, axis: int = 0):
     """
     provide concatenation of an array of arrays each of which is a single
@@ -106,27 +132,7 @@ def is_nonempty(x) -> bool:
     if any_ea and axis == 0:
         if not single_dtype:
             target_dtype = find_common_type([x.dtype for x in to_concat])
-
-            def cast(arr, dtype):
-                if (
-                    is_categorical_dtype(arr.dtype)
-                    and isinstance(dtype, np.dtype)
-                    and np.issubdtype(dtype, np.integer)
-                ):
-                    # problem case: categorical of int -> gives int as result dtype,
-                    # but categorical can contain NAs -> fall back to object dtype
-                    try:
-                        return arr.astype(dtype, copy=False)
-                    except ValueError:
-                        return arr.astype(object, copy=False)
-
-                if is_extension_array_dtype(dtype):
-                    if isinstance(arr, np.ndarray):
-                        # numpy's astype cannot handle ExtensionDtypes
-                        return array(arr, dtype=dtype, copy=False)
-                return arr.astype(dtype, copy=False)
-
-            to_concat = [cast(arr, target_dtype) for arr in to_concat]
+            to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
 
         if isinstance(to_concat[0], ExtensionArray):
             cls = type(to_concat[0])
@@ -137,10 +143,6 @@ def cast(arr, dtype):
     elif _contains_datetime or "timedelta" in typs or _contains_period:
         return concat_datetime(to_concat, axis=axis, typs=typs)
 
-    # these are mandated to handle empties as well
-    elif "sparse" in typs:
-        return _concat_sparse(to_concat, axis=axis, typs=typs)
-
     elif any_ea and axis == 1:
         to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
         return np.concatenate(to_concat, axis=axis)
@@ -394,34 +396,3 @@ def _wrap_datetimelike(arr):
     if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
         arr = pd_array(arr)
     return arr
-
-
-def _concat_sparse(to_concat, axis=0, typs=None):
-    """
-    provide concatenation of an sparse/dense array of arrays each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-    typs : set of to_concat dtypes
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-    from pandas.core.arrays import SparseArray
-
-    fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
-    fill_value = fill_values[0]
-
-    # TODO: Fix join unit generation so we aren't passed this.
-    to_concat = [
-        x
-        if isinstance(x, SparseArray)
-        else SparseArray(x.squeeze(), fill_value=fill_value)
-        for x in to_concat
-    ]
-
-    return SparseArray._concat_same_type(to_concat)
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -584,7 +584,7 @@ def test_interleave_dtype(self, mgr_string, dtype):
         mgr = create_mgr("a: complex")
         assert mgr.as_array().dtype == "complex"
         mgr = create_mgr("a: f8; b: category")
-        assert mgr.as_array().dtype == "object"
+        assert mgr.as_array().dtype == "f8"
         mgr = create_mgr("a: M8[ns]; b: category")
         assert mgr.as_array().dtype == "object"
         mgr = create_mgr("a: M8[ns]; b: bool")