8000 ENH: general concat with ExtensionArrays through find_common_type by jorisvandenbossche · Pull Request #33607 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: general concat with ExtensionArrays through find_common_type #33607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 2, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
handle sparse
  • Loading branch information
jorisvandenbossche committed Apr 17, 2020
commit bb398e78df218e16704806cd28da883203908c30
22 changes: 1 addition & 21 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,27 +952,7 @@ def copy(self):

@classmethod
def _concat_same_type(cls, to_concat):
fill_values = [x.fill_value for x in to_concat]

fill_value = fill_values[0]

# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn(
"Concatenating sparse arrays with multiple fill "
f"values: '{fill_values}'. Picking the first and "
"converting the rest.",
PerformanceWarning,
stacklevel=6,
)
keep = to_concat[0]
to_concat2 = [keep]

for arr in to_concat[1:]:
to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))

to_concat = to_concat2
fill_value = to_concat[0].fill_value

values = []
length = 0
Expand Down
26 changes: 24 additions & 2 deletions pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Sparse Dtype"""

import re
from typing import TYPE_CHECKING, Any, Tuple, Type
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type
import warnings

import numpy as np

from pandas._typing import Dtype
from pandas._typing import Dtype, DtypeObj
from pandas.errors import PerformanceWarning

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
Expand Down Expand Up @@ -352,3 +354,23 @@ def _subtype_with_str(self):
if isinstance(self.fill_value, str):
return type(self.fill_value)
return self.subtype

def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:

fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]

# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn(
"Concatenating sparse arrays with multiple fill "
f"values: '{fill_values}'. Picking the first and "
"converting the rest.",
PerformanceWarning,
stacklevel=6,
)

# TODO also handle non-numpy other dtypes
np_d 10000 types = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
1 change: 1 addition & 0 deletions pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# - do we guarantee that `dtypes` is already deduplicated? (list of uniques)
# - do we call this method if `len(dtypes) == 1`, or does this method
# need to handle that case
# - does this method need to handle "non-fully-initialized" dtypes?
if len(set(dtypes)) == 1:
# only itself
return self
Expand Down
83 changes: 27 additions & 56 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import numpy as np

from pandas._typing import ArrayLike, DtypeObj

from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -62,6 +64,30 @@ def get_dtype_kinds(l):
return typs


def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this private? everything else is not, so this seems odd

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an internal helper function for this file, which we indicate by using a leading underscore (the other functions here are used elsewhere in the pandas codebase)

"""
Helper function for `arr.astype(common_type)` but handling all special
cases.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have a plan in mind to make this function unnecessary? it is special-casing pandas-internal EAs which i think we want to avoid

(Not a blocker for this PR because AFAICT this is necessary and we do this in the status quo anyway, just curious)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, so this is indeed only needed to preserve value-based specific behaviour, not because I like this :)

Ideally we get rid of this, but I am not sure it is entirely possible with the numpy integer dtypes, since it will always depend on the presence of NaNs whether it can preserve int or not. The way to go here is probably the nullable integer dtypes to have consistent behaviour.

if (
is_categorical_dtype(arr.dtype)
and isinstance(dtype, np.dtype)
and np.issubdtype(dtype, np.integer)
):
# problem case: categorical of int -> gives int as result dtype,
# but categorical can contain NAs -> fall back to object dtype
try:
return arr.astype(dtype, copy=False)
except ValueError:
return arr.astype(object, copy=False)

if is_extension_array_dtype(dtype):
if isinstance(arr, np.ndarray):
# numpy's astype cannot handle ExtensionDtypes
return array(arr, dtype=dtype, copy=False)
return arr.astype(dtype, copy=False)


def concat_compat(to_concat, axis: int = 0):
"""
provide concatenation of an array of arrays each of which is a single
Expand Down Expand Up @@ -106,27 +132,7 @@ def is_nonempty(x) -> bool:
if any_ea and axis == 0:
if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])

def cast(arr, dtype):
if (
is_categorical_dtype(arr.dtype)
and isinstance(dtype, np.dtype)
and np.issubdtype(dtype, np.integer)
):
# problem case: categorical of int -> gives int as result dtype,
# but categorical can contain NAs -> fall back to object dtype
try:
return arr.astype(dtype, copy=False)
except ValueError:
return arr.astype(object, copy=False)

if is_extension_array_dtype(dtype):
if isinstance(arr, np.ndarray):
# numpy's astype cannot handle ExtensionDtypes
return array(arr, dtype=dtype, copy=False)
return arr.astype(dtype, copy=False)

to_concat = [cast(arr, target_dtype) for arr in to_concat]
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
Expand All @@ -137,10 +143,6 @@ def cast(arr, dtype):
elif _contains_datetime or "timedelta" in typs or _contains_period:
return concat_datetime(to_concat, axis=axis, typs=typs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we do the DTA/TDA casting above, and do isinstance(obj, ExtensionArray) checks, can all of the dt64/td64 cases be handled by the EA code above?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, because they are not using ExtensionDtype.


# these are mandated to handle empties as well
elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)

elif any_ea and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
return np.concatenate(to_concat, axis=axis)
Expand Down Expand Up @@ -394,34 +396,3 @@ def _wrap_datetimelike(arr):
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
arr = pd_array(arr)
return arr


def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.arrays import SparseArray

fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x
if isinstance(x, SparseArray)
else SparseArray(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return SparseArray._concat_same_type(to_concat)
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def test_interleave_dtype(self, mgr_string, dtype):
mgr = create_mgr("a: complex")
assert mgr.as_array().dtype == "complex"
mgr = create_mgr("a: f8; b: category")
assert mgr.as_array().dtype == "object"
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: M8[ns]; b: category")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: bool")
Expand Down
0