-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
ENH: general concat with ExtensionArrays through find_common_type #33607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3464e95
b1d9d68
bb398e7
83fdc91
7f2ac2a
d0f90de
2d5fcb0
a68206b
fc98b65
b072591
91c984a
2a2b9d5
8893165
e19e3ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,8 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._typing import ArrayLike, DtypeObj | ||
|
||
from pandas.core.dtypes.cast import find_common_type | ||
from pandas.core.dtypes.common import ( | ||
is_bool_dtype, | ||
|
@@ -62,6 +64,30 @@ def get_dtype_kinds(l): | |
return typs | ||
|
||
|
||
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: | ||
""" | ||
Helper function for `arr.astype(common_type)` but handling all special | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cases. | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you have a plan in mind to make this function unnecessary? it is special-casing pandas-internal EAs which i think we want to avoid (Not a blocker for this PR because AFAICT this is necessary and we do this in the status quo anyway, just curious) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, so this is indeed only needed to preserve value-based specific behaviour, not because I like this :) Ideally we get rid of this, but I am not sure it is entirely possible with the numpy integer dtypes, since it will always depend on the presence of NaNs whether it can preserve int or not. The way to go here is probably the nullable integer dtypes to have consistent behaviour. |
||
if ( | ||
is_categorical_dtype(arr.dtype) | ||
and isinstance(dtype, np.dtype) | ||
and np.issubdtype(dtype, np.integer) | ||
): | ||
# problem case: categorical of int -> gives int as result dtype, | ||
# but categorical can contain NAs -> fall back to object dtype | ||
try: | ||
return arr.astype(dtype, copy=False) | ||
except ValueError: | ||
return arr.astype(object, copy=False) | ||
|
||
if is_extension_array_dtype(dtype): | ||
if isinstance(arr, np.ndarray): | ||
# numpy's astype cannot handle ExtensionDtypes | ||
return array(arr, dtype=dtype, copy=False) | ||
return arr.astype(dtype, copy=False) | ||
|
||
|
||
def concat_compat(to_concat, axis: int = 0): | ||
""" | ||
provide concatenation of an array of arrays each of which is a single | ||
|
@@ -106,27 +132,7 @@ def is_nonempty(x) -> bool: | |
if any_ea and axis == 0: | ||
if not single_dtype: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
|
||
def cast(arr, dtype): | ||
if ( | ||
is_categorical_dtype(arr.dtype) | ||
and isinstance(dtype, np.dtype) | ||
and np.issubdtype(dtype, np.integer) | ||
): | ||
# problem case: categorical of int -> gives int as result dtype, | ||
# but categorical can contain NAs -> fall back to object dtype | ||
try: | ||
return arr.astype(dtype, copy=False) | ||
except ValueError: | ||
return arr.astype(object, copy=False) | ||
|
||
if is_extension_array_dtype(dtype): | ||
if isinstance(arr, np.ndarray): | ||
# numpy's astype cannot handle ExtensionDtypes | ||
return array(arr, dtype=dtype, copy=False) | ||
return arr.astype(dtype, copy=False) | ||
|
||
to_concat = [cast(arr, target_dtype) for arr in to_concat] | ||
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] | ||
|
||
if isinstance(to_concat[0], ExtensionArray): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cls = type(to_concat[0]) | ||
|
@@ -137,10 +143,6 @@ def cast(arr, dtype): | |
elif _contains_datetime or "timedelta" in typs or _contains_period: | ||
return concat_datetime(to_concat, axis=axis, typs=typs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we do the DTA/TDA casting above, and do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, because they are not using ExtensionDtype. |
||
|
||
# these are mandated to handle empties as well | ||
elif "sparse" in typs: | ||
return _concat_sparse(to_concat, axis=axis, typs=typs) | ||
|
||
elif any_ea and axis == 1: | ||
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] | ||
return np.concatenate(to_concat, axis=axis) | ||
|
@@ -394,34 +396,3 @@ def _wrap_datetimelike(arr): | |
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: | ||
arr = pd_array(arr) | ||
return arr | ||
|
||
|
||
def _concat_sparse(to_concat, axis=0, typs=None): | ||
""" | ||
provide concatenation of an sparse/dense array of arrays each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : axis to provide concatenation | ||
typs : set of to_concat dtypes | ||
|
||
Returns | ||
------- | ||
a single array, preserving the combined dtypes | ||
""" | ||
from pandas.core.arrays import SparseArray | ||
|
||
fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] | ||
fill_value = fill_values[0] | ||
|
||
# TODO: Fix join unit generation so we aren't passed this. | ||
to_concat = [ | ||
x | ||
if isinstance(x, SparseArray) | ||
else SparseArray(x.squeeze(), fill_value=fill_value) | ||
for x in to_concat | ||
] | ||
|
||
return SparseArray._concat_same_type(to_concat) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this private? everything else is not, so this seems odd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an internal helper function for this file, which we indicate by using a leading underscore (the other functions here are used elsewhere in the pandas codebase)