8000 BUG: fix concat of Sparse with non-sparse dtypes by jorisvandenbossche · Pull Request #34338 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

BUG: fix concat of Sparse with non-sparse dtypes #34338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix _get_common_dtype to preserve sparse/categorical behaviour
  • Loading branch information
jorisvandenbossche committed May 23, 2020
commit 29e63a562ce74ffe2187ed57ae673b30ab5a8c5f
3 changes: 2 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,8 @@ def astype(self, dtype=None, copy=True):
"""
dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
# TODO copy=False is broken for astype_nansafe with int -> float
sp_values = astype_nansafe(self.sp_values, subtype, copy=True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #34456 for this (and added link in the comment)

if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,13 @@ def _subtype_with_str(self):
return self.subtype
8000
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatibtle extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None

fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
Expand All @@ -371,6 +378,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
stacklevel=6,
)

# TODO also handle non-numpy other dtypes
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
6 changes: 6 additions & 0 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
except ValueError:
return arr.astype(object, copy=False)

if is_sparse(arr) and not is_sparse(dtype):
# problem case: SparseArray.astype(dtype) doesn't follow the specified
# dtype exactly, but converts this to Sparse[dtype] -> first manually
# convert to dense array
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #34457 for this

return arr.to_dense().astype(dtype, copy=False)

if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in ["m", "M"]
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,8 @@ def _is_boolean(self) -> bool:
return is_bool_dtype(self.categories)

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
from pandas.core.arrays.sparse import SparseDtype

# check if we have all categorical dtype with identical categories
if all(isinstance(x, CategoricalDtype) for x in dtypes):
first = dtypes[0]
Expand All @@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
elif any(non_init_cats):
return None

# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
Expand Down
0