8000 BUG: DataFrame.append with timedelta64 (#39574) · pandas-dev/pandas@bcf2406 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit bcf2406

Browse files
authored
BUG: DataFrame.append with timedelta64 (#39574)
1 parent 934fc81 commit bcf2406

File tree

5 files changed

+103
-132
lines changed

5 files changed

+103
-132
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ Reshaping
431431
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
432432
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
433433
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
434+
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
434435

435436
Sparse
436437
^^^^^^

pandas/core/dtypes/concat.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
6161
return arr.astype(dtype, copy=False)
6262

6363

64-
def concat_compat(to_concat, axis: int = 0):
64+
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
6565
"""
6666
provide concatenation of an array of arrays each of which is a single
6767
'normalized' dtypes (in that for example, if it's object, then it is a
@@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0):
7272
----------
7373
to_concat : array of arrays
7474
axis : axis to provide concatenation
75+
ea_compat_axis : bool, default False
76+
For ExtensionArray compat, behave as if axis == 1 when determining
77+
whether to drop empty arrays.
7578
7679
Returns
7780
-------
@@ -91,7 +94,8 @@ def is_nonempty(x) -> bool:
9194
# marginal given that it would still require shape & dtype calculation and
9295
# np.concatenate which has them both implemented is compiled.
9396
non_empties = [x for x in to_concat if is_nonempty(x)]
94-
if non_empties and axis == 0:
97+
if non_empties and axis == 0 and not ea_compat_axis:
98+
# ea_compat_axis see GH#39574
9599
to_concat = non_empties
96100

97101
kinds = {obj.dtype.kind for obj in to_concat}

pandas/core/internals/concat.py

Lines changed: 44 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from __future__ import annotations
22

3-
from collections import defaultdict
43
import copy
54
import itertools
6-
from typing import TYPE_CHECKING, Dict, List, Sequence, cast
5+
from typing import TYPE_CHECKING, Dict, List, Sequence
76

87
import numpy as np
98

@@ -14,16 +13,13 @@
1413
from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type
1514
from pandas.core.dtypes.common import (
1615
is_categorical_dtype,
17-
is_datetime64_dtype,
1816
is_datetime64tz_dtype,
17+
is_dtype_equal,
1918
is_extension_array_dtype,
20-
is_float_dtype,
21-
is_numeric_dtype,
2219
is_sparse,
23-
is_timedelta64_dtype,
2420
)
2521
from pandas.core.dtypes.concat import concat_compat
26-
from pandas.core.dtypes.missing import isna_all
22+
from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all
2723

2824
import pandas.core.algorithms as algos
2925
from pandas.core.arrays import DatetimeArray, ExtensionArray
@@ -33,7 +29,6 @@
3329

3430
if TYPE_CHECKING:
3531
from pandas import Index
36-
from pandas.core.arrays.sparse.dtype import SparseDtype
3732

3833

3934
def concatenate_block_managers(
@@ -232,6 +227,29 @@ def dtype(self):
232227
return blk.dtype
233228
return ensure_dtype_can_hold_na(blk.dtype)
234229

230+
def is_valid_na_for(self, dtype: DtypeObj) -> bool:
231+
"""
232+
Check that we are all-NA of a type/dtype that is compatible with this dtype.
233+
Augments `self.is_na` with an additional check of the type of NA values.
234+
"""
235+
if not self.is_na:
236+
return False
237+
if self.block is None:
238+
return True
239+
240+
if self.dtype == object:
241+
values = self.block.values
242+
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
243+
244+
if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal(
245+
self.dtype, dtype
246+
):
247+
# fill_values match but we should not cast self.block.values to dtype
248+
return False
249+
250+
na_value = self.block.fill_value
251+
return is_valid_na_for_dtype(na_value, dtype)
252+
235253
@cache_readonly
236254
def is_na(self) -> bool:
237255
if self.block is None:
@@ -262,7 +280,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
262280
else:
263281
fill_value = upcasted_na
264282

265-
if self.is_na:
283+
if self.is_valid_na_for(empty_dtype):
266284
blk_dtype = getattr(self.block, "dtype", None)
267285

268286
if blk_dtype == np.dtype(object):
@@ -276,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
276294
if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype(
277295
empty_dtype
278296
):
279-
if self.block is None:
280-
# TODO(EA2D): special case unneeded with 2D EAs
281-
i8values = np.full(self.shape[1], fill_value.value)
282-
return DatetimeArray(i8values, dtype=empty_dtype)
297+
# TODO(EA2D): special case unneeded with 2D EAs
298+
i8values = np.full(self.shape[1], fill_value.value)
299+
return DatetimeArray(i8values, dtype=empty_dtype)
283300
elif is_categorical_dtype(blk_dtype):
284301
pass
285302
elif is_extension_array_dtype(blk_dtype):
@@ -295,6 +312,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
295312
empty_arr, allow_fill=True, fill_value=fill_value
296313
)
297314
else:
315+
# NB: we should never get here with empty_dtype integer or bool;
316+
# if we did, the missing_arr.fill would cast to gibberish
298317
missing_arr = np.empty(self.shape, dtype=empty_dtype)
299318
missing_arr.fill(fill_value)
300319
return missing_arr
@@ -362,14 +381,12 @@ def _concatenate_join_units(
362381
# concatting with at least one EA means we are concatting a single column
363382
# the non-EA values are 2D arrays with shape (1, n)
364383
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
365-
concat_values = concat_compat(to_concat, axis=0)
366-
if not isinstance(concat_values, ExtensionArray) or (
367-
isinstance(concat_values, DatetimeArray) and concat_values.tz is None
368-
):
384+
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
385+
if not is_extension_array_dtype(concat_values.dtype):
369386
# if the result of concat is not an EA but an ndarray, reshape to
370387
# 2D to put it a non-EA Block
371-
# special case DatetimeArray, which *is* an EA, but is put in a
372-
# consolidated 2D block
388+
# special case DatetimeArray/TimedeltaArray, which *is* an EA, but
389+
# is put in a consolidated 2D block
373390
concat_values = np.atleast_2d(concat_values)
374391
else:
375392
concat_values = concat_compat(to_concat, axis=concat_axis)
@@ -419,108 +436,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
419436
return empty_dtype
420437

421438
has_none_blocks = any(unit.block is None for unit in join_units)
422-
dtypes = [None if unit.block is None else unit.dtype for unit in join_units]
423439

424-
filtered_dtypes = [
440+
dtypes = [
425441
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
426442
]
427-
if not len(filtered_dtypes):
428-
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
429-
dtype_alt = find_common_type(filtered_dtypes)
430-
431-
upcast_classes = _get_upcast_classes(join_units, dtypes)
432-
433-
if is_extension_array_dtype(dtype_alt):
434-
return dtype_alt
435-
elif dtype_alt == object:
436-
return dtype_alt
437-
438-
# TODO: de-duplicate with maybe_promote?
439-
# create the result
440-
if "extension" in upcast_classes:
441-
return np.dtype("object")
442-
elif "bool" in upcast_classes:
443-
if has_none_blocks:
444-
return np.dtype(np.object_)
445-
else:
446-
return np.dtype(np.bool_)
447-
elif "datetimetz" in upcast_classes:
448-
# GH-25014. We use NaT instead of iNaT, since this eventually
449-
# ends up in DatetimeArray.take, which does not allow iNaT.
450-
dtype = upcast_classes["datetimetz"]
451-
return dtype[0]
452-
elif "datetime" in upcast_classes:
453-
return np.dtype("M8[ns]")
454-
elif "timedelta" in upcast_classes:
455-
return np.dtype("m8[ns]")
456< B41A /td>-
else:
457-
try:
458-
common_dtype = np.find_common_type(upcast_classes, [])
459-
except TypeError:
460-
# At least one is an ExtensionArray
461-
return np.dtype(np.object_)
462-
else:
463-
if is_float_dtype(common_dtype):
464-
return common_dtype
465-
elif is_numeric_dtype(common_dtype):
466-
if has_none_blocks:
467-
return np.dtype(np.float64)
468-
else:
469-
return common_dtype
470-
471-
msg = "invalid dtype determination in get_concat_dtype"
472-
raise AssertionError(msg)
473-
474-
475-
def _get_upcast_classes(
476-
join_units: Sequence[JoinUnit],
477-
dtypes: Sequence[DtypeObj],
478-
) -> Dict[str, List[DtypeObj]]:
479-
"""Create mapping between upcast class names and lists of dtypes."""
480-
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
481-
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
482-
for dtype, unit in zip(dtypes, join_units):
483-
if dtype is None:
484-
continue
485-
486-
upcast_cls = _select_upcast_cls_from_dtype(dtype)
487-
# Null blocks should not influence upcast class selection, unless there
488-
# are only null blocks, when same upcasting rules must be applied to
489-
# null upcast classes.
490-
if unit.is_na:
491-
null_upcast_classes[upcast_cls].append(dtype)
492-
else:
493-
upcast_classes[upcast_cls].append(dtype)
494-
495-
if not upcast_classes:
496-
upcast_classes = null_upcast_classes
497-
498-
return upcast_classes
499-
500-
501-
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
502-
"""Select upcast class name based on dtype."""
503-
if is_categorical_dtype(dtype):
504-
return "extension"
505-
elif is_datetime64tz_dtype(dtype):
506-
return "datetimetz"
507-
elif is_extension_array_dtype(dtype):
508-
return "extension"
509-
elif issubclass(dtype.type, np.bool_):
510-
return "bool"
511-
elif issubclass(dtype.type, np.object_):
512-
return "object"
513-
elif is_datetime64_dtype(dtype):
514-
return "datetime"
515-
elif is_timedelta64_dtype(dtype):
516-
return "timedelta"
517-
elif is_sparse(dtype):
518-
dtype = cast("SparseDtype", dtype)
519-
return dtype.subtype.name
520-
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
521-
return dtype.name
522-
else:
523-
return "float"
443+
if not len(dtypes):
444+
dtypes = [unit.dtype for unit in join_units if unit.block is not None]
445+
446+
dtype = find_common_type(dtypes)
447+
if has_none_blocks:
448+
dtype = ensure_dtype_can_hold_na(dtype)
449+
return dtype
524450

525451

526452
def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:

pandas/tests/reshape/concat/test_append.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -334,9 +334,9 @@ def test_append_missing_column_proper_upcast(self, sort):
334334
def test_append_empty_frame_to_series_with_dateutil_tz(self):
335335
# GH 23682
336336
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
337-
s = Series({"date": date, "a": 1.0, "b": 2.0})
337+
ser = Series({"date": date, "a": 1.0, "b": 2.0})
338338
df = DataFrame(columns=["c", "d"])
339-
result_a = df.append(s, ignore_index=True)
339+
result_a = df.append(ser, ignore_index=True)
340340
expected = DataFrame(
341341
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
342342
)
@@ -350,13 +350,12 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
350350
)
351351
expected["c"] = expected["c"].astype(object)
352352
expected["d"] = expected["d"].astype(object)
353-
354-
result_b = result_a.append(s, ignore_index=True)
353+
result_b = result_a.append(ser, ignore_index=True)
355354
tm.assert_frame_equal(result_b, expected)
356355

357356
# column order is different
358357
expected = expected[["c", "d", "date", "a", "b"]]
359-
result = df.append([s, s], ignore_index=True)
358+
result = df.append([ser, ser], ignore_index=True)
360359
tm.assert_frame_equal(result, expected)
361360

362361
def test_append_empty_tz_frame_with_datetime64ns(self):
@@ -378,12 +377,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
378377
@pytest.mark.parametrize(
379378
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
380379
)
381-
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
380+
@pytest.mark.parametrize("val", [1, "NaT"])
381+
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
382382
# https://github.com/pandas-dev/pandas/issues/35460
383383
df = DataFrame(columns=["a"]).astype(dtype_str)
384384

385-
other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
385+
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
386386
result = df.append(other, ignore_index=True)
387387

388388
expected = other.astype(object)
389389
tm.assert_frame_equal(result, expected)
390+
391+
@pytest.mark.parametrize(
392+
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
393+
)
394+
@pytest.mark.parametrize("val", [1, "NaT"])
395+
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
396+
# https://github.com/pandas-dev/pandas/issues/35460
397+
df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
398+
399+
other = DataFrame({"a": [np.timedelta64(val, "ns")]})
400+
result = df.append(other, ignore_index=True)
401+
402+
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
403+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)
0