[BUG] RandomOversampler crashes on timedelta64 column that only contains NaTs

Hi!

Describe the bug

The RandomOversampler crashes with an Cannot cast DatetimeArray to dtype timedelta64[ns] error when a timedelta64[ns] column with only NaTs is present in the dataframe.

If only one of the elements is an actual timedelta64, no error occurs.

Steps/Code to Reproduce

from datetime import timedelta
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

X = pd.DataFrame(
    {
        'col_str': ["abc", "def", "xyz"],
        'col_timedelta': pd.to_timedelta([np.nan, np.nan, np.nan])
    }
)

display(X.info())

# Data columns (total 2 columns):
#  #   Column         Non-Null Count  Dtype          
# ---  ------         --------------  -----          
#  0   col_str        3 non-null      object         
#  1   col_timedelta  1 non-null      timedelta64[ns]
# dtypes: object(1), timedelta64[ns](1)
# memory usage: 176.0+ bytes

y = [0, 0, 1]

RandomOverSampler().fit_resample(X, y)

Expected Results

No error should occur and the datatype of the column in the returned dataframe should have the type timedelta64[ns].

Actual Results

Here the traceback:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)

...

File /code_location/venv/lib/python3.10/site-packages/imblearn/base.py:208, in BaseSampler.fit_resample(self, X, y)
    187 """Resample the dataset.
    188 
    189 Parameters
   (...)
    205     The corresponding label of `X_resampled`.
    206 """
    207 self._validate_params()
--> 208 return super().fit_resample(X, y)

File /code_location/venv/lib/python3.10/site-packages/imblearn/base.py:118, in SamplerMixin.fit_resample(self, X, y)
    112 output = self._fit_resample(X, y)
    114 y_ = (
    115     label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
    116 )
--> 118 X_, y_ = arrays_transformer.transform(output[0], y_)
    119 return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

File /code_location/venv/lib/python3.10/site-packages/imblearn/utils/_validation.py:39, in ArraysTransformer.transform(self, X, y)
     38 def transform(self, X, y):
---> 39     X = self._transfrom_one(X, self.x_props)
     40     y = self._transfrom_one(y, self.y_props)
     41     if self.x_props["type"].lower() == "dataframe" and self.y_props[
     42         "type"
     43     ].lower() in {"series", "dataframe"}:
     44         # We lost the y.index during resampling. We can safely use X.index to align
     45         # them.

File /code_location/venv/lib/python3.10/site-packages/imblearn/utils/_validation.py:65, in ArraysTransformer._transfrom_one(self, array, props)
     62     import pandas as pd
     64     ret = pd.DataFrame(array, columns=props["columns"])
---> 65     ret = ret.astype(props["dtypes"])
     66 elif type_ == "series":
     67     import pandas as pd

File /code_location/venv/lib/python3.10/site-packages/pandas/core/generic.py:6305, in NDFrame.astype(self, dtype, copy, errors)
   6303 else:
   6304     try:
-> 6305         res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
   6306     except ValueError as ex:
   6307         ex.args = (
   6308             f"{ex}: Error while type casting for column '{col_name}'",
   6309         )

File /code_location/venv/lib/python3.10/site-packages/pandas/core/generic.py:6324, in NDFrame.astype(self, dtype, copy, errors)
   6317     results = [
   6318         self.iloc[:, i].astype(dtype, copy=copy)
   6319         for i in range(len(self.columns))
   6320     ]
   6322 else:
   6323     # else, only a single dtype is given
-> 6324     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6325     return self._constructor(new_data).__finalize__(self, method="astype")
   6327 # GH 33113: handle empty frame or series

File /code_location/venv/lib/python3.10/site-packages/pandas/core/internals/managers.py:451, in BaseBlockManager.astype(self, dtype, copy, errors)
    448 elif using_copy_on_write():
    449     copy = False
--> 451 return self.apply(
    452     "astype",
    453     dtype=dtype,
    454     copy=copy,
    455     errors=errors,
    456     using_cow=using_copy_on_write(),
    457 )

File /code_location/venv/lib/python3.10/site-packages/pandas/core/internals/managers.py:352, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    350         applied = b.apply(f, **kwargs)
    351     else:
--> 352         applied = getattr(b, f)(**kwargs)
    353     result_blocks = extend_blocks(applied, result_blocks)
    355 out = type(self).from_blocks(result_blocks, self.axes)

File /code_location/venv/lib/python3.10/site-packages/pandas/core/internals/blocks.py:511, in Block.astype(self, dtype, copy, errors, using_cow)
    491 """
    492 Coerce to the new dtype.
    493 
   (...)
    507 Block
    508 """
    509 values = self.values
--> 511 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    513 new_values = maybe_coerce_values(new_values)
    515 refs = None

File /code_location/venv/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:242, in astype_array_safe(values, dtype, copy, errors)
    239     dtype = dtype.numpy_dtype
    241 try:
--> 242     new_values = astype_array(values, dtype, copy=copy)
    243 except (ValueError, TypeError):
    244     # e.g. _astype_nansafe can fail on object-dtype of strings
    245     #  trying to convert to float
    246     if errors == "ignore":

File /code_location/venv/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:184, in astype_array(values, dtype, copy)
    180     return values
    182 if not isinstance(values, np.ndarray):
    183     # i.e. ExtensionArray
--> 184     values = values.astype(dtype, copy=copy)
    186 else:
    187     values = _astype_nansafe(values, dtype, copy=copy)

File /code_location/venv/lib/python3.10/site-packages/pandas/core/arrays/datetimes.py:701, in DatetimeArray.astype(self, dtype, copy)
    699 elif is_period_dtype(dtype):
    700     return self.to_period(freq=dtype.freq)
--> 701 return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)

File /code_location/venv/lib/python3.10/site-packages/pandas/core/arrays/datetimelike.py:487, in DatetimeLikeArrayMixin.astype(self, dtype, copy)
    480 elif (
    481     is_datetime_or_timedelta_dtype(dtype)
    482     and not is_dtype_equal(self.dtype, dtype)
    483 ) or is_float_dtype(dtype):
    484     # disallow conversion between datetime/timedelta,
    485     # and conversions for any datetimelike to float
    486     msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
--> 487     raise TypeError(msg)
    488 else:
    489     return np.asarray(self, dtype=dtype)

TypeError: Cannot cast DatetimeArray to dtype timedelta64[ns]

Thanks for your support!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Describe the bug

Steps/Code to Reproduce

Expected Results

Actual Results

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Description

Describe the bug

Steps/Code to Reproduce

Expected Results

Actual Results

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions