8000 PERF: Add cache keyword to to_datetime (#11665) by mroeschke · Pull Request #17077 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF: Add cache keyword to to_datetime (#11665) #17077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 11, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
< 8000 /div>
Prev Previous commit
Next Next commit
Move caching function outside to_datetime
  • Loading branch information
mroeschke committed Nov 11, 2017
commit 9486df3ef84cf87cad98c18181e9853def28c649
2 changes: 1 addition & 1 deletion pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def __new__(cls, data=None,
if not (is_datetime64_dtype(data) or is_datetimetz(data) or
is_integer_dtype(data)):
data = tools.to_datetime(data, dayfirst=dayfirst,
yearfirst=yearfirst, cache=False)
yearfirst=yearfirst)

if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data):

Expand Down
90 changes: 38 additions & 52 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
8000 Expand Up @@ -35,11 +35,35 @@ def _guess_datetime_format_for_array(arr, **kwargs):
if len(non_nan_elements):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)

def _maybe_cache(arg, format, cache, tz, _convert_listlike):
"""Create a cache of unique dates from an array of dates"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u add Parameters and list what the args are

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and Returns

from pandas import Series
cache_array = Series()
if cache:
# Perform a quicker unique check
from pandas import Index
if not Index(arg).is_unique:
unique_dates = algorithms.unique(arg)
cache_dates = _convert_listlike(unique_dates, True, format,
tz=tz)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array

def _convert_and_box_cache(arg, cache_array, box, name=None):
"""Convert array of dates with a cache and box the result"""
from pandas import Series
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can import from pandas here directly

from pandas.core.indexes.datetimes import DatetimeIndex
result = Series(arg).map(cache_array)
if box:
result = DatetimeIndex(result, name=name)
else:
result = result.values
return result

def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix',
cache=True):
cache=False):
"""
Convert argument to datetime.

Expand Down Expand Up @@ -310,51 +334,6 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
except (ValueError, TypeError):
raise e

def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz):
"""
Try to convert the datetimelike arg using
a cache of converted dates.

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
Datetime argument to convert
cache : boolean
If True, try to convert the dates with a cache
If False, short circuit and return None
Flag whether to cache the converted dates
box : boolean
If True, return a DatetimeIndex
if False, return an ndarray of values
tz : String or None
'utc' if UTC=True was passed else None
name : String, default None
DatetimeIndex name
Returns
-------
Series if original argument was a Series
DatetimeIndex if box=True and original argument was not a Series
ndarray if box=False and original argument was not a Series
None if the conversion failed
"""
if cache and is_list_like(arg) and len(arg) >= 1000:
# Perform a quicker unique check
from pandas import Index
if not Index(arg).is_unique:
unique_dates = algorithms.unique(arg)
from pandas import Series
cache_dates = _convert_listlike(unique_dates, True, format,
tz=tz)
convert_cache = Series(cache_dates, index=unique_dates)
result = Series(arg, name=name).map(convert_cache)
if isinstance(arg, Series):
return result
elif box:
return DatetimeIndex(result, name=name)
else:
return result.values
return None

if arg is None:
return None

Expand Down Expand Up @@ -419,20 +398,27 @@ def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz):
if isinstance(arg, tslib.Timestamp):
result = arg
elif isinstance(arg, ABCSeries):
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name)
if result is None:
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this still looks pretty duplicative, but I guess ok for now.

result = arg.map(cache_array)
else:
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name)
if result is None:
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box,
name=arg.name)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't you handle these caess list-list/index) inside _maybe_convert_cache? (I am talking about the else/box part.

result = _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
result = _maybe_convert_cache(arg, cache, box, format)
if result is None:
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box)
else:
result = _convert_listlike(arg, box, format)
else:
result = _convert_listlike(np.array([arg]), box, format)[0]
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,8 @@ def test_parsers(self, cache):
assert result3 is tslib.NaT
assert result4 is tslib.NaT

def test_parsers_dayfirst_yearfirst(self):
@pytest.mark.parametrize('cache', [True, False])
def test_parsers_dayfirst_yearfirst(self, cache):
# OK
# 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00
# 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00
Expand Down Expand Up @@ -1373,7 +1374,8 @@ def test_parsers_time(self):
assert isinstance(res, list)
assert res == expected_arr

def test_parsers_timezone_minute_offsets_roundtrip(self):
@pytest.mark.parametrize('cache', [True, False])
def test_parsers_timezone_minute_offsets_roundtrip(self, cache):
# GH11708
base = to_datetime("2013-01-01 00:00:00", cache=cache)
dt_strings = [
Expand Down
0