8000 PERF: Add cache keyword to to_datetime (#11665) by mroeschke · Pull Request #17077 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF: Add cache keyword to to_datetime (#11665) #17077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 11, 2017
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move box logic into maybe_convert_cache
  • Loading branch information
mroeschke committed Nov 11, 2017
commit a093b88d98fab606f06d1e9a7d2fcf17d00d1434
62 changes: 37 additions & 25 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.

.. versionadded: 0.20.0
cache : boolean, default False
cache : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. Produces signficant speed-ups when parsing duplicate dates.

Expand Down Expand Up @@ -310,16 +310,32 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
except (ValueError, TypeError):
raise e

def _maybe_convert_cache(arg, cache, tz):
"""Try to convert the datetimelike arg using
a cache of converted dates.

arg: datetimelike arg from to_datetime
cache: bool whether to convert using a cache

Returns:
Series of converted datetime arg or
None if the conversion failed
def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this outside to_datetime, making this function huge.

"""
Try to convert the datetimelike arg using
a cache of converted dates.

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
Datetime argument to convert
cache : boolean
If True, try to convert the dates with a cache
If False, short circuit and return None
Flag whether to cache the converted dates
box : boolean
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

boxing should be done outside

If True, return a DatetimeIndex
if False, return an ndarray of values
tz : String or None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same with these args. The wrapping should be done outside, _maybe_cache is a raw function that can always just return a ndarray. Then wrap it after its called. otherwise the logic is getting too cumbersome / duplicated.

'utc' if UTC=True was passed else None
name : String, default None
DatetimeIndex name
Returns
-------
Series if original argument was a Series
DatetimeIndex if box=True and original argument was not a Series
ndarray if box=False and original argument was not a Series
None if the conversion failed
"""
if cache and is_list_like(arg) and len(arg) >= 1000:
unique_dates = algorithms.unique(arg)
Expand All @@ -328,9 +344,13 @@ def _maybe_convert_cache(arg, cache, tz):
cache_dates = _convert_listlike(unique_dates, True, format,
tz=tz)
convert_cache = Series(cache_dates, index=unique_dates)
if not isinstance(arg, Series):
arg = Series(arg)
return arg.map(convert_cache)
result = Series(arg, name=name).map(convert_cache)
if isinstance(arg, Series):
return result
elif box:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if u use _convert_list_like where u return None and do the boxing outside (iow where this is called) then you can make this simpler and avoid splitting logic

now it’s ok 2 places

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call on return _convert_list_like instead of None.

However if I move the boxing outside this function, I think the code will be less DRY since I'll have to repeat the boxing logic twice for Indexes and list-like arguments. So essentially if:

def _maybe_convert_cache(...):
    if cache:
        ...
        return Series(arg, name=name).map(convert_cache)
    else: 
        return _convert_cache(...)

then the logic further down will be:

elif isinstance(arg, ABCIndexClass):
    result = _maybe_convert_cache(...)
    if box:
        return DatetimeIndex(...)
    else:
        return result.values
elif is_list_like(arg):
    result = _maybe_convert_cache(...)
    if box:
        return DatetimeIndex(...)
    else:
        return result.values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see my comments above, this needs re-factoring.

return DatetimeIndex(result, name=name)
else:
return result.values
return None

if arg is None:
Expand Down Expand Up @@ -397,29 +417,21 @@ def _maybe_convert_cache(arg, cache, tz):
if isinstance(arg, tslib.Timestamp):
result = arg
elif isinstance(arg, ABCSeries):
result = _maybe_convert_cache(arg, cache, tz)
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name)
if result is None:
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
result = _maybe_convert_cache(arg, cache, tz)
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name)
if result is None:
result = _convert_listlike(arg, box, format, name=arg.name)
else:
result = result.values
if box:
result = DatetimeIndex(result, tz=tz, name=arg.name)
elif is_list_like(arg):
result = _maybe_convert_cache(arg, cache, tz)
result = _maybe_convert_cache(arg, cache, box, format)
if result is None:
result = _convert_listlike(arg, box, format)
else:
result = result.values
if box:
result = DatetimeIndex(result, tz=tz)
else:
result = _convert_listlike(np.array([arg]), box, format)[0]

Expand Down
0