-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
PERF: Add cache keyword to to_datetime (#11665) #17077
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
7f67ac9
a7c65f7
243349a
d154a6d
b5e71d2
fb2e831
33c79d3
dcaafb6
04df9d9
34b468f
d287cc6
1bf4c9d
3ffdd46
a093b88
d1fc211
9486df3
d059d44
82f36d3
76547e1
590c9cc
9a985ac
85a1f2d
49f5850
07fa22d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,7 +112,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, | |
origin. | ||
|
||
.. versionadded: 0.20.0 | ||
cache : boolean, default False | ||
cache : boolean, default True | ||
If True, use a cache of unique, converted dates to apply the datetime | ||
conversion. Produces signficant speed-ups when parsing duplicate dates. | ||
|
||
|
@@ -310,16 +310,32 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): | |
except (ValueError, TypeError): | ||
raise e | ||
|
||
def _maybe_convert_cache(arg, cache, tz): | ||
"""Try to convert the datetimelike arg using | ||
a cache of converted dates. | ||
|
||
arg: datetimelike arg from to_datetime | ||
cache: bool whether to convert using a cache | ||
|
||
Returns: | ||
Series of converted datetime arg or | ||
None if the conversion failed | ||
def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz): | ||
""" | ||
Try to convert the datetimelike arg using | ||
a cache of converted dates. | ||
|
||
Parameters | ||
---------- | ||
arg : integer, float, string, datetime, list, tuple, 1-d array, Series | ||
Datetime argument to convert | ||
cache : boolean | ||
If True, try to convert the dates with a cache | ||
If False, short circuit and return None | ||
Flag whether to cache the converted dates | ||
box : boolean | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. boxing should be done outside |
||
If True, return a DatetimeIndex | ||
if False, return an ndarray of values | ||
tz : String or None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same with these args. The wrapping should be done outside, |
||
'utc' if UTC=True was passed else None | ||
name : String, default None | ||
DatetimeIndex name | ||
Returns | ||
------- | ||
Series if original argument was a Series | ||
DatetimeIndex if box=True and original argument was not a Series | ||
ndarray if box=False and original argument was not a Series | ||
None if the conversion failed | ||
""" | ||
if cache and is_list_like(arg) and len(arg) >= 1000: | ||
unique_dates = algorithms.unique(arg) | ||
|
@@ -328,9 +344,13 @@ def _maybe_convert_cache(arg, cache, tz): | |
cache_dates = _convert_listlike(unique_dates, True, format, | ||
tz=tz) | ||
convert_cache = Series(cache_dates, index=unique_dates) | ||
if not isinstance(arg, Series): | ||
arg = Series(arg) | ||
return arg.map(convert_cache) | ||
result = Series(arg, name=name).map(convert_cache) | ||
if isinstance(arg, Series): | ||
return result | ||
elif box: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if u use _convert_list_like where u return None and do the boxing outside (iow where this is called) then you can make this simpler and avoid splitting logic now it’s ok 2 places There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call on return _convert_list_like instead of None. However if I move the boxing outside this function, I think the code will be less DRY since I'll have to repeat the boxing logic twice for Indexes and list-like arguments. So essentially if:
then the logic further down will be:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see my comments above, this needs re-factoring. |
||
return DatetimeIndex(result, name=name) | ||
else: | ||
return result.values | ||
return None | ||
|
||
if arg is None: | ||
|
@@ -397,29 +417,21 @@ def _maybe_convert_cache(arg, cache, tz): | |
if isinstance(arg, tslib.Timestamp): | ||
result = arg | ||
elif isinstance(arg, ABCSeries): | ||
result = _maybe_convert_cache(arg, cache, tz) | ||
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) | ||
if result is None: | ||
from pandas import Series | ||
values = _convert_listlike(arg._values, True, format) | ||
result = Series(values, index=arg.index, name=arg.name) | ||
elif isinstance(arg, (ABCDataFrame, MutableMapping)): | ||
result = _assemble_from_unit_mappings(arg, errors=errors) | ||
elif isinstance(arg, ABCIndexClass): | ||
result = _maybe_convert_cache(arg, cache, tz) | ||
result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) | ||
if result is None: | ||
result = _convert_listlike(arg, box, format, name=arg.name) | ||
else: | ||
result = result.values | ||
if box: | ||
result = DatetimeIndex(result, tz=tz, name=arg.name) | ||
elif is_list_like(arg): | ||
result = _maybe_convert_cache(arg, cache, tz) | ||
result = _maybe_convert_cache(arg, cache, box, format) | ||
if result is None: | ||
result = _convert_listlike(arg, box, format) | ||
else: | ||
result = result.values | ||
if box: | ||
result = DatetimeIndex(result, tz=tz) | ||
else: | ||
result = _convert_listlike(np.array([arg]), box, format)[0] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would move this outside
to_datetime
, making this function huge.