8000 PERF: Add cache keyword to to_datetime (#11665) by mroeschke · Pull Request #17077 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF: Add cache keyword to to_datetime (#11665) #17077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 11, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8000
Prev Previous commit
Next Next commit
Move docs and adjust test
  • Loading branch information
mroeschke committed Nov 11, 2017
commit 49f5850148ea5f0904bb2817e2dbc0eca99d3516
55 changes: 33 additions & 22 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,26 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def _maybe_cache(arg, format, cache, tz, _convert_listlike):
def _maybe_cache(arg, format, cache, tz, convert_listlike):
"""
Create a cache of unique dates from an array of dates

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array of dates
format : string, strftime to parse time
cache: boolean, whether to convert with cache
tz: string, timezone of the dates
_convert_listlike: function, conversion function to apply on dates
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
True attempts to create a cache of converted values
tz : string
Timezone of the dates
convert_listlike : function
Conversion function to apply on dates

Returns
-------
cache_array: Series, cache of converted, unique dates, can be empty
cache_array : Series
Cache of converted, unique dates. Can be empty
"""
from pandas import Series
cache_array = Series()
Expand All @@ -59,37 +64,43 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike):
from pandas import Index
if not Index(arg).is_unique:
unique_dates = algorithms.unique(arg)
cache_dates = _convert_listlike(unique_dates, True, format,
tz=tz)
cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array


def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None):
def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
"""
Convert array of dates with a cache and box the result

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array of dates
cache_array: Series, cache of converted, unique dates
box: boolean, True boxes result as an Index-like
errors: string, 'ignore' plus box=True will convert result to Index
tz: string, timezone of the dates
name: string, default None. name for a DatetimeIndex
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
cache_array : Series
Cache of converted, unique dates
box : boolean
True boxes result as an Index-like, False returns an ndarray
errors : string
'ignore' plus box=True will convert result to Index
name : string, default None
Name for a DatetimeIndex

Returns
-------
result: Index-like if box=True else array-like of converted dates
result : datetime of converted dates
Returns:

- Index-like if box=True
- ndarray if box=False
"""
from pandas import Series, DatetimeIndex, Index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

result = Series(arg).map(cache_array)
if box:
if errors == 'ignore':
result = Index(result)
return Index(result)
else:
result = DatetimeIndex(result, tz=tz, name=name)
return result
return DatetimeIndex(result, name=name)
return result.values


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
Expand Down Expand Up @@ -443,14 +454,14 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
elif isinstance(arg, ABCIndexClass):
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors, tz,
result = _convert_and_box_cache(arg, cache_array, box, errors,
name=arg.name)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't you handle these caess list-list/index) inside _maybe_convert_cache? (I am talking about the else/box part.

result = _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors, tz)
result = _convert_and_box_cac A2D6 he(arg, cache_array, box, errors)
else:
result = _convert_listlike(arg, box, format)
else:
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,8 @@ def test_to_datetime_cache(self, utc, format, box, constructor):
date = '20130101 00:00:00'
test_dates = [date] * 10**5
data = constructor(test_dates)
result = pd.to_datetime(data, utc=utc, format=format, box=box)
result = pd.to_datetime(data, utc=utc, format=format, box=box,
cache=True)
expected = pd.to_datetime(data, utc=utc, format=format, box=box,
cache=False)
if box:
Expand Down
0