8000 PERF: Index._shallow_copy shares _cache with copies of self by topper-123 · Pull Request #36840 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
8000
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
PERF: share _cache, don't share _id
  • Loading branch information
topper-123 committed Oct 4, 2020
commit ffa9eeeb03fbfaead1263fea98f8047964fca570
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,9 @@ Performance improvements
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`)
- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`)
- Performance improvement for the :meth:`~Index.equals` method on all index classes, when compared to copies of the same index (:issue:`36840`)
- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes,
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)


.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -359,7 +361,6 @@ Indexing
- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`)
- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`36840`)

Missing
^^^^^^^
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,8 +566,7 @@ def _shallow_copy(self, values=None, name: Label = no_default):
return self._simple_new(values, name=name)

result = self._simple_new(self._values, name=name)
result._cache = self._cache.copy()
result._id = self._id
result._cache = self._cache
return result

def is_(self, other) -> bool:
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default):
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
result._cache = self._cache
return result

# --------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,12 @@ def _shallow_copy(
self, values: Optional[IntervalArray] = None, name: Label = lib.no_default
):
name = self.name if name is lib.no_default else name

if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
result._cache = self._cache
return result

@cache_readonly
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,6 @@ def _shallow_copy(
)
result._cache = self._cache.copy()
result._cache.pop("levels", None) # GH32669
result._id = self._id
return result

def symmetric_difference(self, other, result_name=None, sort=None):
Expand Down Expand Up @@ -1193,6 +1192,7 @@ def __array__(self, dtype=None) -> np.ndarray:
def view(self, cls=None):
""" this is defined as a copy with the same identity """
result = self.copy()
result._id = self._id
return result

@doc(Index.__contains__)
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,7 @@ def _shallow_copy(self, values=None, name: Label = no_default):
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
result._cache = self._cache
return result

def _maybe_convert_timedelta(self, other):
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,7 @@ def _shallow_copy(self, values=None, name: Label = no_default):
return Int64Index._simple_new(values, name=name)

result = self._simple_new(self._range, name=name)
result._cache = self._cache.copy()
result._id = self._id
result._cache = self._cache
return result

@doc(Int64Index.copy)
Expand Down
26 changes: 7 additions & 19 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,33 +936,21 @@ def test_contains_requires_hashable_raises(self):
{} in idx._engine

def test_copy_copies_cache(self):
# GH32898
# GH32898, GH36840
idx = self.create_index()
idx.get_loc(idx[0]) # populates the _cache.
copy = idx.copy()

# check that the copied cache is a copy of the original
assert idx._cache == copy._cache
assert idx._cache is not copy._cache
# cache values should reference the same object
for key, val in idx._cache.items():
assert copy._cache[key] is val, key
assert copy._cache is idx._cache

def test_shallow_copy_copies_cache(self):
# GH32669
# GH32669, GH36840
idx = self.create_index()
idx.get_loc(idx[0]) # populates the _cache.
shallow_copy = idx._shallow_copy()

# check that the shallow_copied cache is a copy of the original
assert idx._cache == shallow_copy._cache
assert idx._cache is not shallow_copy._cache
# cache values should reference the same object
for key, val in idx._cache.items():
assert shallow_copy._cache[key] is val, key
assert shallow_copy._cache is idx._cache

def test_shallow_copy_copies_id(self):
# GH36840
idx = self.create_index()
shallow_copy = idx._shallow_copy()
assert idx._id is shallow_copy._id
shallow_copy = idx._shallow_copy(idx._data)
assert shallow_copy._cache is not idx._cache
assert shallow_copy._cache == {}
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def test_is_(self):
dti = date_range(start="1/1/2005", end="12/1/2005", freq="M")
assert dti.is_(dti)
assert dti.is_(dti.view())
assert dti.is_(dti.copy())
assert not dti.is_(dti.copy())

def test_index_cast_datetime64_other_units(self):
arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/multi/test_equivalence.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_is_():
assert mi2.is_(mi)
assert mi.is_(mi2)

assert mi.is_(mi.set_names(["C", "D"]))
assert not mi.is_(mi.set_names(["C", "D"]))
mi2 = mi.view()
mi2.set_names(["E", "F"], inplace=True)
assert mi.is_(mi2)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,8 +523,8 @@ def test_is_(self):
assert ind.is_(ind)
assert ind.is_(ind.view().view().view().view())
assert not ind.is_(Index(range(10)))
assert ind.is_(ind.copy())
assert ind.is_(ind.copy(deep=False))
assert not ind.is_(ind.copy())
assert not ind.is_(ind.copy(deep=False))
assert not ind.is_(ind[:])
assert not ind.is_(np.array(range(10)))

Expand Down
0