-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
PERF: speed up CategoricalIndex.get_loc #23235
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas._libs import index as libindex | ||
|
||
|
||
@pytest.fixture(params=[ | ||
(libindex.Int64Engine, np.int64), | ||
(libindex.Int32Engine, np.int32), | ||
(libindex.Int16Engine, np.int16), | ||
(libindex.Int8Engine, np.int8), | ||
(libindex.UInt64Engine, np.uint64), | ||
(libindex.UInt32Engine, np.uint32), | ||
(libindex.UInt16Engine, np.uint16), | ||
(libindex.UInt8Engine, np.uint8), | ||
(libindex.Float64Engine, np.float64), | ||
(libindex.Float32Engine, np.float32), | ||
], ids=lambda x: x[0].__name__) | ||
def numeric_indexing_engine_type_and_dtype(request): | ||
return request.param |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import numpy as np | ||
|
||
import pandas.util.testing as tm | ||
from pandas import compat | ||
from pandas._libs import algos as libalgos, index as libindex | ||
|
||
|
||
class TestNumericEngine(object): | ||
def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype): | ||
engine_type, dtype = numeric_indexing_engine_type_and_dtype | ||
num = 1000 | ||
arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) | ||
|
||
# monotonic increasing | ||
engine = engine_type(lambda: arr, len(arr)) | ||
assert engine.is_monotonic_increasing is True | ||
assert engine.is_monotonic_decreasing is False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are adding a ton of tests here which is good. but i suspect a lot of this indexing on the engines is already tested elsewhere, can you remove it where appropriate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Int(32|16|8)Engine, UInt(32|16|8)Engine and Float(32|16|8)Engine are all new, and of these new ones, only Int(32|16|8)Engine are used on code elsewhere (In CategoricalIndex). This means that unexpected failures could happen in those unused engines... I could remove tests for Int(8|16|32|64)Engine and (UInt|Float)64Engine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not what I mean. I think we have explict test for these engines elsewhere, see if you can find them an dconsolidate them here. You have a nice set of unit tests, but we want to avoid some duplicaton elsehwere There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't find any tests explicitly for these engines. Lots of tests for e.g. Index.is_monotonic etc. but that's a differerent set of test IMO and we should always have tests for the public APIs. I've searched for "_engine" and "engine" in the pandas/tests directory, but I'm coming up short. I've found tests for various parser engines, but I can't find any for indexing engines (except one insignificant one in multi/test_contains.py). If you really think there are tests for indexing engines, could you point me to an example? |
||
|
||
# monotonic decreasing | ||
engine = engine_type(lambda: arr[::-1], len(arr)) | ||
assert engine.is_monotonic_increasing is False | ||
assert engine.is_monotonic_decreasing is True | ||
|
||
# neither monotonic increasing or decreasing | ||
arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype) | ||
engine = engine_type(lambda: arr[::-1], len(arr)) | ||
assert engine.is_monotonic_increasing is False | ||
assert engine.is_monotonic_decreasing is False | ||
|
||
def test_is_unique(self, numeric_indexing_engine_type_and_dtype): | ||
engine_type, dtype = numeric_indexing_engine_type_and_dtype | ||
|
||
# unique | ||
arr = np.array([1, 3, 2], dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
assert engine.is_unique is True | ||
|
||
# not unique | ||
arr = np.array([1, 2, 1], dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
assert engine.is_unique is False | ||
|
||
def test_get_loc(self, numeric_indexing_engine_type_and_dtype): | ||
engine_type, dtype = numeric_indexing_engine_type_and_dtype | ||
|
||
# unique | ||
arr = np.array([1, 2, 3], dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
assert engine.get_loc(2) == 1 | ||
|
||
# monotonic | ||
num = 1000 | ||
arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
assert engine.get_loc(2) == slice(1000, 2000) | ||
|
||
# not monotonic | ||
arr = np.array([1, 2, 3] * num, dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
expected = np.array([False, True, False] * num, dtype=bool) | ||
result = engine.get_loc(2) | ||
assert (result == expected).all() | ||
|
||
def test_get_backfill_indexer( | ||
self, numeric_indexing_engine_type_and_dtype): | ||
engine_type, dtype = numeric_indexing_engine_type_and_dtype | ||
|
||
arr = np.array([1, 5, 10], dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
|
||
new = np.array(compat.range(12), dtype=dtype) | ||
result = engine.get_backfill_indexer(new) | ||
|
||
expected = libalgos.backfill(arr, new) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_get_pad_indexer( | ||
self, numeric_indexing_engine_type_and_dtype): | ||
engine_type, dtype = numeric_indexing_engine_type_and_dtype | ||
|
||
arr = np.array([1, 5, 10], dtype=dtype) | ||
engine = engine_type(lambda: arr, len(arr)) | ||
|
||
new = np.array(compat.range(12), dtype=dtype) | ||
result = engine.get_pad_indexer(new) | ||
|
||
expected = libalgos.pad(arr, new) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
|
||
class TestObjectEngine(object): | ||
engine_type = libindex.ObjectEngine | ||
dtype = np.object_ | ||
values = list('abc') | ||
|
||
def test_is_monotonic(self): | ||
|
||
num = 1000 | ||
arr = np.array(['a'] * num + ['a'] * num + ['c'] * num, | ||
dtype=self.dtype) | ||
|
||
# monotonic increasing | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
assert engine.is_monotonic_increasing is True | ||
assert engine.is_monotonic_decreasing is False | ||
|
||
# monotonic decreasing | ||
engine = self.engine_type(lambda: arr[::-1], len(arr)) | ||
assert engine.is_monotonic_increasing is False | ||
assert engine.is_monotonic_decreasing is True | ||
|
||
# neither monotonic increasing or decreasing | ||
arr = np.array(['a'] * num + ['b'] * num + ['a'] * num, | ||
dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr[::-1], len(arr)) | ||
assert engine.is_monotonic_increasing is False | ||
assert engine.is_monotonic_decreasing is False | ||
|
||
def test_is_unique(self): | ||
# unique | ||
arr = np.array(self.values, dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
assert engine.is_unique is True | ||
|
||
# not unique | ||
arr = np.array(['a', 'b', 'a'], dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
assert engine.is_unique is False | ||
|
||
def test_get_loc(self): | ||
# unique | ||
arr = np.array(self.values, dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
assert engine.get_loc('b') == 1 | ||
|
||
# monotonic | ||
num = 1000 | ||
arr = np.array(['a'] * num + ['b'] * num + ['c'] * num, | ||
dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
assert engine.get_loc('b') == slice(1000, 2000) | ||
|
||
# not monotonic | ||
arr = np.array(self.values * num, dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
expected = np.array([False, True, False] * num, dtype=bool) | ||
result = engine.get_loc('b') | ||
assert (result == expected).all() | ||
|
||
def test_get_backfill_indexer(self): | ||
arr = np.array(['a', 'e', 'j'], dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
|
||
new = np.array(list('abcdefghij'), dtype=self.dtype) | ||
result = engine.get_backfill_indexer(new) | ||
|
||
expected = libalgos.backfill_object(arr, new) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_get_pad_indexer(self): | ||
arr = np.array(['a', 'e', 'j'], dtype=self.dtype) | ||
engine = self.engine_type(lambda: arr, len(arr)) | ||
|
||
new = np.array(list('abcdefghij'), dtype=self.dtype) | ||
result = engine.get_pad_indexer(new) | ||
|
||
expected = libalgos.pad_object(arr, new) | ||
tm.assert_numpy_array_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a way to just combine these into a single set of tests w/o regards to whether this is object / numeric. this is pretty duplicative set of tests
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Following the discussion below, I could just remove the tests for ObjectEngine. ObjectEngine is used in lots of other places.