10000 PERF: speed up CategoricalIndex.get_loc by topper-123 · Pull Request #23235 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF: speed up CategoricalIndex.get_loc #23235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
add tests for engines and add is_monotonic_uint(32|16)
  • Loading branch information
topper-123 committed Oct 23, 2018
commit a519cc4380470f5e3a67c28a84a47e8f33f77855
9 changes: 5 additions & 4 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from pandas._libs import index as li
from pandas._libs import index as libindex


def _get_numeric_engines():
Expand All @@ -11,8 +11,9 @@ def _get_numeric_engines():
('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
('Float64Engine', np.float64), ('Float32Engine', np.float32),
]
return [(getattr(li, engine_name), dtype)
for engine_name, dtype in engine_names if hasattr(li, engine_name)]
return [(getattr(libindex, engine_name), dtype)
for engine_name, dtype in engine_names
if hasattr(libindex, engine_name)]


class NumericEngineIndexing(object):
Expand Down Expand Up @@ -55,7 +56,7 @@ def setup(self, index_type):
'non_monotonic': np.array(list('abc') * N, dtype=object),
}[index_type]

self.data = li.ObjectEngine(lambda: arr, len(arr))
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
self.data.get_loc('b')

Expand Down
15 changes: 15 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,8 @@ ctypedef fused algos_t:
int16_t
int8_t
uint64_t
uint32_t
uint16_t
uint8_t


Expand Down Expand Up @@ -462,7 +464,12 @@ pad_float32 = pad["float32_t"]
pad_object = pad["object"]
pad_int64 = pad["int64_t"]
pad_int32 = pad["int32_t"]
pad_int16 = pad["int16_t"]
pad_int8 = pad["int8_t"]
pad_uint64 = pad["uint64_t"]
pad_uint32 = pad["uint32_t"]
pad_uint16 = pad["uint16_t"]
pad_uint8 = pad["uint8_t"]
pad_bool = pad["uint8_t"]


Expand Down Expand Up @@ -656,7 +663,12 @@ backfill_float32 = backfill["float32_t"]
backfill_object = backfill["object"]
backfill_int64 = backfill["int64_t"]
backfill_int32 = backfill["int32_t"]
backfill_int16 = backfill["int16_t"]
backfill_int8 = backfill["int8_t"]
backfill_uint64 = backfill["uint64_t"]
backfill_uint32 = backfill["uint32_t"]
backfill_uint16 = backfill["uint16_t"]
backfill_uint8 = backfill["uint8_t"]
backfill_bool = backfill["uint8_t"]


Expand Down Expand Up @@ -872,6 +884,9 @@ is_monotonic_int32 = is_monotonic["int32_t"]
is_monotonic_int16 = is_monotonic["int16_t"]
is_monotonic_int8 = is_monotonic["int8_t"]
is_monotonic_uint64 = is_monotonic["uint64_t"]
is_monotonic_uint32 = is_monotonic["uint32_t"]
is_monotonic_uint16 = is_monotonic["uint16_t"]
is_monotonic_uint8 = is_monotonic["uint8_t"]
is_monotonic_bool = is_monotonic["uint8_t"]


Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint8', 'UINT8', 'uint8'),
('uint16', 'UINT16', 'uint16'),
('uint32', 'UINT32', 'uint32'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-

import pytest
import numpy as np

import pandas.util.testing as tm
from pandas.core.indexes.api import Index, CategoricalIndex
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas._libs import index as libindex
from .common import Base

from pandas.compat import range, PY3

import numpy as np

from pandas import Categorical, IntervalIndex, compat
from pandas.util.testing import assert_almost_equal
import pandas.core.config as cf
Expand Down Expand Up @@ -1117,3 +1117,23 @@ def test_take_invalid_kwargs(self):
msg = "the 'mode' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, idx.take,
indices, mode='clip')

@pytest.mark.parametrize('dtype, engine_type', [
(np.int8, libindex.Int8Engine),
(np.int16, libindex.Int16Engine),
(np.int32, libindex.Int32Engine),
(np.int64, libindex.Int64Engine),
])
def test_engine_type(self, dtype, engine_type):
if dtype != np.int64:
# num. of uniques required to push CategoricalIndex.codes to a
# dtype (128 categories required for .codes dtype to be int16 etc.)
num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
ci = pd.CategoricalIndex(range(num_uniques))
else:
# having 2**32 - 2**31 categories would be very memory-intensive,
# so we cheat a bit with the dtype
ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
ci.values._codes = ci.values._codes.astype('int64')
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)
20 changes: 20 additions & 0 deletions pandas/tests/indexing/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
import pytest

from pandas._libs import index as libindex


@pytest.fixture(params=[
(libindex.Int64Engine, np.int64),
(libindex.Int32Engine, np.int32),
(libindex.Int16Engine, np.int16),
(libindex.Int8Engine, np.int8),
(libindex.UInt64Engine, np.uint64),
(libindex.UInt32Engine, np.uint32),
(libindex.UInt16Engine, np.uint16),
(libindex.UInt8Engine, np.uint8),
(libindex.Float64Engine, np.float64),
(libindex.Float32Engine, np.float32),
], ids=lambda x: x[0].__name__)
def numeric_indexing_engine_type_and_dtype(request):
return request.param
168 changes: 168 additions & 0 deletions pandas/tests/indexing/test_indexing_engines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import numpy as np

import pandas.util.testing as tm
from pandas import compat
from pandas._libs import algos as libalgos, index as libindex


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a way to just combine these into a single set of tests w/o regards to whether this is object / numeric. this is pretty duplicative set of tests

F438 Copy link
Contributor Author
@topper-123 topper-123 Oct 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following the discussion below, I could just remove the tests for ObjectEngine. ObjectEngine is used in lots of other places.

class TestNumericEngine(object):
def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype):
engine_type, dtype = numeric_indexing_engine_type_and_dtype
num = 1000
arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype)

# monotonic increasing
engine = engine_type(lambda: arr, len(arr))
assert engine.is_monotonic_increasing is True
assert engine.is_monotonic_decreasing is False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are adding a ton of tests here which is good. but i suspect a lot of this indexing on the engines is already tested elsewhere, can you remove it where appropriate.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Int(32|16|8)Engine, UInt(32|16|8)Engine and Float(32|16|8)Engine are all new, and of these new ones, only Int(32|16|8)Engine are used on code elsewhere (In CategoricalIndex).

This means that unexpected failures could happen in those unused engines...

I could remove tests for Int(8|16|32|64)Engine and (UInt|Float)64Engine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not what I mean. I think we have explict test for these engines elsewhere, see if you can find them an dconsolidate them here. You have a nice set of unit tests, but we want to avoid some duplicaton elsehwere

Copy link
Contributor Author
@topper-123 topper-123 Oct 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find any tests explicitly for these engines. Lots of tests for e.g. Index.is_monotonic etc. but that's a differerent set of test IMO and we should always have tests for the public APIs.

I've searched for "_engine" and "engine" in the pandas/tests directory, but I'm coming up short. I've found tests for various parser engines, but I can't find any for indexing engines (except one insignificant one in multi/test_contains.py).

If you really think there are tests for indexing engines, could you point me to an example?


# monotonic decreasing
engine = engine_type(lambda: arr[::-1], len(arr))
assert engine.is_monotonic_increasing is False
assert engine.is_monotonic_decreasing is True

# neither monotonic increasing or decreasing
arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype)
engine = engine_type(lambda: arr[::-1], len(arr))
assert engine.is_monotonic_increasing is False
assert engine.is_monotonic_decreasing is False

def test_is_unique(self, numeric_indexing_engine_type_and_dtype):
engine_type, dtype = numeric_indexing_engine_type_and_dtype

# unique
arr = np.array([1, 3, 2], dtype=dtype)
engine = engine_type(lambda: arr, len(arr))
assert engine.is_unique is True

# not unique
arr = np.array([1, 2, 1], dtype=dtype)
engine = engine_type(lambda: arr, len(arr))
assert engine.is_unique is False

def test_get_loc(self, numeric_indexing_engine_type_and_dtype):
engine_type, dtype = numeric_indexing_engine_type_and_dtype

# unique
arr = np.array([1, 2, 3], dtype=dtype)
engine = engine_type(lambda: arr, len(arr))
assert engine.get_loc(2) == 1

# monotonic
num = 1000
arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype)
engine = engine_type(lambda: arr, len(arr))
assert engine.get_loc(2) == slice(1000, 2000)

# not monotonic
arr = np.array([1, 2, 3] * num, dtype=dtype)
engine = engine_type(lambda: arr, len(arr))
expected = np.array([False, True, False] * num, dtype=bool)
result = engine.get_loc(2)
assert (result == expected).all()

def test_get_backfill_indexer(
self, numeric_indexing_engine_type_and_dtype):
engine_type, dtype = numeric_indexing_engine_type_and_dtype

arr = np.array([1, 5, 10], dtype=dtype)
engine = engine_type(lambda: arr, len(arr))

new = np.array(compat.range(12), dtype=dtype)
result = engine.get_backfill_indexer(new)

expected = libalgos.backfill(arr, new)
tm.assert_numpy_array_equal(result, expected)

def test_get_pad_indexer(
self, numeric_indexing_engine_type_and_dtype):
engine_type, dtype = numeric_indexing_engine_type_and_dtype

arr = np.array([1, 5, 10], dtype=dtype)
engine = engine_type(lambda: arr, len(arr))

new = np.array(compat.range(12), dtype=dtype)
result = engine.get_pad_indexer(new)

expected = libalgos.pad(arr, new)
tm.assert_numpy_array_equal(result, expected)


class TestObjectEngine(object):
engine_type = libindex.ObjectEngine
dtype = np.object_
values = list('abc')

def test_is_monotonic(self):

num = 1000
arr = np.array(['a'] * num + ['a'] * num + ['c'] * num,
dtype=self.dtype)

# monotonic increasing
engine = self.engine_type(lambda: arr, len(arr))
assert engine.is_monotonic_increasing is True
assert engine.is_monotonic_decreasing is False

# monotonic decreasing
engine = self.engine_type(lambda: arr[::-1], len(arr))
assert engine.is_monotonic_increasing is False
assert engine.is_monotonic_decreasing is True

# neither monotonic increasing or decreasing
arr = np.array(['a'] * num + ['b'] * num + ['a'] * num,
dtype=self.dtype)
engine = self.engine_type(lambda: arr[::-1], len(arr))
assert engine.is_monotonic_increasing is False
assert engine.is_monotonic_decreasing is False

def test_is_unique(self):
# unique
arr = np.array(self.values, dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))
assert engine.is_unique is True

# not unique
arr = np.array(['a', 'b', 'a'], dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))
assert engine.is_unique is False

def test_get_loc(self):
# unique
arr = np.array(self.values, dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))
assert engine.get_loc('b') == 1

# monotonic
num = 1000
arr = np.array(['a'] * num + ['b'] * num + ['c'] * num,
dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))
assert engine.get_loc('b') == slice(1000, 2000)

# not monotonic
arr = np.array(self.values * num, dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))
expected = np.array([False, True, False] * num, dtype=bool)
result = engine.get_loc('b')
assert (result == expected).all()

def test_get_backfill_indexer(self):
arr = np.array(['a', 'e', 'j'], dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))

new = np.array(list('abcdefghij'), dtype=self.dtype)
result = engine.get_backfill_indexer(new)

expected = libalgos.backfill_object(arr, new)
tm.assert_numpy_array_equal(result, expected)

def test_get_pad_indexer(self):
arr = np.array(['a', 'e', 'j'], dtype=self.dtype)
engine = self.engine_type(lambda: arr, len(arr))

new = np.array(list('abcdefghij'), dtype=self.dtype)
result = engine.get_pad_indexer(new)

expected = libalgos.pad_object(arr, new)
tm.assert_numpy_array_equal(result, expected)
0