-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG: CategoricalIndex.searchsorted doesn't return a scalar if input was scalar #21019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
bd3d440
c4249fa
1c25d65
d4e9879
25b5fd7
04ca52f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1289,6 +1289,9 @@ Indexing | |
- Bug in performing in-place operations on a ``DataFrame`` with a duplicate ``Index`` (:issue:`17105`) | ||
- Bug in :meth:`IntervalIndex.get_loc` and :meth:`IntervalIndex.get_indexer` when used with an :class:`IntervalIndex` containing a single interval (:issue:`17284`, :issue:`20921`) | ||
- Bug in ``.loc`` with a ``uint64`` indexer (:issue:`20722`) | ||
- Bug in ``CategoricalIndex.searchsorted`` where the method didn't return a scalar when the input values was scalar (:issue:`21019`) | ||
- Bug in ``CategoricalIndex`` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use double backticks on KeyError There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see you added to 0.23.1 below, ok, make these changes there and revert this one |
||
|
||
MultiIndex | ||
^^^^^^^^^^ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1342,6 +1342,8 @@ def searchsorted(self, value, side='left', sorter=None): | |
|
||
if -1 in values_as_codes: | ||
raise ValueError("Value(s) to be inserted must be in categories.") | ||
if is_scalar(value): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would rather do this in use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok I c, change here is ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I found out in #21699, numpy.searchsorted doesn't like python ints, but needs numpy ints to archieve its speed. >>> n = 1_000_000
>>> c = pd.Categorical(list('a' * n + 'b' * n + 'c' * n), ordered=True)
>>> %timeit c.codes.searchsorted(1) # python int
7 ms ± 24.7 µs per loop
>>> c.codes.dtype
int8
>>> %timeit c.codes.searchsorted(np.int8(1))
2.46 µs ± 82.4 ns per loop So the scalar version should be |
||
values_as_codes = np.asscalar(values_as_codes) | ||
|
||
return self.codes.searchsorted(values_as_codes, side=side, | ||
sorter=sorter) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -432,13 +432,14 @@ def get_loc(self, key, method=None): | |
>>> monotonic_index.get_loc('b') | ||
slice(1, 3, None) | ||
|
||
>>> non_monotonic_index = p.dCategoricalIndex(list('abcb')) | ||
>>> non_monotonic_index = pd.CategoricalIndex(list('abcb')) | ||
>>> non_monotonic_index.get_loc('b') | ||
array([False, True, False, True], dtype=bool) | ||
""" | ||
codes = self.categories.get_loc(key) | ||
if (codes == -1): | ||
raise KeyError(key) | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is redundant, KeyError is already raised by get_loc |
||
codes = self.categories.get_loc(key) | ||
except KeyError: | ||
raise KeyError("Category `{}` unknown".format(key)) | ||
return self._engine.get_loc(codes) | ||
|
||
def get_value(self, series, key): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,9 +86,9 @@ def test_searchsorted(self): | |
# Searching for single item argument, side='left' (default) | ||
res_cat = c1.searchsorted('apple') | ||
res_ser = s1.searchsorted('apple') | ||
exp = np.array([2], dtype=np.intp) | ||
tm.assert_numpy_array_equal(res_cat, exp) | ||
tm.assert_numpy_array_equal(res_ser, exp) | ||
exp = np.int64(2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, odd that this doesn't fail, this should be a platform indexer ( |
||
assert res_cat == exp | ||
assert res_ser == exp | ||
|
||
# Searching for single item array, side='left' (default) | ||
res_cat = c1.searchsorted(['bread']) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -627,15 +627,52 @@ def test_reindexing(self): | |
lambda: self.df2.reindex(['a'], limit=2)) | ||
|
||
def test_loc_slice(self): | ||
# slicing | ||
# not implemented ATM | ||
# GH9748 | ||
# Raises KeyError since the left slice 'a' is not unique | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add this issue reference here (gh-....) |
||
pytest.raises(KeyError, lambda: self.df.loc["a":"b"]) | ||
result = self.df.loc["b":"c"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this tests is the same as the 3rd case? if so do we need both? (or if not can you move them together and comment) |
||
|
||
pytest.raises(TypeError, lambda: self.df.loc[1:5]) | ||
expected = DataFrame( | ||
{"A": [2, 3, 4]}, | ||
index=CategoricalIndex( | ||
["b", "b", "c"], name="B", categories=list("cab") | ||
), | ||
) | ||
|
||
assert_frame_equal(result, expected) | ||
|
||
ordered_df = DataFrame( | ||
{"A": range(0, 6)}, | ||
index=CategoricalIndex(list("aabcde"), name="B", ordered=True), | ||
) | ||
|
||
result = ordered_df.loc["a":"b"] | ||
expected = DataFrame( | ||
{"A": range(0, 3)}, | ||
index=CategoricalIndex( | ||
list("aab"), categories=list("abcde"), name="B", ordered=True | ||
), | ||
) | ||
assert_frame_equal(result, expected) | ||
|
||
# This should select the entire dataframe | ||
result = ordered_df.loc["a":"e"] | ||
assert_frame_equal(result, ordered_df) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you also test the left edge as well |
||
df_slice = ordered_df.loc["a":"b"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this result looks suspect. both a and b are in the categories and its ordered? also, don't use label based indexers to select the expected, rather use |
||
# Although the edge is not within the slice, this should fall back | ||
# to searchsorted slicing since the category is known | ||
result = df_slice.loc["a":"e"] | ||
assert_frame_equal(result, df_slice) | ||
|
||
# result = df.loc[1:5] | ||
# expected = df.iloc[[1,2,3,4]] | ||
# assert_frame_equal(result, expected) | ||
# If the categorical is not sorted and the requested edge | ||
# is not in the slice we cannot perform slicing | ||
df_slice.index = df_slice.index.as_unordered() | ||
with pytest.raises(KeyError): | ||
df_slice.loc["a":"e"] | ||
|
||
with pytest.raises(KeyError): | ||
# If the category is not known, there is nothing we can do | ||
ordered_df.loc["a":"z"] | ||
|
||
def test_boolean_selection(self): | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use the
:func:
syntaxdidn't -> did not