categorical: searchsorted returns a scalar if input was scalar

pandas-dev · fjetter · May 13, 2018 · May 13, 2018 · May 25, 2018 · May 25, 2018
commit bd3d440c6c0047141c802551c93184e7c531df7e
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1289,6 +1289,9 @@ Indexing
 - Bug in performing in-place operations on a ``DataFrame`` with a duplicate ``Index`` (:issue:`17105`)
 - Bug in :meth:`IntervalIndex.get_loc` and :meth:`IntervalIndex.get_indexer` when used with an :class:`IntervalIndex` containing a single interval (:issue:`17284`, :issue:`20921`)
 - Bug in ``.loc`` with a ``uint64`` indexer (:issue:`20722`)
+- Bug in ``CategoricalIndex.searchsorted`` where the method didn't return a scalar when the input values was scalar (:issue:`21019`)
+- Bug in ``CategoricalIndex`` where slicing beyond the range of the data raised a KeyError (:issue:`21019`)
+
 
 MultiIndex
 ^^^^^^^^^^

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1342,6 +1342,8 @@ def searchsorted(self, value, side='left', sorter=None):
 
         if -1 in values_as_codes:
             raise ValueError("Value(s) to be inserted must be in categories.")
+        if is_scalar(value):
+            values_as_codes = np.asscalar(values_as_codes)
 
         return self.codes.searchsorted(values_as_codes, side=side,
                                        sorter=sorter)

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -432,13 +432,14 @@ def get_loc(self, key, method=None):
         >>> monotonic_index.get_loc('b')
         slice(1, 3, None)
 
-        >>> non_monotonic_index = p.dCategoricalIndex(list('abcb'))
+        >>> non_monotonic_index = pd.CategoricalIndex(list('abcb'))
         >>> non_monotonic_index.get_loc('b')
         array([False,  True, False,  True], dtype=bool)
         """
-        codes = self.categories.get_loc(key)
-        if (codes == -1):
-            raise KeyError(key)
+        try:
+            codes = self.categories.get_loc(key)
+        except KeyError:
+            raise KeyError("Category `{}` unknown".format(key))
         return self._engine.get_loc(codes)
 
     def get_value(self, series, key):

diff --git a/pandas/tests/categorical/test_analytics.py b/pandas/tests/categorical/test_analytics.py
@@ -86,9 +86,9 @@ def test_searchsorted(self):
         # Searching for single item argument, side='left' (default)
         res_cat = c1.searchsorted('apple')
         res_ser = s1.searchsorted('apple')
-        exp = np.array([2], dtype=np.intp)
-        tm.assert_numpy_array_equal(res_cat, exp)
-        tm.assert_numpy_array_equal(res_ser, exp)
+        exp = np.int64(2)
+        assert res_cat == exp
+        assert res_ser == exp
 
         # Searching for single item array, side='left' (default)
         res_cat = c1.searchsorted(['bread'])

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -627,15 +627,52 @@ def test_reindexing(self):
                       lambda: self.df2.reindex(['a'], limit=2))
 
     def test_loc_slice(self):
-        # slicing
-        # not implemented ATM
-        # GH9748
+        # Raises KeyError since the left slice 'a' is not unique
+        pytest.raises(KeyError, lambda: self.df.loc["a":"b"])
+        result = self.df.loc["b":"c"]
 
-        pytest.raises(TypeError, lambda: self.df.loc[1:5])
+        expected = DataFrame(
+            {"A": [2, 3, 4]},
+            index=CategoricalIndex(
+                ["b", "b", "c"], name="B", categories=list("cab")
+            ),
+        )
+
+        assert_frame_equal(result, expected)
+
+        ordered_df = DataFrame(
+            {"A": range(0, 6)},
+            index=CategoricalIndex(list("aabcde"), name="B", ordered=True),
+        )
+
+        result = ordered_df.loc["a":"b"]
+        expected = DataFrame(
+            {"A": range(0, 3)},
+            index=CategoricalIndex(
+                list("aab"), categories=list("abcde"), name="B", ordered=True
+            ),
+        )
+        assert_frame_equal(result, expected)
+
+        # This should select the entire dataframe
+        result = ordered_df.loc["a":"e"]
+        assert_frame_equal(result, ordered_df)
+
+        df_slice = ordered_df.loc["a":"b"]
+        # Although the edge is not within the slice, this should fall back
+        # to searchsorted slicing since the category is known
+        result = df_slice.loc["a":"e"]
+        assert_frame_equal(result, df_slice)
 
-        # result = df.loc[1:5]
-        # expected = df.iloc[[1,2,3,4]]
-        # assert_frame_equal(result, expected)
+        # If the categorical is not sorted and the requested edge
+        # is not in the slice we cannot perform slicing
+        df_slice.index = df_slice.index.as_unordered()
+        with pytest.raises(KeyError):
+            df_slice.loc["a":"e"]
+
+        with pytest.raises(KeyError):
+            # If the category is not known, there is nothing we can do
+            ordered_df.loc["a":"z"]
 
     def test_boolean_selection(self):