BUG: Maintain column order with groupby.nth

pandas-dev · jreback · Nov 20, 2018 · Sep 23, 2018 · Oct 9, 2018 · Oct 13, 2018
commit bc68c371c255e64802d4e30cf9682943cbb99761
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -1323,6 +1323,7 @@ Groupby/Resample/Rolling
 - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`)
 - Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`)
 - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`).
+- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -493,7 +493,8 @@ def _set_group_selection(self):
 
         if len(groupers):
             # GH12839 clear selected obj cache when group selection changes
-            self._group_selection = ax.difference(Index(groupers)).tolist()
+            self._group_selection = ax.difference(Index(groupers),
+                                                  sort=False).tolist()
             self._reset_cache('_selected_obj')
 
     def _set_result_index_ordered(self, result):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2910,17 +2910,20 @@ def intersection(self, other):
             taken.name = None
         return taken
 
-    def difference(self, other):
+    def difference(self, other, sort=True):
         """
         Return a new Index with elements from the index that are not in
         `other`.
 
         This is the set difference of two Index objects.
-        It's sorted if sorting is possible.
 
         Parameters
         ----------
         other : Index or array-like
+        sort : bool, default True
+            Sort the resulting index if possible
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
@@ -2929,10 +2932,12 @@ def difference(self, other):
         Examples
         --------
 
-        >>> idx1 = pd.Index([1, 2, 3, 4])
+        >>> idx1 = pd.Index([2, 1, 3, 4])
         >>> idx2 = pd.Index([3, 4, 5, 6])
         >>> idx1.difference(idx2)
         Int64Index([1, 2], dtype='int64')
+        >>> idx1.difference(idx2, sort=False)
+        Int64Index([2, 1], dtype='int64')
 
         """
         self._assert_can_do_setop(other)
@@ -2951,10 +2956,11 @@ def difference(self, other):
         label_diff = np.setdiff1d(np.arange(this.size), indexer,
                                   assume_unique=True)
         the_diff = this.values.take(label_diff)
-        try:
-            the_diff = sorting.safe_sort(the_diff)
-        except TypeError:
-            pass
+        if sort:
+            try:
+                the_diff = sorting.safe_sort(the_diff)
+            except TypeError:
+                pass
 
         return this._shallow_copy(the_diff, name=result_name, freq=None)
 

diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
@@ -390,3 +390,27 @@ def test_nth_empty():
                                           names=['a', 'b']),
                          columns=['c'])
     assert_frame_equal(result, expected)
+
+
+def test_nth_column_order():
+    # GH 20760
+    # Check that nth preserves column order
+    df = DataFrame([[1, 'b', 100],
+                    [1, 'a', 50],
+                    [1, 'a', np.nan],
+                    [2, 'c', 200],
+                    [2, 'd', 150]],
+                   columns=['A', 'C', 'B'])
+    result = df.groupby('A').nth(0)
+    expected = DataFrame([['b', 100.0],
+                          ['c', 200.0]],
+                         columns=['C', 'B'],
+                         index=Index([1, 2], name='A'))
+    assert_frame_equal(result, expected)
+
+    result = df.groupby('A').nth(-1, dropna='any')
+    expected = DataFrame([['a', 50.0],
+                          ['d', 150.0]],
+                         columns=['C', 'B'],
+                         index=Index([1, 2], name='A'))
+    assert_frame_equal(result, expected)