10000 API: better error-handling for df.set_index by h-vetinari · Pull Request #22486 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

API: better error-handling for df.set_i 8000 ndex #22486

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Oct 19, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Allow list-likes (review jreback)
  • Loading branch information
h-vetinari committed Sep 26, 2018
commit fa68660f8feed9d6d084c00b084fc8fe466a7e8d
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ Other API Changes
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)

.. _whatsnew_0240.deprecations:

Expand Down
38 changes: 22 additions & 16 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
is_sequence,
is_named_tuple)
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
from pandas.core.dtypes.generic import (ABCSeries, ABCIndexClass,
ABCMultiIndex, ABCDataFrame)
from pandas.core.dtypes.missing import isna, notna


Expand Down Expand Up @@ -3915,16 +3916,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False,

missing = []
for x in keys:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you rename x -> col for consistency with the loop below

if not (is_scalar(x) or isinstance(x, tuple)):
if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex,
list, np.ndarray)):
raise TypeError('The parameter "keys" may only contain a '
'combination of the following: valid '
'column keys, Series, Index, MultiIndex, '
'list or np.ndarray')
else:
if x not in self:
missing.append(x)
if (is_scalar(x) or isinstance(x, tuple)) and x in self:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment here

continue
elif is_scalar(x) and x not in self:
# a tuple gets tried as column key first;
# otherwise considered as a list-like; i.e. not missing
missing.append(x)
elif (not is_list_like(x) or isinstance(x, set)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't not is_like(x) or getattr(x, 'ndim', 1) > 1 work here

or isinstance(x, ABCDataFrame)
or (isinstance(x, np.ndarray) and x.ndim > 1)):
raise TypeError('The parameter "keys" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')

if missing:
raise KeyError('{}'.format(missing))
Expand All @@ -3950,16 +3953,19 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, ABCIndexClass):
# Index but not MultiIndex (treated above)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, ABCSeries):
arrays.append(col._values)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(frame[col]._values)
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,13 @@ def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing

Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
are jointly unique), the rest are unique.
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
"""
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
'C': ['a', 'b', 'c', 'd', 'e'],
'D': np.random.randn(5),
'E': np.random.randn(5)})
'E': np.random.randn(5),
('tuple', 'as', 'label'): np.random.randn(5)})
return df
55 changes: 36 additions & 19 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_set_index_cast(self):
tm.assert_frame_equal(df, df2)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('inplace', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_drop_inplace(self, frame_of_index_cols,
Expand All @@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols,
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append(self, frame_of_index_cols, drop, keys):
df = frame_of_index_cols
Expand All @@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys):
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append_to_multiindex(self, frame_of_index_cols,
drop, keys):
Expand All @@ -114,8 +117,10 @@ def test_set_index_after_mutation(self):
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter, lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'B'), (True, 'test'), (False, None)])
Expand All @@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
df.index.name = index_name

key = box(df['B'])
# np.array and list "forget" the name of B
name = [None if box in [np.array, list] else 'B']
if box == list:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why exactly is a list interpreted differently here? this makes this test really really odd. I am worried something changed here, as this appears to work just fine in the master.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You will see above (line 123) that list wasn't tested - for precisely this reason. df.set_index(['A', 'B']) interprets a A & B as column keys, so (assuming this df was length 2) it would not use the list ['A', 'B'] as the index. To do that, one would have to pass [['A', 'B']]. This PR proposes to add tests for the current behaviour.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
In case the above was not very clearly worded, this corresponds exactly to behaviour on master:

>>> df = pd.DataFrame(np.random.randn(2,3))
>>> df.set_index(['A', 'B'])
KeyError: 'A'
>>> df.set_index([['A', 'B']])
          0         1         2
A  1.962370 -1.150770  0.843600
B -0.417274  0.509781 -0.697802

# list of strings gets interpreted as list of keys
msg = "['one', 'two', 'three', 'one', 'two']"
with tm.assert_raises_regex(KeyError, msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/tuple/iter/list-of-list "forget" the name of B
name_mi = getattr(key, 'names', None)
name = [getattr(key, 'name', None)] if name_mi is None else name_mi

result = df.set_index(key, drop=drop, append=append)
result = df.set_index(key, drop=drop, append=append)

# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name
# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name',
[(True, None), (True, 'A'), (True, 'B'),
Expand All @@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
df.index.name = index_name

keys = ['A', box(df['B'])]
# np.array and list "forget" the name of B
names = ['A', None if box in [np.array, list] else 'B']
# np.array/list/tuple/iter "forget" the name of B
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']

result = df.set_index(keys, drop=drop, append=append)

Expand All @@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
@pytest.mark.parametrize('box2', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
@pytest.mark.parametrize('box1', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('append, index_name', [(True, None),
Expand All @@ -182,6 +197,10 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
df = frame_of_index_cols
df.index.name = index_name

keys = [box1(df['A']), box2(df['A'])]
result = df.set_index(keys, drop=drop, append=append)

# if either box was iter, the content has been consumed; re-read it
keys = [box1(df['A']), box2(df['A'])]

# to test against already-tested behaviour, we add sequentially,
Expand All @@ -192,8 +211,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
and keys[1] is 'A')
else drop), append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)

result = df.set_index(keys, drop=drop, append=append)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('append', [True, False])
Expand Down Expand Up @@ -227,7 +244,7 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols

with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
# column names are A-E
# column names are A-E, as well as one tuple
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)

# non-existent key in list with arrays
Expand Down
0