-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API: better error-handling for df.set_i 8000 ndex #22486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
28ec3a9
8dd1453
18d3464
fa68660
6d3dfa1
159ecf5
46f8dc7
3a6469d
c3cdde2
dd8d000
47c4e74
06ed33a
9871be2
9da6e6c
361a905
42d5f2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,7 +60,8 @@ | |
is_sequence, | ||
is_named_tuple) | ||
from pandas.core.dtypes.concat import _get_sliced_frame_result_type | ||
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex | ||
from pandas.core.dtypes.generic import (ABCSeries, ABCIndexClass, | ||
ABCMultiIndex, ABCDataFrame) | ||
from pandas.core.dtypes.missing import isna, notna | ||
|
||
|
||
|
@@ -3915,16 +3916,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False, | |
|
||
missing = [] | ||
for x in keys: | ||
if not (is_scalar(x) or isinstance(x, tuple)): | ||
if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex, | ||
list, np.ndarray)): | ||
raise TypeError('The parameter "keys" may only contain a ' | ||
'combination of the following: valid ' | ||
'column keys, Series, Index, MultiIndex, ' | ||
'list or np.ndarray') | ||
else: | ||
if x not in self: | ||
missing.append(x) | ||
if (is_scalar(x) or isinstance(x, tuple)) and x in self: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a comment here |
||
continue | ||
elif is_scalar(x) and x not in self: | ||
# a tuple gets tried as column key first; | ||
# otherwise considered as a list-like; i.e. not missing | ||
missing.append(x) | ||
elif (not is_list_like(x) or isinstance(x, set) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't |
||
or isinstance(x, ABCDataFrame) | ||
or (isinstance(x, np.ndarray) and x.ndim > 1)): | ||
raise TypeError('The parameter "keys" may only contain a ' | ||
'combination of valid column keys and ' | ||
'one-dimensional list-likes') | ||
|
||
if missing: | ||
raise KeyError('{}'.format(missing)) | ||
|
@@ -3950,16 +3953,19 @@ def set_index(self, keys, drop=True, append=False, inplace=False, | |
for n in range(col.nlevels): | ||
arrays.append(col._get_level_values(n)) | ||
names.extend(col.names) | ||
elif isinstance(col, ABCIndexClass): | ||
# Index but not MultiIndex (treated above) | ||
elif isinstance(col, (ABCIndexClass, ABCSeries)): | ||
# if Index then not MultiIndex (treated above) | ||
arrays.append(col) | ||
names.append(col.name) | ||
elif isinstance(col, ABCSeries): | ||
arrays.append(col._values) | ||
names.append(col.name) | ||
elif isinstance(col, (list, np.ndarray)): | ||
arrays.append(col) | ||
names.append(None) | ||
elif (is_list_like(col) | ||
and not (isinstance(col, tuple) and col in self)): | ||
# all other list-likes (but avoid valid column keys) | ||
col = list(col) # ensure iterator do not get read twice etc. | ||
arrays.append(col) | ||
names.append(None) | ||
# from here, col can only be a column label | ||
else: | ||
arrays.append(frame[col]._values) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,8 @@ def test_set_index_cast(self): | |
tm.assert_frame_equal(df, df2) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('inplace', [True, False]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_drop_inplace(self, frame_of_index_cols, | ||
|
@@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, | |
tm.assert_frame_equal(result, expected) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_append(self, frame_of_index_cols, drop, keys): | ||
df = frame_of_index_cols | ||
|
@@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): | |
tm.assert_frame_equal(result, expected) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_append_to_multiindex(self, frame_of_index_cols, | ||
drop, keys): | ||
|
@@ -114,8 +117,10 @@ def test_set_index_after_mutation(self): | |
tm.assert_frame_equal(result, expected) | ||
|
||
# MultiIndex constructor does not work directly on Series -> lambda | ||
# Add list-of-list constructor because list is ambiguous -> lambda | ||
# also test index name if append=True (name is duplicate here for B) | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, | ||
list, tuple, iter, lambda x: [list(x)], | ||
lambda x: MultiIndex.from_arrays([x])]) | ||
@pytest.mark.parametrize('append, index_name', [(True, None), | ||
(True, 'B'), (True, 'test'), (False, None)]) | ||
|
@@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, | |
df.index.name = index_name | ||
|
||
key = box(df['B']) | ||
# np.array and list "forget" the name of B | ||
name = [None if box in [np.array, list] else 'B'] | ||
if box == list: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why exactly is a list interpreted differently here? this makes this test really really odd. I am worried something changed here, as this appears to work just fine in the master. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You will see above (line 123) that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback
|
||
# list of strings gets interpreted as list of keys | ||
msg = "['one', 'two', 'three', 'one', 'two']" | ||
with tm.assert_raises_regex(KeyError, msg): | ||
df.set_index(key, drop=drop, append=append) | ||
else: | ||
# np.array/tuple/iter/list-of-list "forget" the name of B | ||
name_mi = getattr(key, 'names', None) | ||
name = [getattr(key, 'name', None)] if name_mi is None else name_mi | ||
|
||
result = df.set_index(key, drop=drop, append=append) | ||
result = df.set_index(key, drop=drop, append=append) | ||
|
||
# only valid column keys are dropped | ||
# since B is always passed as array above, nothing is dropped | ||
expected = df.set_index(['B'], drop=False, append=append) | ||
expected.index.names = [index_name] + name if append else name | ||
# only valid column keys are dropped | ||
# since B is always passed as array above, nothing is dropped | ||
expected = df.set_index(['B'], drop=False, append=append) | ||
expected.index.names = [index_name] + name if append else name | ||
|
||
tm.assert_frame_equal(result, expected) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# MultiIndex constructor does not work directly on Series -> lambda | ||
# also test index name if append=True (name is duplicate here for A & B) | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x])]) | ||
@pytest.mark.parametrize('append, index_name', | ||
[(True, None), (True, 'A'), (True, 'B'), | ||
|
@@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
df.index.name = index_name | ||
|
||
keys = ['A', box(df['B'])] | ||
# np.array and list "forget" the name of B | ||
names = ['A', None if box in [np.array, list] else 'B'] | ||
# np.array/list/tuple/iter "forget" the name of B | ||
names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] | ||
|
||
result = df.set_index(keys, drop=drop, append=append) | ||
|
||
|
@@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
# MultiIndex constructor does not work directly on Series -> lambda | ||
# We also emulate a "constructor" for the label -> lambda | ||
# also test index name if append=True (name is duplicate here for A) | ||
@pytest.mark.parametrize('box2', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box2', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x]), | ||
lambda x: x.name]) | ||
@pytest.mark.parametrize('box1', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box1', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x]), | ||
lambda x: x.name]) | ||
@pytest.mark.parametrize('append, index_name', [(True, None), | ||
|
@@ -182,6 +197,10 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, | |
df = frame_of_index_cols | ||
df.index.name = index_name | ||
|
||
keys = [box1(df['A']), box2(df['A'])] | ||
result = df.set_index(keys, drop=drop, append=append) | ||
|
||
# if either box was iter, the content has been consumed; re-read it | ||
keys = [box1(df['A']), box2(df['A'])] | ||
|
||
# to test against already-tested behaviour, we add sequentially, | ||
|
@@ -192,8 +211,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, | |
and keys[1] is 'A') | ||
else drop), append=append) | ||
expected = expected.set_index([keys[1]], drop=drop, append=True) | ||
|
||
result = df.set_index(keys, drop=drop, append=append) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('append', [True, False]) | ||
|
@@ -227,7 +244,7 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): | |
df = frame_of_index_cols | ||
|
||
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): | ||
# column names are A-E | ||
# column names are A-E, as well as one tuple | ||
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) | ||
|
||
# non-existent key in list with arrays | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you rename x -> col for consistency with the loop below