8000 ENH: Allow for join between two multi-index dataframe instances by harisbal · Pull Request #20356 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: Allow for join between two multi-index dataframe instances #20356

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b581789
Allow for join between two multi-index dataframe instances
Sep 19, 2018
2d61a12
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Sep 19, 2018
4d4acc5
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Oct 7, 2018
66d82fb
Review
harisbal Oct 8, 2018
c091bb4
Merge remote-tracking branch 'upstream/master' into multi-index-join
Oct 8, 2018
d56ebcd
Second review
harisbal Oct 9, 2018
0cdad73
Merge remote-tracking branch 'upstream/master' into multi-index-join
Oct 9, 2018
c2a65aa
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Oct 10, 2018
571fdf7
Merge remote-tracking branch 'upstream/master' into multi-index-join
Nov 1, 2018
ae2d8ad
Review
harisbal Nov 1, 2018
405c1a4
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 1, 2018
1d2d9f3
Fix ci
harisbal Nov 3, 2018
f0ac24d
Merge branch 'master' into multi-index-join
Nov 3, 2018
5ac40ff
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 3, 2018
8000 be862c7
Update v0.24.0.txt
harisbal Nov 4, 2018
e10cbde
Update docstring _restore_dropped_levels_multijoin
harisbal Nov 4, 2018
06d48d0
Update docstring _restore_dropped_levels_multijoin
harisbal Nov 4, 2018
f54c151
Merge remote-tracking branch 'upstream/master' into multi-index-join
Nov 4, 2018
c75108d
Merge remote-tracking branch 'origin/multi-index-join' into multi-ind…
harisbal Nov 5, 2018
c690260
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 6, 2018
4092b34
updated comments
harisbal Nov 6, 2018
cfd5fcc
Refactoring
harisbal Nov 6, 2018
6c8131d
Review
harisbal Nov 10, 2018
ecaf515
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 10, 2018
8b5d0aa
Merge remote-tracking branch 'upstream/master' into harisbal-multi-in…
TomAugspurger Nov 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Review
  • Loading branch information
harisbal authored and harisbal committed Oct 8, 2018
commit 66d82fb1adabf9bb5b34841ca6d78f10d0dfa009
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ This is the same behavior as ``Series.values`` for categorical data. See
Joining with two multi-indexes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`20356`)
As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)

See the :ref:`Merge, join, and concatenate
<merging.Join_with_two_multi_indexes>` documentation section.
Expand Down
35 changes: 19 additions & 16 deletions 8000 pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3912,7 +3912,7 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
from pandas.core.reshape.merge import _complete_multilevel_join
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin

# figure out join names
self_names = set(com._not_none(*self.names))
Expand All @@ -3928,27 +3928,30 @@ def _join_multi(self, other, how, return_indexers=True):

if self_is_mi and other_is_mi:

# Drop the non matching levels
ldrop_levels = list(set(self_names) - set(overlap))
rdrop_levels = list(set(other_names) - set(overlap))
# Drop the non-matching levels from left and right respectively
ldrop_names = list(set(self_names) - set(overlap))
rdrop_names = list(set(other_names) - set(overlap))

self_jnlevels = self.droplevel(ldrop_levels)
other_jnlevels = other.droplevel(rdrop_levels)

if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
raise ValueError("Join on level between two MultiIndex objects"
"is ambiguous")

dropped_levels = ldrop_levels + rdrop_levels
self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
return_indexers=True)

levels, labels, names = _complete_multilevel_join(self, other, how,
dropped_levels,
join_idx,
lidx, ridx)
# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

levels, labels, names = (
_restore_dropped_levels_multijoin(self, other,
dropped_names,
join_idx,
lidx, ridx))

# Re-create the multi-index
multi_join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

Expand Down
42 changes: 24 additions & 18 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,12 +1141,12 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
return join_func(lkey, rkey, count, **kwargs)


def _complete_multilevel_join(left, right, how, dropped_levels,
join_idx, lidx, ridx):
def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
join_idx, lidx, ridx):
"""
*this is an internal non-public method*

Returns the levels, labels and names of a multilevel to multilevel join
Returns the levels, labels and names of a multil-index to multi-index join.
Depending on the type of join, this method restores the appropriate
dropped levels of the joined multi-index. The method relies on lidx, ridx
which hold the index positions of left and right, where a join was feasible
Expand All @@ -1157,19 +1157,18 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
left index
right : Index
right index
dropped_level_names : str array
list of non-common levels
join_idx : Index
the index of the join between the common levels of left and right
how : {'left', 'right', 'outer', 'inner'}
lidx : intp array
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't do these types formally, but I think something like ndarray[intp] will eventually be the correct type here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update

left indexer
right : intp array
right indexer
dropped_levels : str array
list of non-common levels

Returns
-------
levels : intp array
levels : intp ndarray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these the correct return types? At a glance, it looks like they'd be FrozenLists

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this correct?

levels of combined multiindexes
labels : str array
labels of combined multiindexes
Expand All @@ -1178,12 +1177,20 @@ def _complete_multilevel_join(left, right, how, dropped_levels,

"""

# Convert to 1 level multi-index if not
if not isinstance(join_idx, MultiIndex):
levels = [join_idx.values]
labels = [list(range(0, len(join_idx)))]
names = [join_idx.name]
join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

join_levels = join_idx.levels
join_labels = join_idx.labels
join_names = join_idx.names

# lidx and ridx hold the indexes where the join occured
# for left and right respectively. If left (right) is None it means that
# for left and right respectively. If left (right) is None then
# the join occured on all indices of left (right)
if lidx is None:
lidx = range(0, len(left))
Expand All @@ -1192,27 +1199,26 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
ridx = range(0, len(right))

# Iterate through the levels that must be restored
for dl in dropped_levels:
if dl in left.names:
for dropped_level_name in dropped_level_names:
if dropped_level_name in left.names:
idx = left
indexer = lidx
else:
idx = right
indexer = ridx

# The index of the level name to be restored
name_idx = idx.names.index(dl)
name_idx = idx.names.index(dropped_level_name)

restore_levels = idx.levels[name_idx].values
restore_labels = idx.labels[name_idx]

join_levels = join_levels.__add__([restore_levels])
join_names = join_names.__add__([dl])

# Inject -1 in the labels list where a join was not possible
# IOW indexer[i]=-1
labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
join_labels = join_labels.__add__([labels])
labels = idx.labels[name_idx]
restore_labels = [labels[i] if i != -1 else -1 for i in indexer]

join_levels = join_levels.__add__([restore_levels])
join_labels = join_labels.__add__([restore_labels])
join_names = join_names.__add__([dropped_level_name])

return join_levels, join_labels, join_names

Expand Down
129 changes: 77 additions & 52 deletions pandas/tests/reshape/merge/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,50 @@
from pandas.compat import lzip
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
from pandas.util.testing import assert_frame_equal


@pytest.fixture
def left():
# a little relevant example with NAs
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']

data = np.random.randn(len(key1))
return DataFrame({'key1': key1, 'key2': key2, 'data': data})


@pytest.fixture
def right():
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])

return DataFrame(np.random.randn(10, 3), index=index,
columns=['j_one', 'j_two', 'j_three'])


class TestMergeMulti(object):

def setup_method(self):
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
columns=['j_one', 'j_two', 'j_three'])

# a little relevant example with NAs
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']

data = np.random.randn(len(key1))
self.data = DataFrame({'key1': key1, 'key2': key2,
'data': data})

def test_merge_on_multikey(self):
joined = self.data.join(self.to_join, on=['key1', 'key2'])

join_key = Index(lzip(self.data['key1'], self.data['key2']))
indexer = self.to_join.index.get_indexer(join_key)
ex_values = self.to_join.values.take(indexer, axis=0)
def test_merge_on_multikey(self, left, right):
joined = left.join(right, on=['key1', 'key2'])

join_key = Index(lzip(left['key1'], left['key2']))
indexer = right.index.get_indexer(join_key)
ex_values = right.values.take(indexer, axis=0)
ex_values[indexer == -1] = np.nan
expected = self.data.join(DataFrame(ex_values,
columns=self.to_join.columns))
expected = left.join(DataFrame(ex_values, columns=right.columns))

# TODO: columns aren't in the same order yet
assert_frame_equal(joined, expected.loc[:, joined.columns])
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])

left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
left = left.join(right, on=['key1', 'key2'], sort=True)
right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'],
kind='mergesort')
assert_frame_equal(left, right)
tm.assert_frame_equal(left, right)

def test_left_join_multi_index(self):
icols = ['1st', '2nd', '3rd']
Expand Down Expand Up @@ -119,18 +122,18 @@ def run_asserts(left, right):

run_asserts(left, right)

def test_merge_right_vs_left(self):
def test_merge_right_vs_left(self, left, right):
# compare left vs right merge with multikey
for sort in [False, True]:
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)
merged1 = left.merge(right, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)

merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
left_index=True, how='right',
sort=sort)
merged2 = right.merge(left, right_on=['key1', 'key2'],
left_index=True, how='right',
sort=sort)

merged2 = merged2.loc[:, merged1.columns]
assert_frame_equal(merged1, merged2)
tm.assert_frame_equal(merged1, merged2)

def test_compress_group_combinations(self):

Expand Down Expand Up @@ -393,15 +396,13 @@ def test_join_multi_levels(self):
'nl0000289965']))
.set_index(['household_id', 'asset_id'])
.reindex(columns=['male', 'wealth', 'name', 'share']))
assert_frame_equal(result, expected)

assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# equivalency
result2 = (merge(household.reset_index(), portfolio.reset_index(),
on=['household_id'], how='inner')
.set_index(['household_id', 'asset_id']))
assert_frame_equal(result2, expected)
result = (merge(household.reset_index(), portfolio.reset_index(),
on=['household_id'], how='inner')
.set_index(['household_id', 'a EED3 sset_id']))
tm.assert_frame_equal(result, expected)

result = household.join(portfolio, how='outer')
expected = (concat([
Expand All @@ -412,7 +413,7 @@ def test_join_multi_levels(self):
[(4, np.nan)],
names=['household_id', 'asset_id'])))
], axis=0, sort=True).reindex(columns=expected.columns))
assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# invalid cases
household.index.name = 'foo'
Expand Down Expand Up @@ -471,7 +472,7 @@ def test_join_multi_levels2(self):
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='inner')
.set_index(['household_id', 'asset_id', 't']))
assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

expected = (
DataFrame(dict(
Expand All @@ -496,7 +497,7 @@ def test_join_multi_levels2(self):
on=['asset_id'], how='outer')
.set_index(['household_id', 'asset_id', 't']))

assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)


@pytest.fixture
Expand Down Expand Up @@ -564,17 +565,17 @@ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame([1, 2, 3],
["2016-01-01", "2017-01-01", "2018-01-01"],
columns=["a"])
df.index = pd.to_datetime(df.index)
on_vector = df.index.year

if klass is not None:
on_vector = klass(on_vector)
if box is not None:
on_vector = box(on_vector)

expected = DataFrame(
OrderedDict([
Expand All @@ -596,3 +597,27 @@ def test_merge_datetime_index(self, klass):

result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)

def test_single_common_level(self):
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
('K1', 'X2')],
names=['key', 'X'])

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=index_left)

index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
('K2', 'Y2'), ('K2', 'Y3')],
names=['key', 'Y'])

right = pd.DataFrame({'C': ['C0' 1CF5 ;, 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=index_right)

result = left.join(right)
expected = (pd.merge(left.reset_index(), right.reset_index(),
on=['key'], how='inner')
.set_index(['key', 'X', 'Y']))

tm.assert_frame_equal(result, expected)
0