Skip to content

BUG: Join behaved like an inner join when only one side had MultiIndex #37211

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ Reshaping
^^^^^^^^^
- Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`)
- Bug in :func:`concat` incorrectly casting to ``object`` dtype in some cases when one or more of the operands is empty (:issue:`38843`)
- Bug in :func:`join` behaved like an inner join, when one side had a regular :class:`Index` while the other side had a :class:`MultiIndex` (:issue:`34292`)
-


Expand Down
86 changes: 32 additions & 54 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3863,66 +3863,44 @@ def _join_multi(self, other, how, return_indexers=True):
if not overlap:
raise ValueError("cannot join with no overlapping index names")

if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):

# Drop the non-matching levels from left and right respectively
ldrop_names = sorted(self_names - overlap, key=self_names_order)
rdrop_names = sorted(other_names - overlap, key=other_names_order)

# if only the order differs
if not len(ldrop_names + rdrop_names):
self_jnlevels = self
other_jnlevels = other.reorder_levels(self.names)
else:
self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(
other_jnlevels, how, return_indexers=True
)

# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

levels, codes, names = restore_dropped_levels_multijoin(
self, other, dropped_names, join_idx, lidx, ridx
)

# Re-create the multi-index
multi_join_idx = MultiIndex(
levels=levels, codes=codes, names=names, verify_integrity=False
)

multi_join_idx = multi_join_idx.remove_unused_levels()
# Drop the non-matching levels from left and right respectively
ldrop_names = sorted(self_names - overlap, key=self_names_order)
rdrop_names = sorted(other_names - overlap, key=other_names_order)

# if only the order differs
if not len(ldrop_names + rdrop_names):
self_jnlevels = self
other_jnlevels = other.reorder_levels(self.names)
else:
self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

if return_indexers:
return multi_join_idx, lidx, ridx
else:
return multi_join_idx
# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(
other_jnlevels, how, return_indexers=True
)

jl = list(overlap)[0]
# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

# Case where only one index is multi
# make the indices into mi's that match
flip_order = False
if isinstance(self, MultiIndex):
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {"right": "left", "left": "right"}.get(how, how)
levels, codes, names = restore_dropped_levels_multijoin(
self, other, dropped_names, join_idx, lidx, ridx
)

level = other.names.index(jl)
result = self._join_level(
other, level, how=how, return_indexers=return_indexers
# Re-create the multi-index
multi_join_idx = MultiIndex(
levels=levels, codes=codes, names=names, verify_integrity=False
)

if flip_order and isinstance(result, tuple):
return result[0], result[2], result[1]
return result
multi_join_idx = multi_join_idx.remove_unused_levels()

if return_indexers:
return multi_join_idx, lidx, ridx
else:
return multi_join_idx

@final
def _join_non_unique(self, other, how="left", return_indexers=False):
Expand Down
29 changes: 21 additions & 8 deletions pandas/tests/indexes/multi/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def test_join_multi():

# inner
jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True)
exp_idx = pd.MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"])
exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
exp_idx = pd.MultiIndex.from_product([[1, 2], np.arange(4)], names=["b", "a"])
exp_lidx = np.array([1, 5, 9, 13, 2, 6, 10, 14], dtype=np.intp)
exp_ridx = np.array([0, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
tm.assert_index_equal(jidx, exp_idx)
tm.assert_numpy_array_equal(lidx, exp_lidx)
tm.assert_numpy_array_equal(ridx, exp_ridx)
Expand All @@ -71,15 +71,17 @@ def test_join_multi():
# keep MultiIndex
jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True)
exp_ridx = np.array(
[-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp
[-1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1], dtype=np.intp
)
tm.assert_index_equal(jidx, midx)
assert lidx is None
exp_idx = midx.copy()
exp_idx.names = ["b", "a"]
tm.assert_index_equal(jidx, exp_idx)
# assert lidx is None
tm.assert_numpy_array_equal(ridx, exp_ridx)
# flip
jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True)
tm.assert_index_equal(jidx, midx)
assert lidx is None
tm.assert_index_equal(jidx, exp_idx)
# assert lidx is None
tm.assert_numpy_array_equal(ridx, exp_ridx)


Expand Down Expand Up @@ -113,3 +115,14 @@ def test_join_multi_return_indexers():

result = midx1.join(midx2, return_indexers=False)
tm.assert_index_equal(result, midx1)


def test_join_multi_and_index():
# GH#34292
idx = Index([1, 2], name="a")
midx = pd.MultiIndex.from_tuples([(1, 4), (3, 0), (1, 5)], names=["a", "b"])
result = idx.join(midx, how="left")
expected = pd.MultiIndex.from_tuples(
[(1, 4), (1, 5), (2, np.nan)], names=["a", "b"]
)
tm.assert_index_equal(result, expected)