Skip to content

Commit e99e5ab

Browse files
authored
BUG: Fix duplicates in intersection of multiindexes (#36927)
1 parent 7b400b3 commit e99e5ab

File tree

9 files changed

+59
-10
lines changed

9 files changed

+59
-10
lines changed

doc/source/whatsnew/v1.1.5.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed regressions
2323
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
2424
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
2525
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
26+
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
2627

2728
.. ---------------------------------------------------------------------------
2829

pandas/core/indexes/base.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2822,7 +2822,7 @@ def intersection(self, other, sort=False):
28222822
self._assert_can_do_setop(other)
28232823
other = ensure_index(other)
28242824

2825-
if self.equals(other):
2825+
if self.equals(other) and not self.has_duplicates:
28262826
return self._get_reconciled_name_object(other)
28272827

28282828
if not is_dtype_equal(self.dtype, other.dtype):
@@ -2847,7 +2847,7 @@ def _intersection(self, other, sort=False):
28472847
except TypeError:
28482848
pass
28492849
else:
2850-
return result
2850+
return algos.unique1d(result)
28512851

28522852
try:
28532853
indexer = Index(rvals).get_indexer(lvals)
@@ -2858,11 +2858,14 @@ def _intersection(self, other, sort=False):
28582858
indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0])
28592859
indexer = indexer[indexer != -1]
28602860

2861-
result = other.take(indexer)._values
2861+
result = other.take(indexer).unique()._values
28622862

28632863
if sort is None:
28642864
result = algos.safe_sort(result)
28652865

2866+
# Intersection has to be unique
2867+
assert algos.unique(result).shape == result.shape
2868+
28662869
return result
28672870

28682871
def difference(self, other, sort=None):

pandas/core/indexes/multi.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3601,6 +3601,8 @@ def intersection(self, other, sort=False):
36013601
other, result_names = self._convert_can_do_setop(other)
36023602

36033603
if self.equals(other):
3604+
if self.has_duplicates:
3605+
return self.unique().rename(result_names)
36043606
return self.rename(result_names)
36053607

36063608
if not is_object_dtype(other.dtype):
@@ -3619,10 +3621,12 @@ def intersection(self, other, sort=False):
36193621
uniq_tuples = None # flag whether _inner_indexer was successful
36203622
if self.is_monotonic and other.is_monotonic:
36213623
try:
3622-
uniq_tuples = self._inner_indexer(lvals, rvals)[0]
3623-
sort = False # uniq_tuples is already sorted
3624+
inner_tuples = self._inner_indexer(lvals, rvals)[0]
3625+
sort = False # inner_tuples is already sorted
36243626
except TypeError:
36253627
pass
3628+
else:
3629+
uniq_tuples = algos.unique(inner_tuples)
36263630

36273631
if uniq_tuples is None:
36283632
other_uniq = set(rvals)

pandas/core/ops/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,10 @@ def should_reindex_frame_op(
311311
# TODO: any other cases we should handle here?
312312
cols = left.columns.intersection(right.columns)
313313

314-
if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)):
314+
# Intersection is always unique so we have to check the unique columns
315+
left_uniques = left.columns.unique()
316+
right_uniques = right.columns.unique()
317+
if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)):
315318
# TODO: is there a shortcut available when len(cols) == 0?
316319
return True
317320

pandas/core/reshape/merge.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,7 +1271,9 @@ def _validate_specification(self):
12711271
raise MergeError("Must pass left_on or left_index=True")
12721272
else:
12731273
# use the common columns
1274-
common_cols = self.left.columns.intersection(self.right.columns)
1274+
left_cols = self.left.columns
1275+
right_cols = self.right.columns
1276+
common_cols = left_cols.intersection(right_cols)
12751277
if len(common_cols) == 0:
12761278
raise MergeError(
12771279
"No common columns to perform merge on. "
@@ -1280,7 +1282,10 @@ def _validate_specification(self):
12801282
f"left_index={self.left_index}, "
12811283
f"right_index={self.right_index}"
12821284
)
1283-
if not common_cols.is_unique:
1285+
if (
1286+
not left_cols.join(common_cols, how="inner").is_unique
1287+
or not right_cols.join(common_cols, how="inner").is_unique
1288+
):
12841289
raise MergeError(f"Data columns not unique: {repr(common_cols)}")
12851290
self.left_on = self.right_on = common_cols
12861291
elif self.on is not None:

pandas/tests/indexes/base_class/test_setops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def test_intersection_str_dates(self, sort):
141141

142142
@pytest.mark.parametrize(
143143
"index2,expected_arr",
144-
[(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])],
144+
[(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])],
145145
)
146146
def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort):
147147
# non-monotonic non-unique

pandas/tests/indexes/multi/test_setops.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,3 +378,26 @@ def test_setops_disallow_true(method):
378378

379379
with pytest.raises(ValueError, match="The 'sort' keyword only takes"):
380380
getattr(idx1, method)(idx2, sort=True)
381+
382+
383+
@pytest.mark.parametrize(
384+
("tuples", "exp_tuples"),
385+
[
386+
([("val1", "test1")], [("val1", "test1")]),
387+
([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]),
388+
(
389+
[("val2", "test2"), ("val1", "test1")],
390+
[("val2", "test2"), ("val1", "test1")],
391+
),
392+
],
393+
)
394+
def test_intersect_with_duplicates(tuples, exp_tuples):
395+
# GH#36915
396+
left = MultiIndex.from_tuples(tuples, names=["first", "second"])
397+
right = MultiIndex.from_tuples(
398+
[("val1", "test1"), ("val1", "test1"), ("val2", "test2")],
399+
names=["first", "second"],
400+
)
401+
result = left.intersection(right)
402+
expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"])
403+
tm.assert_index_equal(result, expected)

pandas/tests/indexes/test_setops.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,16 @@ def test_dunder_inplace_setops_deprecated(index):
120120
index ^= index
121121

122122

123+
@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]])
124+
def test_intersection_duplicates(values):
125+
# GH#31326
126+
a = pd.Index(values)
127+
b = pd.Index([3, 3])
128+
result = a.intersection(b)
129+
expected = pd.Index([3])
130+
tm.assert_index_equal(result, expected)
131+
132+
123133
class TestSetOps:
124134
# Set operation tests shared by all indexes in the `index` fixture
125135
@pytest.mark.parametrize("case", [0.5, "xxx"])

pandas/tests/reshape/merge/test_merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ def test_overlapping_columns_error_message(self):
753753

754754
# #2649, #10639
755755
df2.columns = ["key1", "foo", "foo"]
756-
msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)"
756+
msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
757757
with pytest.raises(MergeError, match=msg):
758758
merge(df, df2)
759759

0 commit comments

Comments
 (0)