Skip to content

Commit f1cff7f

Browse files
committed
ENH: add return_inverse to df.duplicated
1 parent a620e72 commit f1cff7f

File tree

2 files changed

+134
-4
lines changed

2 files changed

+134
-4
lines changed

pandas/core/frame.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4345,7 +4345,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False):
43454345
else:
43464346
return self[-duplicated]
43474347

4348-
def duplicated(self, subset=None, keep='first'):
4348+
def duplicated(self, subset=None, keep='first', return_inverse=False):
43494349
"""
43504350
Return boolean Series denoting duplicate rows, optionally only
43514351
considering certain columns
@@ -4360,15 +4360,24 @@ def duplicated(self, subset=None, keep='first'):
43604360
first occurrence.
43614361
- ``last`` : Mark duplicates as ``True`` except for the
43624362
last occurrence.
4363-
- False : Mark all duplicates as ``True``.
4363+
- False : Mark all duplicates as ``True``. This option is not
4364+
compatible with ``return_inverse``.
4365+
return_inverse boolean, default False
4366+
Determines whether the mapping from unique elements to the original
4367+
index should be returned. If true, the output is a tuple.
43644368
43654369
Returns
43664370
-------
4367-
duplicated : Series
4371+
duplicated : Series or tuple of Series if return_inverse is True
43684372
"""
43694373
from pandas.core.sorting import get_group_index
43704374
from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
43714375

4376+
if return_inverse and not keep:
4377+
raise ValueError("The parameters return_inverse=True and "
4378+
"keep=False cannot be used together (impossible "
4379+
"to calculate an inverse when discarding values)")
4380+
43724381
def f(vals):
43734382
labels, shape = algorithms.factorize(
43744383
vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
@@ -4393,7 +4402,22 @@ def f(vals):
43934402
labels, shape = map(list, zip(*map(f, vals)))
43944403

43954404
ids = get_group_index(labels, shape, sort=False, xnull=False)
4396-
return Series(duplicated_int64(ids, keep), index=self.index)
4405+
isdup = Series(duplicated_int64(ids, keep), index=self.index)
4406+
if not return_inverse:
4407+
return isdup
4408+
4409+
if keep == 'first':
4410+
# o2u: original indices to indices of ARRAY of unique values
4411+
# u2o: reduplication from array of unique values to original array
4412+
_, o2u, u2o = np.unique(ids, return_inverse=True,
4413+
return_index=True)
4414+
inv = Series(self.index[o2u][u2o], index=self.index)
4415+
elif keep == 'last':
4416+
ids = ids[::-1] # np.unique takes first occurrence as unique value
4417+
_, o2u, u2o = np.unique(ids, return_inverse=True,
4418+
return_index=True)
4419+
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
4420+
return isdup, inv
43974421

43984422
# ----------------------------------------------------------------------
43994423
# Sorting

pandas/tests/frame/test_duplicates.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
5+
import numpy as np
6+
from pandas import Series, DataFrame
7+
8+
from pandas.util.testing import assert_series_equal, assert_frame_equal
9+
import pandas.util.testing as tm
10+
11+
12+
class TestDataFrameDuplicated(object):
13+
14+
def test_duplicated_keep(self):
15+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
16+
17+
# keep = 'first'
18+
exp = Series([False, False, True, False, True])
19+
assert_series_equal(df.duplicated(keep='first'), exp)
20+
21+
# keep = 'last'
22+
exp = Series([True, True, False, False, False])
23+
assert_series_equal(df.duplicated(keep='last'), exp)
24+
25+
# keep = False
26+
exp = Series([True, True, True, False, True])
27+
assert_series_equal(df.duplicated(keep=False), exp)
28+
29+
def test_duplicated_nan_none(self):
30+
# np.nan and None are considered equal
31+
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
32+
33+
# keep = 'first'
34+
exp = Series([False, False, True, True, True])
35+
assert_series_equal(df.duplicated(keep='first'), exp)
36+
37+
# keep = 'last'
38+
exp = Series([True, True, False, True, False])
39+
assert_series_equal(df.duplicated(keep='last'), exp)
40+
41+
# keep = False
42+
exp = Series([True] * 5)
43+
assert_series_equal(df.duplicated(keep=False), exp)
44+
45+
@pytest.mark.parametrize('keep', ['first', 'last', False])
46+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
47+
def test_duplicated_subset(self, subset, keep):
48+
df = DataFrame({'A': [0, 1, 1, 2, 0],
49+
'B': ['a', 'b', 'b', 'c', 'a'],
50+
'C': [np.nan, 3, 3, None, np.nan]})
51+
52+
if subset is None:
53+
subset = list(df.columns)
54+
55+
exp = df[subset].duplicated(keep=keep).rename(name=None)
56+
assert_series_equal(df.duplicated(keep=keep, subset=subset), exp)
57+
58+
def test_duplicated_inverse(self):
59+
# check that return_inverse kwarg does not affect outcome;
60+
# index of inverse must be correctly transformed as well
61+
idx = [1, 4, 9, 16, 25]
62+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
63+
index=idx)
64+
65+
# keep = 'first'
66+
exp_isdup = df.duplicated(keep='first')
67+
exp_inv = Series([1, 4, 4, 16, 1], index=idx)
68+
tst_isdup, tst_inv = df.duplicated(keep='first', return_inverse=True)
69+
assert_series_equal(tst_isdup, exp_isdup)
70+
assert_series_equal(tst_inv, exp_inv)
71+
unique = df.loc[~exp_isdup]
72+
reconstr = unique.reindex(tst_inv.values).set_index(tst_inv.index)
73+
assert_frame_equal(reconstr, df)
74+
75+
# keep = 'last'
76+
exp_isdup = df.duplicated(keep='last')
77+
exp_inv = Series([25, 9, 9, 16, 25], index=idx)
78+
tst_isdup, tst_inv = df.duplicated(keep='last', return_inverse=True)
79+
assert_series_equal(tst_isdup, exp_isdup)
80+
assert_series_equal(tst_inv, exp_inv)
81+
unique = df.loc[~exp_isdup]
82+
reconstr = unique.reindex(tst_inv.values).set_index(tst_inv.index)
83+
assert_frame_equal(reconstr, df)
84+
85+
# keep = False
86+
rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
87+
with tm.assert_raises_regex(ValueError, rgx):
88+
df.duplicated(keep=False, return_inverse=True)
89+
90+
@pytest.mark.parametrize('keep', ['first', 'last'])
91+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
92+
def test_duplicated_inverse_large(self, subset, keep):
93+
# unsorted index important to check 'first'/'last' functionality
94+
df = DataFrame(np.random.randint(0, 10, (10000, 3)),
95+
columns=list('ABC')).sample(5000)
96+
97+
exp_isdup = df.duplicated(keep=keep, subset=subset)
98+
tst_isdup, inv = df.duplicated(keep=keep, subset=subset,
99+
return_inverse=True)
100+
assert_series_equal(tst_isdup, exp_isdup)
101+
102+
# reconstruction can only succeed if all columns are taken into account
103+
if subset is None:
104+
unique = df.loc[~exp_isdup]
105+
reconstr = unique.reindex(inv.values).set_index(inv.index)
106+
assert_frame_equal(reconstr, df)

0 commit comments

Comments
 (0)