Skip to content

Commit f82e3f8

Browse files
committed
ENH: add return_inverse to df.duplicated
1 parent 04caa56 commit f82e3f8

File tree

2 files changed

+134
-4
lines changed

2 files changed

+134
-4
lines changed

pandas/core/frame.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -4347,7 +4347,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False):
43474347
else:
43484348
return self[-duplicated]
43494349

4350-
def duplicated(self, subset=None, keep='first'):
4350+
def duplicated(self, subset=None, keep='first', return_inverse=False):
43514351
"""
43524352
Return boolean Series denoting duplicate rows, optionally only
43534353
considering certain columns
@@ -4362,15 +4362,24 @@ def duplicated(self, subset=None, keep='first'):
43624362
first occurrence.
43634363
- ``last`` : Mark duplicates as ``True`` except for the
43644364
last occurrence.
4365-
- False : Mark all duplicates as ``True``.
4365+
- False : Mark all duplicates as ``True``. This option is not
4366+
compatible with ``return_inverse``.
4367+
return_inverse boolean, default False
4368+
Determines whether the mapping from unique elements to the original
4369+
index should be returned. If true, the output is a tuple.
43664370
43674371
Returns
43684372
-------
4369-
duplicated : Series
4373+
duplicated : Series or tuple of Series if return_inverse is True
43704374
"""
43714375
from pandas.core.sorting import get_group_index
43724376
from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
43734377

4378+
if return_inverse and not keep:
4379+
raise ValueError("The parameters return_inverse=True and "
4380+
"keep=False cannot be used together (impossible "
4381+
"to calculate an inverse when discarding values)")
4382+
43744383
def f(vals):
43754384
labels, shape = algorithms.factorize(
43764385
vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
@@ -4395,7 +4404,22 @@ def f(vals):
43954404
labels, shape = map(list, zip(*map(f, vals)))
43964405

43974406
ids = get_group_index(labels, shape, sort=False, xnull=False)
4398-
return Series(duplicated_int64(ids, keep), index=self.index)
4407+
isdup = Series(duplicated_int64(ids, keep), index=self.index)
4408+
if not return_inverse:
4409+
return isdup
4410+
4411+
if keep == 'first':
4412+
# o2u: original indices to indices of ARRAY of unique values
4413+
# u2o: reduplication from array of unique values to original array
4414+
_, o2u, u2o = np.unique(ids, return_inverse=True,
4415+
return_index=True)
4416+
inv = Series(self.index[o2u][u2o], index=self.index)
4417+
elif keep == 'last':
4418+
ids = ids[::-1] # np.unique takes first occurrence as unique value
4419+
_, o2u, u2o = np.unique(ids, return_inverse=True,
4420+
return_index=True)
4421+
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
4422+
return isdup, inv
43994423

44004424
# ----------------------------------------------------------------------
44014425
# Sorting

pandas/tests/frame/test_duplicates.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
5+
import numpy as np
6+
from pandas import Series, DataFrame
7+
8+
from pandas.util.testing import assert_series_equal, assert_frame_equal
9+
import pandas.util.testing as tm
10+
11+
12+
class TestDataFrameDuplicated(object):
13+
14+
def test_duplicated_keep(self):
15+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
16+
17+
# keep = 'first'
18+
exp = Series([False, False, True, False, True])
19+
assert_series_equal(df.duplicated(keep='first'), exp)
20+
21+
# keep = 'last'
22+
exp = Series([True, True, False, False, False])
23+
assert_series_equal(df.duplicated(keep='last'), exp)
24+
25+
# keep = False
26+
exp = Series([True, True, True, False, True])
27+
assert_series_equal(df.duplicated(keep=False), exp)
28+
29+
def test_duplicated_nan_none(self):
30+
# np.nan and None are considered equal
31+
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
32+
33+
# keep = 'first'
34+
exp = Series([False, False, True, True, True])
35+
assert_series_equal(df.duplicated(keep='first'), exp)
36+
37+
# keep = 'last'
38+
exp = Series([True, True, False, True, False])
39+
assert_series_equal(df.duplicated(keep='last'), exp)
40+
41+
# keep = False
42+
exp = Series([True] * 5)
43+
assert_series_equal(df.duplicated(keep=False), exp)
44+
45+
@pytest.mark.parametrize('keep', ['first', 'last', False])
46+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
47+
def test_duplicated_subset(self, subset, keep):
48+
df = DataFrame({'A': [0, 1, 1, 2, 0],
49+
'B': ['a', 'b', 'b', 'c', 'a'],
50+
'C': [np.nan, 3, 3, None, np.nan]})
51+
52+
if subset is None:
53+
subset = list(df.columns)
54+
55+
exp = df[subset].duplicated(keep=keep).rename(name=None)
56+
assert_series_equal(df.duplicated(keep=keep, subset=subset), exp)
57+
58+
def test_duplicated_inverse(self):
59+
# check that return_inverse kwarg does not affect outcome;
60+
# index of inverse must be correctly transformed as well
61+
idx = [1, 4, 9, 16, 25]
62+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
63+
index=idx)
64+
65+
# keep = 'first'
66+
exp_isdup = df.duplicated(keep='first')
67+
exp_inv = Series([1, 4, 4, 16, 1], index=idx)
68+
tst_isdup, tst_inv = df.duplicated(keep='first', return_inverse=True)
69+
assert_series_equal(tst_isdup, exp_isdup)
70+
assert_series_equal(tst_inv, exp_inv)
71+
unique = df.loc[~exp_isdup]
72+
reconstr = unique.reindex(tst_inv.values).set_index(tst_inv.index)
73+
assert_frame_equal(reconstr, df)
74+
75+
# keep = 'last'
76+
exp_isdup = df.duplicated(keep='last')
77+
exp_inv = Series([25, 9, 9, 16, 25], index=idx)
78+
tst_isdup, tst_inv = df.duplicated(keep='last', return_inverse=True)
79+
assert_series_equal(tst_isdup, exp_isdup)
80+
assert_series_equal(tst_inv, exp_inv)
81+
unique = df.loc[~exp_isdup]
82+
reconstr = unique.reindex(tst_inv.values).set_index(tst_inv.index)
83+
assert_frame_equal(reconstr, df)
84+
85+
# keep = False
86+
rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
87+
with tm.assert_raises_regex(ValueError, rgx):
88+
df.duplicated(keep=False, return_inverse=True)
89+
90+
@pytest.mark.parametrize('keep', ['first', 'last'])
91+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
92+
def test_duplicated_inverse_large(self, subset, keep):
93+
# unsorted index important to check 'first'/'last' functionality
94+
df = DataFrame(np.random.randint(0, 10, (10000, 3)),
95+
columns=list('ABC')).sample(5000)
96+
97+
exp_isdup = df.duplicated(keep=keep, subset=subset)
98+
tst_isdup, inv = df.duplicated(keep=keep, subset=subset,
99+
return_inverse=True)
100+
assert_series_equal(tst_isdup, exp_isdup)
101+
102+
# reconstruction can only succeed if all columns are taken into account
103+
if subset is None:
104+
unique = df.loc[~exp_isdup]
105+
reconstr = unique.reindex(inv.values).set_index(inv.index)
106+
assert_frame_equal(reconstr, df)

0 commit comments

Comments
 (0)