Skip to content

Commit 740df13

Browse files
committed
Review feedback; refactor tests; add whatsnew
1 parent f1cff7f commit 740df13

File tree

4 files changed

+176
-140
lines changed

4 files changed

+176
-140
lines changed

doc/source/whatsnew/v0.24.0.txt

+33-1
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,44 @@ v0.24.0
88
New features
99
~~~~~~~~~~~~
1010

11-
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
11+
.. _whatsnew_0240.enhancements.duplicated_inverse:
12+
13+
``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
14+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15+
16+
Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
17+
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
18+
``return_inverse``-kwarg, it fails to work with ``object`` data.
19+
20+
The method has now gained a ``return_inverse`` keyword -- specifying ``return_inverse=True`` will change the output from a single Series to a tuple of two Series:
21+
22+
.. ipython:: python
23+
24+
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
25+
index=[1, 4, 9, 16, 25])
26+
df
27+
isdup, inv = df.duplicated(return_inverse=True)
28+
isdup
29+
inv
30+
31+
This allows to reconstruct the original DataFrame as follows:
32+
33+
.. ipython:: python
34+
35+
unique = df.loc[~isdup] # same as df.drop_duplicates()
36+
unique
37+
reconstruct = unique.reindex(inv.values).set_index(inv.index)
38+
reconstruct.equals(df)
39+
40+
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
41+
to construct an inverse).
1242

1343
.. _whatsnew_0240.enhancements.other:
1444

1545
Other Enhancements
1646
^^^^^^^^^^^^^^^^^^
47+
48+
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
1749
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
1850
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
1951
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)

pandas/core/frame.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -4362,10 +4362,12 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
43624362
last occurrence.
43634363
- False : Mark all duplicates as ``True``. This option is not
43644364
compatible with ``return_inverse``.
4365-
return_inverse boolean, default False
4365+
return_inverse : boolean, default False
43664366
Determines whether the mapping from unique elements to the original
43674367
index should be returned. If true, the output is a tuple.
43684368
4369+
.. versionadded:: 0.24.0
4370+
43694371
Returns
43704372
-------
43714373
duplicated : Series or tuple of Series if return_inverse is True
@@ -4413,9 +4415,16 @@ def f(vals):
44134415
return_index=True)
44144416
inv = Series(self.index[o2u][u2o], index=self.index)
44154417
elif keep == 'last':
4416-
ids = ids[::-1] # np.unique takes first occurrence as unique value
4418+
# np.unique takes first occurrence as unique value,
4419+
# so we flip ids that first becomes last
4420+
ids = ids[::-1]
44174421
_, o2u, u2o = np.unique(ids, return_inverse=True,
44184422
return_index=True)
4423+
# the values in the ids-array correspond(ed) to self.index -
4424+
# by flipping ids around, we need to do the same for self.index,
4425+
# ___because o2u and u2o are relative to that order___.
4426+
# Finally, to fit with 'index=self.index' in the constructor,
4427+
# we need to flip the values around one last time
44194428
inv = Series(self.index[::-1][o2u][u2o][::-1], index=self.index)
44204429
return isdup, inv
44214430

pandas/tests/frame/test_analytics.py

+132-31
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy.random import randn
1313
import numpy as np
1414

15-
from pandas.compat import lrange, PY35
15+
from pandas.compat import lrange, PY35, string_types
1616
from pandas import (compat, isna, notna, DataFrame, Series,
1717
MultiIndex, date_range, Timestamp, Categorical,
1818
_np_version_under1p12,
@@ -1523,6 +1523,137 @@ def test_isin_empty_datetimelike(self):
15231523
# ----------------------------------------------------------------------
15241524
# Row deduplication
15251525

1526+
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
1527+
def test_duplicated_with_misspelled_column_name(self, subset):
1528+
# GH 19730
1529+
df = pd.DataFrame({'A': [0, 0, 1],
1530+
'B': [0, 0, 1],
1531+
'C': [0, 0, 1]})
1532+
1533+
with pytest.raises(KeyError):
1534+
df.duplicated(subset)
1535+
1536+
with pytest.raises(KeyError):
1537+
df.drop_duplicates(subset)
1538+
1539+
@pytest.mark.slow
1540+
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1541+
# gh-21524
1542+
# Given the wide dataframe with a lot of columns
1543+
# with different (important!) values
1544+
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1545+
for i in range(100)}
1546+
df = pd.DataFrame(data).T
1547+
result = df.duplicated()
1548+
1549+
# Then duplicates produce the bool pd.Series as a result
1550+
# and don't fail during calculation.
1551+
# Actual values doesn't matter here, though usually
1552+
# it's all False in this case
1553+
assert isinstance(result, pd.Series)
1554+
assert result.dtype == np.bool
1555+
1556+
@pytest.mark.parametrize('keep, expected', [
1557+
('first', Series([False, False, True, False, True])),
1558+
('last', Series([True, True, False, False, False])),
1559+
(False, Series([True, True, True, False, True]))
1560+
])
1561+
def test_duplicated_keep(self, keep, expected):
1562+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
1563+
1564+
result = df.duplicated(keep=keep)
1565+
tm.assert_series_equal(result, expected)
1566+
1567+
@pytest.mark.parametrize('keep, expected', [
1568+
('first', Series([False, False, True, True, True])),
1569+
('last', Series([True, True, False, True, False])),
1570+
(False, Series([True] * 5))
1571+
])
1572+
def test_duplicated_nan_none(self, keep, expected):
1573+
# np.nan and None are considered equal
1574+
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
1575+
1576+
result = df.duplicated(keep=keep)
1577+
tm.assert_series_equal(result, expected)
1578+
1579+
@pytest.mark.parametrize('keep', ['first', 'last', False])
1580+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
1581+
def test_duplicated_subset(self, subset, keep):
1582+
df = DataFrame({'A': [0, 1, 1, 2, 0],
1583+
'B': ['a', 'b', 'b', 'c', 'a'],
1584+
'C': [np.nan, 3, 3, None, np.nan]})
1585+
1586+
if subset is None:
1587+
subset = list(df.columns)
1588+
1589+
expected = df[subset].duplicated(keep=keep).rename(name=None)
1590+
result = df.duplicated(keep=keep, subset=subset)
1591+
tm.assert_series_equal(result, expected)
1592+
1593+
def test_duplicated_inverse(self):
1594+
# check that return_inverse kwarg does not affect outcome;
1595+
# index of inverse must be correctly transformed as well
1596+
idx = [1, 4, 9, 16, 25]
1597+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
1598+
index=idx)
1599+
1600+
# keep = 'first'
1601+
expected_isdup = df.duplicated(keep='first')
1602+
expected_inv = Series([1, 4, 4, 16, 1], index=idx)
1603+
result_isdup, result_inv = df.duplicated(keep='first',
1604+
return_inverse=True)
1605+
tm.assert_series_equal(result_isdup, expected_isdup)
1606+
tm.assert_series_equal(result_inv, expected_inv)
1607+
1608+
# test that inv works (and fits together with expected_isdup)
1609+
unique = df.loc[~expected_isdup]
1610+
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
1611+
tm.assert_frame_equal(reconstr, df)
1612+
1613+
# keep = 'last'
1614+
expected_isdup = df.duplicated(keep='last')
1615+
expected_inv = Series([25, 9, 9, 16, 25], index=idx)
1616+
result_isdup, result_inv = df.duplicated(keep='last',
1617+
return_inverse=True)
1618+
tm.assert_series_equal(result_isdup, expected_isdup)
1619+
tm.assert_series_equal(result_inv, expected_inv)
1620+
1621+
# test that inv works (and fits together with expected_isdup)
1622+
unique = df.loc[~expected_isdup]
1623+
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
1624+
tm.assert_frame_equal(reconstr, df)
1625+
1626+
def test_duplicated_inverse_raises(self):
1627+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
1628+
1629+
# keep = False
1630+
rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
1631+
with tm.assert_raises_regex(ValueError, rgx):
1632+
df.duplicated(keep=False, return_inverse=True)
1633+
1634+
@pytest.mark.parametrize('keep', ['first', 'last'])
1635+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
1636+
def test_duplicated_inverse_large(self, subset, keep):
1637+
# unsorted index important to check 'first'/'last' functionality
1638+
df = DataFrame(np.random.randint(0, 10, (10000, 3)),
1639+
columns=list('ABC')).sample(5000)
1640+
1641+
expected_isdup = df.duplicated(keep=keep, subset=subset)
1642+
result_isdup, inv = df.duplicated(keep=keep, subset=subset,
1643+
return_inverse=True)
1644+
tm.assert_series_equal(result_isdup, expected_isdup)
1645+
1646+
if subset is None:
1647+
subset = list(df.columns)
1648+
elif isinstance(subset, string_types):
1649+
# need to have a DataFrame, not a Series
1650+
# -> select columns with singleton list, not string
1651+
subset = [subset]
1652+
1653+
unique = df.loc[~expected_isdup, subset]
1654+
reconstr = unique.reindex(inv.values).set_index(inv.index)
1655+
tm.assert_frame_equal(reconstr, df[subset])
1656+
15261657
def test_drop_duplicates(self):
15271658
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
15281659
'foo', 'bar', 'bar', 'foo'],
@@ -1618,36 +1749,6 @@ def test_drop_duplicates(self):
16181749
for keep in ['first', 'last', False]:
16191750
assert df.duplicated(keep=keep).sum() == 0
16201751

1621-
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
1622-
def test_duplicated_with_misspelled_column_name(self, subset):
1623-
# GH 19730
1624-
df = pd.DataFrame({'A': [0, 0, 1],
1625-
'B': [0, 0, 1],
1626-
'C': [0, 0, 1]})
1627-
1628-
with pytest.raises(KeyError):
1629-
df.duplicated(subset)
1630-
1631-
with pytest.raises(KeyError):
1632-
df.drop_duplicates(subset)
1633-
1634-
@pytest.mark.slow
1635-
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1636-
# gh-21524
1637-
# Given the wide dataframe with a lot of columns
1638-
# with different (important!) values
1639-
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1640-
for i in range(100)}
1641-
df = pd.DataFrame(data).T
1642-
result = df.duplicated()
1643-
1644-
# Then duplicates produce the bool pd.Series as a result
1645-
# and don't fail during calculation.
1646-
# Actual values doesn't matter here, though usually
1647-
# it's all False in this case
1648-
assert isinstance(result, pd.Series)
1649-
assert result.dtype == np.bool
1650-
16511752
def test_drop_duplicates_with_duplicate_column_names(self):
16521753
# GH17836
16531754
df = DataFrame([

pandas/tests/frame/test_duplicates.py

-106
This file was deleted.

0 commit comments

Comments
 (0)