Skip to content

Commit b08dc3d

Browse files
committed
Incorporate review feedback
1 parent 36b03a1 commit b08dc3d

File tree

4 files changed

+24
-42
lines changed

4 files changed

+24
-42
lines changed

asv_bench/benchmarks/frame_methods.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -431,22 +431,13 @@ def setup(self, keep, return_inverse):
431431
self.df3 = df3
432432

433433
def time_frame_duplicated(self, keep, return_inverse):
434-
if return_inverse:
435-
self.df.duplicated(keep=keep, return_inverse=return_inverse)
436-
else:
437-
self.df.duplicated(keep=keep)
434+
self.df.duplicated(keep=keep, return_inverse=return_inverse)
438435

439436
def time_frame_duplicated_wide(self, keep, return_inverse):
440-
if return_inverse:
441-
self.df2.duplicated(keep=keep, return_inverse=return_inverse)
442-
else:
443-
self.df2.duplicated(keep=keep)
437+
self.df2.duplicated(keep=keep, return_inverse=return_inverse)
444438

445439
def time_frame_duplicated_mixed(self, keep, return_inverse):
446-
if return_inverse:
447-
self.df3.duplicated(keep=keep, return_inverse=return_inverse)
448-
else:
449-
self.df3.duplicated(keep=keep)
440+
self.df3.duplicated(keep=keep, return_inverse=return_inverse)
450441

451442

452443
class XS(object):

doc/source/whatsnew/v0.23.1.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. _whatsnew_0231:
22

3-
v0.23.1 (June 12, 2018)
4-
-----------------------
3+
v0.23.1
4+
-------
55

66
This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes
77
and bug fixes. We recommend that all users upgrade to this version.

doc/source/whatsnew/v0.24.0.txt

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,25 @@ New features
1313
``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
1414
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1515

16-
Previously, there was no way to determine how duplicate rows in a ``DataFrame`` got mapped to the deduplicated, unique subset. This made it hard to push back
17-
information that was calculated on the deduplicated values (e.g. aggregation) back to the original dataset. And while the ``numpy.unique``-method provides such a
18-
``return_inverse``-kwarg, it fails to work with ``object`` data.
19-
20-
Therefore, the ``duplicated``-method has now gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
21-
to a tuple of two Series (in the following example, the index is not just a simple order, to illustrate that the inverse correctly takes it into account):
16+
The ``duplicated``-method has gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
17+
to a tuple of two Series, where the second Series contains the mapping from the indices of the deduplicated, unique subset back to the original index:
2218

2319
.. ipython:: python
2420

2521
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
2622
index=[1, 4, 9, 16, 25])
2723
df
28-
isdup, inv = df.duplicated(return_inverse=True) # default: keep='first'
29-
isdup
30-
inv
24+
isduplicate, inverse = df.duplicated(return_inverse=True) # default: keep='first'
25+
isduplicate
26+
inverse
3127

3228
This allows to reconstruct the original DataFrame as follows:
3329

3430
.. ipython:: python
3531

36-
unique = df.loc[~isdup] # same as df.drop_duplicates()
32+
unique = df.loc[~isduplicate] # same as df.drop_duplicates()
3733
unique
38-
reconstruct = unique.reindex(inv.values).set_index(inv.index)
34+
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
3935
reconstruct.equals(df)
4036

4137
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible

pandas/tests/frame/test_analytics.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,35 +1585,30 @@ def test_duplicated_subset(self, subset, keep):
15851585

15861586
if subset is None:
15871587
subset = list(df.columns)
1588+
elif isinstance(subset, string_types):
1589+
# need to have a DataFrame, not a Series
1590+
# -> select columns with singleton list, not string
1591+
subset = [subset]
15881592

15891593
expected = df[subset].duplicated(keep=keep).rename(name=None)
15901594
result = df.duplicated(keep=keep, subset=subset)
15911595
tm.assert_series_equal(result, expected)
15921596

1593-
def test_duplicated_inverse(self):
1597+
@pytest.mark.parametrize('keep, expected_inv_values', [
1598+
('first', [1, 4, 4, 16, 1]),
1599+
('last', [25, 9, 9, 16, 25])
1600+
])
1601+
def test_duplicated_inverse(self, keep, expected_inv_values):
15941602
# check that return_inverse kwarg does not affect outcome;
15951603
# index of inverse must be correctly transformed as well
15961604
idx = [1, 4, 9, 16, 25]
15971605
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
15981606
index=idx)
15991607

16001608
# keep = 'first'
1601-
expected_isdup = df.duplicated(keep='first')
1602-
expected_inv = Series([1, 4, 4, 16, 1], index=idx)
1603-
result_isdup, result_inv = df.duplicated(keep='first',
1604-
return_inverse=True)
1605-
tm.assert_series_equal(result_isdup, expected_isdup)
1606-
tm.assert_series_equal(result_inv, expected_inv)
1607-
1608-
# test that inv works (and fits together with expected_isdup)
1609-
unique = df.loc[~expected_isdup]
1610-
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
1611-
tm.assert_frame_equal(reconstr, df)
1612-
1613-
# keep = 'last'
1614-
expected_isdup = df.duplicated(keep='last')
1615-
expected_inv = Series([25, 9, 9, 16, 25], index=idx)
1616-
result_isdup, result_inv = df.duplicated(keep='last',
1609+
expected_isdup = df.duplicated(keep=keep)
1610+
expected_inv = Series(expected_inv_values, index=idx)
1611+
result_isdup, result_inv = df.duplicated(keep=keep,
16171612
return_inverse=True)
16181613
tm.assert_series_equal(result_isdup, expected_isdup)
16191614
tm.assert_series_equal(result_inv, expected_inv)

0 commit comments

Comments
 (0)