Skip to content

ENH: Add duplicated/drop_duplicates to Index #7979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 15, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ Reindexing / Selection / Label manipulation

Series.align
Series.drop
Series.drop_duplicates
Series.duplicated
Series.equals
Series.first
Series.head
Expand Down Expand Up @@ -1165,6 +1167,8 @@ Modifying and Computations
Index.diff
Index.sym_diff
Index.drop
Index.drop_duplicates
Index.duplicated
Index.equals
Index.factorize
Index.identical
Expand Down
9 changes: 9 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@ API changes

- Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs<visualization.hist>`.

- ``Index`` now supports ``duplicated`` and ``drop_duplicates``. (:issue:`4060`)

.. ipython:: python

idx = Index([1, 2, 3, 4, 1, 2])
idx
idx.duplicated()
idx.drop_duplicates()

.. _whatsnew_0150.dt:

.dt accessor
Expand Down
61 changes: 60 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@
from pandas.core import common as com
import pandas.core.nanops as nanops
import pandas.tslib as tslib
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly


_shared_docs = dict()
_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='')


class StringMixin(object):

"""implements string methods so long as object defines a `__unicode__`
Expand Down Expand Up @@ -474,12 +480,66 @@ def searchsorted(self, key, side='left'):
#### needs tests/doc-string
return self.values.searchsorted(key, side=side)

_shared_docs['drop_duplicates'] = (
"""Return %(klass)s with duplicate values removed

Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
%(inplace)s

Returns
-------
deduplicated : %(klass)s
""")

@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
duplicated = self.duplicated(take_last=take_last)
result = self[~duplicated.values]
if inplace:
return self._update_inplace(result)
else:
return result

_shared_docs['duplicated'] = (
"""Return boolean %(klass)s denoting duplicate values

Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first

Returns
-------
duplicated : %(klass)s
""")

@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
def duplicated(self, take_last=False):
keys = com._ensure_object(self.values)
duplicated = lib.duplicated(keys, take_last=take_last)
try:
return self._constructor(duplicated,
index=self.index).__finalize__(self)
except AttributeError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is very awkward to do. Maybe just put the immutable definition in base and override the definition in series. prob simpler?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, fixed to centralize the logic to IndexOpsMixin. Even though update_inplace is defined in both IndexOpsMixin and Index, it will never called in drop_duplicates case (Index.drop_duplicates blocks inplace kw, and it is better for proper docstring)

from pandas.core.index import Index
return Index(duplicated)

#----------------------------------------------------------------------
# unbox reductions

all = _unbox(np.ndarray.all)
any = _unbox(np.ndarray.any)

#----------------------------------------------------------------------
# abstracts

def _update_inplace(self, result):
raise NotImplementedError


class DatetimeIndexOpsMixin(object):
""" common ops mixin to support a unified inteface datetimelike Index """

Expand All @@ -497,7 +557,6 @@ def _box_values(self, values):
"""
apply box func to passed values
"""
import pandas.lib as lib
return lib.map_infer(values, self._box_func)

@cache_readonly
Expand Down
17 changes: 16 additions & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pandas.algos as _algos
import pandas.index as _index
from pandas.lib import Timestamp, is_datetime_array
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs
from pandas.util.decorators import Appender, cache_readonly, deprecate
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
Expand All @@ -30,6 +30,8 @@

_unsortable_types = frozenset(('mixed', 'mixed-integer'))

_index_doc_kwargs = dict(klass='Index', inplace='')


def _try_get_item(x):
try:
Expand Down Expand Up @@ -209,6 +211,10 @@ def _simple_new(cls, values, name=None, **kwargs):
result._reset_identity()
return result

def _update_inplace(self, result):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

def is_(self, other):
"""
More flexible, faster check like ``is`` but that works through views
Expand Down Expand Up @@ -2019,6 +2025,15 @@ def drop(self, labels):
raise ValueError('labels %s not contained in axis' % labels[mask])
return self.delete(indexer)

@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, take_last=False):
result = super(Index, self).drop_duplicates(take_last=take_last)
return self._constructor(result)

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
return super(Index, self).duplicated(take_last=take_last)

@classmethod
def _add_numeric_methods_disabled(cls):
""" add in numeric methods to disable """
Expand Down
49 changes: 12 additions & 37 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,13 @@

__all__ = ['Series']


_shared_doc_kwargs = dict(
axes='index',
klass='Series',
axes_single_arg="{0,'index'}"
axes_single_arg="{0,'index'}",
inplace="""inplace : boolean, default False
If True, performs operation inplace and returns None."""
)


Expand Down Expand Up @@ -265,6 +268,9 @@ def _set_subtyp(self, is_all_dates):
else:
object.__setattr__(self, '_subtyp', 'series')

def _update_inplace(self, result):
return generic.NDFrame._update_inplace(self, result)

# ndarray compatibility
@property
def dtype(self):
Expand Down Expand Up @@ -1114,45 +1120,14 @@ def mode(self):
from pandas.core.algorithms import mode
return mode(self)

@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
"""
Return Series with duplicate values removed

Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
inplace : boolean, default False
If True, performs operation inplace and returns None.

Returns
-------
deduplicated : Series
"""
duplicated = self.duplicated(take_last=take_last)
result = self[-duplicated]
if inplace:
return self._update_inplace(result)
else:
return result
return super(Series, self).drop_duplicates(take_last=take_last,
inplace=inplace)

@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
def duplicated(self, take_last=False):
"""
Return boolean Series denoting duplicate values

Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first

Returns
-------
duplicated : Series
"""
keys = _ensure_object(self.values)
duplicated = lib.duplicated(keys, take_last=take_last)
return self._constructor(duplicated,
index=self.index).__finalize__(self)
return super(Series, self).duplicated(take_last=take_last)

def idxmin(self, axis=None, out=None, skipna=True):
"""
Expand Down
70 changes: 68 additions & 2 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,13 @@ def test_value_counts_unique_nunique(self):
# freq must be specified because repeat makes freq ambiguous
expected_index = o[::-1]
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
else:
elif isinstance(o, Index):
expected_index = values[::-1]
o = klass(np.repeat(values, range(1, len(o) + 1)))
else:
expected_index = values[::-1]
idx = np.repeat(o.index.values, range(1, len(o) + 1))
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)

expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64')
tm.assert_series_equal(o.value_counts(), expected_s)
Expand Down Expand Up @@ -374,11 +378,16 @@ def test_value_counts_unique_nunique(self):

# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(o, PeriodIndex):
# freq must be specified because repeat makes freq ambiguous
expected_index = o
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)
else:
elif isinstance(o, Index):
expected_index = values
o = klass(np.repeat(values, range(1, len(o) + 1)))
else:
expected_index = values
idx = np.repeat(o.index.values, range(1, len(o) + 1))
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)

expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64')
expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64')
Expand Down Expand Up @@ -571,6 +580,63 @@ def test_factorize(self):
expected = o[5:].append(o[:5])
self.assertTrue(uniques.equals(expected))

def test_duplicated_drop_duplicates(self):
# GH 4060
for original in self.objs:

if isinstance(original, Index):
# original doesn't have duplicates
expected = Index([False] * len(original))
tm.assert_index_equal(original.duplicated(), expected)
result = original.drop_duplicates()
tm.assert_index_equal(result, original)
self.assertFalse(result is original)

# create repeated values, 3rd and 5th values are duplicated
idx = original[list(range(len(original))) + [5, 3]]
expected = Index([False] * len(original) + [True, True])
tm.assert_index_equal(idx.duplicated(), expected)
tm.assert_index_equal(idx.drop_duplicates(), original)

last_base = [False] * len(idx)
last_base[3] = True
last_base[5] = True
expected = Index(last_base)
tm.assert_index_equal(idx.duplicated(take_last=True), expected)
tm.assert_index_equal(idx.drop_duplicates(take_last=True),
idx[~np.array(last_base)])

with tm.assertRaisesRegexp(TypeError,
"drop_duplicates\(\) got an unexpected keyword argument"):
idx.drop_duplicates(inplace=True)

else:
expected = Series([False] * len(original), index=original.index)
tm.assert_series_equal(original.duplicated(), expected)
result = original.drop_duplicates()
tm.assert_series_equal(result, original)
self.assertFalse(result is original)

idx = original.index[list(range(len(original))) + [5, 3]]
values = original.values[list(range(len(original))) + [5, 3]]
s = Series(values, index=idx)

expected = Series([False] * len(original) + [True, True], index=idx)
tm.assert_series_equal(s.duplicated(), expected)
tm.assert_series_equal(s.drop_duplicates(), original)

last_base = [False] * len(idx)
last_base[3] = True
last_base[5] = True
expected = Series(last_base, index=idx)
expected
tm.assert_series_equal(s.duplicated(take_last=True), expected)
tm.assert_series_equal(s.drop_duplicates(take_last=True),
s[~np.array(last_base)])

s.drop_duplicates(inplace=True)
tm.assert_series_equal(s, original)


class TestDatetimeIndexOps(Ops):
tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern',
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,6 +2031,20 @@ def test_duplicate_mi(self):
result = df.loc[('foo','bar')]
assert_frame_equal(result,expected)

def test_duplicated_drop_duplicates(self):
# GH 4060
idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2 ,3], [1, 1, 1, 1, 2, 2]))

expected = Index([False, False, False, True, False, False])
tm.assert_index_equal(idx.duplicated(), expected)
expected = MultiIndex.from_arrays(([1, 2, 3, 2 ,3], [1, 1, 1, 2, 2]))
tm.assert_index_equal(idx.drop_duplicates(), expected)

expected = Index([True, False, False, False, False, False])
tm.assert_index_equal(idx.duplicated(take_last=True), expected)
expected = MultiIndex.from_arrays(([2, 3, 1, 2 ,3], [1, 1, 1, 2, 2]))
tm.assert_index_equal(idx.drop_duplicates(take_last=True), expected)

def test_multiindex_set_index(self):
# segfault in #3308
d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]}
Expand Down