Skip to content

API: add "level=" argument to MultiIndex.unique() #17897

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Other Enhancements

- Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`)
- :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`)
- :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`)
- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)

Expand Down
28 changes: 26 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3757,8 +3757,32 @@ def drop(self, labels, errors='raise'):
indexer = indexer[~mask]
return self.delete(indexer)

@Appender(base._shared_docs['unique'] % _index_doc_kwargs)
def unique(self):
_index_shared_docs['index_unique'] = (
"""
Return unique values in the index. Uniques are returned in order
of appearance, this does NOT sort.

Parameters
----------
level : int or str, optional, default None
Only return values from specified level (for MultiIndex)

.. versionadded:: 0.22.0

Returns
-------
Index without duplicates

See Also
--------
unique
Series.unique
""")

@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
def unique(self, level=None):
if level is not None:
self._validate_index_level(level)
result = super(Index, self).unique()
return self._shallow_copy(result)

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,10 @@ def is_monotonic_increasing(self):
def is_monotonic_decreasing(self):
return Index(self.codes).is_monotonic_decreasing

@Appender(base._shared_docs['unique'] % _index_doc_kwargs)
def unique(self):
@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
def unique(self, level=None):
if level is not None:
self._validate_index_level(level)
result = base.IndexOpsMixin.unique(self)
# CategoricalIndex._shallow_copy uses keeps original categories
# and ordered if not otherwise specified
Expand Down
23 changes: 18 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def _try_mi(k):

raise InvalidIndexError(key)

def _get_level_values(self, level):
def _get_level_values(self, level, unique=False):
"""
Return vector of label values for requested level,
equal to the length of the index
Expand All @@ -918,17 +918,21 @@ def _get_level_values(self, level):
Parameters
----------
level : int level
unique : bool, default False
if True, drop duplicated values

Returns
-------
values : ndarray
"""

unique = self.levels[level]
values = self.levels[level]
labels = self.labels[level]
filled = algos.take_1d(unique._values, labels,
fill_value=unique._na_value)
values = unique._shallow_copy(filled)
if unique:
labels = algos.unique(labels)
filled = algos.take_1d(values._values, labels,
fill_value=values._na_value)
values = values._shallow_copy(filled)
return values

def get_level_values(self, level):
Expand Down Expand Up @@ -967,6 +971,15 @@ def get_level_values(self, level):
values = self._get_level_values(level)
return values

@Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
def unique(self, level=None):

if level is None:
return super(MultiIndex, self).unique()
else:
level = self._get_level_number(level)
return self._get_level_values(level=level, unique=True)

def format(self, space=2, sparsify=None, adjoin=True, names=False,
na_rep=None, formatter=None):
if len(self) == 0:
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,27 @@ def test_duplicates(self, indices):
assert not idx.is_unique
assert idx.has_duplicates

def test_unique(self, indices):
# don't test a MultiIndex here (as its tested separated)
# don't test a CategoricalIndex because categories change (GH 18291)
if isinstance(indices, (MultiIndex, CategoricalIndex)):
return

# GH 17896
expected = indices.drop_duplicates()
for level in 0, indices.name, None:
result = indices.unique(level=level)
tm.assert_index_equal(result, expected)

for level in 3, 'wrong':
pytest.raises((IndexError, KeyError), indices.unique, level=level)

def test_unique_na(self):
idx = pd.Index([2, np.nan, 2, 1], name='my_index')
expected = pd.Index([2, np.nan, 1], name='my_index')
result = idx.unique()
tm.assert_index_equal(result, expected)

def test_get_unique_index(self, indices):
# MultiIndex tested separately
if not len(indices) or isinstance(indices, MultiIndex):
Expand Down
32 changes: 24 additions & 8 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,19 +963,21 @@ def test_get_level_values(self):
exp = CategoricalIndex([1, 2, 3, 1, 2, 3])
tm.assert_index_equal(index.get_level_values(1), exp)

def test_get_level_values_na(self):
@pytest.mark.xfail(reason='GH 17924 (returns Int64Index with float data)')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we xfail this? (I mean, why don't we keep asserting for now that it is float)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assertions compare a correctly built index (float64 data in Float64Index) with an invalid index (float64 data in Int64Index, due to #17924 ). Maybe I'm missing something, but it's not obvious to me what we would assert - plus, I see it as an added value to xfail for a buggy behavior.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

def test_get_level_values_int_with_na(self):
arrays = [['a', 'b', 'b'], [1, np.nan, 2]]
index = pd.MultiIndex.from_arrays(arrays)
values = index.get_level_values(1)
expected = np.array([1, np.nan, 2])
tm.assert_numpy_array_equal(values.values.astype(float), expected)
result = index.get_level_values(1)
expected = Index([1, np.nan, 2])
tm.assert_index_equal(result, expected)

arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]]
index = pd.MultiIndex.from_arrays(arrays)
values = index.get_level_values(1)
expected = np.array([np.nan, np.nan, 2])
tm.assert_numpy_array_equal(values.values.astype(float), expected)
result = index.get_level_values(1)
expected = Index([np.nan, np.nan, 2])
tm.assert_index_equal(result, expected)

def test_get_level_values_na(self):
arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
index = pd.MultiIndex.from_arrays(arrays)
result = index.get_level_values(0)
Expand All @@ -990,7 +992,7 @@ def test_get_level_values_na(self):
index = pd.MultiIndex.from_arrays(arrays)
values = index.get_level_values(1)
expected = pd.DatetimeIndex([0, 1, pd.NaT])
tm.assert_numpy_array_equal(values.values, expected.values)
tm.assert_index_equal(values, expected)

arrays = [[], []]
index = pd.MultiIndex.from_arrays(arrays)
Expand Down Expand Up @@ -2277,6 +2279,20 @@ def test_unique(self):
exp = pd.MultiIndex.from_arrays([['a'], ['a']])
tm.assert_index_equal(res, exp)

@pytest.mark.parametrize('level', [0, 'first', 1, 'second'])
def test_unique_level(self, level):
# GH #17896 - with level= argument
result = self.index.unique(level=level)
expected = self.index.get_level_values(level).unique()
tm.assert_index_equal(result, expected)

# With already unique level
mi = pd.MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
names=['first', 'second'])
result = mi.unique(level=level)
expected = mi.get_level_values(level)
tm.assert_index_equal(result, expected)

def test_unique_datetimelike(self):
idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
'2015-01-01', 'NaT', 'NaT'])
Expand Down