Skip to content

added support for selecting multiple nth values #7910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ This shows the first or last n rows from each group.
Taking the nth row of each group
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

To select from a DataFrame or Series the nth item, use the nth method. This is a reduction method, and will return a single row (or no row) per group:
To select from a DataFrame or Series the nth item, use the nth method. This is a reduction method, and will return a single row (or no row) per group if you pass an int for n:

.. ipython:: python

Expand All @@ -880,7 +880,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a
g.nth(-1)
g.nth(1)

If you want to select the nth not-null method, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy.
If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy.

.. ipython:: python

Expand All @@ -904,6 +904,15 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh
g.nth(0)
g.nth(-1)

You can also select multiple rows from each group by specifying multiple nth values as a list of ints.

.. ipython:: python

business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, 4th, and last date index for each month
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

Enumerate group items
~~~~~~~~~~~~~~~~~~~~~

Expand Down
46 changes: 36 additions & 10 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,12 +756,21 @@ def ohlc(self):

def nth(self, n, dropna=None):
"""
Take the nth row from each group.
Take the nth row from each group if n is an int, or a subset of rows
if n is a list of ints.

If dropna, will not show nth non-null row, dropna is either
If dropna, will take the nth non-null row, dropna is either
Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
to calling dropna(how=dropna) before the groupby.

Parameters
----------
n : int or list of ints
a single nth value for the row or a list of nth values
dropna : None or str, optional
apply the specified dropna operation before counting which row is
the nth row. Needs to be None, 'any' or 'all'

Examples
--------
>>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
Expand Down Expand Up @@ -789,19 +798,36 @@ def nth(self, n, dropna=None):
5 NaN

"""
if isinstance(n, int):
nth_values = [n]
elif isinstance(n, (set, list, tuple)):
nth_values = list(set(n))
if dropna is not None:
raise ValueError("dropna option with a list of nth values is not supported")
else:
raise TypeError("n needs to be an int or a list/set/tuple of ints")

m = self.grouper._max_groupsize
# filter out values that are outside [-m, m)
pos_nth_values = [i for i in nth_values if i >= 0 and i < m]
neg_nth_values = [i for i in nth_values if i < 0 and i >= -m]

self._set_selection_from_grouper()
if not dropna: # good choice
m = self.grouper._max_groupsize
if n >= m or n < -m:
if not pos_nth_values and not neg_nth_values:
# no valid nth values
return self._selected_obj.loc[[]]

rng = np.zeros(m, dtype=bool)
if n >= 0:
rng[n] = True
is_nth = self._cumcount_array(rng)
else:
rng[- n - 1] = True
is_nth = self._cumcount_array(rng, ascending=False)
for i in pos_nth_values:
rng[i] = True
is_nth = self._cumcount_array(rng)

if neg_nth_values:
rng = np.zeros(m, dtype=bool)
for i in neg_nth_values:
rng[- i - 1] = True
is_nth |= self._cumcount_array(rng, ascending=False)

result = self._selected_obj[is_nth]

Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,30 @@ def test_nth(self):
expected = g.B.first()
assert_series_equal(result,expected)

# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
columns=['A', 'B'])
g = df.groupby('A')

assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
assert_frame_equal(g.nth([3, 4]), df.loc[[],['B']])

business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, fourth and last two business days for each month
result = df.groupby((df.index.year, df.index.month)).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30',
'2014/5/1', '2014/5/6', '2014/5/29', '2014/5/30',
'2014/6/2', '2014/6/5', '2014/6/27', '2014/6/30'])
expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
assert_frame_equal(result, expected)

def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
Expand Down