diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c5a22d8766a96..36d21bb338202 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -110,6 +110,30 @@ Example: s.rolling(3).rank(method="max") +.. _whatsnew_140.enhancements.groupby_indexing: + +Groupby positional indexing +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is now possible to specify positional ranges relative to the ends of each group. + +Negative arguments for :meth:`.GroupBy.head` and :meth:`.GroupBy.tail` now work correctly and result in ranges relative to the end and start of each group, respectively. +Previously, negative arguments returned empty frames. + +.. ipython:: python + + df = pd.DataFrame([["g", "g0"], ["g", "g1"], ["g", "g2"], ["g", "g3"], + ["h", "h0"], ["h", "h1"]], columns=["A", "B"]) + df.groupby("A").head(-1) + + +:meth:`.GroupBy.nth` now accepts a slice or list of integers and slices. + +.. ipython:: python + + df.groupby("A").nth(slice(1, -1)) + df.groupby("A").nth([slice(None, 1), slice(-1, None)]) + .. _whatsnew_140.enhancements.other: Other enhancements diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9ca05e05fc09a..0a397672ad2cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -46,6 +46,7 @@ class providing the base-class of operations. ArrayLike, IndexLabel, NDFrameT, + PositionalIndexer, RandomState, Scalar, T, @@ -65,6 +66,7 @@ class providing the base-class of operations. is_bool_dtype, is_datetime64_dtype, is_float_dtype, + is_integer, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -97,6 +99,7 @@ class providing the base-class of operations. numba_, ops, ) +from pandas.core.groupby.indexing import GroupByIndexingMixin from pandas.core.indexes.api import ( CategoricalIndex, Index, @@ -555,7 +558,7 @@ def f(self): ] -class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT]): +class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): _group_selection: IndexLabel | None = None _apply_allowlist: frozenset[str] = frozenset() _hidden_attrs = PandasObject._hidden_attrs | { @@ -2445,11 +2448,12 @@ def backfill(self, limit=None): @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth( - self, n: int | list[int], dropna: Literal["any", "all", None] = None + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, ) -> NDFrameT: """ - Take the nth row from each group if n is an int, or a subset of rows - if n is a list of ints. + Take the nth row from each group if n is an int, otherwise a subset of rows. If dropna, will take the nth non-null row, dropna is either 'all' or 'any'; this is equivalent to calling dropna(how=dropna) @@ -2457,11 +2461,15 @@ def nth( Parameters ---------- - n : int or list of ints - A single nth value for the row or a list of nth values. + n : int, slice or list of ints and slices + A single nth value for the row or a list of nth values or slices. + + .. versionchanged:: 1.4.0 + Added slice and lists containiing slices. + dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is - the nth row. + the nth row. Only supported if n is an int. Returns ------- @@ -2496,6 +2504,12 @@ def nth( 1 2.0 2 3.0 2 5.0 + >>> g.nth(slice(None, -1)) + B + A + 1 NaN + 1 2.0 + 2 3.0 Specifying `dropna` allows count ignoring ``NaN`` @@ -2520,25 +2534,9 @@ def nth( 1 1 2.0 4 2 5.0 """ - valid_containers = (set, list, tuple) - if not isinstance(n, (valid_containers, int)): - raise TypeError("n needs to be an int or a list/set/tuple of ints") - if not dropna: - - if isinstance(n, int): - nth_values = [n] - elif isinstance(n, valid_containers): - nth_values = list(set(n)) - - nth_array = np.array(nth_values, dtype=np.intp) with self._group_selection_context(): - - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d( - self._cumcount_array(ascending=False) + 1, -nth_array - ) - mask = mask_left | mask_right + mask = self._make_mask_from_positional_indexer(n) ids, _, _ = self.grouper.group_info @@ -2546,7 +2544,6 @@ def nth( mask = mask & (ids != -1) out = self._mask_selected_obj(mask) - if not self.as_index: return out @@ -2563,19 +2560,20 @@ def nth( return out.sort_index(axis=self.axis) if self.sort else out # dropna is truthy - if isinstance(n, valid_containers): - raise ValueError("dropna option with a list of nth values is not supported") + if not is_integer(n): + raise ValueError("dropna option only supported for an integer argument") if dropna not in ["any", "all"]: # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError( - "For a DataFrame groupby, dropna must be " + "For a DataFrame or Series groupby.nth, dropna must be " "either None, 'any' or 'all', " f"(was passed {dropna})." ) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf + n = cast(int, n) max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) @@ -3301,11 +3299,16 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). - Does not work for negative values of `n`. + Parameters + ---------- + n : int + If positive: number of entries to include from start of each group. + If negative: number of entries to exclude from end of each group. Returns ------- Series or DataFrame + Subset of original Series or DataFrame as determined by n. %(see_also)s Examples -------- @@ -3317,12 +3320,11 @@ def head(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(-1) - Empty DataFrame - Columns: [A, B] - Index: [] + A B + 0 1 2 """ self._reset_group_selection() - mask = self._cumcount_array() < n + mask = self._make_mask_from_positional_indexer(slice(None, n)) return self._mask_selected_obj(mask) @final @@ -3336,11 +3338,16 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). - Does not work for negative values of `n`. + Parameters + ---------- + n : int + If positive: number of entries to include from end of each group. + If negative: number of entries to exclude from start of each group. Returns ------- Series or DataFrame + Subset of original Series or DataFrame as determined by n. %(see_also)s Examples -------- @@ -3352,12 +3359,16 @@ def tail(self, n=5): 1 a 2 3 b 2 >>> df.groupby('A').tail(-1) - Empty DataFrame - Columns: [A, B] - Index: [] + A B + 1 a 2 + 3 b 2 """ self._reset_group_selection() - mask = self._cumcount_array(ascending=False) < n + if n: + mask = self._make_mask_from_positional_indexer(slice(-n, None)) + else: + mask = self._make_mask_from_positional_indexer([]) + return self._mask_selected_obj(mask) @final diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py new file mode 100644 index 0000000000000..4b3bb6bc0aa50 --- /dev/null +++ b/pandas/core/groupby/indexing.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Iterable, + cast, +) + +import numpy as np + +from pandas._typing import PositionalIndexer +from pandas.util._decorators import ( + cache_readonly, + doc, +) + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) + +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.groupby import groupby + + +class GroupByIndexingMixin: + """ + Mixin for adding ._positional_selector to GroupBy. + """ + + @cache_readonly + def _positional_selector(self) -> GroupByPositionalSelector: + """ + Return positional selection for each group. + + ``groupby._positional_selector[i:j]`` is similar to + ``groupby.apply(lambda x: x.iloc[i:j])`` + but much faster and preserves the original index and order. + + ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head` + and :meth:`~GroupBy.tail`. For example: + + - ``head(5)`` + - ``_positional_selector[5:-5]`` + - ``tail(5)`` + + together return all the rows. + + Allowed inputs for the index are: + + - An integer valued iterable, e.g. ``range(2, 4)``. + - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. + + The output format is the same as :meth:`~GroupBy.head` and + :meth:`~GroupBy.tail`, namely + a subset of the ``DataFrame`` or ``Series`` with the index and order preserved. + + Returns + ------- + Series + The filtered subset of the original Series. + DataFrame + The filtered subset of the original DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Notes + ----- + - The slice step cannot be negative. + - If the index specification results in overlaps, the item is not duplicated. + - If the index specification changes the order of items, then + they are returned in their original order. + By contrast, ``DataFrame.iloc`` can change the row order. + - ``groupby()`` parameters such as as_index and dropna are ignored. + + The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth` + with ``as_index=False`` are: + + - Input to ``_positional_selector`` can include + one or more slices whereas ``nth`` + just handles an integer or a list of integers. + - ``_positional_selector`` can accept a slice relative to the + last row of each group. + - ``_positional_selector`` does not have an equivalent to the + ``nth()`` ``dropna`` parameter. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A")._positional_selector[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A")._positional_selector[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return GroupByPositionalSelector(groupby_self) + + def _make_mask_from_positional_indexer( + self, + arg: PositionalIndexer | tuple, + ) -> np.ndarray: + if is_list_like(arg): + if all(is_integer(i) for i in cast(Iterable, arg)): + mask = self._make_mask_from_list(cast(Iterable[int], arg)) + else: + mask = self._make_mask_from_tuple(cast(tuple, arg)) + + elif isinstance(arg, slice): + mask = self._make_mask_from_slice(arg) + elif is_integer(arg): + mask = self._make_mask_from_int(cast(int, arg)) + else: + raise TypeError( + f"Invalid index {type(arg)}. " + "Must be integer, list-like, slice or a tuple of " + "integers and slices" + ) + + if isinstance(mask, bool): + if mask: + mask = self._ascending_count >= 0 + else: + mask = self._ascending_count < 0 + + return cast(np.ndarray, mask) + + def _make_mask_from_int(self, arg: int) -> np.ndarray: + if arg >= 0: + return self._ascending_count == arg + else: + return self._descending_count == (-arg - 1) + + def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray: + positive = [arg for arg in args if arg >= 0] + negative = [-arg - 1 for arg in args if arg < 0] + + mask: bool | np.ndarray = False + + if positive: + mask |= np.isin(self._ascending_count, positive) + + if negative: + mask |= np.isin(self._descending_count, negative) + + return mask + + def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray: + mask: bool | np.ndarray = False + + for arg in args: + if is_integer(arg): + mask |= self._make_mask_from_int(cast(int, arg)) + elif isinstance(arg, slice): + mask |= self._make_mask_from_slice(arg) + else: + raise ValueError( + f"Invalid argument {type(arg)}. Should be int or slice." + ) + + return mask + + def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: + start = arg.start + stop = arg.stop + step = arg.step + + if step is not None and step < 0: + raise ValueError(f"Invalid step {step}. Must be non-negative") + + mask: bool | np.ndarray = True + + if step is None: + step = 1 + + if start is None: + if step > 1: + mask &= self._ascending_count % step == 0 + + elif start >= 0: + mask &= self._ascending_count >= start + + if step > 1: + mask &= (self._ascending_count - start) % step == 0 + + else: + mask &= self._descending_count < -start + + offset_array = self._descending_count + start + 1 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 + offset_array = np.where(limit_array, self._ascending_count, offset_array) + + mask &= offset_array % step == 0 + + if stop is not None: + if stop >= 0: + mask &= self._ascending_count < stop + else: + mask &= self._descending_count >= -stop + + return mask + + @cache_readonly + def _ascending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array() + + @cache_readonly + def _descending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array(ascending=False) + + +@doc(GroupByIndexingMixin._positional_selector) +class GroupByPositionalSelector: + def __init__(self, groupby_object: groupby.GroupBy): + self.groupby_object = groupby_object + + def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: + """ + Select by positional index per group. + + Implements GroupBy._positional_selector + + Parameters + ---------- + arg : PositionalIndexer | tuple + Allowed values are: + - int + - int valued iterable such as list or range + - slice with step either None or positive + - tuple of integers and slices + + Returns + ------- + Series + The filtered subset of the original groupby Series. + DataFrame + The filtered subset of the original groupby DataFrame. + + See Also + -------- + DataFrame.iloc : Integer-location based indexing for selection by position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy._positional_selector : Return positional selection for each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + """ + self.groupby_object._reset_group_selection() + mask = self.groupby_object._make_mask_from_positional_indexer(arg) + return self.groupby_object._mask_selected_obj(mask) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index d699d05963b46..622c56d707ead 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -116,6 +116,27 @@ def three_group(): ) +@pytest.fixture() +def slice_test_df(): + data = [ + [0, "a", "a0_at_0"], + [1, "b", "b0_at_1"], + [2, "a", "a1_at_2"], + [3, "b", "b1_at_3"], + [4, "c", "c0_at_4"], + [5, "a", "a2_at_5"], + [6, "a", "a3_at_6"], + [7, "a", "a4_at_7"], + ] + df = DataFrame(data, columns=["Index", "Group", "Value"]) + return df.set_index("Index") + + +@pytest.fixture() +def slice_test_grouped(slice_test_df): + return slice_test_df.groupby("Group", as_index=False) + + @pytest.fixture(params=sorted(reduction_kernels)) def reduction_func(request): """ diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py new file mode 100644 index 0000000000000..b9f71fd4ed96a --- /dev/null +++ b/pandas/tests/groupby/test_indexing.py @@ -0,0 +1,287 @@ +# Test GroupBy._positional_selector positional grouped indexing GH#42864 + +import random + +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [2, [5]], + [5, []], + [-1, [3, 4, 7]], + [-2, [1, 6]], + [-6, []], + ], +) +def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test single integer + result = slice_test_grouped._positional_selector[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_slice(slice_test_df, slice_test_grouped): + # Test single slice + result = slice_test_grouped._positional_selector[0:3:2] + expected = slice_test_df.iloc[[0, 1, 4, 5]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [[0, 2], [0, 1, 4, 5]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [range(0, 3, 2), [0, 1, 4, 5]], + [{0, 2}, [0, 1, 4, 5]], + ], + ids=[ + "list", + "negative", + "range", + "set", + ], +) +def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test lists of integers and integer valued iterables + result = slice_test_grouped._positional_selector[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_ints(slice_test_df, slice_test_grouped): + # Test tuple of ints + result = slice_test_grouped._positional_selector[0, 2, -1] + expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_slices(slice_test_df, slice_test_grouped): + # Test tuple of slices + result = slice_test_grouped._positional_selector[:2, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_mix(slice_test_df, slice_test_grouped): + # Test mixed tuple of ints and slices + result = slice_test_grouped._positional_selector[0, 1, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_as_index(slice_test_df, arg, expected_rows): + # Test the default as_index behaviour + result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_doc_examples(): + # Test the examples in the documentation + df = pd.DataFrame( + [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] + ) + + grouped = df.groupby("A", as_index=False) + + result = grouped._positional_selector[1:2] + expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) + + tm.assert_frame_equal(result, expected) + + result = grouped._positional_selector[1, -1] + expected = pd.DataFrame( + [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def multiindex_data(): + ndates = 100 + nitems = 20 + dates = pd.date_range("20130101", periods=ndates, freq="D") + items = [f"item {i}" for i in range(nitems)] + + data = {} + for date in dates: + nitems_for_date = nitems - random.randint(0, 12) + levels = [ + (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) + for item in items[:nitems_for_date] + ] + levels.sort(key=lambda x: x[1]) + data[date] = levels + + return data + + +def _make_df_from_data(data): + rows = {} + for date in data: + for level in data[date]: + rows[(date, level[0])] = {"A": level[1], "B": level[2]} + + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") + return df + + +def test_multiindex(multiindex_data): + # Test the multiindex mentioned as the use-case in the documentation + df = _make_df_from_data(multiindex_data) + result = df.groupby("Date", as_index=False).nth(slice(3, -3)) + + sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} + expected = _make_df_from_data(sliced) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) +@pytest.mark.parametrize("method", ["head", "tail"]) +@pytest.mark.parametrize("simulated", [True, False]) +def test_against_head_and_tail(arg, method, simulated): + # Test gives the same results as grouped head and tail + n_groups = 100 + n_rows_per_group = 30 + + data = { + "group": [ + f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) + ], + "value": [ + f"group {g} row {j}" + for j in range(n_rows_per_group) + for g in range(n_groups) + ], + } + df = pd.DataFrame(data) + grouped = df.groupby("group", as_index=False) + size = arg if arg >= 0 else n_rows_per_group + arg + + if method == "head": + result = grouped._positional_selector[:arg] + + if simulated: + indices = [] + for j in range(size): + for i in range(n_groups): + if j * n_groups + i < n_groups * n_rows_per_group: + indices.append(j * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.head(arg) + + else: + result = grouped._positional_selector[-arg:] + + if simulated: + indices = [] + for j in range(size): + for i in range(n_groups): + if (n_rows_per_group + j - size) * n_groups + i >= 0: + indices.append((n_rows_per_group + j - size) * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.tail(arg) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("step", [None, 1, 5]) +def test_against_df_iloc(start, stop, step): + # Test that a single group gives the same results as DataFame.iloc + n_rows = 30 + + data = { + "group": ["group 0"] * n_rows, + "value": list(range(n_rows)), + } + df = pd.DataFrame(data) + grouped = df.groupby("group", as_index=False) + + result = grouped._positional_selector[start:stop:step] + expected = df.iloc[start:stop:step] + + tm.assert_frame_equal(result, expected) + + +def test_series(): + # Test grouped Series + ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) + grouped = ser.groupby(level=0) + result = grouped._positional_selector[1:2] + expected = pd.Series([2, 5], index=["a", "b"]) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) +def test_step(step): + # Test slice with various step values + data = [["x", f"x{i}"] for i in range(5)] + data += [["y", f"y{i}"] for i in range(4)] + data += [["z", f"z{i}"] for i in range(3)] + df = pd.DataFrame(data, columns=["A", "B"]) + + grouped = df.groupby("A", as_index=False) + + result = grouped._positional_selector[::step] + + data = [["x", f"x{i}"] for i in range(0, 5, step)] + data += [["y", f"y{i}"] for i in range(0, 4, step)] + data += [["z", f"z{i}"] for i in range(0, 3, step)] + + index = [0 + i for i in range(0, 5, step)] + index += [5 + i for i in range(0, 4, step)] + index += [9 + i for i in range(0, 3, step)] + + expected = pd.DataFrame(data, columns=["A", "B"], index=index) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def column_group_df(): + return pd.DataFrame( + [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], + columns=["A", "B", "C", "D", "E", "F", "G"], + ) + + +def test_column_axis(column_group_df): + g = column_group_df.groupby(column_group_df.iloc[1], axis=1) + result = g._positional_selector[1:-1] + expected = column_group_df.iloc[:, [1, 3]] + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 8742135da59e5..a5cb511763eee 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -270,7 +270,7 @@ def test_nth(): result = s.groupby(g, sort=False).nth(0, dropna="all") tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match="For a DataFrame groupby"): + with pytest.raises(ValueError, match="For a DataFrame"): s.groupby(g, sort=False).nth(0, dropna=True) # doc example @@ -517,11 +517,11 @@ def test_nth_multi_index_as_expected(): @pytest.mark.parametrize( "op, n, expected_rows", [ - ("head", -1, []), + ("head", -1, [0]), ("head", 0, []), ("head", 1, [0, 2]), ("head", 7, [0, 1, 2]), - ("tail", -1, []), + ("tail", -1, [1]), ("tail", 0, []), ("tail", 1, [1, 2]), ("tail", 7, [0, 1, 2]), @@ -543,11 +543,11 @@ def test_groupby_head_tail(op, n, expected_rows, columns, as_index): @pytest.mark.parametrize( "op, n, expected_cols", [ - ("head", -1, []), + ("head", -1, [0]), ("head", 0, []), ("head", 1, [0, 2]), ("head", 7, [0, 1, 2]), - ("tail", -1, []), + ("tail", -1, [1]), ("tail", 0, []), ("tail", 1, [1, 2]), ("tail", 7, [0, 1, 2]), @@ -708,6 +708,46 @@ def test_groupby_last_first_nth_with_none(method, nulls_fixture): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [slice(None, 3, 2), [0, 1, 4, 5]], + [slice(None, -2), [0, 2, 5]], + [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test slices GH #42947 + + result = slice_test_grouped.nth(arg) + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_invalid_argument(slice_test_grouped): + # Test for error on invalid argument + + with pytest.raises(TypeError, match="Invalid index"): + slice_test_grouped.nth(3.14) + + +def test_negative_step(slice_test_grouped): + # Test for error on negative slice step + + with pytest.raises(ValueError, match="Invalid step"): + slice_test_grouped.nth(slice(None, None, -1)) + + +def test_np_ints(slice_test_df, slice_test_grouped): + # Test np ints work + + result = slice_test_grouped.nth(np.array([0, 1])) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + def test_groupby_nth_with_column_axis(): # GH43926 df = DataFrame(