From a9f56fc3b0b72775bb98a582656c71a798349240 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 28 Jul 2020 22:14:01 +0100 Subject: [PATCH 1/9] mutated=T/F now works for axis 0 or 1 --- pandas/core/groupby/ops.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..2cb4674b2e33a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -211,7 +211,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes): + if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -897,13 +897,13 @@ def agg_series( return grouper.get_result() -def _is_indexed_like(obj, axes) -> bool: +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) return False From 4ea646171672643aea1b1e84118dd366cfe15163 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Tue, 28 Jul 2020 23:47:48 +0100 Subject: [PATCH 2/9] 1 new test, remove xfail from existing test --- pandas/tests/groupby/test_apply.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..570ebc35d5c99 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -63,11 +63,6 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object." -) def test_apply_trivial_fail(): # GH 20066 # trivial apply fails if the constant dataframe has the same index @@ -1014,3 +1009,23 @@ def test_apply_with_timezones_aware(): result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) tm.assert_frame_equal(result1, result2) + + +def test_apply_by_cols_equals_apply_by_rows_transposed(): + # GH 16646 + # Operating on the columns, or transposing and operating on the rows + # should give the same result. There was previously a bug where the + # by_rows operation would work fine, but by_cols would throw a ValueError + + df = pd.DataFrame( + np.random.random([6, 4]), + columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]), + ) + + by_rows = df.T.groupby(axis=0, level=0).apply( + lambda x: x.droplevel(axis=0, level=0) + ) + by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0)) + + tm.assert_frame_equal(by_cols, by_rows.T) + tm.assert_frame_equal(by_cols, df) From 741d8fecd349f02cca2985a90d37c6db9b45a483 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 29 Jul 2020 00:01:27 +0100 Subject: [PATCH 3/9] whatsnew plust update old test --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/tests/groupby/test_function.py | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a49b29d691692..06e53bb947c96 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1131,6 +1131,7 @@ Groupby/resample/rolling - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). - Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError: cannot reindex from a duplicate axis`` (:issue:`16646`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e693962e57ac3..a4ea0340990fe 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -940,10 +940,7 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) + tm.assert_frame_equal(result, expected) From 364339c4807d336e269afcefe937c770b544cfcc Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 29 Jul 2020 10:59:27 +0100 Subject: [PATCH 4/9] update on v1.2.0 --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 06e53bb947c96..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1131,7 +1131,6 @@ Groupby/resample/rolling - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). - Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) -- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError: cannot reindex from a duplicate axis`` (:issue:`16646`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..e9c03bcc7ca70 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -132,6 +132,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError: cannot reindex from a duplicate axis`` (:issue:`16646`) - - From 3c7f8903936a817ae13e3cc42efe22f317ae9ff1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 29 Jul 2020 11:08:30 +0100 Subject: [PATCH 5/9] updated test comment --- pandas/tests/groupby/test_apply.py | 5 ++--- pandas/tests/groupby/test_function.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 570ebc35d5c99..73c72a8a14b03 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -64,9 +64,8 @@ def test_apply_trivial(): def test_apply_trivial_fail(): - # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. + # GH 35441 + # test passes, xfail removed df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a4ea0340990fe..fc9adefe51f16 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -940,7 +940,6 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T - tm.assert_frame_equal(result, expected) From 3d4431fc7f3807985e73bf50c377740c206e0004 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 31 Jul 2020 10:34:02 +0100 Subject: [PATCH 6/9] addressing comments --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/groupby/test_apply.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e9c03bcc7ca70..ea2fcd037a029 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -132,7 +132,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError: cannot reindex from a duplicate axis`` (:issue:`16646`) +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 73c72a8a14b03..a341342cd808f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -65,7 +65,6 @@ def test_apply_trivial(): def test_apply_trivial_fail(): # GH 35441 - # test passes, xfail removed df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], From 0204cdec5353c5de3b94ff28263a8b83cc89315e Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 1 Aug 2020 23:53:00 +0100 Subject: [PATCH 7/9] revert comment --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a341342cd808f..a35f43d6fc7a6 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -64,7 +64,7 @@ def test_apply_trivial(): def test_apply_trivial_fail(): - # GH 35441 + # GH 20066 df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], From ea1348003dea41035136eec12e5cabb4b82ad30b Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 2 Aug 2020 19:41:36 +0100 Subject: [PATCH 8/9] amend comment to start tests --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a35f43d6fc7a6..56c5fc8fb2c8e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1010,7 +1010,7 @@ def test_apply_with_timezones_aware(): def test_apply_by_cols_equals_apply_by_rows_transposed(): - # GH 16646 + # GH #16646 # Operating on the columns, or transposing and operating on the rows # should give the same result. There was previously a bug where the # by_rows operation would work fine, but by_cols would throw a ValueError From 5122d1502c4e108c93058715bfdeef197a9f5064 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 5 Aug 2020 18:45:42 +0100 Subject: [PATCH 9/9] restart tests --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e37c4466132e0..665cd12225ad7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1040,7 +1040,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): def test_apply_by_cols_equals_apply_by_rows_transposed(): - # GH #16646 + # GH 16646 # Operating on the columns, or transposing and operating on the rows # should give the same result. There was previously a bug where the # by_rows operation would work fine, but by_cols would throw a ValueError