From 79a58db38c108a355f1dfcca6142ed49e1a96060 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Tue, 20 Oct 2020 03:30:22 +0200 Subject: [PATCH 1/3] Backport PR #37198: BUG: Regression in Resample.apply raised error when apply affected only a Series --- doc/source/whatsnew/v1.1.4.rst | 1 + pandas/core/resample.py | 3 ++- pandas/tests/resample/test_resampler_grouper.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ad59711b90f6e..5303e8a00b53a 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) - Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) +- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..0dfbf96947c33 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -365,8 +365,9 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except DataError: + except (DataError, AttributeError, KeyError): # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index b36b11582c1ec..a7232dd5f8a1e 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -346,3 +346,18 @@ def test_median_duplicate_columns(): result = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) + + +def test_apply_to_one_column_of_df(): + # GH: 36951 + df = pd.DataFrame( + {"col": range(10), "col1": range(10, 20)}, + index=pd.date_range("2012-01-01", periods=10, freq="20min"), + ) + result = df.resample("H").apply(lambda group: group.col.sum()) + expected = pd.Series( + [3, 12, 21, 9], index=pd.date_range("2012-01-01", periods=4, freq="H") + ) + tm.assert_series_equal(result, expected) + result = df.resample("H").apply(lambda group: group["col"].sum()) + tm.assert_series_equal(result, expected) From 1dc0795bfc8f832cddb575e9eacc59effe3ca7a9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 25 Aug 2020 23:21:44 -0400 Subject: [PATCH 2/3] CLN/BUG: Clean/Simplify _wrap_applied_output (#35792) --- doc/source/whatsnew/v1.1.4.rst | 1 + pandas/core/groupby/generic.py | 90 +++++++++--------------------- pandas/core/indexes/api.py | 7 ++- pandas/tests/groupby/test_apply.py | 7 ++- 4 files changed, 34 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index eb68ca38ea5b6..3e4e6f530c7a7 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -35,6 +35,7 @@ Bug fixes - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`) - Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) - Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on (:issue:`35792`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 08c988fa05b6a..2e35bb94dfff6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1219,57 +1219,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) - - # reorder the values - values = [values[i] for i in indexer] - - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None + key_index = self.grouper.result_index if self.as_index else None - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if isinstance(first_not_none, Series): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) values = [x if (x is not None) else backup for x in values] @@ -1278,7 +1246,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same((x.index for x in values)) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1310,7 +1278,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1323,34 +1290,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=index, columns=columns ) + elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..678753f684141 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -298,15 +298,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..2af495a170bee 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -868,13 +868,14 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] From c4f6c1be18abea72b93bae5db905610928bacc26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 26 Oct 2020 12:03:38 +0000 Subject: [PATCH 3/3] Revert "CLN/BUG: Clean/Simplify _wrap_applied_output (#35792)" This reverts commit 1dc0795bfc8f832cddb575e9eacc59effe3ca7a9. --- doc/source/whatsnew/v1.1.4.rst | 1 - pandas/core/groupby/generic.py | 90 +++++++++++++++++++++--------- pandas/core/indexes/api.py | 7 +-- pandas/tests/groupby/test_apply.py | 7 +-- 4 files changed, 71 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index 3e4e6f530c7a7..eb68ca38ea5b6 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -35,7 +35,6 @@ Bug fixes - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`) - Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) - Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`) -- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on (:issue:`35792`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2e35bb94dfff6..08c988fa05b6a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1219,25 +1219,57 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) + key_names = self.grouper.names + # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684 - All values are None, return an empty frame. + # GH9684. If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - key_index = self.grouper.result_index if self.as_index else None + if len(self.grouper.groupings) > 1: + key_index = self.grouper.result_index + + else: + ping = self.grouper.groupings[0] + if len(keys) == ping.ngroups: + key_index = ping.group_index + key_index.name = key_names[0] + + key_lookup = Index(keys) + indexer = key_lookup.get_indexer(key_index) + + # reorder the values + values = [values[i] for i in indexer] + + # update due to the potential reorder + first_not_none = next(com.not_none(*values), None) + else: + + key_index = Index(keys, name=key_names[0]) + + # don't use the key indexer + if not self.as_index: + key_index = None - if isinstance(first_not_none, Series): + # make Nones an empty object + if first_not_none is None: + return self.obj._constructor() + elif isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) + if isinstance(first_not_none, Series): + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) + else: + backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] @@ -1246,7 +1278,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same((x.index for x in values)) + all_indexed_same = all_indexes_same([x.index for x in values]) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1278,6 +1310,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) + if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1290,27 +1323,34 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) - - if self.axis == 0: - index = key_index - columns = v.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = list(names)[0] + # normally use vstack as its faster than concat + # and if we have mi-columns + if ( + isinstance(v.index, MultiIndex) + or key_index is None + or isinstance(key_index, MultiIndex) + ): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = self.obj._constructor( + stacked_values, index=key_index, columns=index + ) else: - index = v.index - columns = key_index - stacked_values = stacked_values.T - + # GH5788 instead of stacking; concat gets the + # dtypes correct + from pandas.core.reshape.concat import concat + + result = concat( + values, + keys=key_index, + names=key_index.names, + axis=self.axis, + ).unstack() + result.columns = index + elif isinstance(v, ABCSeries): + stacked_values = np.vstack([np.asarray(v) for v in values]) result = self.obj._constructor( - stacked_values, index=index, columns=columns + stacked_values.T, index=v.index, columns=key_index ) - elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 678753f684141..4c5a70f4088ee 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -298,16 +298,15 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : iterable of Index objects + indexes : list of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - itr = iter(indexes) - first = next(itr) - for index in itr: + first = indexes[0] + for index in indexes[1:]: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 2af495a170bee..5a1268bfb03db 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -868,14 +868,13 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) - expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") - else: - expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) + expected = pd.DataFrame( + {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") + ) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"]