diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7280a9f7db3c..449099e5ce073 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1219,171 +1219,107 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - - # GH12824 + # GH12824 - If first value is None, can't assume all are None first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() - elif isinstance(first_not_none, DataFrame): + + if isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index + key_index = self.grouper.result_index if self.as_index else None + + if not isinstance(first_not_none, (Series, np.ndarray, Index)): + + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] + result = DataFrame(values, index=key_index, columns=[self._selection]) + self._insert_inaxis_grouper_inplace(result) + return result - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) + elif not isinstance(first_not_none, Series): - # reorder the values - values = [values[i] for i in indexer] + # GH1738: values is list of arrays of unequal lengths + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: + # this is to silence a DeprecationWarning + # TODO: Replace when default dtype of empty Series is object + # with backup = first_not_none._constructor(**kwargs) + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(**kwargs, dtype_if_empty=object) + values = [x if (x is not None) else backup for x in values] + + v = values[0] + all_indexed_same = all_indexes_same((x.index for x in values)) + + # GH3596 - provide a reduction (Frame -> Series) if groups are unique + if self.squeeze: + # assign the name to this series + applied_index = self._selected_obj._get_axis(self.axis) + if len(values) == 1 and applied_index.nlevels == 1: + v.name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) - key_index = Index(keys, name=key_names[0]) + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat - # don't use the key indexer - if not self.as_index: - key_index = None + return concat(values) - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) - - values = [x if (x is not None) else backup for x in values] - - v = values[0] - - if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: - if isinstance(v, Series): - applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) - singular_series = len(values) == 1 and applied_index.nlevels == 1 - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) - - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - if self.axis == 0 and isinstance(v, ABCSeries): - # GH6124 if the list of Series have a consistent name, - # then propagate that name to the result. - index = v.index.copy() - if index.name is None: - # Only propagate the series name to the result - # if all series have a consistent name. If the - # series do not have a consistent name, do - # nothing. - names = {v.name for v in values} - if len(names) == 1: - index.name = list(names)[0] - - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index - ) - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame( - values, index=key_index, columns=[self._selection] - ) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) + stacked_values = np.vstack([np.asarray(v) for v in values]) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = _recast_datetimelike_result(result) - else: - result = result._convert(datetime=True) + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + + result = self.obj._constructor(stacked_values, index=index, columns=columns) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) - return self._reindex_output(result) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) - # values are not series or array-like but scalars - else: - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) + return self._reindex_output(result) def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 30cc8cf480dcf..d352b001f5d2a 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -297,15 +297,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee38722ffb8ce..6b3ad9844ad76 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -861,13 +861,15 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"]