BUG: Groupby with as_index=False raises error when type is Category (#34767)

rhshadrach · web-flow · commit 0f179707302f · 2020-06-15T08:29:18.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -676,6 +676,25 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma
 
    df.groupby("a", as_index=False).nunique()
 
+The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`)
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [3]: df.groupby("a", as_index=False).size()
+   Out[4]:
+   a
+   x    2
+   y    2
+   dtype: int64
+
+*New behavior*:
+
+.. ipython:: python
+
+   df.groupby("a", as_index=False).size()
+
 .. _whatsnew_110.api_breaking.apply_applymap_first_once:
 
 apply and applymap on ``DataFrame`` evaluates first row/column only once
@@ -995,6 +1014,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`)
 - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
 
+
 Reshaping
 ^^^^^^^^^
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5450,7 +5450,7 @@ def value_counts(
         if subset is None:
             subset = self.columns.tolist()
 
-        counts = self.groupby(subset).size()
+        counts = self.groupby(subset).grouper.size()
 
         if sort:
             counts = counts.sort_values(ascending=ascending)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -942,9 +942,9 @@ def _transform_should_cast(self, func_nm: str) -> bool:
         bool
             Whether transform should attempt to cast the result of aggregation
         """
-        return (self.size().fillna(0) > 0).any() and (
-            func_nm not in base.cython_cast_blacklist
-        )
+        filled_series = self.grouper.size().fillna(0)
+        assert filled_series is not None
+        return filled_series.gt(0).any() and func_nm not in base.cython_cast_blacklist
 
     def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
         output: Dict[base.OutputKey, np.ndarray] = {}
@@ -1507,14 +1507,15 @@ def sem(self, ddof: int = 1):
 
     @Substitution(name="groupby")
     @Appender(_common_see_also)
-    def size(self):
+    def size(self) -> FrameOrSeriesUnion:
         """
         Compute group sizes.
 
         Returns
         -------
-        Series
-            Number of rows in each group.
+        DataFrame or Series
+            Number of rows in each group as a Series if as_index is True
+            or a DataFrame if as_index is False.
         """
         result = self.grouper.size()
 
@@ -1523,6 +1524,10 @@ def size(self):
             result = self._obj_1d_constructor(result, name=self.obj.name)
         else:
             result = self._obj_1d_constructor(result)
+
+        if not self.as_index:
+            result = result.rename("size").reset_index()
+
         return self._reindex_output(result, fill_value=0)
 
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -668,11 +668,14 @@ def test_ops_not_as_index(reduction_func):
     if reduction_func in ("corrwith",):
         pytest.skip("Test not applicable")
 
-    if reduction_func in ("nth", "ngroup", "size",):
+    if reduction_func in ("nth", "ngroup",):
         pytest.skip("Skip until behavior is determined (GH #5755)")
 
     df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
-    expected = getattr(df.groupby("a"), reduction_func)().reset_index()
+    expected = getattr(df.groupby("a"), reduction_func)()
+    if reduction_func == "size":
+        expected = expected.rename("size")
+    expected = expected.reset_index()
 
     g = df.groupby("a", as_index=False)
 
diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py
@@ -44,3 +44,19 @@ def test_size_period_index():
     grp = ser.groupby(level="A")
     result = grp.size()
     tm.assert_series_equal(result, ser)
+
+
+@pytest.mark.parametrize("as_index", [True, False])
+def test_size_on_categorical(as_index):
+    df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
+    df["A"] = df["A"].astype("category")
+    result = df.groupby(["A", "B"], as_index=as_index).size()
+
+    expected = DataFrame(
+        [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"],
+    )
+    expected["A"] = expected["A"].astype("category")
+    if as_index:
+        expected = expected.set_index(["A", "B"])["size"].rename(None)
+
+    tm.assert_equal(result, expected)