[BUG] Aggregated bool has inconsistent dtype

rhshadrach · rhshadrach · commit 38c6864185e3 · 2020-03-24T19:06:13.000-04:00
Addresses: GH7001
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -30,6 +30,7 @@
     ensure_int64,
     ensure_object,
     ensure_str,
+    groupby_result_dtype,
     is_bool,
     is_bool_dtype,
     is_complex,
@@ -172,7 +173,9 @@ def maybe_downcast_to_dtype(result, dtype):
     return result
 
 
-def maybe_downcast_numeric(result, dtype, do_round: bool = False):
+def maybe_downcast_numeric(
+    result, dtype, do_round: bool = False, how: str = "",
+):
     """
     Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
 
@@ -181,6 +184,7 @@ def maybe_downcast_numeric(result, dtype, do_round: bool = False):
     result : ndarray or ExtensionArray
     dtype : np.dtype or ExtensionDtype
     do_round : bool
+    how : str
 
     Returns
     -------
@@ -195,6 +199,8 @@ def maybe_downcast_numeric(result, dtype, do_round: bool = False):
         #  earlier
         result = np.array(result)
 
+    dtype = groupby_result_dtype(dtype, how)
+
     def trans(x):
         if do_round:
             return x.round()
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1788,3 +1788,27 @@ def pandas_dtype(dtype) -> DtypeObj:
         raise TypeError(f"dtype '{dtype}' not understood")
 
     return npdtype
+
+
+def groupby_result_dtype(dtype, how) -> DtypeObj:
+    """
+    Get the desired dtype of an aggregation result based on the
+    input dtype and how the aggregation is done.
+
+    Parameters
+    ----------
+    dtype : dtype, type
+        The input dtype for the groupby.
+    how : str
+        How the aggregation is performed.
+
+    Returns
+    -------
+    The desired dtype of the aggregation result.
+    """
+    d = {
+        (np.dtype(np.bool), "add"): np.dtype(np.int64),
+        (np.dtype(np.bool), "cumsum"): np.dtype(np.int64),
+        (np.dtype(np.bool), "sum"): np.dtype(np.int64),
+    }
+    return d.get((dtype, how), dtype)
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -526,7 +526,7 @@ def _transform_fast(self, result, func_nm: str) -> Series:
         cast = self._transform_should_cast(func_nm)
         out = algorithms.take_1d(result._values, ids)
         if cast:
-            out = self._try_cast(out, self.obj)
+            out = self._try_cast(out, self.obj, how=func_nm)
         return Series(out, index=self.obj.index, name=self.obj.name)
 
     def filter(self, func, dropna=True, *args, **kwargs):
@@ -1073,7 +1073,7 @@ def _cython_agg_blocks(
 
             if result is not no_result:
                 # see if we can cast the block back to the original dtype
-                result = maybe_downcast_numeric(result, block.dtype)
+                result = maybe_downcast_numeric(result, block.dtype, how=how)
 
                 if block.is_extension and isinstance(result, np.ndarray):
                     # e.g. block.values was an IntegerArray
@@ -1460,7 +1460,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
             # TODO: we have no test cases that get here with EA dtypes;
             #  try_cast may not be needed if EAs never get here
             if cast:
-                res = self._try_cast(res, obj.iloc[:, i])
+                res = self._try_cast(res, obj.iloc[:, i], how=func_nm)
             output.append(res)
 
         return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -42,6 +42,7 @@ class providing the base-class of operations.
 from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 from pandas.core.dtypes.common import (
     ensure_float,
+    groupby_result_dtype,
     is_datetime64_dtype,
     is_extension_array_dtype,
     is_integer_dtype,
@@ -792,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True):
         rev[sorter] = np.arange(count, dtype=np.intp)
         return out[rev].astype(np.int64, copy=False)
 
-    def _try_cast(self, result, obj, numeric_only: bool = False):
+    def _try_cast(self, result, obj, numeric_only: bool = False, how: str = ""):
         """
         Try to cast the result to our obj original type,
         we may have roundtripped through object in the mean-time.
@@ -806,6 +807,8 @@ def _try_cast(self, result, obj, numeric_only: bool = False):
         else:
             dtype = obj.dtype
 
+        dtype = groupby_result_dtype(dtype, how)
+
         if not is_scalar(result):
             if is_extension_array_dtype(dtype) and dtype.kind != "M":
                 # The function can return something of any type, so check
@@ -852,7 +855,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
                 continue
 
             if self._transform_should_cast(how):
-                result = self._try_cast(result, obj)
+                result = self._try_cast(result, obj, how=how)
 
             key = base.OutputKey(label=name, position=idx)
             output[key] = result
@@ -895,12 +898,12 @@ def _cython_agg_general(
                 assert len(agg_names) == result.shape[1]
                 for result_column, result_name in zip(result.T, agg_names):
                     key = base.OutputKey(label=result_name, position=idx)
-                    output[key] = self._try_cast(result_column, obj)
+                    output[key] = self._try_cast(result_column, obj, how=how)
                     idx += 1
             else:
                 assert result.ndim == 1
                 key = base.OutputKey(label=name, position=idx)
-                output[key] = self._try_cast(result, obj)
+                output[key] = self._try_cast(result, obj, how=how)
                 idx += 1
 
         if len(output) == 0:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -7,6 +7,8 @@
 
 from pandas.errors import PerformanceWarning
 
+from pandas.core.dtypes.common import is_integer_dtype
+
 import pandas as pd
 from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv
 import pandas._testing as tm
@@ -2057,3 +2059,46 @@ def test_groups_repr_truncates(max_seq_items, expected):
 
         result = df.groupby(np.array(df.a)).groups.__repr__()
         assert result == expected
+
+
+def test_bool_agg_dtype():
+    # GH 7001
+    # Bool aggregation results in int
+    df = pd.DataFrame({"a": [1, 1], "b": [False, True]})
+    s = df.set_index("a")["b"]
+
+    result = df.groupby("a").sum()["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").sum().dtype
+    assert is_integer_dtype(result)
+
+    result = df.groupby("a").cumsum()["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").cumsum().dtype
+    assert is_integer_dtype(result)
+
+    result = df.groupby("a").agg("sum")["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").agg("sum").dtype
+    assert is_integer_dtype(result)
+
+    result = df.groupby("a").agg("cumsum")["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").agg("cumsum").dtype
+    assert is_integer_dtype(result)
+
+    result = df.groupby("a").transform("sum")["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").transform("sum").dtype
+    assert is_integer_dtype(result)
+
+    result = df.groupby("a").transform("cumsum")["b"].dtype
+    assert is_integer_dtype(result)
+
+    result = s.groupby("a").transform("cumsum").dtype
+    assert is_integer_dtype(result)