pandas-dev · topper-123 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,12 +14,39 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_210.enhancements.enhancement1:
+.. _whatsnew_210.enhancements.better_dtype_inference_for_frame_reductions:
+
+Better dtype inference when doing reductions on dataframes of nullable arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Dtype inference when doing reductions on DataFrames with nullable arrays has been improved (:issue:`52707`).
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [1]: df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64")
+   In [2]: df.sum()
+   a    1
+   b    0
+   dtype: int64
+   In [3]: df.sum(min_count=1)
+   a       1
+   b    <NA>
+   dtype: object
+
+With the new behavior, we keep the original dtype:
+
+*New behavior*:
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64")
+   df.sum()
+   df.sum(min_count=1)
 
-enhancement1
-^^^^^^^^^^^^
 
-.. _whatsnew_210.enhancements.enhancement2:
+.. _whatsnew_210.enhancements.map_works_for_all_array_types:
 
 ``map(func, na_action="ignore")`` now works for all array types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -49,6 +49,7 @@
 from pandas._libs.hashtable import duplicated
 from pandas._libs.lib import (
     NoDefault,
+    infer_dtype,
     is_range_indexer,
     no_default,
 )
@@ -94,6 +95,7 @@
     is_dataclass,
     is_dict_like,
     is_dtype_equal,
+    is_extension_array_dtype,
     is_float,
     is_float_dtype,
     is_hashable,
@@ -10899,14 +10901,29 @@ def _get_data() -> DataFrame:
         #  simple case where we can use BlockManager.reduce
         res = df._mgr.reduce(blk_func)
         out = df._constructor(res).iloc[0]
+        mgr_dtypes = df._mgr.get_dtypes().tolist()
+        if out.dtype != object:
+            # e.g. if data dtype is UInt8 and out.dtype is uint64, then common is UInt64
+            mgr_dtypes.append(out.dtype)
+        common_dtype = find_common_type(mgr_dtypes) if mgr_dtypes else None
+        is_ext_dtype = common_dtype is not None and is_extension_array_dtype(
+            common_dtype
+        )
+
         if out_dtype is not None:
             out = out.astype(out_dtype)
+        elif is_ext_dtype and out.dtype == common_dtype.type:
+            out = out.astype(common_dtype)
         elif (df._mgr.get_dtypes() == object).any():
             out = out.astype(object)
-        elif len(self) == 0 and name in ("sum", "prod"):
-            # Even if we are object dtype, follow numpy and return
-            #  float64, see test_apply_funcs_over_empty
-            out = out.astype(np.float64)
+        elif is_ext_dtype and out.dtype == object:
+            inferred_dtype = infer_dtype(out)
+            if isna(out).all():
+                out = out.astype(common_dtype)
+            elif inferred_dtype == "integer":
+                out = out.astype("Int64")
+            elif inferred_dtype == "float":
+                out = out.astype("Float64")
 
         return out
 
@@ -11157,11 +11174,6 @@ def idxmin(
         )
         indices = res._values
 
-        # indices will always be np.ndarray since axis is not None and
-        # values is a 2d array for DataFrame
-        # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
-
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
         final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
@@ -11182,11 +11194,6 @@ def idxmax(
         )
         indices = res._values
 
-        # indices will always be np.ndarray since axis is not None and
-        # values is a 2d array for DataFrame
-        # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
-
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
         final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -6,7 +6,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import is_platform_windows
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -29,6 +32,8 @@
     nanops,
 )
 
+is_windows_or_is32 = is_platform_windows() or not IS64
+
 
 def assert_stat_op_calc(
     opname,
@@ -917,7 +922,7 @@ def test_mean_extensionarray_numeric_only_true(self):
         arr = np.random.randint(1000, size=(10, 5))
         df = DataFrame(arr, dtype="Int64")
         result = df.mean(numeric_only=True)
-        expected = DataFrame(arr).mean()
+        expected = DataFrame(arr, dtype="Float64").mean()
         tm.assert_series_equal(result, expected)
 
     def test_stats_mixed_type(self, float_string_frame):
@@ -1544,6 +1549,100 @@ def test_reduction_timedelta_smallest_unit(self):
         tm.assert_series_equal(result, expected)
 
 
+class TestEmptyDataFrameReductions:
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", np.int8, 0, np.int64),
+            ("prod", np.int8, 1, np.int_),
+            ("sum", np.int64, 0, np.int64),
+            ("prod", np.int64, 1, np.int64),
+            ("sum", np.uint8, 0, np.int64),
+            ("prod", np.uint8, 1, np.uint),
+            ("sum", np.uint64, 0, np.int64),
+            ("prod", np.uint64, 1, np.uint64),
+            ("sum", np.float32, 0, np.float32),
+            ("prod", np.float32, 1, np.float32),
+            ("sum", np.float64, 0, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", np.int8, np.float64),
+            ("prod", np.int8, np.float64),
+            ("sum", np.int64, np.float64),
+            ("prod", np.int64, np.float64),
+            ("sum", np.uint8, np.float64),
+            ("prod", np.uint8, np.float64),
+            ("sum", np.uint64, np.float64),
+            ("prod", np.uint64, np.float64),
+            ("sum", np.float32, np.float32),
+            ("prod", np.float32, np.float32),
+            ("sum", np.float64, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([np.nan, np.nan], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")),
+            ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")),
+            ("sum", "Int64", 0, "Int64"),
+            ("prod", "Int64", 1, "Int64"),
+            ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("sum", "UInt64", 0, "UInt64"),
+            ("prod", "UInt64", 1, "UInt64"),
+            ("sum", "Float32", 0, "Float32"),
+            ("prod", "Float32", 1, "Float32"),
+            ("sum", "Float64", 0, "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", "Int8", "Int8"),
+            ("prod", "Int8", "Int8"),
+            ("sum", "Int64", "Int64"),
+            ("prod", "Int64", "Int64"),
+            ("sum", "UInt8", "UInt8"),
+            ("prod", "UInt8", "UInt8"),
+            ("sum", "UInt64", "UInt64"),
+            ("prod", "UInt64", "UInt64"),
+            ("sum", "Float32", "Float32"),
+            ("prod", "Float32", "Float32"),
+            ("sum", "Float64", "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([pd.NA, pd.NA], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)
+
+
 class TestNuisanceColumns:
     @pytest.mark.parametrize("method", ["any", "all"])
     def test_any_all_categorical_dtype_nuisance_column(self, method):
@@ -1678,7 +1777,9 @@ def test_minmax_extensionarray(method, numeric_only):
     df = DataFrame({"Int64": ser})
     result = getattr(df, method)(numeric_only=numeric_only)
     expected = Series(
-        [getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
+        [getattr(int64_info, method)],
+        index=Index(["Int64"], dtype="object"),
+        dtype=pd.Int64Dtype(),
     )
     tm.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -945,7 +945,7 @@ def test_apply_multi_level_name(category):
         b = pd.Categorical(b, categories=[1, 2, 3])
         expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
         # GH#40669 - summing an empty frame gives float dtype
-        expected_values = [20.0, 25.0, 0.0]
+        expected_values = [20, 25, 0]
     else:
         expected_index = Index([1, 2], name="B")
         expected_values = [20, 25]