From ef927e275b15300558299f28b9b8a9504856c150 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 22 Oct 2022 11:39:53 -0700 Subject: [PATCH 1/2] DEPR: DataFrame.median/mean with numeric_only=None and dt64 columns --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/frame.py | 24 ------------------------ pandas/tests/frame/test_reductions.py | 22 ++++++++++------------ 3 files changed, 11 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e281e250d608e..87b1e371ab4d8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -185,6 +185,7 @@ Removal of prior version deprecations/changes - Removed :attr:`Rolling.is_datetimelike` (:issue:`38963`) - Removed deprecated :meth:`Timedelta.delta`, :meth:`Timedelta.is_populated`, and :attr:`Timedelta.freq` (:issue:`46430`, :issue:`46476`) - Removed the ``numeric_only`` keyword from :meth:`Categorical.min` and :meth:`Categorical.max` in favor of ``skipna`` (:issue:`48821`) +- Changed behavior of :meth:`DataFrame.median` and :meth:`DataFrame.mean` with ``numeric_only=None`` to not exclude datetime-like columns THIS NOTE WILL BE IRRELEVANT ONCE ``numeric_only=None`` DEPRECATION IS ENFORCED (:issue:`29941`) - Removed :func:`is_extension_type` in favor of :func:`is_extension_array_dtype` (:issue:`29457`) - Removed :meth:`Index.get_value` (:issue:`33907`) - Remove :meth:`DataFrameGroupBy.pad` and :meth:`DataFrameGroupBy.backfill` (:issue:`45076`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fb333aff66b72..ede84aad504d3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -123,7 +123,6 @@ is_1d_only_ea_dtype, is_bool_dtype, is_dataclass, - is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -10789,29 +10788,6 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None - if numeric_only is None and name in ["mean", "median"]: - own_dtypes = [arr.dtype for arr in self._mgr.arrays] - - dtype_is_dt = np.array( - [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], - dtype=bool, - ) - if dtype_is_dt.any(): - warnings.warn( - "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # Non-copy equivalent to - # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) - # cols = self.columns[~dt64_cols] - # self = self[cols] - predicate = lambda x: not is_datetime64_any_dtype(x.dtype) - mgr = self._mgr._get_data_subset(predicate) - self = type(self)(mgr) - # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) labels = self._get_agg_axis(axis) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 6654ecec78c94..0ab61a9097ab8 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -73,14 +73,13 @@ def assert_stat_op_calc( f = getattr(frame, opname) if check_dates: - expected_warning = FutureWarning if opname in ["mean", "median"] else None df = DataFrame({"b": date_range("1/1/2001", periods=2)}) - with tm.assert_produces_warning(expected_warning): + with tm.assert_produces_warning(None): result = getattr(df, opname)() assert isinstance(result, Series) df["a"] = range(len(df)) - with tm.assert_produces_warning(expected_warning): + with tm.assert_produces_warning(None): result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) @@ -390,9 +389,8 @@ def test_nunique(self): def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2}) - with tm.assert_produces_warning(FutureWarning): - result = df.mean() - expected = Series([1.0], index=["A"]) + result = df.mean() + expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -400,11 +398,11 @@ def test_mean_excludes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. + # As of 2.0 with numeric_only=None we do *not* drop datetime columns df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2}) - with tm.assert_produces_warning(FutureWarning): - result = df.mean() + result = df.mean() - expected = Series(dtype=np.float64) + expected = Series([Timestamp("2000", tz=tz)], index=["A"]) tm.assert_series_equal(result, expected) def test_mean_mixed_string_decimal(self): @@ -857,6 +855,7 @@ def test_mean_corner(self, float_frame, float_string_frame): def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True + # As of 2.0, datetimelike are *not* excluded with numeric_only=None df = DataFrame( { @@ -870,10 +869,9 @@ def test_mean_datetimelike(self): expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - # in the future datetime columns will be included + with tm.assert_produces_warning(FutureWarning, match="Select only valid"): result = df.mean() - expected = Series({"A": 1.0, "C": df.loc[1, "C"]}) + expected = Series({"A": 1.0, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) def test_mean_datetimelike_numeric_only_false(self): From 0ae823b95939e5df00b53969ea68b96cdd304a84 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 Oct 2022 09:54:23 -0700 Subject: [PATCH 2/2] rename test, edit comment --- pandas/tests/frame/test_reductions.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0ab61a9097ab8..92cdaf3319195 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -394,10 +394,9 @@ def test_mean_mixed_datetime_numeric(self, tz): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_mean_excludes_datetimes(self, tz): + def test_mean_includes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 - # Our long-term desired behavior is unclear, but the behavior in - # 0.24.0rc1 was buggy. + # Behavior in 0.24.0rc1 was buggy. # As of 2.0 with numeric_only=None we do *not* drop datetime columns df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2}) result = df.mean()