Skip to content

DEPR: numeic_only in Series and SeriesGroupBy consistency #47561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ Other enhancements
- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
- Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down Expand Up @@ -766,7 +767,8 @@ Other Deprecations
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
- Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`)

- Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
- Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.performance:
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8689,6 +8689,15 @@ def ranker(data):
)

if numeric_only:
if self.ndim == 1 and not is_numeric_dtype(self.dtype):
# GH#47500
warnings.warn(
f"Calling Series.rank with numeric_only={numeric_only} and dtype "
f"{self.dtype} is deprecated and will raise a TypeError in a "
"future version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
data = self._get_numeric_data()
else:
data = self
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,7 @@ def _cython_transform(
) -> DataFrame:
assert axis == 0 # handled by caller
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis)
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis)

# With self.axis == 0, we have multi-block tests
# e.g. test_rank_min_int, test_cython_transform_frame
Expand Down
67 changes: 48 additions & 19 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1291,7 +1291,7 @@ def _wrap_applied_output(
raise AbstractMethodError(self)

def _resolve_numeric_only(
self, numeric_only: bool | lib.NoDefault, axis: int
self, how: str, numeric_only: bool | lib.NoDefault, axis: int
) -> bool:
"""
Determine subclass-specific default value for 'numeric_only'.
Expand Down Expand Up @@ -1328,6 +1328,20 @@ def _resolve_numeric_only(
else:
numeric_only = False

if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
# GH#47500
how = "sum" if how == "add" else how
warnings.warn(
f"{type(self).__name__}.{how} called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "
"raise a TypeError in a future version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement numeric_only"
)

return numeric_only

def _maybe_warn_numeric_only_depr(
Expand Down Expand Up @@ -1704,7 +1718,7 @@ def _cython_agg_general(
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)

data = self._get_data_to_aggregate()
is_ser = data.ndim == 1
Expand Down Expand Up @@ -2100,7 +2114,7 @@ def mean(
2 4.0
Name: B, dtype: float64
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0)

if maybe_use_numba(engine):
from pandas.core._numba.kernels import sliding_mean
Expand Down Expand Up @@ -2134,7 +2148,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
Series or DataFrame
Median of values within each group.
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0)

result = self._cython_agg_general(
"median",
Expand Down Expand Up @@ -2196,10 +2210,15 @@ def std(
return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
else:
# Resolve numeric_only so that var doesn't warn
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.std does not implement numeric_only."
numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0)
if (
numeric_only_bool
and self.obj.ndim == 1
and not is_numeric_dtype(self.obj.dtype)
):
raise TypeError(
f"{type(self).__name__}.std called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
)
result = self._get_cythonized_result(
libgroupby.group_var,
Expand Down Expand Up @@ -2264,7 +2283,7 @@ def var(

return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
else:
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
numeric_only_bool = self._resolve_numeric_only("var", numeric_only, axis=0)
if ddof == 1:
return self._cython_agg_general(
"var",
Expand Down Expand Up @@ -2304,10 +2323,15 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default
Standard error of the mean of values within each group.
"""
# Reolve numeric_only so that std doesn't warn
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.sem does not implement numeric_only."
numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
if (
numeric_only_bool
and self.obj.ndim == 1
and not is_numeric_dtype(self.obj.dtype)
):
raise TypeError(
f"{type(self).__name__}.sem called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
)
result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
Expand Down Expand Up @@ -3179,10 +3203,15 @@ def quantile(
a 2.0
b 3.0
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.quantile does not implement numeric_only"
numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0)
if (
numeric_only_bool
and self.obj.ndim == 1
and not is_numeric_dtype(self.obj.dtype)
):
raise TypeError(
f"{type(self).__name__}.quantile called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
Expand Down Expand Up @@ -3671,7 +3700,8 @@ def _get_cythonized_result(
-------
`Series` or `DataFrame` with filled values
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
how = base_func.__name__
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)

if post_processing and not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
Expand All @@ -3682,7 +3712,6 @@ def _get_cythonized_result(

ids, _, ngroups = grouper.group_info

how = base_func.__name__
base_func = partial(base_func, labels=ids)

def blk_func(values: ArrayLike) -> ArrayLike:
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
is_integer,
is_iterator,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
pandas_dtype,
Expand Down Expand Up @@ -4616,10 +4617,17 @@ def _reduce(

else:
# dispatch to numpy arrays
if numeric_only:
if numeric_only and not is_numeric_dtype(self.dtype):
kwd_name = "numeric_only"
if name in ["any", "all"]:
kwd_name = "bool_only"
# GH#47500 - change to TypeError to match other methods
warnings.warn(
f"Calling Series.{name} with {kwd_name}={numeric_only} and "
f"dtype {self.dtype} will raise a TypeError in the future",
FutureWarning,
stacklevel=find_stack_level(),
)
raise NotImplementedError(
f"Series.{name} does not implement {kwd_name}."
)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ def test_cython_agg_boolean():
def test_cython_agg_nothing_to_agg():
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})

with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("a")["b"].mean(numeric_only=True)
with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"):
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("a")["b"].mean(numeric_only=True)

with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
frame.groupby("a")["b"].mean()
Expand All @@ -114,8 +115,9 @@ def test_cython_agg_nothing_to_agg_with_dates():
"dates": pd.date_range("now", periods=50, freq="T"),
}
)
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("b").dates.mean(numeric_only=True)
with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"):
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("b").dates.mean(numeric_only=True)


def test_cython_agg_frame_columns():
Expand Down
Loading