Skip to content

Commit 009fe99

Browse files
natmokvalmroeschke
andauthored
CLN: enforce deprecation of the Series[categorical].replace special-casing (#58270)
* enforce depr behavior df.replace / s.replace with CategoricalDtype * fixup tests in frame/methods/test_replace.py * fixup tests in arrays/categorical/test_replace.py and pandas/tests/copy_view/test_replace.py * add a note to v3.0.0 * remove _replace and special-casing, fix tests * fix tests * Adjust tests and clarify whatsnew * Fix pre-commit --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 1a95c79 commit 009fe99

File tree

7 files changed

+96
-269
lines changed

7 files changed

+96
-269
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ Other Removals
381381
- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`)
382382
- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`)
383383
- Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`)
384+
- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`)
384385
- Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`)
385386
- Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
386387
- Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)

pandas/core/arrays/categorical.py

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
cast,
1111
overload,
1212
)
13-
import warnings
1413

1514
import numpy as np
1615

@@ -23,7 +22,6 @@
2322
)
2423
from pandas._libs.arrays import NDArrayBacked
2524
from pandas.compat.numpy import function as nv
26-
from pandas.util._exceptions import find_stack_level
2725
from pandas.util._validators import validate_bool_kwarg
2826

2927
from pandas.core.dtypes.cast import (
@@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
26732671
code_values = code_values[null_mask | (code_values >= 0)]
26742672
return algorithms.isin(self.codes, code_values)
26752673

2676-
@overload
2677-
def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ...
2678-
2679-
@overload
2680-
def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ...
2681-
2682-
def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None:
2683-
from pandas import Index
2684-
2685-
orig_dtype = self.dtype
2686-
2687-
inplace = validate_bool_kwarg(inplace, "inplace")
2688-
cat = self if inplace else self.copy()
2689-
2690-
mask = isna(np.asarray(value))
2691-
if mask.any():
2692-
removals = np.asarray(to_replace)[mask]
2693-
removals = cat.categories[cat.categories.isin(removals)]
2694-
new_cat = cat.remove_categories(removals)
2695-
NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
2696-
2697-
ser = cat.categories.to_series()
2698-
ser = ser.replace(to_replace=to_replace, value=value)
2699-
2700-
all_values = Index(ser)
2701-
2702-
# GH51016: maintain order of existing categories
2703-
idxr = cat.categories.get_indexer_for(all_values)
2704-
locs = np.arange(len(ser))
2705-
locs = np.where(idxr == -1, locs, idxr)
2706-
locs = locs.argsort()
2707-
2708-
new_categories = ser.take(locs)
2709-
new_categories = new_categories.drop_duplicates(keep="first")
2710-
index_categories = Index(new_categories)
2711-
new_codes = recode_for_categories(
2712-
cat._codes, all_values, index_categories, copy=False
2713-
)
2714-
new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered)
2715-
NDArrayBacked.__init__(cat, new_codes, new_dtype)
2716-
2717-
if new_dtype != orig_dtype:
2718-
warnings.warn(
2719-
# GH#55147
2720-
"The behavior of Series.replace (and DataFrame.replace) with "
2721-
"CategoricalDtype is deprecated. In a future version, replace "
2722-
"will only be used for cases that preserve the categories. "
2723-
"To change the categories, use ser.cat.rename_categories "
2724-
"instead.",
2725-
FutureWarning,
2726-
stacklevel=find_stack_level(),
2727-
)
2728-
if not inplace:
2729-
return cat
2730-
return None
2731-
27322674
# ------------------------------------------------------------------------
27332675
# String methods interface
27342676
def _str_map(

pandas/core/internals/blocks.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@
100100
)
101101
from pandas.core.array_algos.transforms import shift
102102
from pandas.core.arrays import (
103-
Categorical,
104103
DatetimeArray,
105104
ExtensionArray,
106105
IntervalArray,
@@ -696,14 +695,6 @@ def replace(
696695
# go through replace_list
697696
values = self.values
698697

699-
if isinstance(values, Categorical):
700-
# TODO: avoid special-casing
701-
# GH49404
702-
blk = self._maybe_copy(inplace)
703-
values = cast(Categorical, blk.values)
704-
values._replace(to_replace=to_replace, value=value, inplace=True)
705-
return [blk]
706-
707698
if not self._can_hold_element(to_replace):
708699
# We cannot hold `to_replace`, so we know immediately that
709700
# replacing it is a no-op.
@@ -803,14 +794,6 @@ def replace_list(
803794
"""
804795
values = self.values
805796

806-
if isinstance(values, Categorical):
807-
# TODO: avoid special-casing
808-
# GH49404
809-
blk = self._maybe_copy(inplace)
810-
values = cast(Categorical, blk.values)
811-
values._replace(to_replace=src_list, value=dest_list, inplace=True)
812-
return [blk]
813-
814797
# Exclude anything that we know we won't contain
815798
pairs = [
816799
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)

pandas/tests/arrays/categorical/test_replace.py

Lines changed: 39 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -6,106 +6,66 @@
66

77

88
@pytest.mark.parametrize(
9-
"to_replace,value,expected,flip_categories",
9+
"to_replace,value,expected",
1010
[
1111
# one-to-one
12-
(1, 2, [2, 2, 3], False),
13-
(1, 4, [4, 2, 3], False),
14-
(4, 1, [1, 2, 3], False),
15-
(5, 6, [1, 2, 3], False),
12+
(4, 1, [1, 2, 3]),
13+
(3, 1, [1, 2, 1]),
1614
# many-to-one
17-
([1], 2, [2, 2, 3], False),
18-
([1, 2], 3, [3, 3, 3], False),
19-
([1, 2], 4, [4, 4, 3], False),
20-
((1, 2, 4), 5, [5, 5, 3], False),
21-
((5, 6), 2, [1, 2, 3], False),
22-
([1], [2], [2, 2, 3], False),
23-
([1, 4], [5, 2], [5, 2, 3], False),
24-
# GH49404: overlap between to_replace and value
25-
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
26-
# GH50872, GH46884: replace with null
27-
(1, None, [None, 2, 3], False),
28-
(1, pd.NA, [None, 2, 3], False),
29-
# check_categorical sorts categories, which crashes on mixed dtypes
30-
(3, "4", [1, 2, "4"], False),
31-
([1, 2, "3"], "5", ["5", "5", 3], True),
15+
((5, 6), 2, [1, 2, 3]),
16+
((3, 2), 1, [1, 1, 1]),
3217
],
3318
)
34-
@pytest.mark.filterwarnings(
35-
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
36-
)
37-
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
19+
def test_replace_categorical_series(to_replace, value, expected):
3820
# GH 31720
39-
4021
ser = pd.Series([1, 2, 3], dtype="category")
4122
result = ser.replace(to_replace, value)
42-
expected = pd.Series(expected, dtype="category")
43-
ser.replace(to_replace, value, inplace=True)
44-
45-
if flip_categories:
46-
expected = expected.cat.set_categories(expected.cat.categories[::-1])
47-
48-
tm.assert_series_equal(expected, result, check_category_order=False)
49-
tm.assert_series_equal(expected, ser, check_category_order=False)
23+
expected = pd.Series(Categorical(expected, categories=[1, 2, 3]))
24+
tm.assert_series_equal(result, expected)
5025

5126

5227
@pytest.mark.parametrize(
53-
"to_replace, value, result, expected_error_msg",
28+
"to_replace,value",
5429
[
55-
("b", "c", ["a", "c"], "Categorical.categories are different"),
56-
("c", "d", ["a", "b"], None),
57-
# https://github.com/pandas-dev/pandas/issues/33288
58-
("a", "a", ["a", "b"], None),
59-
("b", None, ["a", None], "Categorical.categories length are different"),
30+
# one-to-one
31+
(3, 5),
32+
# many-to-one
33+
((3, 2), 5),
6034
],
6135
)
62-
def test_replace_categorical(to_replace, value, result, expected_error_msg):
63-
# GH#26988
64-
cat = Categorical(["a", "b"])
65-
expected = Categorical(result)
66-
msg = (
67-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
68-
"with CategoricalDtype"
69-
)
70-
warn = FutureWarning if expected_error_msg is not None else None
71-
with tm.assert_produces_warning(warn, match=msg):
72-
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
36+
def test_replace_categorical_series_new_category_raises(to_replace, value):
37+
# GH 31720
38+
ser = pd.Series([1, 2, 3], dtype="category")
39+
with pytest.raises(
40+
TypeError, match="Cannot setitem on a Categorical with a new category"
41+
):
42+
ser.replace(to_replace, value)
7343

74-
tm.assert_categorical_equal(result, expected)
75-
if to_replace == "b": # the "c" test is supposed to be unchanged
76-
with pytest.raises(AssertionError, match=expected_error_msg):
77-
# ensure non-inplace call does not affect original
78-
tm.assert_categorical_equal(cat, expected)
7944

80-
ser = pd.Series(cat, copy=False)
81-
with tm.assert_produces_warning(warn, match=msg):
82-
ser.replace(to_replace, value, inplace=True)
83-
tm.assert_categorical_equal(cat, expected)
45+
def test_replace_maintain_ordering():
46+
# GH51016
47+
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
48+
ser = pd.Series([0, 1, 2], dtype=dtype)
49+
result = ser.replace(0, 2)
50+
expected = pd.Series([2, 1, 2], dtype=dtype)
51+
tm.assert_series_equal(expected, result, check_category_order=True)
8452

8553

8654
def test_replace_categorical_ea_dtype():
8755
# GH49404
88-
cat = Categorical(pd.array(["a", "b"], dtype="string"))
89-
msg = (
90-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
91-
"with CategoricalDtype"
56+
cat = Categorical(pd.array(["a", "b", "c"], dtype="string"))
57+
result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values
58+
expected = Categorical(
59+
pd.array(["c"] * 3, dtype="string"),
60+
categories=pd.array(["a", "b", "c"], dtype="string"),
9261
)
93-
with tm.assert_produces_warning(FutureWarning, match=msg):
94-
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
95-
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
9662
tm.assert_categorical_equal(result, expected)
9763

9864

99-
def test_replace_maintain_ordering():
100-
# GH51016
101-
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
102-
ser = pd.Series([0, 1, 2], dtype=dtype)
103-
msg = (
104-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
105-
"with CategoricalDtype"
106-
)
107-
with tm.assert_produces_warning(FutureWarning, match=msg):
108-
result = ser.replace(0, 2)
109-
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
110-
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
111-
tm.assert_series_equal(expected, result, check_category_order=True)
65+
def test_replace_categorical_ea_dtype_different_cats_raises():
66+
# GH49404
67+
cat = Categorical(pd.array(["a", "b"], dtype="string"))
68+
with pytest.raises(
69+
TypeError, match="Cannot setitem on a Categorical with a new category"
70+
):
71+
pd.Series(cat).replace(["a", "b"], ["c", pd.NA])

pandas/tests/copy_view/test_replace.py

Lines changed: 12 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype():
129129
def test_replace_list_categorical():
130130
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
131131
arr = get_array(df, "a")
132-
msg = (
133-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
134-
"with CategoricalDtype"
135-
)
136-
with tm.assert_produces_warning(FutureWarning, match=msg):
137-
df.replace(["c"], value="a", inplace=True)
132+
133+
df.replace(["c"], value="a", inplace=True)
138134
assert np.shares_memory(arr.codes, get_array(df, "a").codes)
139135
assert df._mgr._has_no_reference(0)
140136

141137
df_orig = df.copy()
142-
with tm.assert_produces_warning(FutureWarning, match=msg):
143-
df2 = df.replace(["b"], value="a")
138+
df.replace(["b"], value="a")
139+
df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"}))
144140
assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
145141

146142
tm.assert_frame_equal(df, df_orig)
@@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical():
150146
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
151147
view = df[:]
152148
df_orig = df.copy()
153-
msg = (
154-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
155-
"with CategoricalDtype"
156-
)
157-
with tm.assert_produces_warning(FutureWarning, match=msg):
158-
df.replace(["c"], value="a", inplace=True)
159-
assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes)
149+
df.replace(["c"], value="a", inplace=True)
160150
tm.assert_frame_equal(df_orig, view)
161151

162152

@@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace):
195185

196186

197187
@pytest.mark.parametrize("to_replace", [1, [1]])
198-
@pytest.mark.parametrize("val", [1, 1.5])
199-
def test_replace_categorical_inplace_reference(val, to_replace):
188+
def test_replace_categorical_inplace_reference(to_replace):
200189
df = DataFrame({"a": Categorical([1, 2, 3])})
201190
df_orig = df.copy()
202191
arr_a = get_array(df, "a")
203192
view = df[:]
204-
msg = (
205-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
206-
"with CategoricalDtype"
207-
)
208-
warn = FutureWarning if val == 1.5 else None
209-
with tm.assert_produces_warning(warn, match=msg):
210-
df.replace(to_replace=to_replace, value=val, inplace=True)
211-
193+
df.replace(to_replace=to_replace, value=1, inplace=True)
212194
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
213195
assert df._mgr._has_no_reference(0)
214196
assert view._mgr._has_no_reference(0)
215197
tm.assert_frame_equal(view, df_orig)
216198

217199

218-
@pytest.mark.parametrize("val", [1, 1.5])
219-
def test_replace_categorical_inplace(val):
200+
def test_replace_categorical_inplace():
220201
df = DataFrame({"a": Categorical([1, 2, 3])})
221202
arr_a = get_array(df, "a")
222-
msg = (
223-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
224-
"with CategoricalDtype"
225-
)
226-
warn = FutureWarning if val == 1.5 else None
227-
with tm.assert_produces_warning(warn, match=msg):
228-
df.replace(to_replace=1, value=val, inplace=True)
203+
df.replace(to_replace=1, value=1, inplace=True)
229204

230205
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
231206
assert df._mgr._has_no_reference(0)
232207

233-
expected = DataFrame({"a": Categorical([val, 2, 3])})
208+
expected = DataFrame({"a": Categorical([1, 2, 3])})
234209
tm.assert_frame_equal(df, expected)
235210

236211

237-
@pytest.mark.parametrize("val", [1, 1.5])
238-
def test_replace_categorical(val):
212+
def test_replace_categorical():
239213
df = DataFrame({"a": Categorical([1, 2, 3])})
240214
df_orig = df.copy()
241-
msg = (
242-
r"The behavior of Series\.replace \(and DataFrame.replace\) "
243-
"with CategoricalDtype"
244-
)
245-
warn = FutureWarning if val == 1.5 else None
246-
with tm.assert_produces_warning(warn, match=msg):
247-
df2 = df.replace(to_replace=1, value=val)
215+
df2 = df.replace(to_replace=1, value=1)
248216

249217
assert df._mgr._has_no_reference(0)
250218
assert df2._mgr._has_no_reference(0)

0 commit comments

Comments
 (0)