Skip to content

Commit 527c789

Browse files
authored
API/BUG: always try to operate inplace when setting with loc/iloc[foo, bar] (#39163)
1 parent 45c8090 commit 527c789

File tree

8 files changed

+134
-54
lines changed

8 files changed

+134
-54
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,46 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first`
181181
combined.dtypes
182182
183183
184+
Try operating inplace when setting values with ``loc`` and ``iloc``
185+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
186+
187+
When setting an entire column using ``loc`` or ``iloc``, pandas will try to
188+
insert the values into the existing data rather than create an entirely new array.
189+
190+
.. ipython:: python
191+
192+
df = pd.DataFrame(range(3), columns=["A"], dtype="float64")
193+
values = df.values
194+
new = np.array([5, 6, 7], dtype="int64")
195+
df.loc[[0, 1, 2], "A"] = new
196+
197+
In both the new and old behavior, the data in ``values`` is overwritten, but in
198+
the old behavior the dtype of ``df["A"]`` changed to ``int64``.
199+
200+
*pandas 1.2.x*
201+
202+
.. code-block:: ipython
203+
204+
In [1]: df.dtypes
205+
Out[1]:
206+
A int64
207+
dtype: object
208+
In [2]: np.shares_memory(df["A"].values, new)
209+
Out[2]: False
210+
In [3]: np.shares_memory(df["A"].values, values)
211+
Out[3]: False
212+
213+
In pandas 1.3.0, ``df`` continues to share data with ``values``
214+
215+
*pandas 1.3.0*
216+
217+
.. ipython:: python
218+
219+
df.dtypes
220+
np.shares_memory(df["A"], new)
221+
np.shares_memory(df["A"], values)
222+
223+
184224
.. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting:
185225

186226
Consistent Casting With Setting Into Boolean Series

pandas/core/indexing.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1866,7 +1866,6 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
18661866
ser = value
18671867
elif is_array_like(value) and is_exact_shape_match(ser, value):
18681868
ser = value
1869-
18701869
else:
18711870
# set the item, possibly having a dtype change
18721871
ser = ser.copy()

pandas/core/internals/blocks.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@
9292
Categorical,
9393
DatetimeArray,
9494
ExtensionArray,
95+
FloatingArray,
96+
IntegerArray,
9597
PandasArray,
9698
)
9799
from pandas.core.base import PandasObject
@@ -994,6 +996,7 @@ def setitem(self, indexer, value):
994996
# length checking
995997
check_setitem_lengths(indexer, value, values)
996998
exact_match = is_exact_shape_match(values, arr_value)
999+
9971000
if is_empty_indexer(indexer, arr_value):
9981001
# GH#8669 empty indexers
9991002
pass
@@ -1007,27 +1010,21 @@ def setitem(self, indexer, value):
10071010
# GH25495 - If the current dtype is not categorical,
10081011
# we need to create a new categorical block
10091012
values[indexer] = value
1010-
if values.ndim == 2:
1011-
# TODO(EA2D): special case not needed with 2D EAs
1012-
if values.shape[-1] != 1:
1013-
# shouldn't get here (at least until 2D EAs)
1014-
raise NotImplementedError
1015-
values = values[:, 0]
1016-
return self.make_block(Categorical(values, dtype=arr_value.dtype))
10171013

10181014
elif exact_match and is_ea_value:
10191015
# GH#32395 if we're going to replace the values entirely, just
10201016
# substitute in the new array
1021-
return self.make_block(arr_value)
1017+
if not self.is_object and isinstance(value, (IntegerArray, FloatingArray)):
1018+
values[indexer] = value.to_numpy(value.dtype.numpy_dtype)
1019+
else:
1020+
values[indexer] = np.asarray(value)
10221021

10231022
# if we are an exact match (ex-broadcasting),
10241023
# then use the resultant dtype
10251024
elif exact_match:
10261025
# We are setting _all_ of the array's values, so can cast to new dtype
10271026
values[indexer] = value
10281027

1029-
values = values.astype(arr_value.dtype, copy=False)
1030-
10311028
elif is_ea_value:
10321029
# GH#38952
10331030
if values.ndim == 1:
@@ -1892,6 +1889,10 @@ class NumericBlock(Block):
18921889
is_numeric = True
18931890

18941891
def _can_hold_element(self, element: Any) -> bool:
1892+
element = extract_array(element, extract_numpy=True)
1893+
if isinstance(element, (IntegerArray, FloatingArray)):
1894+
if element._mask.any():
1895+
return False
18951896
return can_hold_element(self.dtype, element)
18961897

18971898
@property

pandas/tests/extension/base/setitem.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,13 +339,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
339339

340340
key = full_indexer(df)
341341
result.loc[key, "data"] = df["data"]
342+
342343
self.assert_frame_equal(result, expected)
343344

344345
def test_setitem_series(self, data, full_indexer):
345346
# https://github.com/pandas-dev/pandas/issues/32395
346-
ser = expected = pd.Series(data, name="data")
347+
ser = pd.Series(data, name="data")
347348
result = pd.Series(index=ser.index, dtype=object, name="data")
348349

350+
# because result has object dtype, the attempt to do setting inplace
351+
# is successful, and object dtype is retained
349352
key = full_indexer(ser)
350353
result.loc[key] = ser
354+
355+
expected = pd.Series(
356+
data.astype(object), index=ser.index, name="data", dtype=object
357+
)
351358
self.assert_series_equal(result, expected)

pandas/tests/extension/test_numpy.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,23 @@ def test_setitem_slice(self, data, box_in_series):
425425
def test_setitem_loc_iloc_slice(self, data):
426426
super().test_setitem_loc_iloc_slice(data)
427427

428+
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
429+
# https://github.com/pandas-dev/pandas/issues/32395
430+
df = expected = pd.DataFrame({"data": pd.Series(data)})
431+
result = pd.DataFrame(index=df.index)
432+
433+
# because result has object dtype, the attempt to do setting inplace
434+
# is successful, and object dtype is retained
435+
key = full_indexer(df)
436+
result.loc[key, "data"] = df["data"]
437+
438+
# base class method has expected = df; PandasArray behaves oddly because
439+
# we patch _typ for these tests.
440+
if data.dtype.numpy_dtype != object:
441+
if not isinstance(key, slice) or key != slice(None):
442+
expected = pd.DataFrame({"data": data.to_numpy()})
443+
self.assert_frame_equal(result, expected)
444+
428445

429446
@skip_nested
430447
class TestParsing(BaseNumPyTests, base.BaseParsingTests):

pandas/tests/indexing/test_iloc.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,33 +67,31 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key):
6767
frame = DataFrame({0: range(3)}, dtype=object)
6868

6969
cat = Categorical(["alpha", "beta", "gamma"])
70-
expected = DataFrame({0: cat})
71-
# NB: pending GH#38896, the expected likely should become
72-
# expected= DataFrame({"A": cat.astype(object)})
73-
# and should remain a view on the original values
7470

7571
assert frame._mgr.blocks[0]._can_hold_element(cat)
7672

7773
df = frame.copy()
7874
orig_vals = df.values
7975
indexer(df)[key, 0] = cat
8076

81-
overwrite = not isinstance(key, slice)
77+
overwrite = isinstance(key, slice) and key == slice(None)
8278

83-
tm.assert_frame_equal(df, expected)
84-
85-
# TODO: this inconsistency is likely undesired GH#39986
8679
if overwrite:
87-
# check that we overwrote underlying
88-
tm.assert_numpy_array_equal(orig_vals, df.values)
80+
# TODO: GH#39986 this probably shouldn't behave differently
81+
expected = DataFrame({0: cat})
82+
assert not np.shares_memory(df.values, orig_vals)
83+
else:
84+
expected = DataFrame({0: cat}).astype(object)
85+
assert np.shares_memory(df.values, orig_vals)
8986

90-
# but we don't have a view on orig_vals
91-
orig_vals[0, 0] = 19
92-
assert df.iloc[0, 0] != 19
87+
tm.assert_frame_equal(df, expected)
9388

9489
# check we dont have a view on cat (may be undesired GH#39986)
9590
df.iloc[0, 0] = "gamma"
96-
assert cat[0] != "gamma"
91+
if overwrite:
92+
assert cat[0] != "gamma"
93+
else:
94+
assert cat[0] != "gamma"
9795

9896
@pytest.mark.parametrize("box", [pd_array, Series])
9997
def test_iloc_setitem_ea_inplace(self, frame_or_series, box):

pandas/tests/indexing/test_indexing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,7 @@ def test_float_index_non_scalar_assignment(self):
620620
expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index)
621621
tm.assert_frame_equal(expected, df)
622622

623+
def test_loc_setitem_fullindex_views(self):
623624
df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
624625
df2 = df.copy()
625626
df.loc[df.index] = df.loc[df.index]

pandas/tests/indexing/test_loc.py

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -588,32 +588,19 @@ def test_loc_modify_datetime(self):
588588

589589
tm.assert_frame_equal(df, expected)
590590

591-
def test_loc_setitem_frame(self):
592-
df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))
593-
594-
df.loc["a", "A"] = 1
595-
result = df.loc["a", "A"]
596-
assert result == 1
597-
598-
result = df.iloc[0, 0]
599-
assert result == 1
600-
601-
df.loc[:, "B":"D"] = 0
602-
expected = df.loc[:, "B":"D"]
603-
result = df.iloc[:, 1:]
604-
tm.assert_frame_equal(result, expected)
605-
606-
# GH 6254
607-
# setting issue
608-
df = DataFrame(index=[3, 5, 4], columns=["A"])
591+
def test_loc_setitem_frame_with_reindex(self):
592+
# GH#6254 setting issue
593+
df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float)
609594
df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64")
610-
expected = DataFrame({"A": Series([1, 2, 3], index=[4, 3, 5])}).reindex(
611-
index=[3, 5, 4]
612-
)
595+
596+
# setting integer values into a float dataframe with loc is inplace,
597+
# so we retain float dtype
598+
ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float)
599+
expected = DataFrame({"A": ser})
613600
tm.assert_frame_equal(df, expected)
614601

615-
# GH 6252
616-
# setting with an empty frame
602+
def test_loc_setitem_empty_frame(self):
603+
# GH#6252 setting with an empty frame
617604
keys1 = ["@" + str(i) for i in range(5)]
618605
val1 = np.arange(5, dtype="int64")
619606

@@ -628,18 +615,39 @@ def test_loc_setitem_frame(self):
628615
df["B"] = np.nan
629616
df.loc[keys2, "B"] = val2
630617

631-
expected = DataFrame(
632-
{"A": Series(val1, index=keys1), "B": Series(val2, index=keys2)}
633-
).reindex(index=index)
618+
# Because df["A"] was initialized as float64, setting values into it
619+
# is inplace, so that dtype is retained
620+
sera = Series(val1, index=keys1, dtype=np.float64)
621+
serb = Series(val2, index=keys2)
622+
expected = DataFrame({"A": sera, "B": serb}).reindex(index=index)
634623
tm.assert_frame_equal(df, expected)
635624

625+
def test_loc_setitem_frame(self):
626+
df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))
627+
628+
result = df.iloc[0, 0]
629+
630+
df.loc["a", "A"] = 1
631+
result = df.loc["a", "A"]
632+
assert result == 1
633+
634+
result = df.iloc[0, 0]
635+
assert result == 1
636+
637+
df.loc[:, "B":"D"] = 0
638+
expected = df.loc[:, "B":"D"]
639+
result = df.iloc[:, 1:]
640+
tm.assert_frame_equal(result, expected)
641+
642+
def test_loc_setitem_frame_nan_int_coercion_invalid(self):
636643
# GH 8669
637644
# invalid coercion of nan -> int
638645
df = DataFrame({"A": [1, 2, 3], "B": np.nan})
639646
df.loc[df.B > df.A, "B"] = df.A
640647
expected = DataFrame({"A": [1, 2, 3], "B": np.nan})
641648
tm.assert_frame_equal(df, expected)
642649

650+
def test_loc_setitem_frame_mixed_labels(self):
643651
# GH 6546
644652
# setting with mixed labels
645653
df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]})
@@ -1063,8 +1071,15 @@ def test_loc_setitem_str_to_small_float_conversion_type(self):
10631071
expected = DataFrame(col_data, columns=["A"], dtype=object)
10641072
tm.assert_frame_equal(result, expected)
10651073

1066-
# change the dtype of the elements from object to float one by one
1074+
# assigning with loc/iloc attempts to set the values inplace, which
1075+
# in this case is succesful
10671076
result.loc[result.index, "A"] = [float(x) for x in col_data]
1077+
expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
1078+
tm.assert_frame_equal(result, expected)
1079+
1080+
# assigning the entire column using __setitem__ swaps in the new array
1081+
# GH#???
1082+
result["A"] = [float(x) for x in col_data]
10681083
expected = DataFrame(col_data, columns=["A"], dtype=float)
10691084
tm.assert_frame_equal(result, expected)
10701085

@@ -1219,7 +1234,9 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture):
12191234
tz = tz_naive_fixture
12201235
idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz)
12211236
expected = DataFrame(1.2, index=idx, columns=["var"])
1222-
result = DataFrame(index=idx, columns=["var"])
1237+
# if result started off with object dtype, tehn the .loc.__setitem__
1238+
# below would retain object dtype
1239+
result = DataFrame(index=idx, columns=["var"], dtype=np.float64)
12231240
result.loc[:, idxer] = expected
12241241
tm.assert_frame_equal(result, expected)
12251242

0 commit comments

Comments
 (0)