API/BUG: always try to operate inplace when setting with loc/iloc[foo, bar] (#39163)

jbrockmendel · web-flow · commit 527c7893b8f5 · 2021-03-04T19:09:34.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -181,6 +181,46 @@ Preserve dtypes in  :meth:`~pandas.DataFrame.combine_first`
    combined.dtypes
 
 
+Try operating inplace when setting values with ``loc`` and ``iloc``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When setting an entire column using ``loc`` or ``iloc``, pandas will try to
+insert the values into the existing data rather than create an entirely new array.
+
+.. ipython:: python
+
+   df = pd.DataFrame(range(3), columns=["A"], dtype="float64")
+   values = df.values
+   new = np.array([5, 6, 7], dtype="int64")
+   df.loc[[0, 1, 2], "A"] = new
+
+In both the new and old behavior, the data in ``values`` is overwritten, but in
+the old behavior the dtype of ``df["A"]`` changed to ``int64``.
+
+*pandas 1.2.x*
+
+.. code-block:: ipython
+
+   In [1]: df.dtypes
+   Out[1]:
+   A    int64
+   dtype: object
+   In [2]: np.shares_memory(df["A"].values, new)
+   Out[2]: False
+   In [3]: np.shares_memory(df["A"].values, values)
+   Out[3]: False
+
+In pandas 1.3.0, ``df`` continues to share data with ``values``
+
+*pandas 1.3.0*
+
+.. ipython:: python
+
+   df.dtypes
+   np.shares_memory(df["A"], new)
+   np.shares_memory(df["A"], values)
+
+
 .. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting:
 
 Consistent Casting With Setting Into Boolean Series
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1866,7 +1866,6 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
             ser = value
         elif is_array_like(value) and is_exact_shape_match(ser, value):
             ser = value
-
         else:
             # set the item, possibly having a dtype change
             ser = ser.copy()
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -92,6 +92,8 @@
     Categorical,
     DatetimeArray,
     ExtensionArray,
+    FloatingArray,
+    IntegerArray,
     PandasArray,
 )
 from pandas.core.base import PandasObject
@@ -994,6 +996,7 @@ def setitem(self, indexer, value):
         # length checking
         check_setitem_lengths(indexer, value, values)
         exact_match = is_exact_shape_match(values, arr_value)
+
         if is_empty_indexer(indexer, arr_value):
             # GH#8669 empty indexers
             pass
@@ -1007,27 +1010,21 @@ def setitem(self, indexer, value):
             # GH25495 - If the current dtype is not categorical,
             # we need to create a new categorical block
             values[indexer] = value
-            if values.ndim == 2:
-                # TODO(EA2D): special case not needed with 2D EAs
-                if values.shape[-1] != 1:
-                    # shouldn't get here (at least until 2D EAs)
-                    raise NotImplementedError
-                values = values[:, 0]
-            return self.make_block(Categorical(values, dtype=arr_value.dtype))
 
         elif exact_match and is_ea_value:
             # GH#32395 if we're going to replace the values entirely, just
             #  substitute in the new array
-            return self.make_block(arr_value)
+            if not self.is_object and isinstance(value, (IntegerArray, FloatingArray)):
+                values[indexer] = value.to_numpy(value.dtype.numpy_dtype)
+            else:
+                values[indexer] = np.asarray(value)
 
         # if we are an exact match (ex-broadcasting),
         # then use the resultant dtype
         elif exact_match:
             # We are setting _all_ of the array's values, so can cast to new dtype
             values[indexer] = value
 
-            values = values.astype(arr_value.dtype, copy=False)
-
         elif is_ea_value:
             # GH#38952
             if values.ndim == 1:
@@ -1892,6 +1889,10 @@ class NumericBlock(Block):
     is_numeric = True
 
     def _can_hold_element(self, element: Any) -> bool:
+        element = extract_array(element, extract_numpy=True)
+        if isinstance(element, (IntegerArray, FloatingArray)):
+            if element._mask.any():
+                return False
         return can_hold_element(self.dtype, element)
 
     @property
diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
@@ -339,13 +339,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
 
         key = full_indexer(df)
         result.loc[key, "data"] = df["data"]
+
         self.assert_frame_equal(result, expected)
 
     def test_setitem_series(self, data, full_indexer):
         # https://github.com/pandas-dev/pandas/issues/32395
-        ser = expected = pd.Series(data, name="data")
+        ser = pd.Series(data, name="data")
         result = pd.Series(index=ser.index, dtype=object, name="data")
 
+        # because result has object dtype, the attempt to do setting inplace
+        #  is successful, and object dtype is retained
         key = full_indexer(ser)
         result.loc[key] = ser
+
+        expected = pd.Series(
+            data.astype(object), index=ser.index, name="data", dtype=object
+        )
         self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -425,6 +425,23 @@ def test_setitem_slice(self, data, box_in_series):
     def test_setitem_loc_iloc_slice(self, data):
         super().test_setitem_loc_iloc_slice(data)
 
+    def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
+        # https://github.com/pandas-dev/pandas/issues/32395
+        df = expected = pd.DataFrame({"data": pd.Series(data)})
+        result = pd.DataFrame(index=df.index)
+
+        # because result has object dtype, the attempt to do setting inplace
+        #  is successful, and object dtype is retained
+        key = full_indexer(df)
+        result.loc[key, "data"] = df["data"]
+
+        # base class method has expected = df; PandasArray behaves oddly because
+        #  we patch _typ for these tests.
+        if data.dtype.numpy_dtype != object:
+            if not isinstance(key, slice) or key != slice(None):
+                expected = pd.DataFrame({"data": data.to_numpy()})
+        self.assert_frame_equal(result, expected)
+
 
 @skip_nested
 class TestParsing(BaseNumPyTests, base.BaseParsingTests):
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -67,33 +67,31 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key):
         frame = DataFrame({0: range(3)}, dtype=object)
 
         cat = Categorical(["alpha", "beta", "gamma"])
-        expected = DataFrame({0: cat})
-        # NB: pending GH#38896, the expected likely should become
-        #  expected= DataFrame({"A": cat.astype(object)})
-        # and should remain a view on the original values
 
         assert frame._mgr.blocks[0]._can_hold_element(cat)
 
         df = frame.copy()
         orig_vals = df.values
         indexer(df)[key, 0] = cat
 
-        overwrite = not isinstance(key, slice)
+        overwrite = isinstance(key, slice) and key == slice(None)
 
-        tm.assert_frame_equal(df, expected)
-
-        # TODO: this inconsistency is likely undesired GH#39986
         if overwrite:
-            # check that we overwrote underlying
-            tm.assert_numpy_array_equal(orig_vals, df.values)
+            # TODO: GH#39986 this probably shouldn't behave differently
+            expected = DataFrame({0: cat})
+            assert not np.shares_memory(df.values, orig_vals)
+        else:
+            expected = DataFrame({0: cat}).astype(object)
+            assert np.shares_memory(df.values, orig_vals)
 
-        # but we don't have a view on orig_vals
-        orig_vals[0, 0] = 19
-        assert df.iloc[0, 0] != 19
+        tm.assert_frame_equal(df, expected)
 
         # check we dont have a view on cat (may be undesired GH#39986)
         df.iloc[0, 0] = "gamma"
-        assert cat[0] != "gamma"
+        if overwrite:
+            assert cat[0] != "gamma"
+        else:
+            assert cat[0] != "gamma"
 
     @pytest.mark.parametrize("box", [pd_array, Series])
     def test_iloc_setitem_ea_inplace(self, frame_or_series, box):
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -620,6 +620,7 @@ def test_float_index_non_scalar_assignment(self):
         expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index)
         tm.assert_frame_equal(expected, df)
 
+    def test_loc_setitem_fullindex_views(self):
         df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
         df2 = df.copy()
         df.loc[df.index] = df.loc[df.index]
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -588,32 +588,19 @@ def test_loc_modify_datetime(self):
 
         tm.assert_frame_equal(df, expected)
 
-    def test_loc_setitem_frame(self):
-        df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))
-
-        df.loc["a", "A"] = 1
-        result = df.loc["a", "A"]
-        assert result == 1
-
-        result = df.iloc[0, 0]
-        assert result == 1
-
-        df.loc[:, "B":"D"] = 0
-        expected = df.loc[:, "B":"D"]
-        result = df.iloc[:, 1:]
-        tm.assert_frame_equal(result, expected)
-
-        # GH 6254
-        # setting issue
-        df = DataFrame(index=[3, 5, 4], columns=["A"])
+    def test_loc_setitem_frame_with_reindex(self):
+        # GH#6254 setting issue
+        df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float)
         df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64")
-        expected = DataFrame({"A": Series([1, 2, 3], index=[4, 3, 5])}).reindex(
-            index=[3, 5, 4]
-        )
+
+        # setting integer values into a float dataframe with loc is inplace,
+        #  so we retain float dtype
+        ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float)
+        expected = DataFrame({"A": ser})
         tm.assert_frame_equal(df, expected)
 
-        # GH 6252
-        # setting with an empty frame
+    def test_loc_setitem_empty_frame(self):
+        # GH#6252 setting with an empty frame
         keys1 = ["@" + str(i) for i in range(5)]
         val1 = np.arange(5, dtype="int64")
 
@@ -628,18 +615,39 @@ def test_loc_setitem_frame(self):
         df["B"] = np.nan
         df.loc[keys2, "B"] = val2
 
-        expected = DataFrame(
-            {"A": Series(val1, index=keys1), "B": Series(val2, index=keys2)}
-        ).reindex(index=index)
+        # Because df["A"] was initialized as float64, setting values into it
+        #  is inplace, so that dtype is retained
+        sera = Series(val1, index=keys1, dtype=np.float64)
+        serb = Series(val2, index=keys2)
+        expected = DataFrame({"A": sera, "B": serb}).reindex(index=index)
         tm.assert_frame_equal(df, expected)
 
+    def test_loc_setitem_frame(self):
+        df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))
+
+        result = df.iloc[0, 0]
+
+        df.loc["a", "A"] = 1
+        result = df.loc["a", "A"]
+        assert result == 1
+
+        result = df.iloc[0, 0]
+        assert result == 1
+
+        df.loc[:, "B":"D"] = 0
+        expected = df.loc[:, "B":"D"]
+        result = df.iloc[:, 1:]
+        tm.assert_frame_equal(result, expected)
+
+    def test_loc_setitem_frame_nan_int_coercion_invalid(self):
         # GH 8669
         # invalid coercion of nan -> int
         df = DataFrame({"A": [1, 2, 3], "B": np.nan})
         df.loc[df.B > df.A, "B"] = df.A
         expected = DataFrame({"A": [1, 2, 3], "B": np.nan})
         tm.assert_frame_equal(df, expected)
 
+    def test_loc_setitem_frame_mixed_labels(self):
         # GH 6546
         # setting with mixed labels
         df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]})
@@ -1063,8 +1071,15 @@ def test_loc_setitem_str_to_small_float_conversion_type(self):
         expected = DataFrame(col_data, columns=["A"], dtype=object)
         tm.assert_frame_equal(result, expected)
 
-        # change the dtype of the elements from object to float one by one
+        # assigning with loc/iloc attempts to set the values inplace, which
+        #  in this case is succesful
         result.loc[result.index, "A"] = [float(x) for x in col_data]
+        expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
+        tm.assert_frame_equal(result, expected)
+
+        # assigning the entire column using __setitem__ swaps in the new array
+        # GH#???
+        result["A"] = [float(x) for x in col_data]
         expected = DataFrame(col_data, columns=["A"], dtype=float)
         tm.assert_frame_equal(result, expected)
 
@@ -1219,7 +1234,9 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture):
         tz = tz_naive_fixture
         idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz)
         expected = DataFrame(1.2, index=idx, columns=["var"])
-        result = DataFrame(index=idx, columns=["var"])
+        # if result started off with object dtype, tehn the .loc.__setitem__
+        #  below would retain object dtype
+        result = DataFrame(index=idx, columns=["var"], dtype=np.float64)
         result.loc[:, idxer] = expected
         tm.assert_frame_equal(result, expected)