|
| 1 | +""" |
| 2 | +Tests for values coercion in setitem-like operations on DataFrame. |
| 3 | +
|
| 4 | +For the most part, these should be multi-column DataFrames, otherwise |
| 5 | +we would share the tests with Series. |
| 6 | +""" |
| 7 | +import numpy as np |
| 8 | +import pytest |
| 9 | + |
| 10 | +import pandas as pd |
| 11 | +from pandas import ( |
| 12 | + DataFrame, |
| 13 | + MultiIndex, |
| 14 | + NaT, |
| 15 | + Series, |
| 16 | + Timestamp, |
| 17 | + date_range, |
| 18 | +) |
| 19 | +import pandas._testing as tm |
| 20 | + |
| 21 | + |
| 22 | +class TestDataFrameSetitemCoercion: |
| 23 | + @pytest.mark.xfail(reason="Unnecessary cast.") |
| 24 | + @pytest.mark.parametrize("consolidate", [True, False]) |
| 25 | + def test_loc_setitem_multiindex_columns(self, consolidate): |
| 26 | + # GH#18415 Setting values in a single column preserves dtype, |
| 27 | + # while setting them in multiple columns did unwanted cast. |
| 28 | + |
| 29 | + # Note that A here has 2 blocks, below we do the same thing |
| 30 | + # with a consolidated frame. |
| 31 | + A = DataFrame(np.zeros((6, 5), dtype=np.float32)) |
| 32 | + A = pd.concat([A, A], axis=1, keys=[1, 2]) |
| 33 | + if consolidate: |
| 34 | + A = A._consolidate() |
| 35 | + |
| 36 | + A.loc[2:3, (1, slice(2, 3))] = np.ones((2, 2), dtype=np.float32) |
| 37 | + assert (A.dtypes == np.float32).all() |
| 38 | + |
| 39 | + A.loc[0:5, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) |
| 40 | + assert (A.dtypes == np.float32).all() |
| 41 | + |
| 42 | + A.loc[:, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) |
| 43 | + assert (A.dtypes == np.float32).all() |
| 44 | + |
| 45 | + # TODO: i think this isn't about MultiIndex and could be done with iloc? |
| 46 | + |
| 47 | + |
| 48 | +def test_37477(): |
| 49 | + # fixed by GH#45121 |
| 50 | + orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) |
| 51 | + expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) |
| 52 | + |
| 53 | + df = orig.copy() |
| 54 | + df.at[1, "B"] = 1.2 |
| 55 | + tm.assert_frame_equal(df, expected) |
| 56 | + |
| 57 | + df = orig.copy() |
| 58 | + df.loc[1, "B"] = 1.2 |
| 59 | + tm.assert_frame_equal(df, expected) |
| 60 | + |
| 61 | + df = orig.copy() |
| 62 | + df.iat[1, 1] = 1.2 |
| 63 | + tm.assert_frame_equal(df, expected) |
| 64 | + |
| 65 | + df = orig.copy() |
| 66 | + df.iloc[1, 1] = 1.2 |
| 67 | + tm.assert_frame_equal(df, expected) |
| 68 | + |
| 69 | + |
| 70 | +def test_6942(indexer_al): |
| 71 | + # check that the .at __setitem__ after setting "Live" actually sets the data |
| 72 | + start = Timestamp("2014-04-01") |
| 73 | + t1 = Timestamp("2014-04-23 12:42:38.883082") |
| 74 | + t2 = Timestamp("2014-04-24 01:33:30.040039") |
| 75 | + |
| 76 | + dti = date_range(start, periods=1) |
| 77 | + orig = DataFrame(index=dti, columns=["timenow", "Live"]) |
| 78 | + |
| 79 | + df = orig.copy() |
| 80 | + indexer_al(df)[start, "timenow"] = t1 |
| 81 | + |
| 82 | + df["Live"] = True |
| 83 | + |
| 84 | + df.at[start, "timenow"] = t2 |
| 85 | + assert df.iloc[0, 0] == t2 |
| 86 | + |
| 87 | + |
| 88 | +def test_26395(indexer_al): |
| 89 | + # .at case fixed by GH#45121 (best guess) |
| 90 | + df = DataFrame(index=["A", "B", "C"]) |
| 91 | + df["D"] = 0 |
| 92 | + |
| 93 | + indexer_al(df)["C", "D"] = 2 |
| 94 | + expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) |
| 95 | + tm.assert_frame_equal(df, expected) |
| 96 | + |
| 97 | + indexer_al(df)["C", "D"] = 44.5 |
| 98 | + expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) |
| 99 | + tm.assert_frame_equal(df, expected) |
| 100 | + |
| 101 | + indexer_al(df)["C", "D"] = "hello" |
| 102 | + expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) |
| 103 | + tm.assert_frame_equal(df, expected) |
| 104 | + |
| 105 | + |
| 106 | +@pytest.mark.xfail(reason="unwanted upcast") |
| 107 | +def test_15231(): |
| 108 | + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) |
| 109 | + df.loc[2] = Series({"a": 5, "b": 6}) |
| 110 | + assert (df.dtypes == np.int64).all() |
| 111 | + |
| 112 | + df.loc[3] = Series({"a": 7}) |
| 113 | + |
| 114 | + # df["a"] doesn't have any NaNs, should not have been cast |
| 115 | + exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"]) |
| 116 | + tm.assert_series_equal(df.dtypes, exp_dtypes) |
| 117 | + |
| 118 | + |
| 119 | +@pytest.mark.xfail(reason="Unnecessarily upcasts to float64") |
| 120 | +def test_iloc_setitem_unnecesssary_float_upcasting(): |
| 121 | + # GH#12255 |
| 122 | + df = DataFrame( |
| 123 | + { |
| 124 | + 0: np.array([1, 3], dtype=np.float32), |
| 125 | + 1: np.array([2, 4], dtype=np.float32), |
| 126 | + 2: ["a", "b"], |
| 127 | + } |
| 128 | + ) |
| 129 | + orig = df.copy() |
| 130 | + |
| 131 | + values = df[0].values.reshape(2, 1) |
| 132 | + df.iloc[:, 0:1] = values |
| 133 | + |
| 134 | + tm.assert_frame_equal(df, orig) |
| 135 | + |
| 136 | + |
| 137 | +@pytest.mark.xfail(reason="unwanted casting to dt64") |
| 138 | +def test_12499(): |
| 139 | + # TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT, |
| 140 | + # which has consequences for the expected df["two"] (though i think at |
| 141 | + # the time it might not have because of a separate bug). See if it makes |
| 142 | + # a difference which one we use here. |
| 143 | + ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC") |
| 144 | + |
| 145 | + data = [{"one": 0, "two": ts}] |
| 146 | + orig = DataFrame(data) |
| 147 | + df = orig.copy() |
| 148 | + df.loc[1] = [np.nan, NaT] |
| 149 | + |
| 150 | + expected = DataFrame( |
| 151 | + {"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")} |
| 152 | + ) |
| 153 | + tm.assert_frame_equal(df, expected) |
| 154 | + |
| 155 | + data = [{"one": 0, "two": ts}] |
| 156 | + df = orig.copy() |
| 157 | + df.loc[1, :] = [np.nan, NaT] |
| 158 | + tm.assert_frame_equal(df, expected) |
| 159 | + |
| 160 | + |
| 161 | +@pytest.mark.xfail(reason="Too many columns cast to float64") |
| 162 | +def test_20476(): |
| 163 | + mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) |
| 164 | + df = DataFrame(-1, index=range(3), columns=mi) |
| 165 | + filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"]) |
| 166 | + df["A"] = filler |
| 167 | + |
| 168 | + expected = DataFrame( |
| 169 | + { |
| 170 | + 0: [1, 1, 1], |
| 171 | + 1: [2, 2, 2], |
| 172 | + 2: [3.0, 3.0, 3.0], |
| 173 | + 3: [-1, -1, -1], |
| 174 | + 4: [-1, -1, -1], |
| 175 | + 5: [-1, -1, -1], |
| 176 | + } |
| 177 | + ) |
| 178 | + expected.columns = mi |
| 179 | + exp_dtypes = Series( |
| 180 | + [np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3, |
| 181 | + index=mi, |
| 182 | + ) |
| 183 | + tm.assert_series_equal(df.dtypes, exp_dtypes) |
0 commit comments