diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 760da36a30075..c32eda4928da7 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -652,7 +652,9 @@ class Rank: ] def setup(self, dtype): - self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype) + self.df = DataFrame( + np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype + ) def time_rank(self, dtype): self.df.rank() diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b36499c340fd9..0bca312c0bdce 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,6 +700,7 @@ Deprecations - Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) - Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) - Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index edaa53cd55042..c877d27fd2392 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -24,6 +24,7 @@ Dtype, DtypeObj, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.base import ( ExtensionDtype, @@ -511,7 +512,24 @@ def sanitize_array( # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) + except IntCastingNaNError: + subarr = np.array(data, copy=copy) except ValueError: + if not raise_cast_failure: + # i.e. called via DataFrame constructor + warnings.warn( + "In a future version, passing float-dtype values and an " + "integer dtype to DataFrame will retain floating dtype " + "if they cannot be cast losslessly (matching Series behavior). " + "To retain the old behavior, use DataFrame(data).astype(dtype)", + FutureWarning, + stacklevel=4, + ) + # GH#40110 until the deprecation is enforced, we _dont_ + # ignore the dtype for DataFrame, and _do_ cast even though + # it is lossy. + dtype = cast(np.dtype, dtype) + return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 161572f3f1ac3..177b1ccd166cb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array( if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + if not np.isfinite(arr).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + raise ValueError("Trying to coerce float values to integers") + if is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") if casted.dtype < arr.dtype: @@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array( ) return casted + if arr.dtype.kind in ["m", "M"]: + # test_constructor_maskedarray_nonfloat + warnings.warn( + f"Constructing Series or DataFrame from {arr.dtype} values and " + f"dtype={dtype} is deprecated and will raise in a future version. " + "Use values.view(dtype) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return casted + # No known cases that get here, but raising explicitly to cover our bases. raise ValueError(f"values cannot be losslessly cast to {dtype}") diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 270eddf2bd3a5..81bf3ca4ba07a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -22,11 +22,9 @@ DtypeObj, Manager, ) -from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -303,22 +301,12 @@ def ndarray_to_mgr( shape = values.shape flat = values.ravel() - if not is_integer_dtype(dtype): - # TODO: skipping integer_dtype is needed to keep the tests passing, - # not clear it is correct - # Note: we really only need _try_cast, but keeping to exposed funcs - values = sanitize_array( - flat, None, dtype=dtype, copy=copy, raise_cast_failure=True - ) - else: - try: - values = construct_1d_ndarray_preserving_na( - flat, dtype=dtype, copy=False - ) - except IntCastingNaNError: - # following Series, we ignore the dtype and retain floating - # values instead of casting nans to meaningless ints - pass + # GH#40110 see similar check inside sanitize_array + rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") + + values = sanitize_array( + flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf + ) values = values.reshape(shape) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 6e176310da6b4..dac3c0382df01 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -603,7 +603,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + df = DataFrame(np.random.randn(4000).astype("int64"), index=index) # it works! result = df.sort_index(level=0) @@ -611,7 +611,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + df = DataFrame(np.random.randn(4000).astype("int32"), index=index) # it works! result = df.sort_index(level=0) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 769b08373b890..5156d0371e9b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -714,7 +714,9 @@ def create_cols(name): np.random.randn(100, 5), dtype="float64", columns=create_cols("float") ) df_int = DataFrame( - np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + np.random.randn(100, 5).astype("int64"), + dtype="int64", + columns=create_cols("int"), ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self): tm.assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") - df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") + df_int = DataFrame(np.random.randn(1000, 3)).astype("int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 784969c199c9f..6e0013c196760 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,6 +10,7 @@ import functools import itertools import re +import warnings import numpy as np import numpy.ma as ma @@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self): assert isna(frame).values.all() # cast type - frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + msg = r"datetime64\[ns\] values and dtype=int64" + with tm.assert_produces_warning(FutureWarning, match=msg): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message="elementwise comparison failed", + ) + frame = DataFrame( + mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64 + ) assert frame.values.dtype == np.int64 # Check non-masked values @@ -2484,6 +2495,27 @@ def test_nested_list_columns(self): tm.assert_frame_equal(result, expected) +class TestDataFrameConstructorWithDtypeCoercion: + def test_floating_values_integer_dtype(self): + # GH#40110 make DataFrame behavior with arraylike floating data and + # inty dtype match Series behavior + + arr = np.random.randn(10, 5) + + msg = "if they cannot be cast losslessly" + with tm.assert_produces_warning(FutureWarning, match=msg): + DataFrame(arr, dtype="i8") + + with tm.assert_produces_warning(None): + # if they can be cast losslessly, no warning + DataFrame(arr.round(), dtype="i8") + + # with NaNs, we already have the correct behavior, so no warning + arr[0, 0] = np.nan + with tm.assert_produces_warning(None): + DataFrame(arr, dtype="i8") + + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_construction_preserves_tzaware_dtypes(self, tz): diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index c9a39eb460cf4..d010426bee53e 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -294,7 +294,7 @@ def test_multi_dtype2(self): def test_dups_across_blocks(self, using_array_manager): # dups across blocks df_float = DataFrame(np.random.randn(10, 3), dtype="float64") - df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_int = DataFrame(np.random.randn(10, 3).astype("int64")) df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) df_dt = DataFrame( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 6f4949267c00c..26f2ba577d184 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request): ) request.node.add_marker(mark) - exp = pd.Series([1, val, 3, 4], dtype=np.int8) + warn = None if exp_dtype is np.int8 else FutureWarning + msg = "Values are too large to be losslessly cast to int8" + with tm.assert_produces_warning(warn, match=msg): + exp = pd.Series([1, val, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) @pytest.mark.parametrize(