Skip to content

DEPR: DataFrame(floaty, dtype=inty) match Series #41770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 3, 2021
4 changes: 3 additions & 1 deletion asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,9 @@ class Rank:
]

def setup(self, dtype):
self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype)
self.df = DataFrame(
np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype
)

def time_rank(self, dtype):
self.df.rank()
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ Deprecations
- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Dtype,
DtypeObj,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.base import (
ExtensionDtype,
Expand Down Expand Up @@ -511,7 +512,24 @@ def sanitize_array(
# possibility of nan -> garbage
try:
subarr = _try_cast(data, dtype, copy, True)
except IntCastingNaNError:
subarr = np.array(data, copy=copy)
except ValueError:
if not raise_cast_failure:
# i.e. called via DataFrame constructor
warnings.warn(
"In a future version, passing float-dtype values and an "
"integer dtype to DataFrame will retain floating dtype "
"if they cannot be cast losslessly (matching Series behavior). "
"To retain the old behavior, use DataFrame(data).astype(dtype)",
FutureWarning,
stacklevel=4,
)
# GH#40110 until the deprecation is enforced, we _dont_
# ignore the dtype for DataFrame, and _do_ cast even though
# it is lossy.
dtype = cast(np.dtype, dtype)
return np.array(data, dtype=dtype, copy=copy)
subarr = np.array(data, copy=copy)
else:
# we will try to copy by-definition here
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array(
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
raise OverflowError("Trying to coerce negative values to unsigned integers")

if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype):
if is_float_dtype(arr.dtype):
if not np.isfinite(arr).all():
raise IntCastingNaNError(
"Cannot convert non-finite values (NA or inf) to integer"
)
raise ValueError("Trying to coerce float values to integers")
if is_object_dtype(arr.dtype):
raise ValueError("Trying to coerce float values to integers")

if casted.dtype < arr.dtype:
Expand All @@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array(
)
return casted

if arr.dtype.kind in ["m", "M"]:
# test_constructor_maskedarray_nonfloat
warnings.warn(
f"Constructing Series or DataFrame from {arr.dtype} values and "
f"dtype={dtype} is deprecated and will raise in a future version. "
"Use values.view(dtype) instead",
FutureWarning,
stacklevel=find_stack_level(),
)
return casted

# No known cases that get here, but raising explicitly to cover our bases.
raise ValueError(f"values cannot be losslessly cast to {dtype}")

Expand Down
24 changes: 6 additions & 18 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,9 @@
DtypeObj,
Manager,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
construct_1d_ndarray_preserving_na,
maybe_cast_to_datetime,
maybe_convert_platform,
maybe_infer_to_datetimelike,
Expand Down Expand Up @@ -303,22 +301,12 @@ def ndarray_to_mgr(
shape = values.shape
flat = values.ravel()

if not is_integer_dtype(dtype):
# TODO: skipping integer_dtype is needed to keep the tests passing,
# not clear it is correct
# Note: we really only need _try_cast, but keeping to exposed funcs
values = sanitize_array(
flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
)
else:
try:
values = construct_1d_ndarray_preserving_na(
flat, dtype=dtype, copy=False
)
except IntCastingNaNError:
# following Series, we ignore the dtype and retain floating
# values instead of casting nans to meaningless ints
pass
# GH#40110 see similar check inside sanitize_array
rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

values = sanitize_array(
flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf
)

values = values.reshape(shape)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_sort_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,15 +603,15 @@ def test_sort_index_level_large_cardinality(self):

# GH#2684 (int64)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
df = DataFrame(np.random.randn(4000).astype("int64"), index=index)

# it works!
result = df.sort_index(level=0)
assert result.index._lexsort_depth == 3

# GH#2684 (int32)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
df = DataFrame(np.random.randn(4000).astype("int32"), index=index)

# it works!
result = df.sort_index(level=0)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,9 @@ def create_cols(name):
np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
)
df_int = DataFrame(
np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
np.random.randn(100, 5).astype("int64"),
dtype="int64",
columns=create_cols("int"),
)
df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
df_object = DataFrame(
Expand Down Expand Up @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self):
tm.assert_frame_equal(result, df)

df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
Expand Down
34 changes: 33 additions & 1 deletion pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import functools
import itertools
import re
import warnings

import numpy as np
import numpy.ma as ma
Expand Down Expand Up @@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self):
assert isna(frame).values.all()

# cast type
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
msg = r"datetime64\[ns\] values and dtype=int64"
with tm.assert_produces_warning(FutureWarning, match=msg):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=DeprecationWarning,
message="elementwise comparison failed",
)
frame = DataFrame(
mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
)
assert frame.values.dtype == np.int64

# Check non-masked values
Expand Down Expand Up @@ -2484,6 +2495,27 @@ def test_nested_list_columns(self):
tm.assert_frame_equal(result, expected)


class TestDataFrameConstructorWithDtypeCoercion:
def test_floating_values_integer_dtype(self):
# GH#40110 make DataFrame behavior with arraylike floating data and
# inty dtype match Series behavior

arr = np.random.randn(10, 5)

msg = "if they cannot be cast losslessly"
with tm.assert_produces_warning(FutureWarning, match=msg):
DataFrame(arr, dtype="i8")

with tm.assert_produces_warning(None):
# if they can be cast losslessly, no warning
DataFrame(arr.round(), dtype="i8")

# with NaNs, we already have the correct behavior, so no warning
arr[0, 0] = np.nan
with tm.assert_produces_warning(None):
DataFrame(arr, dtype="i8")


class TestDataFrameConstructorWithDatetimeTZ:
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_construction_preserves_tzaware_dtypes(self, tz):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_nonunique_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def test_multi_dtype2(self):
def test_dups_across_blocks(self, using_array_manager):
# dups across blocks
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
df_int = DataFrame(np.random.randn(10, 3).astype("int64"))
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
df_dt = DataFrame(
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request):
)
request.node.add_marker(mark)

exp = pd.Series([1, val, 3, 4], dtype=np.int8)
warn = None if exp_dtype is np.int8 else FutureWarning
msg = "Values are too large to be losslessly cast to int8"
with tm.assert_produces_warning(warn, match=msg):
exp = pd.Series([1, val, 3, 4], dtype=np.int8)
self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)

@pytest.mark.parametrize(
Expand Down