Skip to content

ENH/API: preserve non-nano in to_datetime #50369

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ Other API changes
- Passing data with dtype of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; timedelta64 data with lower resolution will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
- Passing ``dtype`` of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; passing a dtype with lower resolution for :class:`Series` or :class:`DataFrame` will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
- Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`)
- Passing ``datetime64`` values with resolution other than nanosecond to :func:`to_datetime` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`50369`)
- Passing a string in ISO-8601 format to :class:`Timestamp` will retain the resolution of the parsed input if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49737`)
- The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`)
- Changed behavior of :meth:`Series.quantile` and :meth:`DataFrame.quantile` with :class:`SparseDtype` to retain sparse dtype (:issue:`49583`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ cpdef cnp.ndarray astype_overflowsafe(
cnp.dtype dtype, # ndarray[datetime64[anyunit]]
bint copy=*,
bint round_ok=*,
bint is_coerce=*,
)
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def astype_overflowsafe(
dtype: np.dtype,
copy: bool = ...,
round_ok: bool = ...,
is_coerce: bool = ...,
) -> np.ndarray: ...
def is_unitless(dtype: np.dtype) -> bool: ...
def compare_mismatched_resolutions(
Expand Down
9 changes: 6 additions & 3 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ cpdef ndarray astype_overflowsafe(
cnp.dtype dtype,
bint copy=True,
bint round_ok=True,
bint is_coerce=False,
):
"""
Convert an ndarray with datetime64[X] to datetime64[Y]
Expand Down Expand Up @@ -385,7 +386,9 @@ cpdef ndarray astype_overflowsafe(
try:
check_dts_bounds(&dts, to_unit)
except OutOfBoundsDatetime as err:
if is_td:
if is_coerce:
new_value = NPY_DATETIME_NAT
elif is_td:
from_abbrev = np.datetime_data(values.dtype)[0]
np_val = np.timedelta64(value, from_abbrev)
msg = (
Expand All @@ -395,8 +398,8 @@ cpdef ndarray astype_overflowsafe(
raise OutOfBoundsTimedelta(msg) from err
else:
raise

new_value = npy_datetimestruct_to_datetime(to_unit, &dts)
else:
new_value = npy_datetimestruct_to_datetime(to_unit, &dts)

# Analogous to: iresult[i] = new_value
(<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def _coerce_to_type(x):
if is_datetime64tz_dtype(x.dtype):
dtype = x.dtype
elif is_datetime64_dtype(x.dtype):
x = to_datetime(x)
x = to_datetime(x).astype("datetime64[ns]", copy=False)
dtype = np.dtype("datetime64[ns]")
elif is_timedelta64_dtype(x.dtype):
x = to_timedelta(x)
Expand Down Expand Up @@ -527,7 +527,12 @@ def _convert_bin_to_numeric_type(bins, dtype):
raise ValueError("bins must be of timedelta64 dtype")
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
if bins_dtype in ["datetime", "datetime64"]:
bins = to_datetime(bins).view(np.int64)
bins = to_datetime(bins)
if is_datetime64_dtype(bins):
# As of 2.0, to_datetime may give non-nano, so we need to convert
# here until the rest of this file recognizes non-nano
bins = bins.astype("datetime64[ns]", copy=False)
Comment on lines +531 to +534
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any test that would hit the else branch? I think now is_datetime64_dtype(bins) is always True in the test suite

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no extant test, but im not sure its impossible to reach

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps raise an AssertionError in the else block? the it won't show up as uncovered

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im not concerned about it showing as uncovered, and don't want to break potentially-working code in the wild.

will need to revisit this eventually to handle non-nano without the cast anyway

bins = bins.view(np.int64)
else:
raise ValueError("bins must be of datetime64 dtype")

Expand Down
18 changes: 16 additions & 2 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
Timedelta,
Timestamp,
astype_overflowsafe,
get_unit_from_dtype,
iNaT,
is_supported_unit,
nat_strings,
parsing,
timezones as libtimezones,
Expand All @@ -50,7 +52,6 @@
from pandas.core.dtypes.common import (
ensure_object,
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_float,
is_integer,
Expand All @@ -68,6 +69,7 @@
from pandas.arrays import (
DatetimeArray,
IntegerArray,
PandasArray,
)
from pandas.core import algorithms
from pandas.core.algorithms import unique
Expand Down Expand Up @@ -384,6 +386,8 @@ def _convert_listlike_datetimes(
"""
if isinstance(arg, (list, tuple)):
arg = np.array(arg, dtype="O")
elif isinstance(arg, PandasArray):
arg = np.array(arg)

arg_dtype = getattr(arg, "dtype", None)
# these are shortcutable
Expand All @@ -395,7 +399,17 @@ def _convert_listlike_datetimes(
arg = arg.tz_convert(None).tz_localize("utc")
return arg

elif is_datetime64_ns_dtype(arg_dtype):
elif is_datetime64_dtype(arg_dtype):
arg_dtype = cast(np.dtype, arg_dtype)
if not is_supported_unit(get_unit_from_dtype(arg_dtype)):
# We go to closest supported reso, i.e. "s"
arg = astype_overflowsafe(
# TODO: looks like we incorrectly raise with errors=="ignore"
np.asarray(arg),
np.dtype("M8[s]"),
is_coerce=errors == "coerce",
)

if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
return DatetimeIndex(arg, tz=tz, name=name)
elif utc:
Expand Down
40 changes: 19 additions & 21 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,23 +1027,28 @@ def test_to_datetime_dt64s_and_str(self, arg, format):
@pytest.mark.parametrize(
"dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")]
)
def test_to_datetime_dt64s_out_of_bounds(self, cache, dt):
msg = "^Out of bounds nanosecond timestamp: .*, at position 0$"
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(dt, errors="raise")
@pytest.mark.parametrize("errors", ["raise", "ignore", "coerce"])
def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors):
# GH#50369 We cast to the nearest supported reso, i.e. "s"
ts = to_datetime(dt, errors=errors, cache=cache)
assert isinstance(ts, Timestamp)
assert ts.unit == "s"
assert ts.asm8 == dt

# TODO(2.0): The Timestamp and to_datetime behaviors should match;
# as of 2022-09-28, the Timestamp constructor has been updated
# to cast to M8[s] but to_datetime has not
ts = Timestamp(dt)
assert ts.unit == "s"
assert ts.asm8 == dt

def test_to_datetime_dt64d_out_of_bounds(self, cache):
dt64 = np.datetime64(np.iinfo(np.int64).max, "D")

msg = "Out of bounds nanosecond timestamp"
with pytest.raises(OutOfBoundsDatetime, match=msg):
Timestamp(np.datetime64(np.iinfo(np.int64).max, "D"))
Timestamp(dt64)
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(dt64, errors="raise", cache=cache)

assert to_datetime(dt, errors="coerce", cache=cache) is NaT
assert to_datetime(dt64, errors="coerce", cache=cache) is NaT

@pytest.mark.parametrize("unit", ["s", "D"])
def test_to_datetime_array_of_dt64s(self, cache, unit):
Expand Down Expand Up @@ -2516,23 +2521,16 @@ def test_string_na_nat_conversion_with_name(self, cache):
assert dresult.name == "foo"

@pytest.mark.parametrize(
"dtype",
[
"datetime64[h]",
"datetime64[m]",
"datetime64[s]",
"datetime64[ms]",
"datetime64[us]",
"datetime64[ns]",
],
"unit",
["h", "m", "s", "ms", "us", "ns"],
)
def test_dti_constructor_numpy_timeunits(self, cache, dtype):
def test_dti_constructor_numpy_timeunits(self, cache, unit):
# GH 9114
dtype = np.dtype(f"M8[{unit}]")
base = to_datetime(["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache)

values = base.values.astype(dtype)

unit = dtype.split("[")[-1][:-1]
if unit in ["h", "m"]:
# we cast to closest supported unit
unit = "s"
Expand All @@ -2541,7 +2539,7 @@ def test_dti_constructor_numpy_timeunits(self, cache, dtype):
assert expected.dtype == exp_dtype

tm.assert_index_equal(DatetimeIndex(values), expected)
tm.assert_index_equal(to_datetime(values, cache=cache), base)
tm.assert_index_equal(to_datetime(values, cache=cache), expected)

def test_dayfirst(self, cache):
# GH 5917
Expand Down