fix mean for datetime-like using the respective time resolution unit (#9977)

kmuehlbauer · spencerkclark · dcherian · web-flow · commit e28f1714bc81 · 2025-01-29T09:37:15.000+01:00
* fix mean for datetime-like by using the respective dtype time resolution unit, adapting tests

* fix mypy

* add PR to existing entry for non-nanosecond datetimes

* Update xarray/core/duck_array_ops.py

Co-authored-by: Spencer Clark &lt;spencerkclark@gmail.com&gt;

* cast to "int64" in calculation of datime-like mean

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Spencer Clark &lt;spencerkclark@gmail.com&gt;

---------

Co-authored-by: Spencer Clark &lt;spencerkclark@gmail.com&gt;
Co-authored-by: Deepak Cherian &lt;dcherian@users.noreply.github.com&gt;
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -50,7 +50,7 @@ eventually be deprecated.
 
 New Features
 ~~~~~~~~~~~~
-- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`).
+- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`, :pull:`9977`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Spencer Clark <https://github.com/spencerkclark>`_.
 - Enable the ``compute=False`` option in :py:meth:`DataTree.to_zarr`. (:pull:`9958`).
   By `Sam Levang <https://github.com/slevang>`_.
diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -662,16 +662,10 @@ def _to_pytimedelta(array, unit="us"):
 
 
 def np_timedelta64_to_float(array, datetime_unit):
-    """Convert numpy.timedelta64 to float.
-
-    Notes
-    -----
-    The array is first converted to microseconds, which is less likely to
-    cause overflow errors.
-    """
-    array = array.astype("timedelta64[ns]").astype(np.float64)
-    conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit)
-    return conversion_factor * array
+    """Convert numpy.timedelta64 to float, possibly at a loss of resolution."""
+    unit, _ = np.datetime_data(array.dtype)
+    conversion_factor = np.timedelta64(1, unit) / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array.astype(np.float64)
 
 
 def pd_timedelta_to_float(value, datetime_unit):
@@ -715,12 +709,15 @@ def mean(array, axis=None, skipna=None, **kwargs):
     if dtypes.is_datetime_like(array.dtype):
         offset = _datetime_nanmin(array)
 
-        # xarray always uses np.datetime64[ns] for np.datetime64 data
-        dtype = "timedelta64[ns]"
+        # From version 2025.01.2 xarray uses np.datetime64[unit], where unit
+        # is one of "s", "ms", "us", "ns".
+        # To not have to worry about the resolution, we just convert the output
+        # to "timedelta64" (without unit) and let the dtype of offset take precedence.
+        # This is fully backwards compatible with datetime64[ns].
         return (
             _mean(
                 datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs
-            ).astype(dtype)
+            ).astype("timedelta64")
             + offset
         )
     elif _contains_cftime_datetimes(array):
diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py
@@ -9,6 +9,7 @@
 from numpy import array, nan
 
 from xarray import DataArray, Dataset, cftime_range, concat
+from xarray.coding.times import _NS_PER_TIME_DELTA
 from xarray.core import dtypes, duck_array_ops
 from xarray.core.duck_array_ops import (
     array_notnull_equiv,
@@ -28,6 +29,7 @@
     where,
 )
 from xarray.core.extension_array import PandasExtensionArray
+from xarray.core.types import NPDatetimeUnitOptions, PDDatetimeUnitOptions
 from xarray.namedarray.pycompat import array_type
 from xarray.testing import assert_allclose, assert_equal, assert_identical
 from xarray.tests import (
@@ -411,10 +413,11 @@ def assert_dask_array(da, dask):
 @arm_xfail
 @pytest.mark.filterwarnings("ignore:All-NaN .* encountered:RuntimeWarning")
 @pytest.mark.parametrize("dask", [False, True] if has_dask else [False])
-def test_datetime_mean(dask: bool) -> None:
+def test_datetime_mean(dask: bool, time_unit: PDDatetimeUnitOptions) -> None:
     # Note: only testing numpy, as dask is broken upstream
+    dtype = f"M8[{time_unit}]"
     da = DataArray(
-        np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype="M8[ns]"),
+        np.array(["2010-01-01", "NaT", "2010-01-03", "NaT", "NaT"], dtype=dtype),
         dims=["time"],
     )
     if dask:
@@ -846,11 +849,11 @@ def test_multiple_dims(dtype, dask, skipna, func):
 
 
 @pytest.mark.parametrize("dask", [True, False])
-def test_datetime_to_numeric_datetime64(dask):
+def test_datetime_to_numeric_datetime64(dask, time_unit: PDDatetimeUnitOptions):
     if dask and not has_dask:
         pytest.skip("requires dask")
 
-    times = pd.date_range("2000", periods=5, freq="7D").values
+    times = pd.date_range("2000", periods=5, freq="7D").as_unit(time_unit).values
     if dask:
         import dask.array
 
@@ -874,8 +877,8 @@ def test_datetime_to_numeric_datetime64(dask):
         result = duck_array_ops.datetime_to_numeric(
             times, datetime_unit="h", dtype=dtype
         )
-    expected = 24 * np.arange(0, 35, 7).astype(dtype)
-    np.testing.assert_array_equal(result, expected)
+    expected2 = 24 * np.arange(0, 35, 7).astype(dtype)
+    np.testing.assert_array_equal(result, expected2)
 
 
 @requires_cftime
@@ -923,15 +926,18 @@ def test_datetime_to_numeric_cftime(dask):
 
 
 @requires_cftime
-def test_datetime_to_numeric_potential_overflow():
+def test_datetime_to_numeric_potential_overflow(time_unit: PDDatetimeUnitOptions):
     import cftime
 
-    times = pd.date_range("2000", periods=5, freq="7D").values.astype("datetime64[us]")
+    if time_unit == "ns":
+        pytest.skip("out-of-bounds datetime64 overflow")
+    dtype = f"M8[{time_unit}]"
+    times = pd.date_range("2000", periods=5, freq="7D").values.astype(dtype)
     cftimes = cftime_range(
         "2000", periods=5, freq="7D", calendar="proleptic_gregorian"
     ).values
 
-    offset = np.datetime64("0001-01-01")
+    offset = np.datetime64("0001-01-01", time_unit)
     cfoffset = cftime.DatetimeProlepticGregorian(1, 1, 1)
 
     result = duck_array_ops.datetime_to_numeric(
@@ -957,35 +963,45 @@ def test_py_timedelta_to_float():
     assert py_timedelta_to_float(dt.timedelta(days=1e6), "D") == 1e6
 
 
-@pytest.mark.parametrize(
-    "td, expected",
-    ([np.timedelta64(1, "D"), 86400 * 1e9], [np.timedelta64(1, "ns"), 1.0]),
-)
-def test_np_timedelta64_to_float(td, expected):
-    out = np_timedelta64_to_float(td, datetime_unit="ns")
+@pytest.mark.parametrize("np_dt_unit", ["D", "h", "m", "s", "ms", "us", "ns"])
+def test_np_timedelta64_to_float(
+    np_dt_unit: NPDatetimeUnitOptions, time_unit: PDDatetimeUnitOptions
+):
+    # tests any combination of source np.timedelta64 (NPDatetimeUnitOptions) with
+    # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions)
+    td = np.timedelta64(1, np_dt_unit)
+    expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit]
+
+    out = np_timedelta64_to_float(td, datetime_unit=time_unit)
     np.testing.assert_allclose(out, expected)
     assert isinstance(out, float)
 
-    out = np_timedelta64_to_float(np.atleast_1d(td), datetime_unit="ns")
+    out = np_timedelta64_to_float(np.atleast_1d(td), datetime_unit=time_unit)
     np.testing.assert_allclose(out, expected)
 
 
-@pytest.mark.parametrize(
-    "td, expected", ([pd.Timedelta(1, "D"), 86400 * 1e9], [pd.Timedelta(1, "ns"), 1.0])
-)
-def test_pd_timedelta_to_float(td, expected):
-    out = pd_timedelta_to_float(td, datetime_unit="ns")
+@pytest.mark.parametrize("np_dt_unit", ["D", "h", "m", "s", "ms", "us", "ns"])
+def test_pd_timedelta_to_float(
+    np_dt_unit: NPDatetimeUnitOptions, time_unit: PDDatetimeUnitOptions
+):
+    # tests any combination of source pd.Timedelta (NPDatetimeUnitOptions) with
+    # np_timedelta_to_float with dedicated target unit (PDDatetimeUnitOptions)
+    td = pd.Timedelta(1, np_dt_unit)
+    expected = _NS_PER_TIME_DELTA[np_dt_unit] / _NS_PER_TIME_DELTA[time_unit]
+
+    out = pd_timedelta_to_float(td, datetime_unit=time_unit)
     np.testing.assert_allclose(out, expected)
     assert isinstance(out, float)
 
 
 @pytest.mark.parametrize(
     "td", [dt.timedelta(days=1), np.timedelta64(1, "D"), pd.Timedelta(1, "D"), "1 day"]
 )
-def test_timedelta_to_numeric(td):
+def test_timedelta_to_numeric(td, time_unit: PDDatetimeUnitOptions):
     # Scalar input
-    out = timedelta_to_numeric(td, "ns")
-    np.testing.assert_allclose(out, 86400 * 1e9)
+    out = timedelta_to_numeric(td, time_unit)
+    expected = _NS_PER_TIME_DELTA["D"] / _NS_PER_TIME_DELTA[time_unit]
+    np.testing.assert_allclose(out, expected)
     assert isinstance(out, float)