Skip to content

Commit 03c1014

Browse files
Use resolution-dependent default units for lazy time encoding (#10017)
When lazily encoding non-nanosecond times, the appropriate optimal integer encoding units are resolution-dependent. This PR updates our encoding pipeline accordingly. Note that due to our internal reliance on pandas for date string parsing, we are still not able to round trip times outside the range -9999-01-01 to 9999-12-31 with pandas / NumPy, but this at least should pick more natural default units than nanoseconds for chunked arrays of non-nanosecond precision times. This gives users another way of addressing #9154 (i.e. use non-nanosecond time arrays).
1 parent 160cced commit 03c1014

File tree

3 files changed

+30
-10
lines changed

3 files changed

+30
-10
lines changed

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ Deprecations
3333

3434
Bug fixes
3535
~~~~~~~~~
36+
- Default to resolution-dependent optimal integer encoding units when saving
37+
chunked non-nanosecond :py:class:`numpy.datetime64` or
38+
:py:class:`numpy.timedelta64` arrays to disk. Previously units of
39+
"nanoseconds" were chosen by default, which are optimal for
40+
nanosecond-resolution times, but not for times with coarser resolution. By
41+
`Spencer Clark <https://github.com/spencerkclark>`_ (:pull:`10017`).
3642

3743

3844
Documentation

xarray/coding/times.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,12 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str:
153153
}[units]
154154

155155

156+
def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str:
157+
unit, _ = np.datetime_data(dtype)
158+
unit = cast(NPDatetimeUnitOptions, unit)
159+
return _numpy_to_netcdf_timeunit(unit)
160+
161+
156162
def _ensure_padded_year(ref_date: str) -> str:
157163
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
158164
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
@@ -1143,7 +1149,8 @@ def _lazily_encode_cf_datetime(
11431149
units = "microseconds since 1970-01-01"
11441150
dtype = np.dtype("int64")
11451151
else:
1146-
units = "nanoseconds since 1970-01-01"
1152+
netcdf_unit = _numpy_dtype_to_netcdf_timeunit(dates.dtype)
1153+
units = f"{netcdf_unit} since 1970-01-01"
11471154
dtype = np.dtype("int64")
11481155

11491156
if units is None or dtype is None:
@@ -1249,7 +1256,7 @@ def _lazily_encode_cf_timedelta(
12491256
timedeltas: T_ChunkedArray, units: str | None = None, dtype: np.dtype | None = None
12501257
) -> tuple[T_ChunkedArray, str]:
12511258
if units is None and dtype is None:
1252-
units = "nanoseconds"
1259+
units = _numpy_dtype_to_netcdf_timeunit(timedeltas.dtype)
12531260
dtype = np.dtype("int64")
12541261

12551262
if units is None or dtype is None:

xarray/tests/test_coding_times.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,10 +1620,10 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None
16201620
_ENCODE_DATETIME64_VIA_DASK_TESTS.values(),
16211621
ids=_ENCODE_DATETIME64_VIA_DASK_TESTS.keys(),
16221622
)
1623-
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
1623+
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype, time_unit) -> None:
16241624
import dask.array
16251625

1626-
times_pd = pd.date_range(start="1700", freq=freq, periods=3)
1626+
times_pd = pd.date_range(start="1700", freq=freq, periods=3, unit=time_unit)
16271627
times = dask.array.from_array(times_pd, chunks=1)
16281628
encoded_times, encoding_units, encoding_calendar = encode_cf_datetime(
16291629
times, units, None, dtype
@@ -1636,13 +1636,17 @@ def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
16361636
assert encoding_units == units
16371637
assert encoded_times.dtype == dtype
16381638
else:
1639-
assert encoding_units == "nanoseconds since 1970-01-01"
1639+
expected_netcdf_time_unit = _numpy_to_netcdf_timeunit(time_unit)
1640+
assert encoding_units == f"{expected_netcdf_time_unit} since 1970-01-01"
16401641
assert encoded_times.dtype == np.dtype("int64")
16411642

16421643
assert encoding_calendar == "proleptic_gregorian"
16431644

1644-
decoded_times = decode_cf_datetime(encoded_times, encoding_units, encoding_calendar)
1645+
decoded_times = decode_cf_datetime(
1646+
encoded_times, encoding_units, encoding_calendar, time_unit=time_unit
1647+
)
16451648
np.testing.assert_equal(decoded_times, times)
1649+
assert decoded_times.dtype == times.dtype
16461650

16471651

16481652
@requires_dask
@@ -1749,11 +1753,11 @@ def test_encode_cf_datetime_casting_overflow_error(use_cftime, use_dask, dtype)
17491753
("units", "dtype"), [("days", np.dtype("int32")), (None, None)]
17501754
)
17511755
def test_encode_cf_timedelta_via_dask(
1752-
units: str | None, dtype: np.dtype | None
1756+
units: str | None, dtype: np.dtype | None, time_unit: PDDatetimeUnitOptions
17531757
) -> None:
17541758
import dask.array
17551759

1756-
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3)
1760+
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) # type: ignore[call-arg]
17571761
times = dask.array.from_array(times_pd, chunks=1)
17581762
encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype)
17591763

@@ -1764,11 +1768,14 @@ def test_encode_cf_timedelta_via_dask(
17641768
assert encoding_units == units
17651769
assert encoded_times.dtype == dtype
17661770
else:
1767-
assert encoding_units == "nanoseconds"
1771+
assert encoding_units == _numpy_to_netcdf_timeunit(time_unit)
17681772
assert encoded_times.dtype == np.dtype("int64")
17691773

1770-
decoded_times = decode_cf_timedelta(encoded_times, encoding_units)
1774+
decoded_times = decode_cf_timedelta(
1775+
encoded_times, encoding_units, time_unit=time_unit
1776+
)
17711777
np.testing.assert_equal(decoded_times, times)
1778+
assert decoded_times.dtype == times.dtype
17721779

17731780

17741781
@pytest.mark.parametrize("use_dask", [False, pytest.param(True, marks=requires_dask)])

0 commit comments

Comments
 (0)