Skip to content

Commit 7087ca4

Browse files
keewiskmuehlbauerdcherian
authored
numpy 2 compatibility in the netcdf4 and h5netcdf backends (#9136)
* don't remove `netcdf4` from the upstream-dev environment * also stop removing `h5py` and `hdf5` * hard-code the precision (I believe this was missed in #9081) * don't remove `h5py` either * use on-diks _FillValue as standrd expects, use view instead of cast to prevent OverflowError. * whats-new * unpin `numpy` * rework UnsignedCoder * add test * Update xarray/coding/variables.py Co-authored-by: Justus Magin <[email protected]> --------- Co-authored-by: Kai Mühlbauer <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]> Co-authored-by: Deepak Cherian <[email protected]>
1 parent ff15a08 commit 7087ca4

File tree

7 files changed

+49
-15
lines changed

7 files changed

+49
-15
lines changed

ci/install-upstream-wheels.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ $conda remove -y numba numbagg sparse
1313
# temporarily remove numexpr
1414
$conda remove -y numexpr
1515
# temporarily remove backends
16-
$conda remove -y cf_units hdf5 h5py netcdf4 pydap
16+
$conda remove -y cf_units pydap
1717
# forcibly remove packages to avoid artifacts
1818
$conda remove -y --force \
1919
numpy \
@@ -37,8 +37,7 @@ python -m pip install \
3737
numpy \
3838
scipy \
3939
matplotlib \
40-
pandas \
41-
h5py
40+
pandas
4241
# for some reason pandas depends on pyarrow already.
4342
# Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge`
4443
python -m pip install \

ci/requirements/all-but-dask.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ dependencies:
2222
- netcdf4
2323
- numba
2424
- numbagg
25-
- numpy<2
25+
- numpy
2626
- packaging
2727
- pandas
2828
- pint>=0.22

ci/requirements/environment-windows.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies:
2323
- netcdf4
2424
- numba
2525
- numbagg
26-
- numpy<2
26+
- numpy
2727
- packaging
2828
- pandas
2929
# - pint>=0.22

ci/requirements/environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies:
2626
- numba
2727
- numbagg
2828
- numexpr
29-
- numpy<2
29+
- numpy
3030
- opt_einsum
3131
- packaging
3232
- pandas

doc/whats-new.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ Bug fixes
4545
By `Pontus Lurcock <https://github.com/pont-us>`_.
4646
- Allow diffing objects with array attributes on variables (:issue:`9153`, :pull:`9169`).
4747
By `Justus Magin <https://github.com/keewis>`_.
48+
- ``numpy>=2`` compatibility in the ``netcdf4`` backend (:pull:`9136`).
49+
By `Justus Magin <https://github.com/keewis>`_ and `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
4850
- Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`).
4951
By `Justus Magin <https://github.com/keewis>`_.
5052
- Address regression introduced in :pull:`9002` that prevented objects returned
@@ -67,7 +69,7 @@ Documentation
6769
- Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 <https://github.com/pydata/xarray/discussions/8990>`_).
6870
By `Jessica Scheick <https://github.com/jessicas11>`_.
6971
- Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`)
70-
By `Maximilian Roos <https://github.com/max-sixty>`_
72+
By `Maximilian Roos <https://github.com/max-sixty>`_.
7173

7274

7375
Internal Changes

xarray/coding/variables.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -516,10 +516,13 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
516516
dims, data, attrs, encoding = unpack_for_encoding(variable)
517517

518518
pop_to(encoding, attrs, "_Unsigned")
519-
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
519+
# we need the on-disk type here
520+
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
521+
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
520522
if "_FillValue" in attrs:
521-
new_fill = signed_dtype.type(attrs["_FillValue"])
522-
attrs["_FillValue"] = new_fill
523+
new_fill = np.array(attrs["_FillValue"])
524+
# use view here to prevent OverflowError
525+
attrs["_FillValue"] = new_fill.view(signed_dtype).item()
523526
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
524527

525528
return Variable(dims, data, attrs, encoding, fastpath=True)
@@ -535,10 +538,11 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
535538
if unsigned == "true":
536539
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
537540
transform = partial(np.asarray, dtype=unsigned_dtype)
538-
data = lazy_elemwise_func(data, transform, unsigned_dtype)
539541
if "_FillValue" in attrs:
540-
new_fill = unsigned_dtype.type(attrs["_FillValue"])
541-
attrs["_FillValue"] = new_fill
542+
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
543+
# use view here to prevent OverflowError
544+
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
545+
data = lazy_elemwise_func(data, transform, unsigned_dtype)
542546
elif data.dtype.kind == "u":
543547
if unsigned == "false":
544548
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")

xarray/tests/test_backends.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
166166

167167
def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
168168
encoding = {
169-
"_FillValue": 255,
169+
"_FillValue": np.int8(-1),
170170
"_Unsigned": "true",
171171
"dtype": "i1",
172172
"add_offset": dtype.type(10),
@@ -925,6 +925,35 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
925925
assert decoded.variables[k].dtype == actual.variables[k].dtype
926926
assert_allclose(decoded, actual, decode_bytes=False)
927927

928+
@pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255)])
929+
def test_roundtrip_unsigned(self, fillvalue):
930+
# regression/numpy2 test for
931+
encoding = {
932+
"_FillValue": fillvalue,
933+
"_Unsigned": "true",
934+
"dtype": "i1",
935+
}
936+
x = np.array([0, 1, 127, 128, 254, np.nan], dtype=np.float32)
937+
decoded = Dataset({"x": ("t", x, {}, encoding)})
938+
939+
attributes = {
940+
"_FillValue": fillvalue,
941+
"_Unsigned": "true",
942+
}
943+
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
944+
sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1")
945+
encoded = Dataset({"x": ("t", sb, attributes)})
946+
947+
with self.roundtrip(decoded) as actual:
948+
for k in decoded.variables:
949+
assert decoded.variables[k].dtype == actual.variables[k].dtype
950+
assert_allclose(decoded, actual, decode_bytes=False)
951+
952+
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
953+
for k in encoded.variables:
954+
assert encoded.variables[k].dtype == actual.variables[k].dtype
955+
assert_allclose(encoded, actual, decode_bytes=False)
956+
928957
@staticmethod
929958
def _create_cf_dataset():
930959
original = Dataset(
@@ -4285,7 +4314,7 @@ def test_roundtrip_coordinates_with_space(self) -> None:
42854314
def test_roundtrip_numpy_datetime_data(self) -> None:
42864315
# Override method in DatasetIOBase - remove not applicable
42874316
# save_kwargs
4288-
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
4317+
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"], unit="ns")
42894318
expected = Dataset({"t": ("t", times), "t0": times[0]})
42904319
with self.roundtrip(expected) as actual:
42914320
assert_identical(expected, actual)

0 commit comments

Comments
 (0)