Skip to content

Commit 1c5adc9

Browse files
authored
Support overriding existing variables in to_zarr() without appending (#4029)
* Support overriding existing variables in to_zarr() without appending This should be useful for cases where users want to update values in existing Zarr datasets. * Update docstring for to_zarr
1 parent 1b3c768 commit 1c5adc9

File tree

5 files changed

+79
-69
lines changed

5 files changed

+79
-69
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ New Features
4949
By `Todd Jennings <https://github.com/toddrjen>`_
5050
- More support for unit aware arrays with pint (:pull:`3643`)
5151
By `Justus Magin <https://github.com/keewis>`_.
52-
52+
- Support overriding existing variables in ``to_zarr()`` with ``mode='a'`` even
53+
without ``append_dim``, as long as dimension sizes do not change.
54+
By `Stephan Hoyer <https://github.com/shoyer>`_.
5355
- Allow plotting of boolean arrays. (:pull:`3766`)
5456
By `Marek Jacob <https://github.com/MeraX>`_
5557
- A ``days_in_month`` accessor for :py:class:`xarray.CFTimeIndex`, analogous to

xarray/backends/api.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,18 +1279,35 @@ def _validate_append_dim_and_encoding(
12791279
return
12801280
if append_dim:
12811281
if append_dim not in ds.dims:
1282-
raise ValueError(f"{append_dim} not a valid dimension in the Dataset")
1283-
for data_var in ds_to_append:
1284-
if data_var in ds:
1285-
if append_dim is None:
1282+
raise ValueError(
1283+
f"append_dim={append_dim!r} does not match any existing "
1284+
f"dataset dimensions {ds.dims}"
1285+
)
1286+
for var_name in ds_to_append:
1287+
if var_name in ds:
1288+
if ds_to_append[var_name].dims != ds[var_name].dims:
1289+
raise ValueError(
1290+
f"variable {var_name!r} already exists with different "
1291+
f"dimension names {ds[var_name].dims} != "
1292+
f"{ds_to_append[var_name].dims}, but changing variable "
1293+
"dimensions is not supported by to_zarr()."
1294+
)
1295+
existing_sizes = {
1296+
k: v for k, v in ds[var_name].sizes.items() if k != append_dim
1297+
}
1298+
new_sizes = {
1299+
k: v for k, v in ds_to_append[var_name].sizes.items() if k != append_dim
1300+
}
1301+
if existing_sizes != new_sizes:
12861302
raise ValueError(
1287-
"variable '{}' already exists, but append_dim "
1288-
"was not set".format(data_var)
1303+
f"variable {var_name!r} already exists with different "
1304+
"dimension sizes: {existing_sizes} != {new_sizes}. "
1305+
"to_zarr() only supports changing dimension sizes when "
1306+
f"explicitly appending, but append_dim={append_dim!r}."
12891307
)
1290-
if data_var in encoding.keys():
1308+
if var_name in encoding.keys():
12911309
raise ValueError(
1292-
"variable '{}' already exists, but encoding was"
1293-
"provided".format(data_var)
1310+
f"variable {var_name!r} already exists, but encoding was provided"
12941311
)
12951312

12961313

xarray/backends/zarr.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,18 +445,23 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
445445
fill_value = attrs.pop("_FillValue", None)
446446
if v.encoding == {"_FillValue": None} and fill_value is None:
447447
v.encoding = {}
448-
if name in self.ds:
448+
449+
if self.append_dim is not None and self.append_dim in dims:
450+
# resize existing variable
449451
zarr_array = self.ds[name]
450-
if self.append_dim in dims:
451-
# this is the DataArray that has append_dim as a
452-
# dimension
453-
append_axis = dims.index(self.append_dim)
454-
new_shape = list(zarr_array.shape)
455-
new_shape[append_axis] += v.shape[append_axis]
456-
new_region = [slice(None)] * len(new_shape)
457-
new_region[append_axis] = slice(zarr_array.shape[append_axis], None)
458-
zarr_array.resize(new_shape)
459-
writer.add(v.data, zarr_array, region=tuple(new_region))
452+
append_axis = dims.index(self.append_dim)
453+
454+
new_region = [slice(None)] * len(dims)
455+
new_region[append_axis] = slice(zarr_array.shape[append_axis], None)
456+
region = tuple(new_region)
457+
458+
new_shape = list(zarr_array.shape)
459+
new_shape[append_axis] += v.shape[append_axis]
460+
zarr_array.resize(new_shape)
461+
elif name in self.ds:
462+
# override existing variable
463+
zarr_array = self.ds[name]
464+
region = None
460465
else:
461466
# new variable
462467
encoding = extract_zarr_variable_encoding(
@@ -474,7 +479,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
474479
name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding
475480
)
476481
zarr_array.attrs.put(encoded_attrs)
477-
writer.add(v.data, zarr_array)
482+
region = None
483+
484+
writer.add(v.data, zarr_array, region=region)
478485

479486
def close(self):
480487
if self._consolidate_on_close:

xarray/core/dataset.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,7 +1579,7 @@ def to_zarr(
15791579
mode : {'w', 'w-', 'a', None}
15801580
Persistence mode: 'w' means create (overwrite if exists);
15811581
'w-' means create (fail if exists);
1582-
'a' means append (create if does not exist).
1582+
'a' means override existing variables (create if does not exist).
15831583
If ``append_dim`` is set, ``mode`` can be omitted as it is
15841584
internally set to ``'a'``. Otherwise, ``mode`` will default to
15851585
`w-` if not set.
@@ -1598,7 +1598,8 @@ def to_zarr(
15981598
If True, apply zarr's `consolidate_metadata` function to the store
15991599
after writing.
16001600
append_dim: hashable, optional
1601-
If set, the dimension on which the data will be appended.
1601+
If set, the dimension along which the data will be appended. All
1602+
other dimensions on overriden variables must remain the same size.
16021603
16031604
References
16041605
----------
@@ -1766,7 +1767,7 @@ def maybe_chunk(name, var, chunks):
17661767
return self._replace(variables)
17671768

17681769
def _validate_indexers(
1769-
self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise",
1770+
self, indexers: Mapping[Hashable, Any], missing_dims: str = "raise"
17701771
) -> Iterator[Tuple[Hashable, Union[int, slice, np.ndarray, Variable]]]:
17711772
""" Here we make sure
17721773
+ indexer has a valid keys
@@ -5933,7 +5934,7 @@ def polyfit(
59335934
"The number of data points must exceed order to scale the covariance matrix."
59345935
)
59355936
fac = residuals / (x.shape[0] - order)
5936-
covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac
5937+
covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac
59375938
variables[name + "polyfit_covariance"] = covariance
59385939

59395940
return Dataset(data_vars=variables, attrs=self.attrs.copy())
@@ -6199,7 +6200,7 @@ def idxmin(
61996200
skipna=skipna,
62006201
fill_value=fill_value,
62016202
keep_attrs=keep_attrs,
6202-
),
6203+
)
62036204
)
62046205

62056206
def idxmax(
@@ -6297,7 +6298,7 @@ def idxmax(
62976298
skipna=skipna,
62986299
fill_value=fill_value,
62996300
keep_attrs=keep_attrs,
6300-
),
6301+
)
63016302
)
63026303

63036304

xarray/tests/test_backends.py

Lines changed: 24 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,12 +1526,6 @@ def roundtrip(
15261526
with self.open(store_target, **open_kwargs) as ds:
15271527
yield ds
15281528

1529-
@contextlib.contextmanager
1530-
def roundtrip_append(
1531-
self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False
1532-
):
1533-
pytest.skip("zarr backend does not support appending")
1534-
15351529
def test_roundtrip_consolidated(self):
15361530
pytest.importorskip("zarr", minversion="2.2.1.dev2")
15371531
expected = create_test_data()
@@ -1826,63 +1820,52 @@ def test_encoding_kwarg_fixed_width_string(self):
18261820
# not relevant for zarr, since we don't use EncodedStringCoder
18271821
pass
18281822

1829-
# TODO: someone who understand caching figure out whether chaching
1823+
# TODO: someone who understand caching figure out whether caching
18301824
# makes sense for Zarr backend
18311825
@pytest.mark.xfail(reason="Zarr caching not implemented")
18321826
def test_dataset_caching(self):
18331827
super().test_dataset_caching()
18341828

18351829
@pytest.mark.skipif(LooseVersion(dask_version) < "2.4", reason="dask GH5334")
18361830
def test_append_write(self):
1837-
ds, ds_to_append, _ = create_append_test_data()
1838-
with self.create_zarr_target() as store_target:
1839-
ds.to_zarr(store_target, mode="w")
1840-
ds_to_append.to_zarr(store_target, append_dim="time")
1841-
original = xr.concat([ds, ds_to_append], dim="time")
1842-
assert_identical(original, xr.open_zarr(store_target))
1843-
1844-
@pytest.mark.xfail(reason="Zarr stores can not be appended to")
1845-
def test_append_overwrite_values(self):
1846-
super().test_append_overwrite_values()
1831+
super().test_append_write()
18471832

18481833
def test_append_with_invalid_dim_raises(self):
1849-
18501834
ds, ds_to_append, _ = create_append_test_data()
1851-
1852-
# check failure when append_dim not valid
1853-
with pytest.raises(ValueError):
1854-
with self.create_zarr_target() as store_target:
1855-
ds.to_zarr(store_target, mode="w")
1835+
with self.create_zarr_target() as store_target:
1836+
ds.to_zarr(store_target, mode="w")
1837+
with pytest.raises(
1838+
ValueError, match="does not match any existing dataset dimensions"
1839+
):
18561840
ds_to_append.to_zarr(store_target, append_dim="notvalid")
18571841

1858-
def test_append_with_append_dim_not_set_raises(self):
1842+
def test_append_with_no_dims_raises(self):
1843+
with self.create_zarr_target() as store_target:
1844+
Dataset({"foo": ("x", [1])}).to_zarr(store_target, mode="w")
1845+
with pytest.raises(ValueError, match="different dimension names"):
1846+
Dataset({"foo": ("y", [2])}).to_zarr(store_target, mode="a")
18591847

1848+
def test_append_with_append_dim_not_set_raises(self):
18601849
ds, ds_to_append, _ = create_append_test_data()
1861-
1862-
# check failure when append_dim not set
1863-
with pytest.raises(ValueError):
1864-
with self.create_zarr_target() as store_target:
1865-
ds.to_zarr(store_target, mode="w")
1850+
with self.create_zarr_target() as store_target:
1851+
ds.to_zarr(store_target, mode="w")
1852+
with pytest.raises(ValueError, match="different dimension sizes"):
18661853
ds_to_append.to_zarr(store_target, mode="a")
18671854

18681855
def test_append_with_mode_not_a_raises(self):
1869-
18701856
ds, ds_to_append, _ = create_append_test_data()
1871-
1872-
# check failure when append_dim is set and mode != 'a'
1873-
with pytest.raises(ValueError):
1874-
with self.create_zarr_target() as store_target:
1875-
ds.to_zarr(store_target, mode="w")
1857+
with self.create_zarr_target() as store_target:
1858+
ds.to_zarr(store_target, mode="w")
1859+
with pytest.raises(
1860+
ValueError, match="append_dim was set along with mode='w'"
1861+
):
18761862
ds_to_append.to_zarr(store_target, mode="w", append_dim="time")
18771863

18781864
def test_append_with_existing_encoding_raises(self):
1879-
18801865
ds, ds_to_append, _ = create_append_test_data()
1881-
1882-
# check failure when providing encoding to existing variable
1883-
with pytest.raises(ValueError):
1884-
with self.create_zarr_target() as store_target:
1885-
ds.to_zarr(store_target, mode="w")
1866+
with self.create_zarr_target() as store_target:
1867+
ds.to_zarr(store_target, mode="w")
1868+
with pytest.raises(ValueError, match="but encoding was provided"):
18861869
ds_to_append.to_zarr(
18871870
store_target,
18881871
append_dim="time",

0 commit comments

Comments
 (0)