Skip to content

Commit e1f0f98

Browse files
authored
Better chunking error messages for zarr backend (#3983)
1 parent 0cd14a5 commit e1f0f98

File tree

3 files changed

+50
-24
lines changed

3 files changed

+50
-24
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ Documentation
105105

106106
Internal Changes
107107
~~~~~~~~~~~~~~~~
108+
- Raise more informative error messages for chunk size conflicts when writing to zarr files.
109+
By `Deepak Cherian <https://github.com/dcherian>`_.
108110
- Run the ``isort`` pre-commit hook only on python source files
109111
and update the ``flake8`` version. (:issue:`3750`, :pull:`3711`)
110112
By `Justus Magin <https://github.com/keewis>`_.

xarray/backends/zarr.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def __getitem__(self, key):
6565
# could possibly have a work-around for 0d data here
6666

6767

68-
def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
68+
def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name):
6969
"""
7070
Given encoding chunks (possibly None) and variable chunks (possibly None)
7171
"""
@@ -88,15 +88,16 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
8888
if var_chunks and enc_chunks is None:
8989
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks):
9090
raise ValueError(
91-
"Zarr requires uniform chunk sizes except for final chunk."
92-
" Variable dask chunks %r are incompatible. Consider "
93-
"rechunking using `chunk()`." % (var_chunks,)
91+
"Zarr requires uniform chunk sizes except for final chunk. "
92+
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. "
93+
"Consider rechunking using `chunk()`."
9494
)
9595
if any((chunks[0] < chunks[-1]) for chunks in var_chunks):
9696
raise ValueError(
9797
"Final chunk of Zarr array must be the same size or smaller "
98-
"than the first. Variable Dask chunks %r are incompatible. "
99-
"Consider rechunking using `chunk()`." % var_chunks
98+
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}."
99+
"Consider either rechunking using `chunk()` or instead deleting "
100+
"or modifying `encoding['chunks']`."
100101
)
101102
# return the first chunk for each dimension
102103
return tuple(chunk[0] for chunk in var_chunks)
@@ -114,13 +115,15 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
114115

115116
if len(enc_chunks_tuple) != ndim:
116117
# throw away encoding chunks, start over
117-
return _determine_zarr_chunks(None, var_chunks, ndim)
118+
return _determine_zarr_chunks(None, var_chunks, ndim, name)
118119

119120
for x in enc_chunks_tuple:
120121
if not isinstance(x, int):
121122
raise TypeError(
122-
"zarr chunks must be an int or a tuple of ints. "
123-
"Instead found %r" % (enc_chunks_tuple,)
123+
"zarr chunk sizes specified in `encoding['chunks']` "
124+
"must be an int or a tuple of ints. "
125+
f"Instead found encoding['chunks']={enc_chunks_tuple!r} "
126+
f"for variable named {name!r}."
124127
)
125128

126129
# if there are chunks in encoding and the variable data is a numpy array,
@@ -142,19 +145,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
142145
for dchunk in dchunks[:-1]:
143146
if dchunk % zchunk:
144147
raise NotImplementedError(
145-
"Specified zarr chunks %r would overlap multiple dask "
146-
"chunks %r. This is not implemented in xarray yet. "
147-
" Consider rechunking the data using "
148-
"`chunk()` or specifying different chunks in encoding."
149-
% (enc_chunks_tuple, var_chunks)
148+
f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for "
149+
f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. "
150+
"This is not implemented in xarray yet. "
151+
"Consider either rechunking using `chunk()` or instead deleting "
152+
"or modifying `encoding['chunks']`."
150153
)
151154
if dchunks[-1] > zchunk:
152155
raise ValueError(
153156
"Final chunk of Zarr array must be the same size or "
154-
"smaller than the first. The specified Zarr chunk "
155-
"encoding is %r, but %r in variable Dask chunks %r is "
156-
"incompatible. Consider rechunking using `chunk()`."
157-
% (enc_chunks_tuple, dchunks, var_chunks)
157+
"smaller than the first. "
158+
f"Specified Zarr chunk encoding['chunks']={enc_chunks_tuple}, "
159+
f"for variable named {name!r} "
160+
f"but {dchunks} in the variable's Dask chunks {var_chunks} is "
161+
"incompatible with this encoding. "
162+
"Consider either rechunking using `chunk()` or instead deleting "
163+
"or modifying `encoding['chunks']`."
158164
)
159165
return enc_chunks_tuple
160166

@@ -177,7 +183,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
177183
return dimensions, attributes
178184

179185

180-
def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
186+
def extract_zarr_variable_encoding(variable, raise_on_invalid=False, name=None):
181187
"""
182188
Extract zarr encoding dictionary from xarray Variable
183189
@@ -207,7 +213,7 @@ def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
207213
del encoding[k]
208214

209215
chunks = _determine_zarr_chunks(
210-
encoding.get("chunks"), variable.chunks, variable.ndim
216+
encoding.get("chunks"), variable.chunks, variable.ndim, name
211217
)
212218
encoding["chunks"] = chunks
213219
return encoding
@@ -453,7 +459,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
453459
writer.add(v.data, zarr_array, region=tuple(new_region))
454460
else:
455461
# new variable
456-
encoding = extract_zarr_variable_encoding(v, raise_on_invalid=check)
462+
encoding = extract_zarr_variable_encoding(
463+
v, raise_on_invalid=check, name=vn
464+
)
457465
encoded_attrs = {}
458466
# the magic for storing the hidden dimension data
459467
encoded_attrs[DIMENSION_KEY] = dims

xarray/tests/test_backends.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,11 +1685,27 @@ def test_chunk_encoding_with_dask(self):
16851685

16861686
# should fail if dask_chunks are irregular...
16871687
ds_chunk_irreg = ds.chunk({"x": (5, 4, 3)})
1688-
with pytest.raises(ValueError) as e_info:
1688+
with raises_regex(ValueError, "uniform chunk sizes."):
16891689
with self.roundtrip(ds_chunk_irreg) as actual:
16901690
pass
1691-
# make sure this error message is correct and not some other error
1692-
assert e_info.match("chunks")
1691+
1692+
# should fail if encoding["chunks"] clashes with dask_chunks
1693+
badenc = ds.chunk({"x": 4})
1694+
badenc.var1.encoding["chunks"] = (6,)
1695+
with raises_regex(NotImplementedError, "named 'var1' would overlap"):
1696+
with self.roundtrip(badenc) as actual:
1697+
pass
1698+
1699+
badenc.var1.encoding["chunks"] = (2,)
1700+
with raises_regex(ValueError, "Specified Zarr chunk encoding"):
1701+
with self.roundtrip(badenc) as actual:
1702+
pass
1703+
1704+
badenc = badenc.chunk({"x": (3, 3, 6)})
1705+
badenc.var1.encoding["chunks"] = (3,)
1706+
with raises_regex(ValueError, "incompatible with this encoding"):
1707+
with self.roundtrip(badenc) as actual:
1708+
pass
16931709

16941710
# ... except if the last chunk is smaller than the first
16951711
ds_chunk_irreg = ds.chunk({"x": (5, 5, 2)})

0 commit comments

Comments
 (0)