Skip to content
forked from pydata/xarray

Commit 1e8b218

Browse files
committed
Squashed commit dask-computes
commit 0711eb0 Author: dcherian <[email protected]> Date: Thu Oct 31 21:18:58 2019 -0600 bugfix. commit 4ee2963 Author: Deepak Cherian <[email protected]> Date: Thu Oct 31 11:27:05 2019 -0600 pep8 commit 6e4c11f Merge: 08f7f74 53c5199 Author: Deepak Cherian <[email protected]> Date: Thu Oct 31 11:25:12 2019 -0600 Merge branch 'master' into fix/dask-computes commit 08f7f74 Merge: 53c0f4e 278d2e6 Author: dcherian <[email protected]> Date: Tue Oct 29 09:36:58 2019 -0600 Merge remote-tracking branch 'upstream/master' into fix/dask-computes * upstream/master: upgrade black verison to 19.10b0 (pydata#3456) Remove outdated code related to compatibility with netcdftime (pydata#3450) commit 53c0f4e Author: dcherian <[email protected]> Date: Tue Oct 29 09:25:27 2019 -0600 Add identity check to lazy_array_equiv commit 5e742e4 Author: dcherian <[email protected]> Date: Tue Oct 29 09:22:15 2019 -0600 update whats new commit ee0d422 Merge: e99148e 74ca69a Author: dcherian <[email protected]> Date: Tue Oct 29 09:18:38 2019 -0600 Merge remote-tracking branch 'upstream/master' into fix/dask-computes * upstream/master: Remove deprecated behavior from dataset.drop docstring (pydata#3451) jupyterlab dark theme (pydata#3443) Drop groups associated with nans in group variable (pydata#3406) Allow ellipsis (...) in transpose (pydata#3421) Another groupby.reduce bugfix. (pydata#3403) add icomoon license (pydata#3448) commit e99148e Author: dcherian <[email protected]> Date: Tue Oct 29 09:17:58 2019 -0600 add concat test commit 4a66e7c Author: dcherian <[email protected]> Date: Mon Oct 28 10:19:32 2019 -0600 review suggestions. commit 8739ddd Author: dcherian <[email protected]> Date: Mon Oct 28 08:32:15 2019 -0600 better docstring commit e84cc97 Author: dcherian <[email protected]> Date: Sun Oct 27 20:22:13 2019 -0600 Optimize dask array equality checks. Dask arrays with the same graph have the same name. We can use this to quickly compare dask-backed variables without computing. Fixes pydata#3068 and pydata#3311
1 parent afe01ac commit 1e8b218

File tree

6 files changed

+217
-45
lines changed

6 files changed

+217
-45
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ Bug fixes
7070
but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle <https://github.com/rdoyle45>`_
7171
- Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`).
7272
By `Deepak Cherian <https://github.com/dcherian>`_.
73+
- Use dask names to compare dask objects prior to comparing values after computation.
74+
(:issue:`3068`, :issue:`3311`, :issue:`3454`, :pull:`3453`).
75+
By `Deepak Cherian <https://github.com/dcherian>`_.
7376
- Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4.
7477
By `Anderson Banihirwe <https://github.com/andersy005>`_.
7578
- Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and

xarray/core/concat.py

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from . import dtypes, utils
44
from .alignment import align
5+
from .duck_array_ops import lazy_array_equiv
56
from .merge import _VALID_COMPAT, unique_variable
67
from .variable import IndexVariable, Variable, as_variable
78
from .variable import concat as concat_vars
@@ -189,26 +190,43 @@ def process_subset_opt(opt, subset):
189190
# all nonindexes that are not the same in each dataset
190191
for k in getattr(datasets[0], subset):
191192
if k not in concat_over:
192-
# Compare the variable of all datasets vs. the one
193-
# of the first dataset. Perform the minimum amount of
194-
# loads in order to avoid multiple loads from disk
195-
# while keeping the RAM footprint low.
196-
v_lhs = datasets[0].variables[k].load()
197-
# We'll need to know later on if variables are equal.
198-
computed = []
199-
for ds_rhs in datasets[1:]:
200-
v_rhs = ds_rhs.variables[k].compute()
201-
computed.append(v_rhs)
202-
if not getattr(v_lhs, compat)(v_rhs):
203-
concat_over.add(k)
204-
equals[k] = False
205-
# computed variables are not to be re-computed
206-
# again in the future
207-
for ds, v in zip(datasets[1:], computed):
208-
ds.variables[k].data = v.data
193+
equals[k] = None
194+
variables = [ds.variables[k] for ds in datasets]
195+
# first check without comparing values i.e. no computes
196+
for var in variables[1:]:
197+
equals[k] = getattr(variables[0], compat)(
198+
var, equiv=lazy_array_equiv
199+
)
200+
if equals[k] is not True:
201+
# exit early if we know these are not equal or that
202+
# equality cannot be determined i.e. one or all of
203+
# the variables wraps a numpy array
209204
break
210-
else:
211-
equals[k] = True
205+
206+
if equals[k] is False:
207+
concat_over.add(k)
208+
209+
elif equals[k] is None:
210+
# Compare the variable of all datasets vs. the one
211+
# of the first dataset. Perform the minimum amount of
212+
# loads in order to avoid multiple loads from disk
213+
# while keeping the RAM footprint low.
214+
v_lhs = datasets[0].variables[k].load()
215+
# We'll need to know later on if variables are equal.
216+
computed = []
217+
for ds_rhs in datasets[1:]:
218+
v_rhs = ds_rhs.variables[k].compute()
219+
computed.append(v_rhs)
220+
if not getattr(v_lhs, compat)(v_rhs):
221+
concat_over.add(k)
222+
equals[k] = False
223+
# computed variables are not to be re-computed
224+
# again in the future
225+
for ds, v in zip(datasets[1:], computed):
226+
ds.variables[k].data = v.data
227+
break
228+
else:
229+
equals[k] = True
212230

213231
elif opt == "all":
214232
concat_over.update(

xarray/core/duck_array_ops.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -174,27 +174,57 @@ def as_shared_dtype(scalars_or_arrays):
174174
return [x.astype(out_type, copy=False) for x in arrays]
175175

176176

177-
def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
178-
"""Like np.allclose, but also allows values to be NaN in both arrays
177+
def lazy_array_equiv(arr1, arr2):
178+
"""Like array_equal, but doesn't actually compare values.
179+
Returns True when arr1, arr2 identical or their dask names are equal.
180+
Returns False when shapes are not equal.
181+
Returns None when equality cannot determined: one or both of arr1, arr2 are numpy arrays;
182+
or their dask names are not equal
179183
"""
184+
if arr1 is arr2:
185+
return True
180186
arr1 = asarray(arr1)
181187
arr2 = asarray(arr2)
182188
if arr1.shape != arr2.shape:
183189
return False
184-
return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all())
190+
if (
191+
dask_array
192+
and isinstance(arr1, dask_array.Array)
193+
and isinstance(arr2, dask_array.Array)
194+
):
195+
# GH3068
196+
if arr1.name == arr2.name:
197+
return True
198+
else:
199+
return None
200+
return None
201+
202+
203+
def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
204+
"""Like np.allclose, but also allows values to be NaN in both arrays
205+
"""
206+
arr1 = asarray(arr1)
207+
arr2 = asarray(arr2)
208+
lazy_equiv = lazy_array_equiv(arr1, arr2)
209+
if lazy_equiv is None:
210+
return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all())
211+
else:
212+
return lazy_equiv
185213

186214

187215
def array_equiv(arr1, arr2):
188216
"""Like np.array_equal, but also allows values to be NaN in both arrays
189217
"""
190218
arr1 = asarray(arr1)
191219
arr2 = asarray(arr2)
192-
if arr1.shape != arr2.shape:
193-
return False
194-
with warnings.catch_warnings():
195-
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
196-
flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2))
197-
return bool(flag_array.all())
220+
lazy_equiv = lazy_array_equiv(arr1, arr2)
221+
if lazy_equiv is None:
222+
with warnings.catch_warnings():
223+
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
224+
flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2))
225+
return bool(flag_array.all())
226+
else:
227+
return lazy_equiv
198228

199229

200230
def array_notnull_equiv(arr1, arr2):
@@ -203,12 +233,14 @@ def array_notnull_equiv(arr1, arr2):
203233
"""
204234
arr1 = asarray(arr1)
205235
arr2 = asarray(arr2)
206-
if arr1.shape != arr2.shape:
207-
return False
208-
with warnings.catch_warnings():
209-
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
210-
flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2)
211-
return bool(flag_array.all())
236+
lazy_equiv = lazy_array_equiv(arr1, arr2)
237+
if lazy_equiv is None:
238+
with warnings.catch_warnings():
239+
warnings.filterwarnings("ignore", "In the future, 'NAT == x'")
240+
flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2)
241+
return bool(flag_array.all())
242+
else:
243+
return lazy_equiv
212244

213245

214246
def count(data, axis=None):

xarray/core/merge.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from . import dtypes, pdcompat
2121
from .alignment import deep_align
22+
from .duck_array_ops import lazy_array_equiv
2223
from .utils import Frozen, dict_equiv
2324
from .variable import Variable, as_variable, assert_unique_multiindex_level_names
2425

@@ -123,16 +124,24 @@ def unique_variable(
123124
combine_method = "fillna"
124125

125126
if equals is None:
126-
out = out.compute()
127+
# first check without comparing values i.e. no computes
127128
for var in variables[1:]:
128-
equals = getattr(out, compat)(var)
129-
if not equals:
129+
equals = getattr(out, compat)(var, equiv=lazy_array_equiv)
130+
if equals is not True:
130131
break
131132

133+
if equals is None:
134+
# now compare values with minimum number of computes
135+
out = out.compute()
136+
for var in variables[1:]:
137+
equals = getattr(out, compat)(var)
138+
if not equals:
139+
break
140+
132141
if not equals:
133142
raise MergeError(
134-
"conflicting values for variable {!r} on objects to be combined. "
135-
"You can skip this check by specifying compat='override'.".format(name)
143+
f"conflicting values for variable {name!r} on objects to be combined. "
144+
"You can skip this check by specifying compat='override'."
136145
)
137146

138147
if combine_method:

xarray/core/variable.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,7 +1236,9 @@ def transpose(self, *dims) -> "Variable":
12361236
dims = self.dims[::-1]
12371237
dims = tuple(infix_dims(dims, self.dims))
12381238
axes = self.get_axis_num(dims)
1239-
if len(dims) < 2: # no need to transpose if only one dimension
1239+
if len(dims) < 2 or dims == self.dims:
1240+
# no need to transpose if only one dimension
1241+
# or dims are in same order
12401242
return self.copy(deep=False)
12411243

12421244
data = as_indexable(self._data).transpose(axes)
@@ -1595,22 +1597,24 @@ def broadcast_equals(self, other, equiv=duck_array_ops.array_equiv):
15951597
return False
15961598
return self.equals(other, equiv=equiv)
15971599

1598-
def identical(self, other):
1600+
def identical(self, other, equiv=duck_array_ops.array_equiv):
15991601
"""Like equals, but also checks attributes.
16001602
"""
16011603
try:
1602-
return utils.dict_equiv(self.attrs, other.attrs) and self.equals(other)
1604+
return utils.dict_equiv(self.attrs, other.attrs) and self.equals(
1605+
other, equiv=equiv
1606+
)
16031607
except (TypeError, AttributeError):
16041608
return False
16051609

1606-
def no_conflicts(self, other):
1610+
def no_conflicts(self, other, equiv=duck_array_ops.array_notnull_equiv):
16071611
"""True if the intersection of two Variable's non-null data is
16081612
equal; otherwise false.
16091613
16101614
Variables can thus still be equal if there are locations where either,
16111615
or both, contain NaN values.
16121616
"""
1613-
return self.broadcast_equals(other, equiv=duck_array_ops.array_notnull_equiv)
1617+
return self.broadcast_equals(other, equiv=equiv)
16141618

16151619
def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
16161620
"""Compute the qth quantile of the data along the specified dimension.

xarray/tests/test_dask.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
raises_regex,
2525
requires_scipy_or_netCDF4,
2626
)
27+
from ..core.duck_array_ops import lazy_array_equiv
2728
from .test_backends import create_tmp_file
2829

2930
dask = pytest.importorskip("dask")
@@ -428,7 +429,53 @@ def test_concat_loads_variables(self):
428429
out.compute()
429430
assert kernel_call_count == 24
430431

431-
# Finally, test that riginals are unaltered
432+
# Finally, test that originals are unaltered
433+
assert ds1["d"].data is d1
434+
assert ds1["c"].data is c1
435+
assert ds2["d"].data is d2
436+
assert ds2["c"].data is c2
437+
assert ds3["d"].data is d3
438+
assert ds3["c"].data is c3
439+
440+
# now check that concat() is correctly using dask name equality to skip loads
441+
out = xr.concat(
442+
[ds1, ds1, ds1], dim="n", data_vars="different", coords="different"
443+
)
444+
assert kernel_call_count == 24
445+
# variables are not loaded in the output
446+
assert isinstance(out["d"].data, dask.array.Array)
447+
assert isinstance(out["c"].data, dask.array.Array)
448+
449+
out = xr.concat(
450+
[ds1, ds1, ds1], dim="n", data_vars=[], coords=[], compat="identical"
451+
)
452+
assert kernel_call_count == 24
453+
# variables are not loaded in the output
454+
assert isinstance(out["d"].data, dask.array.Array)
455+
assert isinstance(out["c"].data, dask.array.Array)
456+
457+
out = xr.concat(
458+
[ds1, ds2.compute(), ds3],
459+
dim="n",
460+
data_vars="all",
461+
coords="different",
462+
compat="identical",
463+
)
464+
# c1,c3 must be computed for comparison since c2 is numpy;
465+
# d2 is computed too
466+
assert kernel_call_count == 28
467+
468+
out = xr.concat(
469+
[ds1, ds2.compute(), ds3],
470+
dim="n",
471+
data_vars="all",
472+
coords="all",
473+
compat="identical",
474+
)
475+
# no extra computes
476+
assert kernel_call_count == 30
477+
478+
# Finally, test that originals are unaltered
432479
assert ds1["d"].data is d1
433480
assert ds1["c"].data is c1
434481
assert ds2["d"].data is d2
@@ -1142,6 +1189,19 @@ def test_make_meta(map_ds):
11421189
assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim
11431190

11441191

1192+
def test_identical_coords_no_computes():
1193+
lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1194+
a = xr.DataArray(
1195+
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
1196+
)
1197+
b = xr.DataArray(
1198+
da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2}
1199+
)
1200+
with raise_if_dask_computes():
1201+
c = a + b
1202+
assert_identical(c, a)
1203+
1204+
11451205
@pytest.mark.parametrize(
11461206
"obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()]
11471207
)
@@ -1229,3 +1289,49 @@ def test_normalize_token_with_backend(map_ds):
12291289
map_ds.to_netcdf(tmp_file)
12301290
read = xr.open_dataset(tmp_file)
12311291
assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read)
1292+
1293+
1294+
@pytest.mark.parametrize(
1295+
"compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
1296+
)
1297+
def test_lazy_array_equiv_variables(compat):
1298+
var1 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
1299+
var2 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2))
1300+
var3 = xr.Variable(("y", "x"), da.zeros((20, 10), chunks=2))
1301+
1302+
with raise_if_dask_computes():
1303+
assert getattr(var1, compat)(var2, equiv=lazy_array_equiv)
1304+
# values are actually equal, but we don't know that till we compute, return None
1305+
with raise_if_dask_computes():
1306+
assert getattr(var1, compat)(var2 / 2, equiv=lazy_array_equiv) is None
1307+
1308+
# shapes are not equal, return False without computes
1309+
with raise_if_dask_computes():
1310+
assert getattr(var1, compat)(var3, equiv=lazy_array_equiv) is False
1311+
1312+
# if one or both arrays are numpy, return None
1313+
assert getattr(var1, compat)(var2.compute(), equiv=lazy_array_equiv) is None
1314+
assert (
1315+
getattr(var1.compute(), compat)(var2.compute(), equiv=lazy_array_equiv) is None
1316+
)
1317+
1318+
with raise_if_dask_computes():
1319+
assert getattr(var1, compat)(var2.transpose("y", "x"))
1320+
1321+
1322+
@pytest.mark.parametrize(
1323+
"compat", ["broadcast_equals", "equals", "identical", "no_conflicts"]
1324+
)
1325+
def test_lazy_array_equiv_merge(compat):
1326+
da1 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1327+
da2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x"))
1328+
da3 = xr.DataArray(da.ones((20, 10), chunks=2), dims=("y", "x"))
1329+
1330+
with raise_if_dask_computes():
1331+
xr.merge([da1, da2], compat=compat)
1332+
# shapes are not equal; no computes necessary
1333+
with raise_if_dask_computes(max_computes=0):
1334+
with pytest.raises(ValueError):
1335+
xr.merge([da1, da3], compat=compat)
1336+
with raise_if_dask_computes(max_computes=2):
1337+
xr.merge([da1, da2 / 2], compat=compat)

0 commit comments

Comments
 (0)