Skip to content

Commit 54e6fde

Browse files
committed
Merge branch 'master' into pint-support-dataarray
2 parents 61c3eb9 + b3d3b44 commit 54e6fde

12 files changed

+176
-25
lines changed

doc/related-projects.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Geosciences
2525
- `PyGDX <https://pygdx.readthedocs.io/en/latest/>`_: Python 3 package for
2626
accessing data stored in GAMS Data eXchange (GDX) files. Also uses a custom
2727
subclass.
28+
- `pyinterp <https://pangeo-pyinterp.readthedocs.io/en/latest/>`_: Python 3 package for interpolating geo-referenced data used in the field of geosciences.
2829
- `pyXpcm <https://pyxpcm.readthedocs.io>`_: xarray-based Profile Classification Modelling (PCM), mostly for ocean data.
2930
- `Regionmask <https://regionmask.readthedocs.io/>`_: plotting and creation of masks of spatial regions
3031
- `rioxarray <https://corteva.github.io/rioxarray>`_: geospatial xarray extension powered by rasterio

doc/whats-new.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ Breaking changes
2525

2626
New Features
2727
~~~~~~~~~~~~
28+
- Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking
29+
to a single chunk along all reduction axes. (:issue:`2999`).
30+
By `Deepak Cherian <https://github.com/dcherian>`_.
31+
- :py:func:`xarray.concat` now preserves attributes from the first Variable.
32+
(:issue:`2575`, :issue:`2060`, :issue:`1614`)
33+
By `Deepak Cherian <https://github.com/dcherian>`_.
2834
- :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` and ``GroupBy.quantile``
2935
now work with dask Variables.
3036
By `Deepak Cherian <https://github.com/dcherian>`_.
@@ -38,6 +44,9 @@ New Features
3844

3945
Bug fixes
4046
~~~~~~~~~
47+
- Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete
48+
hypercubes of Datasets (:issue:`3648`). By `Ian Bolliger
49+
<https://github.com/bolliger32>`_.
4150
- Fix :py:meth:`xarray.combine_by_coords` when combining cftime coordinates
4251
which span long time intervals (:issue:`3535`). By `Spencer Clark
4352
<https://github.com/spencerkclark>`_.

xarray/core/combine.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,12 @@ def _infer_concat_order_from_coords(datasets):
115115
return combined_ids, concat_dims
116116

117117

118-
def _check_shape_tile_ids(combined_tile_ids):
118+
def _check_dimension_depth_tile_ids(combined_tile_ids):
119+
"""
120+
Check all tuples are the same length, i.e. check that all lists are
121+
nested to the same depth.
122+
"""
119123
tile_ids = combined_tile_ids.keys()
120-
121-
# Check all tuples are the same length
122-
# i.e. check that all lists are nested to the same depth
123124
nesting_depths = [len(tile_id) for tile_id in tile_ids]
124125
if not nesting_depths:
125126
nesting_depths = [0]
@@ -128,8 +129,13 @@ def _check_shape_tile_ids(combined_tile_ids):
128129
"The supplied objects do not form a hypercube because"
129130
" sub-lists do not have consistent depths"
130131
)
132+
# return these just to be reused in _check_shape_tile_ids
133+
return tile_ids, nesting_depths
131134

132-
# Check all lists along one dimension are same length
135+
136+
def _check_shape_tile_ids(combined_tile_ids):
137+
"""Check all lists along one dimension are same length."""
138+
tile_ids, nesting_depths = _check_dimension_depth_tile_ids(combined_tile_ids)
133139
for dim in range(nesting_depths[0]):
134140
indices_along_dim = [tile_id[dim] for tile_id in tile_ids]
135141
occurrences = Counter(indices_along_dim)
@@ -536,7 +542,8 @@ def combine_by_coords(
536542
coords : {'minimal', 'different', 'all' or list of str}, optional
537543
As per the 'data_vars' kwarg, but for coordinate variables.
538544
fill_value : scalar, optional
539-
Value to use for newly missing values
545+
Value to use for newly missing values. If None, raises a ValueError if
546+
the passed Datasets do not create a complete hypercube.
540547
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
541548
String indicating how to combine differing indexes
542549
(excluding concat_dim) in objects
@@ -653,6 +660,15 @@ def combine_by_coords(
653660
temperature (y, x) float64 1.654 10.63 7.015 2.543 ... 12.46 2.22 15.96
654661
precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
655662
663+
>>> xr.combine_by_coords([x1, x2, x3])
664+
<xarray.Dataset>
665+
Dimensions: (x: 6, y: 4)
666+
Coordinates:
667+
* x (x) int64 10 20 30 40 50 60
668+
* y (y) int64 0 1 2 3
669+
Data variables:
670+
temperature (y, x) float64 1.654 10.63 7.015 nan ... 12.46 2.22 15.96
671+
precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
656672
"""
657673

658674
# Group by data vars
@@ -667,7 +683,13 @@ def combine_by_coords(
667683
list(datasets_with_same_vars)
668684
)
669685

670-
_check_shape_tile_ids(combined_ids)
686+
if fill_value is None:
687+
# check that datasets form complete hypercube
688+
_check_shape_tile_ids(combined_ids)
689+
else:
690+
# check only that all datasets have same dimension depth for these
691+
# vars
692+
_check_dimension_depth_tile_ids(combined_ids)
671693

672694
# Concatenate along all of concat_dims one by one to create single ds
673695
concatenated = _combine_nd(

xarray/core/concat.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,14 @@ def concat(
9393
those of the first object with that dimension. Indexes for the same
9494
dimension must have the same size in all objects.
9595
96-
indexers, mode, concat_over : deprecated
97-
9896
Returns
9997
-------
10098
concatenated : type of objs
10199
100+
Notes
101+
-----
102+
Each concatenated Variable preserves corresponding ``attrs`` from the first element of ``objs``.
103+
102104
See also
103105
--------
104106
merge

xarray/core/dask_array_compat.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
from distutils.version import LooseVersion
2+
from typing import Iterable
23

3-
import dask.array as da
44
import numpy as np
5-
from dask import __version__ as dask_version
5+
6+
try:
7+
import dask.array as da
8+
from dask import __version__ as dask_version
9+
except ImportError:
10+
dask_version = "0.0.0"
11+
da = None
612

713
if LooseVersion(dask_version) >= LooseVersion("2.0.0"):
814
meta_from_array = da.utils.meta_from_array
@@ -89,3 +95,76 @@ def meta_from_array(x, ndim=None, dtype=None):
8995
meta = meta.astype(dtype)
9096

9197
return meta
98+
99+
100+
if LooseVersion(dask_version) >= LooseVersion("2.8.1"):
101+
median = da.median
102+
else:
103+
# Copied from dask v2.8.1
104+
# Used under the terms of Dask's license, see licenses/DASK_LICENSE.
105+
def median(a, axis=None, keepdims=False):
106+
"""
107+
This works by automatically chunking the reduced axes to a single chunk
108+
and then calling ``numpy.median`` function across the remaining dimensions
109+
"""
110+
111+
if axis is None:
112+
raise NotImplementedError(
113+
"The da.median function only works along an axis. "
114+
"The full algorithm is difficult to do in parallel"
115+
)
116+
117+
if not isinstance(axis, Iterable):
118+
axis = (axis,)
119+
120+
axis = [ax + a.ndim if ax < 0 else ax for ax in axis]
121+
122+
a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})
123+
124+
result = a.map_blocks(
125+
np.median,
126+
axis=axis,
127+
keepdims=keepdims,
128+
drop_axis=axis if not keepdims else None,
129+
chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
130+
if keepdims
131+
else None,
132+
)
133+
134+
return result
135+
136+
137+
if LooseVersion(dask_version) > LooseVersion("2.9.0"):
138+
nanmedian = da.nanmedian
139+
else:
140+
141+
def nanmedian(a, axis=None, keepdims=False):
142+
"""
143+
This works by automatically chunking the reduced axes to a single chunk
144+
and then calling ``numpy.nanmedian`` function across the remaining dimensions
145+
"""
146+
147+
if axis is None:
148+
raise NotImplementedError(
149+
"The da.nanmedian function only works along an axis. "
150+
"The full algorithm is difficult to do in parallel"
151+
)
152+
153+
if not isinstance(axis, Iterable):
154+
axis = (axis,)
155+
156+
axis = [ax + a.ndim if ax < 0 else ax for ax in axis]
157+
158+
a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})
159+
160+
result = a.map_blocks(
161+
np.nanmedian,
162+
axis=axis,
163+
keepdims=keepdims,
164+
drop_axis=axis if not keepdims else None,
165+
chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
166+
if keepdims
167+
else None,
168+
)
169+
170+
return result

xarray/core/duck_array_ops.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import numpy as np
1212
import pandas as pd
1313

14-
from . import dask_array_ops, dtypes, npcompat, nputils
14+
from . import dask_array_ops, dask_array_compat, dtypes, npcompat, nputils
1515
from .nputils import nanfirst, nanlast
1616
from .pycompat import dask_array_type
1717

@@ -284,7 +284,7 @@ def _ignore_warnings_if(condition):
284284
yield
285285

286286

287-
def _create_nan_agg_method(name, coerce_strings=False):
287+
def _create_nan_agg_method(name, dask_module=dask_array, coerce_strings=False):
288288
from . import nanops
289289

290290
def f(values, axis=None, skipna=None, **kwargs):
@@ -301,7 +301,7 @@ def f(values, axis=None, skipna=None, **kwargs):
301301
nanname = "nan" + name
302302
func = getattr(nanops, nanname)
303303
else:
304-
func = _dask_or_eager_func(name)
304+
func = _dask_or_eager_func(name, dask_module=dask_module)
305305

306306
try:
307307
return func(values, axis=axis, **kwargs)
@@ -337,7 +337,7 @@ def f(values, axis=None, skipna=None, **kwargs):
337337
std.numeric_only = True
338338
var = _create_nan_agg_method("var")
339339
var.numeric_only = True
340-
median = _create_nan_agg_method("median")
340+
median = _create_nan_agg_method("median", dask_module=dask_array_compat)
341341
median.numeric_only = True
342342
prod = _create_nan_agg_method("prod")
343343
prod.numeric_only = True

xarray/core/nanops.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66

77
try:
88
import dask.array as dask_array
9+
from . import dask_array_compat
910
except ImportError:
1011
dask_array = None
12+
dask_array_compat = None # type: ignore
1113

1214

1315
def _replace_nan(a, val):
@@ -141,7 +143,15 @@ def nanmean(a, axis=None, dtype=None, out=None):
141143

142144

143145
def nanmedian(a, axis=None, out=None):
144-
return _dask_or_eager_func("nanmedian", eager_module=nputils)(a, axis=axis)
146+
# The dask algorithm works by rechunking to one chunk along axis
147+
# Make sure we trigger the dask error when passing all dimensions
148+
# so that we don't rechunk the entire array to one chunk and
149+
# possibly blow memory
150+
if axis is not None and len(np.atleast_1d(axis)) == a.ndim:
151+
axis = None
152+
return _dask_or_eager_func(
153+
"nanmedian", dask_module=dask_array_compat, eager_module=nputils
154+
)(a, axis=axis)
145155

146156

147157
def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs):

xarray/core/variable.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,8 +1629,9 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
16291629
if not shortcut:
16301630
for var in variables:
16311631
if var.dims != first_var.dims:
1632-
raise ValueError("inconsistent dimensions")
1633-
utils.remove_incompatible_items(attrs, var.attrs)
1632+
raise ValueError(
1633+
f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var.dims)}"
1634+
)
16341635

16351636
return cls(dims, data, attrs, encoding)
16361637

xarray/tests/test_combine.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,22 @@ def test_check_for_impossible_ordering(self):
711711
):
712712
combine_by_coords([ds1, ds0])
713713

714+
def test_combine_by_coords_incomplete_hypercube(self):
715+
# test that this succeeds with default fill_value
716+
x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]})
717+
x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]})
718+
x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]})
719+
actual = combine_by_coords([x1, x2, x3])
720+
expected = Dataset(
721+
{"a": (("y", "x"), [[1, 1], [1, np.nan]])},
722+
coords={"y": [0, 1], "x": [0, 1]},
723+
)
724+
assert_identical(expected, actual)
725+
726+
# test that this fails if fill_value is None
727+
with pytest.raises(ValueError):
728+
combine_by_coords([x1, x2, x3], fill_value=None)
729+
714730

715731
@pytest.mark.filterwarnings(
716732
"ignore:In xarray version 0.15 `auto_combine` " "will be deprecated"

xarray/tests/test_concat.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,16 @@ def test_concat_join_kwarg(self):
462462
for join in expected:
463463
actual = concat([ds1, ds2], join=join, dim="x")
464464
assert_equal(actual, expected[join].to_array())
465+
466+
467+
@pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {}))
468+
@pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {}))
469+
def test_concat_attrs_first_variable(attr1, attr2):
470+
471+
arrs = [
472+
DataArray([[1], [2]], dims=["x", "y"], attrs=attr1),
473+
DataArray([[3], [4]], dims=["x", "y"], attrs=attr2),
474+
]
475+
476+
concat_attrs = concat(arrs, "y").attrs
477+
assert concat_attrs == attr1

xarray/tests/test_dask.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,10 @@ def test_reduce(self):
216216
self.assertLazyAndAllClose(u.argmin(dim="x"), actual)
217217
self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
218218
self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x"))
219-
with raises_regex(NotImplementedError, "dask"):
219+
with raises_regex(NotImplementedError, "only works along an axis"):
220220
v.median()
221+
with raises_regex(NotImplementedError, "only works along an axis"):
222+
v.median(v.dims)
221223
with raise_if_dask_computes():
222224
v.reduce(duck_array_ops.mean)
223225

xarray/tests/test_variable.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def test_concat(self):
432432
assert_identical(
433433
Variable(["b", "a"], np.array([x, y])), Variable.concat((v, w), "b")
434434
)
435-
with raises_regex(ValueError, "inconsistent dimensions"):
435+
with raises_regex(ValueError, "Variable has dimensions"):
436436
Variable.concat([v, Variable(["c"], y)], "b")
437437
# test indexers
438438
actual = Variable.concat(
@@ -451,16 +451,12 @@ def test_concat(self):
451451
Variable.concat([v[:, 0], v[:, 1:]], "x")
452452

453453
def test_concat_attrs(self):
454-
# different or conflicting attributes should be removed
454+
# always keep attrs from first variable
455455
v = self.cls("a", np.arange(5), {"foo": "bar"})
456456
w = self.cls("a", np.ones(5))
457457
expected = self.cls(
458458
"a", np.concatenate([np.arange(5), np.ones(5)])
459459
).to_base_variable()
460-
assert_identical(expected, Variable.concat([v, w], "a"))
461-
w.attrs["foo"] = 2
462-
assert_identical(expected, Variable.concat([v, w], "a"))
463-
w.attrs["foo"] = "bar"
464460
expected.attrs["foo"] = "bar"
465461
assert_identical(expected, Variable.concat([v, w], "a"))
466462

0 commit comments

Comments
 (0)