diff --git a/doc/environment.yml b/doc/environment.yml index b28c9047ba7..6422550b165 100644 --- a/doc/environment.yml +++ b/doc/environment.yml @@ -8,7 +8,7 @@ dependencies: - pandas=0.20.1 - numpydoc=0.6.0 - matplotlib=2.0.0 - - seaborn=0.7.1 + - seaborn=0.8 - dask=0.12.0 - ipython=5.1.0 - sphinx=1.5 diff --git a/doc/examples/weather-data.rst b/doc/examples/weather-data.rst index 87303dbfff1..06076c6c3b0 100644 --- a/doc/examples/weather-data.rst +++ b/doc/examples/weather-data.rst @@ -68,7 +68,7 @@ Monthly averaging .. ipython:: python - monthly_avg = ds.resample('1MS', dim='time', how='mean') + monthly_avg = ds.resample(time='1MS').mean() @savefig examples_tmin_tmax_plot_mean.png monthly_avg.sel(location='IA').to_dataframe().plot(style='s-') diff --git a/doc/time-series.rst b/doc/time-series.rst index 0e344c5d4b5..bdf8b1e7f81 100644 --- a/doc/time-series.rst +++ b/doc/time-series.rst @@ -15,6 +15,7 @@ core functionality. import numpy as np import pandas as pd import xarray as xr + np.random.seed(123456) Creating datetime64 data @@ -95,8 +96,8 @@ given ``DataArray`` can be quickly computed using a special ``.dt`` accessor. .. ipython:: python - time = time = pd.date_range('2000-01-01', freq='6H', periods=365 * 4) - ds = xr.Dataset({'foo': ('time', np.arange(365 * 24)), 'time': time}) + time = pd.date_range('2000-01-01', freq='6H', periods=365 * 4) + ds = xr.Dataset({'foo': ('time', np.arange(365 * 4)), 'time': time}) ds.time.dt.hour ds.time.dt.dayofweek @@ -128,6 +129,8 @@ the first letters of the corresponding months. You can use these shortcuts with both Datasets and DataArray coordinates. +.. _resampling: + Resampling and grouped operations --------------------------------- @@ -150,17 +153,38 @@ For example, we can downsample our dataset from hourly to 6-hourly: .. ipython:: python - ds.resample('6H', dim='time', how='mean') + ds.resample(time='6H') + +This will create a specialized ``Resample`` object which saves information +necessary for resampling. All of the reduction methods which work with +``Resample`` objects can also be used for resampling: + +.. ipython:: python -Resample also works for upsampling, in which case intervals without any -values are marked by ``NaN``: + ds.resample(time='6H').mean() + +You can also supply an arbitrary reduction function to aggregate over each +resampling group: .. ipython:: python - ds.resample('30Min', 'time') + ds.resample(time='6H').reduce(np.mean) + +For upsampling, xarray provides four methods: ``asfreq``, ``ffill``, ``bfill``, +and ``interpolate``. ``interpolate`` extends ``scipy.interpolate.interp1d`` and +supports all of its schemes. All of these resampling operations work on both +Dataset and DataArray objects with an arbitrary number of dimensions. + +.. note:: + + The ``resample`` api was updated in version 0.10.0 to reflect similar + updates in pandas ``resample`` api to be more groupby-like. Older style + calls to ``resample`` will still be supported for a short period: + + .. ipython:: python + + ds.resample('6H', dim='time', how='mean') -Of course, all of these resampling and groupby operation work on both Dataset -and DataArray objects with any number of additional dimensions. For more examples of using grouped operations on a time dimension, see :ref:`toy weather data`. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c69719cf32d..03b10d1d73c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,11 +27,42 @@ Breaking changes (:issue:`727`). By `Joe Hamman `_. +- A new resampling interface to match pandas' group-by-like API was added to + :py:meth:`~xarray.Dataset.resample` and :py:meth:`~xarray.DataArray.resample` + (:issue:`1272`). :ref:`Timeseries resampling ` is + fully supported for data with arbitrary dimensions as is both downsampling + and upsampling (including linear, quadratic, cubic, and spline interpolation). + + Old syntax: + + .. ipython:: + :verbatim: + + In [1]: ds.resample('24H', dim='time', how='max') + Out[1]: + + [...] + + New syntax: + + .. ipython:: + :verbatim: + + In [1]: ds.resample(time='24H').max() + Out[1]: + + [...] + + Note that both versions are currently supported, but using the old syntax will + produce a warning encouraging users to adopt the new syntax. + By `Daniel Rothenberg `_. + - ``repr`` and the Jupyter Notebook won't automatically compute dask variables. Datasets loaded with ``open_dataset`` won't automatically read coords from disk when calling ``repr`` (:issue:`1522`). By `Guido Imperiale `_. + Backward Incompatible Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/core/common.py b/xarray/core/common.py index f108fdf1922..ce84a65eebb 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -3,6 +3,7 @@ from __future__ import print_function import numpy as np import pandas as pd +import warnings from .pycompat import basestring, suppress, dask_array_type, OrderedDict from . import dtypes @@ -506,48 +507,23 @@ def rolling(self, min_periods=None, center=False, **windows): return self._rolling_cls(self, min_periods=min_periods, center=center, **windows) - def resample(self, freq, dim, how='mean', skipna=None, closed=None, - label=None, base=0, keep_attrs=False): - """Resample this object to a new temporal resolution. + def resample(self, freq=None, dim=None, how=None, skipna=None, + closed=None, label=None, base=0, keep_attrs=False, **indexer): + """Returns a Resample object for performing resampling operations. Handles both downsampling and upsampling. Upsampling with filling is - not yet supported; if any intervals contain no values in the original + not supported; if any intervals contain no values from the original object, they will be given the value ``NaN``. Parameters ---------- - freq : str - String in the '#offset' to specify the step-size along the - resampled dimension, where '#' is an (optional) integer multipler - (default 1) and 'offset' is any pandas date offset alias. Examples - of valid offsets include: - - * 'AS': year start - * 'QS-DEC': quarterly, starting on December 1 - * 'MS': month start - * 'D': day - * 'H': hour - * 'Min': minute - - The full list of these offset aliases is documented in pandas [1]_. - dim : str - Name of the dimension to resample along (e.g., 'time'). - how : str or func, optional - Used for downsampling. If a string, ``how`` must be a valid - aggregation operation supported by xarray. Otherwise, ``how`` must be - a function that can be called like ``how(values, axis)`` to reduce - ndarray values along the given axis. Valid choices that can be - provided as a string include all the usual Dataset/DataArray - aggregations (``all``, ``any``, ``argmax``, ``argmin``, ``max``, - ``mean``, ``median``, ``min``, ``prod``, ``sum``, ``std`` and - ``var``), as well as ``first`` and ``last``. skipna : bool, optional Whether to skip missing values when aggregating in downsampling. closed : 'left' or 'right', optional Side of each interval to treat as closed. label : 'left or 'right', optional Side of each interval to use for labeling. - base : int, optionalt + base : int, optional For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. @@ -555,6 +531,9 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None, If True, the object's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. + **indexer : {dim: freq} + Dictionary with a key indicating the dimension name to resample + over and a value corresponding to the resampling frequency. Returns ------- @@ -567,11 +546,58 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None, .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases """ from .dataarray import DataArray + from .resample import RESAMPLE_DIM + + if dim is not None: + if how is None: + how = 'mean' + return self._resample_immediately(freq, dim, how, skipna, closed, + label, base, keep_attrs) + + if (how is not None) and indexer: + raise TypeError("If passing an 'indexer' then 'dim' " + "and 'how' should not be used") + + # More than one indexer is ambiguous, but we do in fact need one if + # "dim" was not provided, until the old API is fully deprecated + if len(indexer) != 1: + raise ValueError( + "Resampling only supported along single dimensions." + ) + dim, freq = indexer.popitem() + if isinstance(dim, basestring): + dim_name = dim + dim = self[dim] + else: + raise TypeError("Dimension name should be a string; " + "was passed %r" % dim) + group = DataArray(dim, [(dim.dims, dim)], name=RESAMPLE_DIM) + time_grouper = pd.TimeGrouper(freq=freq, closed=closed, + label=label, base=base) + resampler = self._resample_cls(self, group=group, dim=dim_name, + grouper=time_grouper, + resample_dim=RESAMPLE_DIM) + + return resampler + + def _resample_immediately(self, freq, dim, how, skipna, + closed, label, base, keep_attrs): + """Implement the original version of .resample() which immediately + executes the desired resampling operation. """ + from .dataarray import DataArray RESAMPLE_DIM = '__resample_dim__' + + warnings.warn("\n.resample() has been modified to defer " + "calculations. Instead of passing 'dim' and " + "'how=\"{how}\", instead consider using " + ".resample({dim}=\"{freq}\").{how}() ".format( + dim=dim, freq=freq, how=how + ), DeprecationWarning, stacklevel=3) + if isinstance(dim, basestring): dim = self[dim] - group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM) + group = DataArray(dim, [(dim.dims, dim)], name=RESAMPLE_DIM) time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed, label=label, base=base) gb = self._groupby_cls(self, group, grouper=time_grouper) @@ -579,6 +605,8 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None, f = getattr(gb, how) if how in ['first', 'last']: result = f(skipna=skipna, keep_attrs=keep_attrs) + elif how == 'count': + result = f(dim=dim.name, keep_attrs=keep_attrs) else: result = f(dim=dim.name, skipna=skipna, keep_attrs=keep_attrs) else: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 64b0e87710a..17fc9ebd299 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -12,6 +12,7 @@ from . import duck_array_ops from . import indexing from . import groupby +from . import resample from . import rolling from . import ops from . import utils @@ -34,7 +35,7 @@ def _infer_coords_and_dims(shape, coords, dims): """All the logic for creating a new DataArray""" if (coords is not None and not utils.is_dict_like(coords) and - len(coords) != len(shape)): + len(coords) != len(shape)): raise ValueError('coords is not dict-like, but it has %s items, ' 'which does not match the %s dimensions of the ' 'data' % (len(coords), len(shape))) @@ -115,6 +116,7 @@ class _ThisArray(object): """An instance of this object is used as the key corresponding to the variable when converting arbitrary DataArray objects to datasets """ + def __repr__(self): return '' @@ -159,6 +161,8 @@ class DataArray(AbstractArray, BaseDataObject): """ _groupby_cls = groupby.DataArrayGroupBy _rolling_cls = rolling.DataArrayRolling + _resample_cls = resample.DataArrayResample + dt = property(DatetimeAccessor) def __init__(self, data, coords=None, dims=None, name=None, @@ -1490,8 +1494,10 @@ def from_cdms2(cls, variable): def _all_compat(self, other, compat_str): """Helper function for equals and identical""" + def compat(x, y): return getattr(x.variable, compat_str)(y.variable) + return (utils.dict_equiv(self.coords, other.coords, compat=compat) and compat(self, other)) @@ -1565,6 +1571,7 @@ def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): return self.__array_wrap__(f(self.variable.data, *args, **kwargs)) + return func @staticmethod @@ -1574,7 +1581,8 @@ def func(self, other): if isinstance(other, (Dataset, groupby.GroupBy)): return NotImplemented if hasattr(other, 'indexes'): - align_type = OPTIONS['arithmetic_join'] if join is None else join + align_type = (OPTIONS['arithmetic_join'] + if join is None else join) self, other = align(self, other, join=align_type, copy=False) other_variable = getattr(other, 'variable', other) other_coords = getattr(other, 'coords', None) @@ -1586,6 +1594,7 @@ def func(self, other): name = self._result_name(other) return self._replace(variable, coords, name) + return func @staticmethod @@ -1604,6 +1613,7 @@ def func(self, other): with self.coords._merge_inplace(other_coords): f(self.variable, other_variable) return self + return func def _copy_attrs_from(self, other): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f2c176c9d3e..5d975ffd281 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -14,6 +14,7 @@ from . import ops from . import utils from . import groupby +from . import resample from . import rolling from . import indexing from . import alignment @@ -305,6 +306,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject, """ _groupby_cls = groupby.DatasetGroupBy _rolling_cls = rolling.DatasetRolling + _resample_cls = resample.DatasetResample def __init__(self, data_vars=None, coords=None, attrs=None, compat='broadcast_equals'): diff --git a/xarray/core/resample.py b/xarray/core/resample.py new file mode 100644 index 00000000000..0a10a69fa69 --- /dev/null +++ b/xarray/core/resample.py @@ -0,0 +1,353 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import ops +from .groupby import DataArrayGroupBy, DatasetGroupBy +from .pycompat import dask_array_type, OrderedDict + +RESAMPLE_DIM = '__resample_dim__' + + +class Resample(object): + """An object that extends the `GroupBy` object with additional logic + for handling specialized re-sampling operations. + + You should create a `Resample` object by using the `DataArray.resample` or + `Dataset.resample` methods. The dimension along re-sampling + + See Also + -------- + DataArray.resample + Dataset.resample + + """ + + def _upsample(self, method, *args, **kwargs): + """Dispatch function to call appropriate up-sampling methods on + data. + + This method should not be called directly; instead, use one of the + wrapper functions supplied by `Resample`. + + Parameters + ---------- + method : str {'asfreq', 'pad', 'ffill', 'backfill', 'bfill', 'nearest', + 'interpolate'} + Method to use for up-sampling + + See Also + -------- + Resample.asfreq + Resample.pad + Resample.backfill + Resample.interpolate + + """ + + upsampled_index = self._full_index + + # Drop non-dimension coordinates along the resampled dimension + for k, v in self._obj.coords.items(): + if k == self._dim: + continue + if self._dim in v.dims: + self._obj = self._obj.drop(k) + + if method == 'asfreq': + return self.mean(self._dim) + + elif method in ['pad', 'ffill', 'backfill', 'bfill', 'nearest']: + kwargs = kwargs.copy() + kwargs.update(**{self._dim: upsampled_index}) + return self._obj.reindex(method=method, *args, **kwargs) + + elif method == 'interpolate': + return self._interpolate(*args, **kwargs) + + else: + raise ValueError('Specified method was "{}" but must be one of' + '"asfreq", "ffill", "bfill", or "interpolate"' + .format(method)) + + def asfreq(self): + """Return values of original object at the new up-sampling frequency; + essentially a re-index with new times set to NaN. + """ + return self._upsample('asfreq') + + def pad(self): + """Forward fill new values at up-sampled frequency. + """ + return self._upsample('pad') + ffill = pad + + def backfill(self): + """Backward fill new values at up-sampled frequency. + """ + return self._upsample('backfill') + bfill = backfill + + def nearest(self): + """Take new values from nearest original coordinate to up-sampled + frequency coordinates. + """ + return self._upsample('nearest') + + def interpolate(self, kind='linear'): + """Interpolate up-sampled data using the original data + as knots. + + Parameters + ---------- + kind : str {'linear', 'nearest', 'zero', 'slinear', + 'quadratic', 'cubic'} + Interpolation scheme to use + + See Also + -------- + scipy.interpolate.interp1d + + """ + return self._interpolate(kind=kind) + + def _interpolate(self, kind='linear'): + raise NotImplementedError + + +class DataArrayResample(DataArrayGroupBy, Resample): + """DataArrayGroupBy object specialized to time resampling operations over a + specified dimension + """ + + def __init__(self, *args, **kwargs): + + self._dim = kwargs.pop('dim', None) + self._resample_dim = kwargs.pop('resample_dim', None) + + if self._dim == self._resample_dim: + raise ValueError("Proxy resampling dimension ('{}') " + "cannot have the same name as actual dimension " + "('{}')! ".format(self._resample_dim, self._dim)) + super(DataArrayResample, self).__init__(*args, **kwargs) + + def apply(self, func, shortcut=False, **kwargs): + """Apply a function over each array in the group and concatenate them + together into a new array. + + `func` is called like `func(ar, *args, **kwargs)` for each array `ar` + in this group. + + Apply uses heuristics (like `pandas.GroupBy.apply`) to figure out how + to stack together the array. The rule is: + 1. If the dimension along which the group coordinate is defined is + still in the first grouped array after applying `func`, then stack + over this dimension. + 2. Otherwise, stack over the new dimension given by name of this + grouping (the argument to the `groupby` function). + + Parameters + ---------- + func : function + Callable to apply to each array. + shortcut : bool, optional + Whether or not to shortcut evaluation under the assumptions that: + (1) The action of `func` does not depend on any of the array + metadata (attributes or coordinates) but only on the data and + dimensions. + (2) The action of `func` creates arrays with homogeneous metadata, + that is, with the same dimensions and attributes. + If these conditions are satisfied `shortcut` provides significant + speedup. This should be the case for many common groupby operations + (e.g., applying numpy ufuncs). + **kwargs + Used to call `func(ar, **kwargs)` for each array `ar`. + + Returns + ------- + applied : DataArray or DataArray + The result of splitting, applying and combining this array. + """ + combined = super(DataArrayResample, self).apply( + func, shortcut=shortcut, **kwargs) + + # If the aggregation function didn't drop the original resampling + # dimension, then we need to do so before we can rename the proxy + # dimension we used. + if self._dim in combined: + combined = combined.drop(self._dim) + + if self._resample_dim in combined.dims: + combined = combined.rename({self._resample_dim: self._dim}) + + return combined + + def _interpolate(self, kind='linear'): + """Apply scipy.interpolate.interp1d along resampling dimension.""" + from .dataarray import DataArray + from scipy.interpolate import interp1d + + if isinstance(self._obj.data, dask_array_type): + raise TypeError( + "Up-sampling via interpolation was attempted on the the " + "variable '{}', but it is a dask array; dask arrays are not " + "yet supported in resample.interpolate(). Load into " + "memory with Dataset.load() before resampling." + .format(self._obj.data.name) + ) + + x = self._obj[self._dim].astype('float') + y = self._obj.data + + axis = self._obj.get_axis_num(self._dim) + + f = interp1d(x, y, kind=kind, axis=axis, bounds_error=True, + assume_sorted=True) + new_x = self._full_index.values.astype('float') + + # construct new up-sampled DataArray + dummy = self._obj.copy() + dims = dummy.dims + + # drop any existing non-dimension coordinates along the resampling + # dimension + coords = OrderedDict() + for k, v in dummy.coords.items(): + # is the resampling dimension + if k == self._dim: + coords[self._dim] = self._full_index + # else, check if resampling dim is in coordinate dimensions + elif self._dim not in v.dims: + coords[k] = v + return DataArray(f(new_x), coords, dims, name=dummy.name, + attrs=dummy.attrs) + + +ops.inject_reduce_methods(DataArrayResample) +ops.inject_binary_ops(DataArrayResample) + + +class DatasetResample(DatasetGroupBy, Resample): + """DatasetGroupBy object specialized to resampling a specified dimension + """ + + def __init__(self, *args, **kwargs): + + self._dim = kwargs.pop('dim', None) + self._resample_dim = kwargs.pop('resample_dim', None) + + if self._dim == self._resample_dim: + raise ValueError("Proxy resampling dimension ('{}') " + "cannot have the same name as actual dimension " + "('{}')! ".format(self._resample_dim, self._dim)) + super(DatasetResample, self).__init__(*args, **kwargs) + + def apply(self, func, **kwargs): + """Apply a function over each Dataset in the groups generated for + resampling and concatenate them together into a new Dataset. + + `func` is called like `func(ds, *args, **kwargs)` for each dataset `ds` + in this group. + + Apply uses heuristics (like `pandas.GroupBy.apply`) to figure out how + to stack together the datasets. The rule is: + 1. If the dimension along which the group coordinate is defined is + still in the first grouped item after applying `func`, then stack + over this dimension. + 2. Otherwise, stack over the new dimension given by name of this + grouping (the argument to the `groupby` function). + + Parameters + ---------- + func : function + Callable to apply to each sub-dataset. + **kwargs + Used to call `func(ds, **kwargs)` for each sub-dataset `ar`. + + Returns + ------- + applied : Dataset or DataArray + The result of splitting, applying and combining this dataset. + """ + kwargs.pop('shortcut', None) # ignore shortcut if set (for now) + applied = (func(ds, **kwargs) for ds in self._iter_grouped()) + combined = self._combine(applied) + + return combined.rename({self._resample_dim: self._dim}) + + def reduce(self, func, dim=None, keep_attrs=False, **kwargs): + """Reduce the items in this group by applying `func` along the + pre-defined resampling dimension. + + Note that `dim` is by default here and ignored if passed by the user; + this ensures compatibility with the existing reduce interface. + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to return the result of collapsing + an np.ndarray over an integer valued axis. + keep_attrs : bool, optional + If True, the datasets's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + reduced : Array + Array with summarized data and the indicated dimension(s) + removed. + """ + return super(DatasetResample, self).reduce( + func, self._dim, keep_attrs, **kwargs) + + def _interpolate(self, kind='linear'): + """Apply scipy.interpolate.interp1d along resampling dimension.""" + from .dataset import Dataset + from .variable import Variable + from scipy.interpolate import interp1d + + old_times = self._obj[self._dim].astype(float) + new_times = self._full_index.values.astype(float) + + data_vars = OrderedDict() + coords = OrderedDict() + + # Apply the interpolation to each DataArray in our original Dataset + for name, variable in self._obj.variables.items(): + if name in self._obj.coords: + if name == self._dim: + coords[self._dim] = self._full_index + elif self._dim not in variable.dims: + coords[name] = variable + else: + if isinstance(variable.data, dask_array_type): + raise TypeError( + "Up-sampling via interpolation was attempted on the " + "variable '{}', but it is a dask array; dask arrays " + "are not yet supprted in resample.interpolate(). Load " + "into memory with Dataset.load() before resampling." + .format(name) + ) + + axis = variable.get_axis_num(self._dim) + + # We've previously checked for monotonicity along the + # re-sampling dimension (in __init__ via the GroupBy + # constructor), so we can avoid sorting the data again by + # passing 'assume_sorted=True' + f = interp1d(old_times, variable.data, kind=kind, + axis=axis, bounds_error=True, + assume_sorted=True) + interpolated = Variable(variable.dims, f(new_times)) + + data_vars[name] = interpolated + + return Dataset(data_vars, coords) + + +ops.inject_reduce_methods(DatasetResample) +ops.inject_binary_ops(DatasetResample) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c42560c1182..e4ab2303fe1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -19,7 +19,7 @@ from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, - raises_regex) + raises_regex, requires_scipy) class TestDataArray(TestCase): @@ -1836,100 +1836,269 @@ def test_resample(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) array = DataArray(np.arange(10), [('time', times)]) - actual = array.resample('6H', dim='time') - self.assertDataArrayIdentical(array, actual) - - actual = array.resample('24H', dim='time') - expected = DataArray(array.to_series().resample('24H', how='mean')) + actual = array.resample(time='24H').mean() + expected = DataArray(array.to_series().resample('24H').mean()) self.assertDataArrayIdentical(expected, actual) - actual = array.resample('24H', dim='time', how=np.mean) + actual = array.resample(time='24H').reduce(np.mean) self.assertDataArrayIdentical(expected, actual) with self.assertRaisesRegexp(ValueError, 'index must be monotonic'): - array[[2, 0, 1]].resample('1D', dim='time') + array[[2, 0, 1]].resample(time='1D') def test_resample_first(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) array = DataArray(np.arange(10), [('time', times)]) - actual = array.resample('1D', dim='time', how='first') + actual = array.resample(time='1D').first() expected = DataArray([0, 4, 8], [('time', times[::4])]) self.assertDataArrayIdentical(expected, actual) # verify that labels don't use the first value - actual = array.resample('24H', dim='time', how='first') - expected = DataArray(array.to_series().resample('24H', how='first')) + actual = array.resample(time='24H').first() + expected = DataArray(array.to_series().resample('24H').first()) self.assertDataArrayIdentical(expected, actual) # missing values array = array.astype(float) array[:2] = np.nan - actual = array.resample('1D', dim='time', how='first') + actual = array.resample(time='1D').first() expected = DataArray([2, 4, 8], [('time', times[::4])]) self.assertDataArrayIdentical(expected, actual) - actual = array.resample('1D', dim='time', how='first', skipna=False) + actual = array.resample(time='1D').first(skipna=False) expected = DataArray([np.nan, 4, 8], [('time', times[::4])]) self.assertDataArrayIdentical(expected, actual) # regression test for http://stackoverflow.com/questions/33158558/ array = Dataset({'time': times})['time'] - actual = array.resample('1D', dim='time', how='last') + actual = array.resample(time='1D').last() expected_times = pd.to_datetime(['2000-01-01T18', '2000-01-02T18', '2000-01-03T06']) expected = DataArray(expected_times, [('time', times[::4])], name='time') self.assertDataArrayIdentical(expected, actual) - def test_resample_first_keep_attrs(self): + def test_resample_bad_resample_dim(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) - array = DataArray(np.arange(10), [('time', times)]) - array.attrs['meta'] = 'data' + array = DataArray(np.arange(10), [('__resample_dim__', times)]) + with self.assertRaisesRegexp(ValueError, 'Proxy resampling dimension'): + array.resample(**{'__resample_dim__': '1D'}).first() + + @requires_scipy + def test_resample_drop_nondim_coords(self): + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + xx, yy = np.meshgrid(xs*5, ys*2.5) + tt = np.arange(len(times), dtype=int) + array = DataArray(data, + {'time': times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + xcoord = DataArray(xx.T, {'x': xs, 'y': ys}, ('x', 'y')) + ycoord = DataArray(yy.T, {'x': xs, 'y': ys}, ('x', 'y')) + tcoord = DataArray(tt, {'time': times}, ('time', )) + ds = Dataset({'data': array, 'xc': xcoord, + 'yc': ycoord, 'tc': tcoord}) + ds = ds.set_coords(['xc', 'yc', 'tc']) + + # Select the data now, with the auxiliary coordinates in place + array = ds['data'] + + # Re-sample + actual = array.resample(time="12H").mean('time') + assert 'tc' not in actual.coords + + # Up-sample - filling + actual = array.resample(time="1H").ffill() + assert 'tc' not in actual.coords + + # Up-sample - interpolation + actual = array.resample(time="1H").interpolate('linear') + assert 'tc' not in actual.coords + + def test_resample_old_vs_new_api(self): + times = pd.date_range('2000-01-01', freq='6H', periods=10) + array = DataArray(np.ones(10), [('time', times)]) - resampled_array = array.resample('1D', dim='time', how='first', - keep_attrs=True) - actual = resampled_array.attrs - expected = array.attrs - self.assertEqual(expected, actual) + # Simple mean + with pytest.warns(DeprecationWarning): + old_mean = array.resample('1D', 'time', how='mean') + new_mean = array.resample(time='1D').mean() + self.assertDataArrayIdentical(old_mean, new_mean) + + # Mean, while keeping attributes + attr_array = array.copy() + attr_array.attrs['meta'] = 'data' + + with pytest.warns(DeprecationWarning): + old_mean = attr_array.resample('1D', dim='time', how='mean', + keep_attrs=True) + new_mean = attr_array.resample(time='1D').mean(keep_attrs=True) + self.assertEqual(old_mean.attrs, new_mean.attrs) + self.assertDatasetIdentical(old_mean, new_mean) + + # Mean, with NaN to skip + nan_array = array.copy() + nan_array[1] = np.nan + + with pytest.warns(DeprecationWarning): + old_mean = nan_array.resample('1D', 'time', how='mean', + skipna=False) + new_mean = nan_array.resample(time='1D').mean(skipna=False) + expected = DataArray([np.nan, 1, 1], [('time', times[::4])]) + self.assertDataArrayIdentical(old_mean, expected) + self.assertDataArrayIdentical(new_mean, expected) + + # Try other common resampling methods + resampler = array.resample(time='1D') + for method in ['mean', 'median', 'sum', 'first', 'last', 'count']: + # Discard attributes on the call using the new api to match + # convention from old api + new_api = getattr(resampler, method)(keep_attrs=False) + with pytest.warns(DeprecationWarning): + old_api = array.resample('1D', dim='time', how=method) + self.assertDatasetIdentical(new_api, old_api) + for method in [np.mean, np.sum, np.max, np.min]: + new_api = resampler.reduce(method) + with pytest.warns(DeprecationWarning): + old_api = array.resample('1D', dim='time', how=method) + self.assertDatasetIdentical(new_api, old_api) + + def test_upsample(self): + times = pd.date_range('2000-01-01', freq='6H', periods=5) + array = DataArray(np.arange(5), [('time', times)]) - resampled_array = array.resample('1D', dim='time', how='first', - keep_attrs=False) - assert resampled_array.attrs == {} + # Forward-fill + actual = array.resample(time='3H').ffill() + expected = DataArray(array.to_series().resample('3H').ffill()) + self.assertDataArrayIdentical(expected, actual) - def test_resample_mean_keep_attrs(self): - times = pd.date_range('2000-01-01', freq='6H', periods=10) - array = DataArray(np.arange(10), [('time', times)]) - array.attrs['meta'] = 'data' + # Backward-fill + actual = array.resample(time='3H').bfill() + expected = DataArray(array.to_series().resample('3H').bfill()) + self.assertDataArrayIdentical(expected, actual) - resampled_array = array.resample('1D', dim='time', how='mean', - keep_attrs=True) - actual = resampled_array.attrs - expected = array.attrs - self.assertEqual(expected, actual) + # As frequency + actual = array.resample(time='3H').asfreq() + expected = DataArray(array.to_series().resample('3H').asfreq()) + self.assertDataArrayIdentical(expected, actual) - resampled_array = array.resample('1D', dim='time', how='mean', - keep_attrs=False) - assert resampled_array.attrs == {} + # Pad + actual = array.resample(time='3H').pad() + expected = DataArray(array.to_series().resample('3H').pad()) + self.assertDataArrayIdentical(expected, actual) - def test_resample_skipna(self): - times = pd.date_range('2000-01-01', freq='6H', periods=10) - array = DataArray(np.ones(10), [('time', times)]) - array[1] = np.nan + # Nearest + rs = array.resample(time='3H') + actual = rs.nearest() + new_times = rs._full_index + expected = DataArray( + array.reindex(time=new_times, method='nearest') + ) + self.assertDataArrayIdentical(expected, actual) + + def test_upsample_nd(self): + # Same as before, but now we try on multi-dimensional DataArrays. + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + array = DataArray(data, + {'time': times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + + # Forward-fill + actual = array.resample(time='3H').ffill() + expected_data = np.repeat(data, 2, axis=-1) + expected_times = times.to_series().resample('3H').asfreq().index + expected_data = expected_data[..., :len(expected_times)] + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + self.assertDataArrayIdentical(expected, actual) + + # Backward-fill + actual = array.resample(time='3H').ffill() + expected_data = np.repeat(np.flipud(data.T).T, 2, axis=-1) + expected_data = np.flipud(expected_data.T).T + expected_times = times.to_series().resample('3H').asfreq().index + expected_data = expected_data[..., :len(expected_times)] + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + self.assertDataArrayIdentical(expected, actual) + + # As frequency + actual = array.resample(time='3H').asfreq() + expected_data = np.repeat(data, 2, axis=-1).astype(float)[..., :-1] + expected_data[..., 1::2] = np.nan + expected_times = times.to_series().resample('3H').asfreq().index + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + self.assertDataArrayIdentical(expected, actual) + + # Pad + actual = array.resample(time='3H').pad() + expected_data = np.repeat(data, 2, axis=-1) + expected_data[..., 1::2] = expected_data[..., ::2] + expected_data = expected_data[..., :-1] + expected_times = times.to_series().resample('3H').asfreq().index + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + self.assertDataArrayIdentical(expected, actual) + + @requires_scipy + def test_upsample_interpolate(self): + from scipy.interpolate import interp1d + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) + + z = np.arange(5)**2 + data = np.tile(z, (6, 3, 1)) + array = DataArray(data, + {'time': times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + + expected_times = times.to_series().resample('1H').asfreq().index + # Split the times into equal sub-intervals to simulate the 6 hour + # to 1 hour up-sampling + new_times_idx = np.linspace(0, len(times)-1, len(times)*5) + for kind in ['linear', 'nearest', 'zero', 'slinear', 'quadratic', + 'cubic']: + actual = array.resample(time='1H').interpolate(kind) + f = interp1d(np.arange(len(times)), data, kind=kind, axis=-1, + bounds_error=True, assume_sorted=True) + expected_data = f(new_times_idx) + expected = DataArray(expected_data, + {'time': expected_times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + # Use AllClose because there are some small differences in how + # we upsample timeseries versus the integer indexing as I've + # done here due to floating point arithmetic + self.assertDataArrayAllClose(expected, actual, rtol=1e-16) - actual = array.resample('1D', dim='time', skipna=False) - expected = DataArray([np.nan, 1, 1], [('time', times[::4])]) - self.assertDataArrayIdentical(expected, actual) + @requires_dask + def test_upsample_interpolate_dask(self): + import dask.array as da - def test_resample_upsampling(self): - times = pd.date_range('2000-01-01', freq='1D', periods=5) - array = DataArray(np.arange(5), [('time', times)]) + times = pd.date_range('2000-01-01', freq='6H', periods=5) + xs = np.arange(6) + ys = np.arange(3) - expected_time = pd.date_range('2000-01-01', freq='12H', periods=9) - expected = array.reindex(time=expected_time) - for how in ['mean', 'median', 'sum', 'first', 'last', np.mean]: - actual = array.resample('12H', 'time', how=how) - self.assertDataArrayIdentical(expected, actual) + z = np.arange(5)**2 + data = da.from_array(np.tile(z, (6, 3, 1)), (1, 3, 1)) + array = DataArray(data, + {'time': times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + + with self.assertRaisesRegexp(TypeError, + "dask arrays are not yet supported"): + array.resample(time='1H').interpolate('linear') def test_align(self): array = DataArray(np.random.random((6, 8)), @@ -2638,7 +2807,7 @@ def da_dask(seed=123): da['time'] = times return da - + def test_rolling_iter(da): rolling_obj = da.rolling(time=7) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index fd59e8ca184..85b18e77975 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,7 +31,7 @@ requires_dask, source_ndarray) from xarray.tests import (assert_equal, assert_allclose, - assert_array_equal) + assert_array_equal, requires_scipy) def create_test_data(seed=None): @@ -2192,16 +2192,21 @@ def test_resample_and_first(self): 'bar': ('time', np.random.randn(10), {'meta': 'data'}), 'time': times}) - actual = ds.resample('1D', dim='time', how='first', keep_attrs=True) + actual = ds.resample(time='1D').first(keep_attrs=True) expected = ds.isel(time=[0, 4, 8]) self.assertDatasetIdentical(expected, actual) # upsampling expected_time = pd.date_range('2000-01-01', freq='3H', periods=19) expected = ds.reindex(time=expected_time) - for how in ['mean', 'sum', 'first', 'last', np.mean]: - actual = ds.resample('3H', 'time', how=how) - self.assertDatasetEqual(expected, actual) + actual = ds.resample(time='3H') + for how in ['mean', 'sum', 'first', 'last', ]: + method = getattr(actual, how) + result = method() + self.assertDatasetEqual(expected, result) + for method in [np.mean, ]: + result = actual.reduce(method) + self.assertDatasetEqual(expected, result) def test_resample_by_mean_with_keep_attrs(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) @@ -2210,7 +2215,7 @@ def test_resample_by_mean_with_keep_attrs(self): 'time': times}) ds.attrs['dsmeta'] = 'dsdata' - resampled_ds = ds.resample('1D', dim='time', how='mean', keep_attrs=True) + resampled_ds = ds.resample(time='1D').mean(keep_attrs=True) actual = resampled_ds['bar'].attrs expected = ds['bar'].attrs self.assertEqual(expected, actual) @@ -2226,7 +2231,7 @@ def test_resample_by_mean_discarding_attrs(self): 'time': times}) ds.attrs['dsmeta'] = 'dsdata' - resampled_ds = ds.resample('1D', dim='time', how='mean', keep_attrs=False) + resampled_ds = ds.resample(time='1D').mean(keep_attrs=False) assert resampled_ds['bar'].attrs == {} assert resampled_ds.attrs == {} @@ -2238,11 +2243,58 @@ def test_resample_by_last_discarding_attrs(self): 'time': times}) ds.attrs['dsmeta'] = 'dsdata' - resampled_ds = ds.resample('1D', dim='time', how='last', keep_attrs=False) + resampled_ds = ds.resample(time='1D').last(keep_attrs=False) assert resampled_ds['bar'].attrs == {} assert resampled_ds.attrs == {} + @requires_scipy + def test_resample_drop_nondim_coords(self): + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range('2000-01-01', freq='6H', periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + xx, yy = np.meshgrid(xs*5, ys*2.5) + tt = np.arange(len(times), dtype=int) + array = DataArray(data, + {'time': times, 'x': xs, 'y': ys}, + ('x', 'y', 'time')) + xcoord = DataArray(xx.T, {'x': xs, 'y': ys}, ('x', 'y')) + ycoord = DataArray(yy.T, {'x': xs, 'y': ys}, ('x', 'y')) + tcoord = DataArray(tt, {'time': times}, ('time', )) + ds = Dataset({'data': array, 'xc': xcoord, + 'yc': ycoord, 'tc': tcoord}) + ds = ds.set_coords(['xc', 'yc', 'tc']) + + # Re-sample + actual = ds.resample(time="12H").mean('time') + assert 'tc' not in actual.coords + + # Up-sample - filling + actual = ds.resample(time="1H").ffill() + assert 'tc' not in actual.coords + + # Up-sample - interpolation + actual = ds.resample(time="1H").interpolate('linear') + assert 'tc' not in actual.coords + + def test_resample_old_vs_new_api(self): + + times = pd.date_range('2000-01-01', freq='6H', periods=10) + ds = Dataset({'foo': (['time', 'x', 'y'], np.random.randn(10, 5, 3)), + 'bar': ('time', np.random.randn(10), {'meta': 'data'}), + 'time': times}) + ds.attrs['dsmeta'] = 'dsdata' + + for method in ['mean', 'sum', 'count', 'first', 'last']: + resampler = ds.resample(time='1D') + # Discard attributes on the call using the new api to match + # convention from old api + new_api = getattr(resampler, method)(keep_attrs=False) + with pytest.warns(DeprecationWarning): + old_api = ds.resample('1D', dim='time', how=method) + self.assertDatasetIdentical(new_api, old_api) + def test_to_array(self): ds = Dataset(OrderedDict([('a', 1), ('b', ('x', [1, 2, 3]))]), coords={'c': 42}, attrs={'Conventions': 'None'})