interpolate_na: Add max_gap support. (#3302)

dcherian · web-flow · commit ee9da17ef040 · 2019-11-15T14:53:16.000Z
* interpolate_na: Add maxgap support.

* Add docs.

* Add requires_bottleneck to test.

* Review comments.

* Update xarray/core/dataarray.py

Co-Authored-By: Maximilian Roos &lt;5635139+max-sixty@users.noreply.github.com&gt;

* Update xarray/core/dataset.py

Co-Authored-By: Maximilian Roos &lt;5635139+max-sixty@users.noreply.github.com&gt;

* maxgap → max_gap

* update whats-new

* update computation.rst

* Better support uniformly spaced coordinates. Split legnths, interp test

* Raise error for max_gap and irregularly spaced coordinates + test

* rework.

* Use pandas checks for index duplication and monotonicity.

* Progress + add datetime.

* nicer error message

* A few fstrings.

* finish up timedelta max_gap.

* fix whats-new

* small fixes.

* fix dan's test.

* remove redundant test.

* nicer error message.

* Add xfailed cftime tests

* better error checking and tests.

* typing.

* update docstrings

* scipy intersphinx

* fix tests

* add bottleneck testing decorator.
diff --git a/doc/computation.rst b/doc/computation.rst
@@ -95,6 +95,9 @@ for filling missing values via 1D interpolation.
 Note that xarray slightly diverges from the pandas ``interpolate`` syntax by
 providing the ``use_coordinate`` keyword which facilitates a clear specification
 of which values to use as the index in the interpolation.
+xarray also provides the ``max_gap`` keyword argument to limit the interpolation to
+data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na`
+for more.
 
 Aggregation
 ===========
diff --git a/doc/conf.py b/doc/conf.py
@@ -340,9 +340,10 @@
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
-    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
-    "iris": ("http://scitools.org.uk/iris/docs/latest/", None),
-    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
-    "numba": ("https://numba.pydata.org/numba-doc/latest/", None),
-    "matplotlib": ("https://matplotlib.org/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
+    "iris": ("https://scitools.org.uk/iris/docs/latest", None),
+    "numpy": ("https://docs.scipy.org/doc/numpy", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
+    "numba": ("https://numba.pydata.org/numba-doc/latest", None),
+    "matplotlib": ("https://matplotlib.org", None),
 }
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -38,6 +38,10 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+
+- Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and
+  :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data
+  gap that will be filled by interpolation. By `Deepak Cherian <https://github.com/dcherian>`_.
 - :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels.
   :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for 
   dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible 
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -2018,44 +2018,69 @@ def fillna(self, value: Any) -> "DataArray":
 
     def interpolate_na(
         self,
-        dim=None,
+        dim: Hashable = None,
         method: str = "linear",
         limit: int = None,
         use_coordinate: Union[bool, str] = True,
+        max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
         **kwargs: Any,
     ) -> "DataArray":
-        """Interpolate values according to different methods.
+        """Fill in NaNs by interpolating according to different methods.
 
         Parameters
         ----------
         dim : str
             Specifies the dimension along which to interpolate.
-        method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
-                  'polynomial', 'barycentric', 'krog', 'pchip',
-                  'spline', 'akima'}, optional
+        method : str, optional
             String indicating which method to use for interpolation:
 
             - 'linear': linear interpolation (Default). Additional keyword
-              arguments are passed to ``numpy.interp``
-            - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
-              'polynomial': are passed to ``scipy.interpolate.interp1d``. If
-              method=='polynomial', the ``order`` keyword argument must also be
+              arguments are passed to :py:func:`numpy.interp`
+            - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial':
+              are passed to :py:func:`scipy.interpolate.interp1d`. If
+              ``method='polynomial'``, the ``order`` keyword argument must also be
               provided.
-            - 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their
-              respective``scipy.interpolate`` classes.
-        use_coordinate : boolean or str, default True
+            - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their
+              respective :py:class:`scipy.interpolate` classes.
+        use_coordinate : bool, str, default True
             Specifies which index to use as the x values in the interpolation
             formulated as `y = f(x)`. If False, values are treated as if
-            eqaully-spaced along `dim`. If True, the IndexVariable `dim` is
-            used. If use_coordinate is a string, it specifies the name of a
+            eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is
+            used. If ``use_coordinate`` is a string, it specifies the name of a
             coordinate variariable to use as the index.
         limit : int, default None
             Maximum number of consecutive NaNs to fill. Must be greater than 0
-            or None for no limit.
+            or None for no limit. This filling is done regardless of the size of
+            the gap in the data. To only interpolate over gaps less than a given length,
+            see ``max_gap``.
+        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
+            Maximum size of gap, a continuous sequence of NaNs, that will be filled.
+            Use None for no limit. When interpolating along a datetime64 dimension
+            and ``use_coordinate=True``, ``max_gap`` can be one of the following:
+
+            - a string that is valid input for pandas.to_timedelta
+            - a :py:class:`numpy.timedelta64` object
+            - a :py:class:`pandas.Timedelta` object
+            Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
+            dimensions has not been implemented yet. Gap length is defined as the difference
+            between coordinate values at the first data point after a gap and the last value
+            before a gap. For gaps at the beginning (end), gap length is defined as the difference
+            between coordinate values at the first (last) valid data point and the first (last) NaN.
+            For example, consider::
+
+                <xarray.DataArray (x: 9)>
+                array([nan, nan, nan,  1., nan, nan,  4., nan, nan])
+                Coordinates:
+                  * x        (x) int64 0 1 2 3 4 5 6 7 8
+
+            The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively
+        kwargs : dict, optional
+            parameters passed verbatim to the underlying interpolation function
 
         Returns
         -------
-        DataArray
+        interpolated: DataArray
+            Filled in DataArray.
 
         See also
         --------
@@ -2070,6 +2095,7 @@ def interpolate_na(
             method=method,
             limit=limit,
             use_coordinate=use_coordinate,
+            max_gap=max_gap,
             **kwargs,
         )
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -3908,42 +3908,65 @@ def interpolate_na(
         method: str = "linear",
         limit: int = None,
         use_coordinate: Union[bool, Hashable] = True,
+        max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
         **kwargs: Any,
     ) -> "Dataset":
-        """Interpolate values according to different methods.
+        """Fill in NaNs by interpolating according to different methods.
 
         Parameters
         ----------
-        dim : Hashable
+        dim : str
             Specifies the dimension along which to interpolate.
-        method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
-                  'polynomial', 'barycentric', 'krog', 'pchip',
-                  'spline'}, optional
+        method : str, optional
             String indicating which method to use for interpolation:
 
             - 'linear': linear interpolation (Default). Additional keyword
-              arguments are passed to ``numpy.interp``
-            - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
-              'polynomial': are passed to ``scipy.interpolate.interp1d``. If
-              method=='polynomial', the ``order`` keyword argument must also be
+              arguments are passed to :py:func:`numpy.interp`
+            - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial':
+              are passed to :py:func:`scipy.interpolate.interp1d`. If
+              ``method='polynomial'``, the ``order`` keyword argument must also be
               provided.
-            - 'barycentric', 'krog', 'pchip', 'spline': use their respective
-              ``scipy.interpolate`` classes.
-        use_coordinate : boolean or str, default True
+            - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their
+              respective :py:class:`scipy.interpolate` classes.
+        use_coordinate : bool, str, default True
             Specifies which index to use as the x values in the interpolation
             formulated as `y = f(x)`. If False, values are treated as if
-            eqaully-spaced along `dim`. If True, the IndexVariable `dim` is
-            used. If use_coordinate is a string, it specifies the name of a
+            eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is
+            used. If ``use_coordinate`` is a string, it specifies the name of a
             coordinate variariable to use as the index.
         limit : int, default None
             Maximum number of consecutive NaNs to fill. Must be greater than 0
-            or None for no limit.
-        kwargs : any
-            parameters passed verbatim to the underlying interplation function
+            or None for no limit. This filling is done regardless of the size of
+            the gap in the data. To only interpolate over gaps less than a given length,
+            see ``max_gap``.
+        max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
+            Maximum size of gap, a continuous sequence of NaNs, that will be filled.
+            Use None for no limit. When interpolating along a datetime64 dimension
+            and ``use_coordinate=True``, ``max_gap`` can be one of the following:
+
+            - a string that is valid input for pandas.to_timedelta
+            - a :py:class:`numpy.timedelta64` object
+            - a :py:class:`pandas.Timedelta` object
+            Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
+            dimensions has not been implemented yet. Gap length is defined as the difference
+            between coordinate values at the first data point after a gap and the last value
+            before a gap. For gaps at the beginning (end), gap length is defined as the difference
+            between coordinate values at the first (last) valid data point and the first (last) NaN.
+            For example, consider::
+
+                <xarray.DataArray (x: 9)>
+                array([nan, nan, nan,  1., nan, nan,  4., nan, nan])
+                Coordinates:
+                  * x        (x) int64 0 1 2 3 4 5 6 7 8
+
+            The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively
+        kwargs : dict, optional
+            parameters passed verbatim to the underlying interpolation function
 
         Returns
         -------
-        Dataset
+        interpolated: Dataset
+            Filled in Dataset.
 
         See also
         --------
@@ -3959,6 +3982,7 @@ def interpolate_na(
             method=method,
             limit=limit,
             use_coordinate=use_coordinate,
+            max_gap=max_gap,
             **kwargs,
         )
         return new
diff --git a/xarray/core/missing.py b/xarray/core/missing.py
@@ -1,18 +1,46 @@
 import warnings
 from functools import partial
-from typing import Any, Callable, Dict, Sequence
+from numbers import Number
+from typing import Any, Callable, Dict, Hashable, Sequence, Union
 
 import numpy as np
 import pandas as pd
 
 from . import utils
-from .common import _contains_datetime_like_objects
+from .common import _contains_datetime_like_objects, ones_like
 from .computation import apply_ufunc
 from .duck_array_ops import dask_array_type
 from .utils import OrderedSet, is_scalar
 from .variable import Variable, broadcast_variables
 
 
+def _get_nan_block_lengths(obj, dim: Hashable, index: Variable):
+    """
+    Return an object where each NaN element in 'obj' is replaced by the
+    length of the gap the element is in.
+    """
+
+    # make variable so that we get broadcasting for free
+    index = Variable([dim], index)
+
+    # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072
+    arange = ones_like(obj) * index
+    valid = obj.notnull()
+    valid_arange = arange.where(valid)
+    cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0])
+
+    nan_block_lengths = (
+        cumulative_nans.diff(dim=dim, label="upper")
+        .reindex({dim: obj[dim]})
+        .where(valid)
+        .bfill(dim=dim)
+        .where(~valid, 0)
+        .fillna(index[-1] - valid_arange.max())
+    )
+
+    return nan_block_lengths
+
+
 class BaseInterpolator:
     """Generic interpolator class for normalizing interpolation methods
     """
@@ -178,7 +206,7 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs):
     return ds
 
 
-def get_clean_interp_index(arr, dim, use_coordinate=True):
+def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True):
     """get index to use for x values in interpolation.
 
     If use_coordinate is True, the coordinate that shares the name of the
@@ -195,23 +223,33 @@ def get_clean_interp_index(arr, dim, use_coordinate=True):
             index = arr.coords[use_coordinate]
             if index.ndim != 1:
                 raise ValueError(
-                    "Coordinates used for interpolation must be 1D, "
-                    "%s is %dD." % (use_coordinate, index.ndim)
+                    f"Coordinates used for interpolation must be 1D, "
+                    f"{use_coordinate} is {index.ndim}D."
                 )
+            index = index.to_index()
+
+        # TODO: index.name is None for multiindexes
+        # set name for nice error messages below
+        if isinstance(index, pd.MultiIndex):
+            index.name = dim
+
+        if not index.is_monotonic:
+            raise ValueError(f"Index {index.name!r} must be monotonically increasing")
+
+        if not index.is_unique:
+            raise ValueError(f"Index {index.name!r} has duplicate values")
 
         # raise if index cannot be cast to a float (e.g. MultiIndex)
         try:
             index = index.values.astype(np.float64)
         except (TypeError, ValueError):
             # pandas raises a TypeError
-            # xarray/nuppy raise a ValueError
+            # xarray/numpy raise a ValueError
             raise TypeError(
-                "Index must be castable to float64 to support"
-                "interpolation, got: %s" % type(index)
+                f"Index {index.name!r} must be castable to float64 to support "
+                f"interpolation, got {type(index).__name__}."
             )
-        # check index sorting now so we can skip it later
-        if not (np.diff(index) > 0).all():
-            raise ValueError("Index must be monotonicly increasing")
+
     else:
         axis = arr.get_axis_num(dim)
         index = np.arange(arr.shape[axis], dtype=np.float64)
@@ -220,7 +258,13 @@ def get_clean_interp_index(arr, dim, use_coordinate=True):
 
 
 def interp_na(
-    self, dim=None, use_coordinate=True, method="linear", limit=None, **kwargs
+    self,
+    dim: Hashable = None,
+    use_coordinate: Union[bool, str] = True,
+    method: str = "linear",
+    limit: int = None,
+    max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
+    **kwargs,
 ):
     """Interpolate values according to different methods.
     """
@@ -230,6 +274,40 @@ def interp_na(
     if limit is not None:
         valids = _get_valid_fill_mask(self, dim, limit)
 
+    if max_gap is not None:
+        max_type = type(max_gap).__name__
+        if not is_scalar(max_gap):
+            raise ValueError("max_gap must be a scalar.")
+
+        if (
+            dim in self.indexes
+            and isinstance(self.indexes[dim], pd.DatetimeIndex)
+            and use_coordinate
+        ):
+            if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)):
+                raise TypeError(
+                    f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}"
+                )
+
+            if isinstance(max_gap, str):
+                try:
+                    max_gap = pd.to_timedelta(max_gap)
+                except ValueError:
+                    raise ValueError(
+                        f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta"
+                    )
+
+            if isinstance(max_gap, pd.Timedelta):
+                max_gap = np.timedelta64(max_gap.value, "ns")
+
+            max_gap = np.timedelta64(max_gap, "ns").astype(np.float64)
+
+        if not use_coordinate:
+            if not isinstance(max_gap, (Number, np.number)):
+                raise TypeError(
+                    f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}."
+                )
+
     # method
     index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate)
     interp_class, kwargs = _get_interpolator(method, **kwargs)
@@ -253,6 +331,14 @@ def interp_na(
     if limit is not None:
         arr = arr.where(valids)
 
+    if max_gap is not None:
+        if dim not in self.coords:
+            raise NotImplementedError(
+                "max_gap not implemented for unlabeled coordinates yet."
+            )
+        nan_block_lengths = _get_nan_block_lengths(self, dim, index)
+        arr = arr.where(nan_block_lengths <= max_gap)
+
     return arr
 
 
diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py