From 16464be6f4a7f037d9b984506f4840a3f6c079ef Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Thu, 25 Oct 2018 01:37:24 -0600 Subject: [PATCH 1/2] ENH: Implement IntervalIndex.is_overlapping --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/intervaltree.pxi.in | 23 ++++++- pandas/core/arrays/interval.py | 2 + pandas/core/indexes/interval.py | 60 ++++++++++++++++++ .../tests/indexes/interval/test_interval.py | 61 +++++++++++++++++++ .../indexes/interval/test_interval_tree.py | 35 +++++++++++ 7 files changed, 182 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 81bb420c47a99..bf86204fe0fef 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1671,6 +1671,7 @@ IntervalIndex Components IntervalIndex.length IntervalIndex.values IntervalIndex.is_non_overlapping_monotonic + IntervalIndex.is_overlapping IntervalIndex.get_loc IntervalIndex.get_indexer IntervalIndex.set_closed diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6fd0a224b81b2..8a368f5c3c009 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -294,6 +294,7 @@ Other Enhancements - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. - :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`) +- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 7be3bdbc1048a..a76989231f206 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -26,7 +26,7 @@ cdef class IntervalTree(IntervalMixin): cdef: readonly object left, right, root, dtype readonly str closed - object _left_sorter, _right_sorter + object _is_overlapping, _left_sorter, _right_sorter def __init__(self, left, right, closed='right', leaf_size=100): """ @@ -81,6 +81,27 @@ cdef class IntervalTree(IntervalMixin): self._right_sorter = np.argsort(self.right) return self._right_sorter + @property + def is_overlapping(self): + """ + Determine if the IntervalTree contains overlapping intervals. + """ + if self._is_overlapping is not None: + return self._is_overlapping + + # <= when both sides closed since endpoints can overlap + op = le if self.closed == 'both' else lt + + self._is_overlapping = False + for previous, current in zip(self.left_sorter, self.left_sorter[1:]): + # overlap if start of current interval < end of previous interval + # (previous in terms of sorted order by left/start side) + if op(self.left[current], self.right[previous]): + self._is_overlapping = True + break + + return self._is_overlapping + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 70be850481d85..b055bc3f2eb52 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -74,6 +74,7 @@ length values is_non_overlapping_monotonic +%(extra_attributes)s\ Methods ------- @@ -107,6 +108,7 @@ summary="Pandas array for interval data that are closed on the same side.", versionadded="0.24.0", name='', + extra_attributes='', extra_methods='', examples=textwrap.dedent("""\ Examples diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1ebcf213ab0eb..f3e84d5b6c963 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -104,6 +104,7 @@ def _new_IntervalIndex(cls, d): summary="Immutable index of intervals that are closed on the same side.", name=_index_doc_kwargs['name'], versionadded="0.20.0", + extra_attributes="is_overlapping\n", extra_methods="contains\n", examples=textwrap.dedent("""\ Examples @@ -464,6 +465,61 @@ def is_unique(self): def is_non_overlapping_monotonic(self): return self._data.is_non_overlapping_monotonic + @property + def is_overlapping(self): + """ + Return True if the IntervalIndex has overlapping intervals, else False. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Returns + ------- + bool + Boolean indicating if the IntervalIndex has overlapping intervals. + + Examples + -------- + >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index + IntervalIndex([(0, 2], (1, 3], (4, 5]], + closed='right', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that share closed endpoints overlap: + + >>> index = pd.interval_range(0, 3, closed='both') + >>> index + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> index = pd.interval_range(0, 3, closed='left') + >>> index + IntervalIndex([[0, 1), [1, 2), [2, 3)], + closed='left', + dtype='interval[int64]') + >>> index.is_overlapping + False + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + IntervalIndex.overlaps : Check an IntervalIndex elementwise for + overlaps. + """ + # GH 23309 + return self._engine.is_overlapping + @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): if kind == 'iloc': @@ -570,6 +626,10 @@ def _maybe_convert_i8(self, key): else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) + if key.hasnans: + # convert NaT from it's i8 value to np.nan so it's not viewed + # as a valid value, maybe causing errors (e.g. is_overlapping) + key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index da3b3253ecbd1..c4dac6948cd7a 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -654,6 +654,23 @@ def test_maybe_convert_i8(self, breaks): expected = Index(breaks.asi8) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('breaks', [ + date_range('2018-01-01', periods=5), + timedelta_range('0 days', periods=5)]) + def test_maybe_convert_i8_nat(self, breaks): + # GH 20636 + index = IntervalIndex.from_breaks(breaks) + + to_convert = breaks._constructor([pd.NaT] * 3) + expected = pd.Float64Index([np.nan] * 3) + result = index._maybe_convert_i8(to_convert) + tm.assert_index_equal(result, expected) + + to_convert = to_convert.insert(0, breaks[0]) + expected = expected.insert(0, float(breaks[0].value)) + result = index._maybe_convert_i8(to_convert) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('breaks', [ np.arange(5, dtype='int64'), np.arange(5, dtype='float64')], ids=lambda x: str(x.dtype)) @@ -1082,6 +1099,50 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True + @pytest.mark.parametrize('start, shift, na_value', [ + (0, 1, np.nan), + (Timestamp('2018-01-01'), Timedelta('1 day'), pd.NaT), + (Timedelta('0 days'), Timedelta('1 day'), pd.NaT)]) + def test_is_overlapping(self, start, shift, na_value, closed): + # GH 23309 + # see test_interval_tree.py for extensive tests; interface tests here + + # non-overlapping + tuples = [(start + n * shift, start + (n + 1) * shift) + for n in (0, 2, 4)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # non-overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # overlapping + tuples = [(start + n * shift, start + (n + 2) * shift) + for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # common endpoints + tuples = [(start + n * shift, start + (n + 1) * shift) + for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + expected = closed == 'both' + assert result is expected + + # common endpoints with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + assert result is expected + @pytest.mark.parametrize('tuples', [ lzip(range(10), range(1, 11)), lzip(date_range('20170101', periods=10), diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 686cdaccd3883..90255835d9147 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -1,5 +1,7 @@ from __future__ import division +from itertools import permutations + import numpy as np import pytest @@ -135,3 +137,36 @@ def test_get_indexer_closed(self, closed, leaf_size): expected = found if tree.closed_right else not_found tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) + + @pytest.mark.parametrize('left, right, expected', [ + (np.array([0, 1, 4]), np.array([2, 3, 5]), True), + (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), + (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False)]) + @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + def test_is_overlapping(self, closed, order, left, right, expected): + # GH 23309 + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + assert result is expected + + @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + def test_is_overlapping_endpoints(self, closed, order): + """shared endpoints are marked as overlapping""" + # GH 23309 + left, right = np.arange(3), np.arange(1, 4) + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + expected = closed is 'both' + assert result is expected + + @pytest.mark.parametrize('left, right', [ + (np.array([], dtype='int64'), np.array([], dtype='int64')), + (np.array([0], dtype='int64'), np.array([1], dtype='int64')), + (np.array([np.nan]), np.array([np.nan])), + (np.array([np.nan] * 3), np.array([np.nan] * 3))]) + def test_is_overlapping_trivial(self, closed, left, right): + # GH 23309 + tree = IntervalTree(left, right, closed=closed) + assert tree.is_overlapping is False From dd6349286c2832fc54b33fb928051045f7d79973 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 26 Nov 2018 20:07:44 -0700 Subject: [PATCH 2/2] switch to array impl --- pandas/_libs/intervaltree.pxi.in | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index a76989231f206..fb6f30c030f11 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -85,6 +85,7 @@ cdef class IntervalTree(IntervalMixin): def is_overlapping(self): """ Determine if the IntervalTree contains overlapping intervals. + Cached as self._is_overlapping. """ if self._is_overlapping is not None: return self._is_overlapping @@ -92,13 +93,11 @@ cdef class IntervalTree(IntervalMixin): # <= when both sides closed since endpoints can overlap op = le if self.closed == 'both' else lt - self._is_overlapping = False - for previous, current in zip(self.left_sorter, self.left_sorter[1:]): - # overlap if start of current interval < end of previous interval - # (previous in terms of sorted order by left/start side) - if op(self.left[current], self.right[previous]): - self._is_overlapping = True - break + # overlap if start of current interval < end of previous interval + # (current and previous in terms of sorted order by left/start side) + current = self.left[self.left_sorter[1:]] + previous = self.right[self.left_sorter[:-1]] + self._is_overlapping = bool(op(current, previous).any()) return self._is_overlapping