From 79452104276aa182a16ffbbf7f054355983b152a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Jun 2021 07:25:14 -0700 Subject: [PATCH 1/3] Backport PR #42268: PERF: IntervalIndex.intersection --- pandas/core/indexes/base.py | 10 +++++++--- pandas/core/indexes/interval.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1de28cb26e7dc..1dcc3337990a3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3099,9 +3099,13 @@ def _intersection(self, other: Index, sort=False): """ intersection specialized to the case with matching dtypes. """ - # TODO(EA): setops-refactor, clean all this up - - if self.is_monotonic and other.is_monotonic: + if ( + self.is_monotonic + and other.is_monotonic + and not is_interval_dtype(self.dtype) + ): + # For IntervalIndex _inner_indexer is not more performant than get_indexer, + # so don't take this fastpath try: result = self._inner_indexer(other)[0] except TypeError: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f82b44abaeaa1..26f13280c118f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -37,6 +37,7 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, find_common_type, infer_dtype_from_scalar, maybe_box_datetimelike, @@ -801,6 +802,19 @@ def _is_all_dates(self) -> bool: """ return False + def _get_join_target(self) -> np.ndarray: + # constructing tuples is much faster than constructing Intervals + tups = list(zip(self.left, self.right)) + target = construct_1d_object_array_from_listlike(tups) + return target + + def _from_join_target(self, result): + left, right = list(zip(*result)) + arr = type(self._data).from_arrays( + left, right, dtype=self.dtype, closed=self.closed + ) + return type(self)._simple_new(arr, name=self.name) + # TODO: arithmetic operations From cfc31ce49b88dbadfb44f6d4e7940be9f292fa85 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Jun 2021 05:31:39 -0700 Subject: [PATCH 2/3] Backport PR #42293: PERF/REGR: IntervalIndex.intersection, PeriodIndex.get_loc --- pandas/core/indexes/interval.py | 45 +++++++++++++++++++++++++++++++++ pandas/core/indexes/period.py | 23 ++++++++++++++--- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 26f13280c118f..845d21e18d287 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -61,6 +61,7 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.missing import is_valid_na_for_dtype +from pandas.core.algorithms import unique from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -792,6 +793,50 @@ def _format_data(self, name=None) -> str: # name argument is unused here; just for compat with base / categorical return self._data._format_data() + "," + self._format_space() + # -------------------------------------------------------------------- + # Set Operations + + def _intersection(self, other, sort): + """ + intersection specialized to the case with matching dtypes. + """ + # For IntervalIndex we also know other.closed == self.closed + if self.left.is_unique and self.right.is_unique: + taken = self._intersection_unique(other) + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: + # Swap other/self if other is unique and self does not have + # multiple NaNs + taken = other._intersection_unique(self) + else: + return super()._intersection(other, sort) + + if sort is None: + taken = taken.sort_values() + + return taken + + def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does not have any common endpoint, + no matter left or right. + Return the intersection with another IntervalIndex. + Parameters + ---------- + other : IntervalIndex + Returns + ------- + IntervalIndex + """ + # Note: this is much more performant than super()._intersection(other) + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + indexer = unique(indexer) + + return self.take(indexer) + # -------------------------------------------------------------------- @property diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ae8126846d03a..70d401a233328 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -447,10 +447,25 @@ def get_loc(self, key, method=None, tolerance=None): else: key = asdt - elif is_integer(key): - # Period constructor will cast to string, which we dont want - raise KeyError(key) - elif isinstance(key, Period) and key.freq != self.freq: + elif isinstance(key, Period): + sfreq = self.freq + kfreq = key.freq + if not ( + sfreq.n == kfreq.n + and sfreq._period_dtype_code == kfreq._period_dtype_code + ): + # GH#42247 For the subset of DateOffsets that can be Period freqs, + # checking these two attributes is sufficient to check equality, + # and much more performant than `self.freq == key.freq` + raise KeyError(key) + elif isinstance(key, datetime): + try: + key = Period(key, freq=self.freq) + except ValueError as err: + # we cannot construct the Period + raise KeyError(orig_key) from err + else: + # in particular integer, which Period constructor would cast to string raise KeyError(key) try: From 0651dbc751dec85ddbeafc9aa8261f6af4932fff Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 29 Jun 2021 11:52:00 -0700 Subject: [PATCH 3/3] catch NAs --- pandas/core/indexes/period.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 70d401a233328..7917e0eff0227 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -15,6 +15,7 @@ ) from pandas._libs.tslibs import ( BaseOffset, + NaT, Period, Resolution, Tick, @@ -37,6 +38,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.arrays.period import ( PeriodArray, @@ -413,7 +415,10 @@ def get_loc(self, key, method=None, tolerance=None): if not is_scalar(key): raise InvalidIndexError(key) - if isinstance(key, str): + if is_valid_na_for_dtype(key, self.dtype): + key = NaT + + elif isinstance(key, str): try: loc = self._get_string_slice(key)