From 77891cd589ef35fc7de44e9617725906dfc3b902 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 17 Nov 2020 20:41:07 -0800 Subject: [PATCH 1/3] ENH: make closed part of IntervalDtype --- pandas/core/arrays/_arrow_utils.py | 2 +- pandas/core/arrays/interval.py | 10 ++++--- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 28 +++++++++++++++---- pandas/tests/arithmetic/test_interval.py | 2 +- pandas/tests/arrays/interval/test_interval.py | 2 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/frame/test_constructors.py | 6 ++-- .../indexes/interval/test_constructors.py | 2 +- .../tests/indexes/interval/test_interval.py | 8 +++--- pandas/tests/series/test_constructors.py | 6 ++-- pandas/tests/util/test_assert_frame_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 2 +- 14 files changed, 47 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index c89f5554d0715..6f2e17be0c845 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -127,7 +127,7 @@ def __hash__(self): def to_pandas_dtype(self): import pandas as pd - return pd.IntervalDtype(self.subtype.to_pandas_dtype()) + return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d007bb112c86c..8d94fd6760db1 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -222,10 +222,12 @@ def __new__( def _simple_new(cls, data, closed="right"): result = IntervalMixin.__new__(cls) + dtype = IntervalDtype(data.dtype, closed=closed) + result._dtype = dtype + result._combined = data result._left = data[:, 0] result._right = data[:, 1] - result._closed = closed return result @classmethod @@ -480,7 +482,7 @@ def _validate(self): @property def dtype(self): - return IntervalDtype(self.left.dtype) + return self._dtype @property def nbytes(self) -> int: @@ -1117,7 +1119,7 @@ def closed(self): Whether the intervals are closed on the left-side, right-side, both or neither. """ - return self._closed + return self.dtype.closed _interval_shared_docs["set_closed"] = textwrap.dedent( """ @@ -1212,7 +1214,7 @@ def __array__(self, dtype=None) -> np.ndarray: left = self._left right = self._right mask = self.isna() - closed = self._closed + closed = self.closed result = np.empty(len(left), dtype=object) for i in range(len(left)): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9758eae60c262..ab72d30f0b379 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -811,7 +811,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, dtype = PeriodDtype(freq=val.freq) elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] - dtype = IntervalDtype(subtype=subtype) + dtype = IntervalDtype(subtype=subtype, closed=val.closed) return dtype, val diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 01b34187997cb..82feec44d871c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1008,19 +1008,23 @@ class IntervalDtype(PandasExtensionDtype): base = np.dtype("O") num = 103 _metadata = ("subtype",) - _match = re.compile(r"(I|i)nterval\[(?P.+)\]") + _match = re.compile( + r"(I|i)nterval\[(?P[^,]+)(, (?P(right|left|both|neither)))?\]" + ) _cache: Dict[str_type, PandasExtensionDtype] = {} - def __new__(cls, subtype=None): + def __new__(cls, subtype=None, closed=None): from pandas.core.dtypes.common import is_string_dtype, pandas_dtype if isinstance(subtype, IntervalDtype): return subtype + # TODO: what if closed is also passed? elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u._subtype = None + u._closed = closed return u elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None @@ -1028,7 +1032,9 @@ def __new__(cls, subtype=None): if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: - subtype = m.group("subtype") + gd = m.groupdict() + subtype = gd["subtype"] + closed = gd.get("closed", closed) try: subtype = pandas_dtype(subtype) @@ -1043,14 +1049,20 @@ def __new__(cls, subtype=None): ) raise TypeError(msg) + key = str(subtype) + str(closed) try: - return cls._cache[str(subtype)] + return cls._cache[key] except KeyError: u = object.__new__(cls) u._subtype = subtype - cls._cache[str(subtype)] = u + u._closed = closed + cls._cache[key] = u return u + @property + def closed(self): + return self._closed + @property def subtype(self): """ @@ -1100,7 +1112,7 @@ def type(self): def __str__(self) -> str_type: if self.subtype is None: return "interval" - return f"interval[{self.subtype}]" + return f"interval[{self.subtype}, {self.closed}]" def __hash__(self) -> int: # make myself hashable @@ -1114,6 +1126,8 @@ def __eq__(self, other: Any) -> bool: elif self.subtype is None or other.subtype is None: # None should match any subtype return True + elif self.closed != other.closed: + return False else: from pandas.core.dtypes.common import is_dtype_equal @@ -1124,6 +1138,8 @@ def __setstate__(self, state): # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) self._subtype = state["subtype"] + # backward-compat older pickles won't have "closed" key + self._closed = state.pop("closed", None) @classmethod def is_dtype(cls, dtype: object) -> bool: diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 6dc3b3b13dd0c..46db9100b8b93 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -133,7 +133,7 @@ def test_compare_scalar_na(self, op, array, nulls_fixture, request): result = op(array, nulls_fixture) expected = self.elementwise_comparison(op, array, nulls_fixture) - if nulls_fixture is pd.NA and array.dtype != pd.IntervalDtype("int64"): + if nulls_fixture is pd.NA and array.dtype.subtype != "int64": mark = pytest.mark.xfail( reason="broken for non-integer IntervalArray; see GH 31882" ) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index e5ccb51ce36f5..af291ca98a91a 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -131,7 +131,7 @@ def test_repr(): expected = ( "\n" "[(0, 1], (1, 2]]\n" - "Length: 2, closed: right, dtype: interval[int64]" + "Length: 2, closed: right, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 157adacbdfdf7..3954d74b74650 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -129,7 +129,7 @@ def test_infer_from_interval(left, right, subtype, closed, pandas_dtype): # GH 30337 interval = Interval(left, right, closed) result_dtype, result_value = infer_dtype_from_scalar(interval, pandas_dtype) - expected_dtype = f"interval[{subtype}]" if pandas_dtype else np.object_ + expected_dtype = f"interval[{subtype}, {closed}]" if pandas_dtype else np.object_ assert result_dtype == expected_dtype assert result_value == interval diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a419cb0dded79..76e98ea0e65b8 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -678,7 +678,7 @@ def test_equality_generic(self, subtype): def test_name_repr(self, subtype): # GH 18980 dtype = IntervalDtype(subtype) - expected = f"interval[{subtype}]" + expected = f"interval[{subtype}, None]" assert str(dtype) == expected assert dtype.name == "interval" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f53378d86d7c6..bb2904a67cc1f 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -722,8 +722,8 @@ def test_constructor_period_dict(self): [ (pd.Period("2012-01", freq="M"), "period[M]"), (pd.Period("2012-02-01", freq="D"), "period[D]"), - (Interval(left=0, right=5), IntervalDtype("int64")), - (Interval(left=0.1, right=0.5), IntervalDtype("float64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), + (Interval(left=0.1, right=0.5), IntervalDtype("float64", "right")), ], ) def test_constructor_period_dict_scalar(self, data, dtype): @@ -739,7 +739,7 @@ def test_constructor_period_dict_scalar(self, data, dtype): "data,dtype", [ (Period("2020-01"), PeriodDtype("M")), - (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), ( Timestamp("2011-01-01", tz="US/Eastern"), DatetimeTZDtype(tz="US/Eastern"), diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index c0ca0b415ba8e..3f9104811c745 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -75,7 +75,7 @@ def test_constructor_dtype(self, constructor, breaks, subtype): expected = constructor(**expected_kwargs) result_kwargs = self.get_kwargs_from_breaks(breaks) - iv_dtype = IntervalDtype(subtype) + iv_dtype = IntervalDtype(subtype, "right") for dtype in (iv_dtype, str(iv_dtype)): result = constructor(dtype=dtype, **result_kwargs) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index ff871ee45daed..5bff2e63365a8 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -582,7 +582,7 @@ def test_comparison(self): msg = "|".join( [ "not supported between instances of 'int' and '.*.Interval'", - r"Invalid comparison between dtype=interval\[int64\] and ", + r"Invalid comparison between dtype=interval\[int64, right\] and ", ] ) with pytest.raises(TypeError, match=msg): @@ -691,13 +691,13 @@ def test_append(self, closed): ) tm.assert_index_equal(result, expected) - msg = "Intervals must all be closed on the same side" for other_closed in {"left", "right", "both", "neither"} - {closed}: index_other_closed = IntervalIndex.from_arrays( [0, 1], [1, 2], closed=other_closed ) - with pytest.raises(ValueError, match=msg): - index1.append(index_other_closed) + result = index1.append(index_other_closed) + expected = index1.astype(object).append(index_other_closed.astype(object)) + tm.assert_index_equal(result, expected) def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1d75ed25ad2e9..8c972216a9cb0 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -997,7 +997,7 @@ def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals intervals = interval_constructor.from_breaks(np.arange(3), closed="right") result = Series(intervals) - assert result.dtype == "interval[int64]" + assert result.dtype == "interval[int64, right]" tm.assert_index_equal(Index(result.values), Index(intervals)) @pytest.mark.parametrize( @@ -1008,7 +1008,7 @@ def test_constructor_infer_interval(self, data_constructor): data = [Interval(0, 1), Interval(0, 2), None] result = Series(data_constructor(data)) expected = Series(IntervalArray(data)) - assert result.dtype == "interval[float64]" + assert result.dtype == "interval[float64, right]" tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1111,7 +1111,7 @@ def test_constructor_dict_order(self): "data,dtype", [ (Period("2020-01"), PeriodDtype("M")), - (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), ( Timestamp("2011-01-01", tz="US/Eastern"), DatetimeTZDtype(tz="US/Eastern"), diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index d5161ce37494b..684b2d747c6e1 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -254,7 +254,7 @@ def test_assert_frame_equal_interval_dtype_mismatch(): "Attributes of DataFrame\\.iloc\\[:, 0\\] " '\\(column name="a"\\) are different\n\n' 'Attribute "dtype" are different\n' - "\\[left\\]: interval\\[int64\\]\n" + "\\[left\\]: interval\\[int64, right\\]\n" "\\[right\\]: object" ) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0f56fb0b93642..99359a568d7d0 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -253,7 +253,7 @@ def test_assert_series_equal_interval_dtype_mismatch(): msg = """Attributes of Series are different Attribute "dtype" are different -\\[left\\]: interval\\[int64\\] +\\[left\\]: interval\\[int64, right\\] \\[right\\]: object""" tm.assert_series_equal(left, right, check_dtype=False) From 9e3d1ba3eedccf5e8d322e5e21bfae702eca192d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 21 Nov 2020 09:58:44 -0800 Subject: [PATCH 2/3] TST: raise on mismatched closed --- pandas/core/dtypes/dtypes.py | 8 ++++++-- pandas/tests/dtypes/test_dtypes.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c08d323c1f7d7..46bf041a096f0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1013,12 +1013,16 @@ class IntervalDtype(PandasExtensionDtype): ) _cache: Dict[str_type, PandasExtensionDtype] = {} - def __new__(cls, subtype=None, closed=None): + def __new__(cls, subtype=None, closed: Optional[str_type] = None): from pandas.core.dtypes.common import is_string_dtype, pandas_dtype if isinstance(subtype, IntervalDtype): + if closed is not None and closed != subtype.closed: + raise ValueError( + "dtype.closed and 'closed' do not match. " + "Try IntervalDtype(dtype.subtype, closed) instead." + ) return subtype - # TODO: what if closed is also passed? elif subtype is None: # we are called as an empty constructor # generally for pickle compat diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 76e98ea0e65b8..dfa1eb3ccdf46 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -578,6 +578,14 @@ def test_construction_errors(self, subtype): with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) + def test_closed_must_match(self): + # GH#37933 + dtype = IntervalDtype(np.float64, "left") + + msg = "dtype.closed and 'closed' do not match" + with pytest.raises(ValueError, match=msg): + IntervalDtype(dtype, closed="both") + def test_construction_from_string(self, dtype): result = IntervalDtype("interval[int64]") assert is_dtype_equal(dtype, result) From fb5f9c35b6b8d41653367a38478ef622792b54f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 21 Nov 2020 12:10:47 -0800 Subject: [PATCH 3/3] typo fixup --- pandas/core/reshape/tile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 11ceab91f1cfa..969b416669023 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -135,7 +135,7 @@ def cut( >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... - Categories (3, interval[float64m right]): [(0.994, 3.0] < (3.0, 5.0] ... + Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ... >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) ... # doctest: +ELLIPSIS @@ -176,7 +176,7 @@ def cut( d (7.333, 10.0] e (7.333, 10.0] dtype: category - Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. @@ -214,7 +214,7 @@ def cut( >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] - Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] + Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0