From aac172cefd468f3896ea9019b92fdc00efef6ef4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 11:59:22 -0600 Subject: [PATCH 01/12] [WIP]: API: Change default for Index.union sort Closes https://github.com/pandas-dev/pandas/issues/24959 --- pandas/core/indexes/base.py | 37 +++++++++++++++++++++---- pandas/tests/indexes/test_base.py | 46 +++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 767da81c5c43a..41430249055f9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2245,18 +2245,34 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self - def union(self, other, sort=True): + def union(self, other, sort=None): """ Form the union of two Index objects. Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * True : sort the result. A TypeError is raised when the + values cannot be compared. + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.0 + + Changed the default `sort` to None, matching the + behavior of pandas 0.23.4 and earlier. + Returns ------- union : Index @@ -2273,10 +2289,16 @@ def union(self, other, sort=True): other = ensure_index(other) if len(other) == 0 or self.equals(other): - return self._get_reconciled_name_object(other) + result = self._get_reconciled_name_object(other) + if sort: + result = result.sort_values() + return result if len(self) == 0: - return other._get_reconciled_name_object(self) + result = other._get_reconciled_name_object(self) + if sort: + result = result.sort_values() + return result # TODO: is_dtype_union_equal is a hack around # 1. buggy set ops with duplicates (GH #13432) @@ -2319,13 +2341,16 @@ def union(self, other, sort=True): else: result = lvals - if sort: + if sort is None: try: result = sorting.safe_sort(result) except TypeError as e: warnings.warn("{}, sort order is undefined for " "incomparable objects".format(e), RuntimeWarning, stacklevel=3) + elif sort: + # raise if not sortable. + result = sorting.safe_sort(result) # for subclasses return self._wrap_setop_result(other, result) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f3e9d835c7391..c38e956dafcac 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -794,6 +794,52 @@ def test_union(self, sort): tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) + def test_union_sort_other_equal(self): + a = pd.Index([1, 0, 2]) + # default, sort=None + result = a.union(a) + tm.assert_index_equal(result, a) + + # sort=True + result = a.union(a, sort=True) + expected = pd.Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + # sort=False + result = a.union(a, sort=False) + tm.assert_index_equal(result, a) + + def test_union_sort_other_empty(self): + a = pd.Index([1, 0, 2]) + # default, sort=None + tm.assert_index_equal(a.union(a[:0]), a) + tm.assert_index_equal(a[:0].union(a), a) + + # sort=True + expected = pd.Index([0, 1, 2]) + tm.assert_index_equal(a.union(a[:0], sort=True), expected) + tm.assert_index_equal(a[:0].union(a, sort=True), expected) + + # sort=False + tm.assert_index_equal(a.union(a[:0], sort=False), a) + tm.assert_index_equal(a[:0].union(a, sort=False), a) + + def test_union_sort_other_incomparable(self): + a = pd.Index([1, pd.Timestamp('2000')]) + # default, sort=None + with tm.assert_produces_warning(RuntimeWarning): + result = a.union(a[:1]) + + tm.assert_index_equal(result, a) + + # sort=True + with pytest.raises(TypeError, match='.*'): + a.union(a[:1], sort=True) + + # sort=False + result = a.union(a[:1], sort=False) + tm.assert_index_equal(result, a) + @pytest.mark.parametrize("klass", [ np.array, Series, list]) @pytest.mark.parametrize("sort", [True, False]) From d4bcc55a161c56f485b95fdbcfcff88344df47d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 13:27:49 -0600 Subject: [PATCH 02/12] update test --- pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_base.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 41430249055f9..dac32b887a596 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2268,7 +2268,7 @@ def union(self, other, sort=None): .. versionadded:: 0.24.0 - .. versionchanged:: 0.24.0 + .. versionchanged:: 0.24.1 Changed the default `sort` to None, matching the behavior of pandas 0.23.4 and earlier. diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c38e956dafcac..972d0bad45426 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -856,19 +856,20 @@ def test_union_from_iterables(self, klass, sort): tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, True, False]) def test_union_identity(self, sort): # TODO: replace with fixturesult first = self.strIndex[5:20] union = first.union(first, sort=sort) - assert union is first + # i.e. identity is not preserved when sort is True + assert (union is first) is (not sort) union = first.union([], sort=sort) - assert union is first + assert (union is first) is (not sort) union = Index([]).union(first, sort=sort) - assert union is first + assert (union is first) is (not sort) @pytest.mark.parametrize("first_list", [list('ba'), list()]) @pytest.mark.parametrize("second_list", [list('ab'), list()]) From 45c827cdf59fef7f4cdc229be4a880372a6d0f4b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 20:38:52 -0600 Subject: [PATCH 03/12] fixups --- pandas/_libs/lib.pyx | 5 ++- pandas/core/indexes/multi.py | 26 ++++++++++-- pandas/tests/indexes/multi/test_set_ops.py | 39 ++++++++++++++++++ pandas/tests/indexes/test_base.py | 48 +++++++++------------- 4 files changed, 85 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4745916eb0ce2..f3ff3f7678e1a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -233,11 +233,14 @@ def fast_unique_multiple(list arrays, sort: bool=True): if val not in table: table[val] = stub uniques.append(val) - if sort: + if sort is None: try: uniques.sort() except Exception: + # TODO: RuntimeWarning? pass + elif sort: + uniques.sort() return uniques diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e4d01a40bd181..bdbb6c812c5c6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2879,18 +2879,34 @@ def equal_levels(self, other): return False return True - def union(self, other, sort=True): + def union(self, other, sort=None): """ Form the union of two MultiIndex objects Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True - Sort the resulting MultiIndex if possible + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * True : sort the result. A TypeError is raised when the + values cannot be compared. + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default `sort` to None, matching the + behavior of pandas 0.23.4 and earlier. + Returns ------- Index @@ -2901,8 +2917,12 @@ def union(self, other, sort=True): other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): + if sort: + return self.sort_values() return self + # TODO: Index.union returns other when `len(self)` is 0. + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, other._ndarray_values], sort=sort) diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 208d6cf1c639f..09321000eb80d 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -249,3 +249,42 @@ def test_intersection(idx, sort): # tuples = _index.values # result = _index & tuples # assert result.equals(tuples) + + +@pytest.mark.parametrize('slice_', [slice(None), slice(0)]) +def test_union_sort_other_empty(slice_): + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + # Two cases: + # 1. idx is other + # 2. other is empty + + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + # MultiIndex does not special case empty.union(idx) + # tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + # sort=True + result = idx.union(other, sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_union_sort_other_incomparable(): + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + + # default, sort=None + # with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) + tm.assert_index_equal(result, idx) + + # sort=True + with pytest.raises(TypeError, match='Cannot compare'): + idx.union(idx[:1], sort=True) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 972d0bad45426..c2fdab98977ba 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -794,51 +794,41 @@ def test_union(self, sort): tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) - def test_union_sort_other_equal(self): - a = pd.Index([1, 0, 2]) - # default, sort=None - result = a.union(a) - tm.assert_index_equal(result, a) + @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + def test_union_sort_other_special(self, slice_): + # Two cases: + # 1. idx is other + # 2. other is empty - # sort=True - result = a.union(a, sort=True) - expected = pd.Index([0, 1, 2]) - tm.assert_index_equal(result, expected) + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) # sort=False - result = a.union(a, sort=False) - tm.assert_index_equal(result, a) - - def test_union_sort_other_empty(self): - a = pd.Index([1, 0, 2]) - # default, sort=None - tm.assert_index_equal(a.union(a[:0]), a) - tm.assert_index_equal(a[:0].union(a), a) + tm.assert_index_equal(idx.union(other, sort=False), idx) # sort=True + result = idx.union(other, sort=True) expected = pd.Index([0, 1, 2]) - tm.assert_index_equal(a.union(a[:0], sort=True), expected) - tm.assert_index_equal(a[:0].union(a, sort=True), expected) - - # sort=False - tm.assert_index_equal(a.union(a[:0], sort=False), a) - tm.assert_index_equal(a[:0].union(a, sort=False), a) + tm.assert_index_equal(result, expected) def test_union_sort_other_incomparable(self): - a = pd.Index([1, pd.Timestamp('2000')]) + idx = pd.Index([1, pd.Timestamp('2000')]) # default, sort=None with tm.assert_produces_warning(RuntimeWarning): - result = a.union(a[:1]) + result = idx.union(idx[:1]) - tm.assert_index_equal(result, a) + tm.assert_index_equal(result, idx) # sort=True with pytest.raises(TypeError, match='.*'): - a.union(a[:1], sort=True) + idx.union(idx[:1], sort=True) # sort=False - result = a.union(a[:1], sort=False) - tm.assert_index_equal(result, a) + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) @pytest.mark.parametrize("klass", [ np.array, Series, list]) From 8716f977433cfc62e6c279deadb5d285806c135c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 20:44:44 -0600 Subject: [PATCH 04/12] multi --- pandas/tests/indexes/multi/test_set_ops.py | 14 ++++++-------- pandas/tests/indexes/test_base.py | 5 ++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 09321000eb80d..c0c11ccab2a26 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -253,10 +253,8 @@ def test_intersection(idx, sort): @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) def test_union_sort_other_empty(slice_): + # https://github.com/pandas-dev/pandas/issues/24959 idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) - # Two cases: - # 1. idx is other - # 2. other is empty # default, sort=None other = idx[slice_] @@ -274,17 +272,17 @@ def test_union_sort_other_empty(slice_): def test_union_sort_other_incomparable(): + # https://github.com/pandas-dev/pandas/issues/24959 idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) # default, sort=None - # with tm.assert_produces_warning(RuntimeWarning): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) - # sort=True - with pytest.raises(TypeError, match='Cannot compare'): - idx.union(idx[:1], sort=True) - # sort=False result = idx.union(idx[:1], sort=False) tm.assert_index_equal(result, idx) + + # sort=True + with pytest.raises(TypeError, match='Cannot compare'): + idx.union(idx[:1], sort=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4eaad5b31de7d..d6a9b3b769965 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -801,9 +801,7 @@ def test_union(self, sort): @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) def test_union_sort_other_special(self, slice_): - # Two cases: - # 1. idx is other - # 2. other is empty + # https://github.com/pandas-dev/pandas/issues/24959 idx = pd.Index([1, 0, 2]) # default, sort=None @@ -820,6 +818,7 @@ def test_union_sort_other_special(self, slice_): tm.assert_index_equal(result, expected) def test_union_sort_other_incomparable(self): + # https://github.com/pandas-dev/pandas/issues/24959 idx = pd.Index([1, pd.Timestamp('2000')]) # default, sort=None with tm.assert_produces_warning(RuntimeWarning): From f7056d5e2d9f66eab73c2ff595712cb46b66e465 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 20:45:59 -0600 Subject: [PATCH 05/12] doc typo --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6f832bba5ec67..cab15e727fa6e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2937,7 +2937,7 @@ def intersection(self, other, sort=False): Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True + sort : bool, default False Sort the resulting MultiIndex if possible .. versionadded:: 0.24.0 From e82cbb184373080d641c62ecd9c4aced245d8cf6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 21:03:56 -0600 Subject: [PATCH 06/12] intersection --- pandas/core/indexes/base.py | 13 ++++++++++--- pandas/core/indexes/multi.py | 2 ++ pandas/tests/indexes/multi/test_set_ops.py | 22 +++++++++++++++++++--- pandas/tests/indexes/test_base.py | 11 ++++++++++- 4 files changed, 41 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e0d5d37087055..b2dc46dab620c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2367,8 +2367,12 @@ def intersection(self, other, sort=False): Parameters ---------- other : Index or array-like - sort : bool, default False - Sort the resulting index if possible + sort : bool or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * True : sort the result. A TypeError is raised when the + values cannot be compared. .. versionadded:: 0.24.0 @@ -2392,7 +2396,10 @@ def intersection(self, other, sort=False): other = ensure_index(other) if self.equals(other): - return self._get_reconciled_name_object(other) + result = self._get_reconciled_name_object(other) + if sort: + result = result.sort_values() + return result if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cab15e727fa6e..2115135552df2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2954,6 +2954,8 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): + if sort: + return self.sort_values() return self self_tuples = self._ndarray_values diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index c0c11ccab2a26..eee649de61714 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -203,10 +203,16 @@ def test_union(idx, sort): # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) - assert the_union is idx + if sort: + tm.assert_index_equal(the_union, idx.sort_values()) + else: + assert the_union is idx the_union = idx.union(idx[:0], sort=sort) - assert the_union is idx + if sort: + tm.assert_index_equal(the_union, idx.sort_values()) + else: + assert the_union is idx # won't work in python 3 # tuples = _index.values @@ -238,7 +244,10 @@ def test_intersection(idx, sort): # corner case, pass self the_int = idx.intersection(idx, sort=sort) - assert the_int is idx + if sort: + tm.assert_index_equal(the_int, idx.sort_values()) + else: + assert the_int is idx # empty intersection: disjoint empty = idx[:2].intersection(idx[2:], sort=sort) @@ -251,6 +260,13 @@ def test_intersection(idx, sort): # assert result.equals(tuples) +def test_intersect_equal_sort(): + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) def test_union_sort_other_empty(slice_): # https://github.com/pandas-dev/pandas/issues/24959 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d6a9b3b769965..cff515604179f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -695,7 +695,10 @@ def test_intersection(self, sort): # Corner cases inter = first.intersection(first, sort=sort) - assert inter is first + if sort: + tm.assert_index_equal(inter, first.sort_values()) + else: + assert inter is first @pytest.mark.parametrize("index2,keeps_name", [ (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name @@ -770,6 +773,12 @@ def test_intersect_nosort(self): expected = pd.Index(['b', 'a']) tm.assert_index_equal(result, expected) + def test_intersect_equal_sort(self): + idx = pd.Index(['c', 'a', 'b']) + sorted_ = pd.Index(['a', 'b', 'c']) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + @pytest.mark.parametrize("sort", [True, False]) def test_chained_union(self, sort): # Chained unions handles names correctly From 2a2de250f02f8fb67960d03cead0faf1127666b2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 21:13:48 -0600 Subject: [PATCH 07/12] whatsnew --- doc/source/whatsnew/v0.24.1.rst | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 828c35c10e958..19d15b533c61b 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -15,10 +15,34 @@ Whats New in 0.24.1 (February XX, 2019) These are the changes in pandas 0.24.1. See :ref:`release` for a full changelog including other versions of pandas. +.. _whatsnew_0241.api: + +API Changes +~~~~~~~~~~~ + +Changing the ``sort`` parameter for :meth:`Index.Union` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None``. +The default *behavior* remains the same: The result is sorted, unless + +1. ``self`` and ``other`` are identical +2. ``self`` or ``other`` is empty +3. ``self`` or ``other`` contain values that can not be compared (a ``RuntimeWarning`` is raised). + +This allows ``sort=True`` to now mean "always sort" A ``TypeError`` is raised if the values cannot be compared. + +Changed the behavior of :meth:`Index.intersection` with ``sort=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When ``sort=True`` is provided to :meth:`Index.intersection`, the values are always sorted. In 0.24.0, +the values would not be sorted when ``self`` and ``other`` were identical. Pass ``sort=False`` to not +sort the values. + .. _whatsnew_0241.regressions: Fixed Regressions -^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~ - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) @@ -27,7 +51,7 @@ Fixed Regressions .. _whatsnew_0241.enhancements: Enhancements -^^^^^^^^^^^^ +~~~~~~~~~~~~ .. _whatsnew_0241.bug_fixes: From 5c3da746dd6232ebe1a41bfc8a7620d48c43bcc7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 30 Jan 2019 07:50:39 -0600 Subject: [PATCH 08/12] update whatsnew --- doc/source/whatsnew/v0.24.1.rst | 53 +++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 19d15b533c61b..9e09d02d8d2f8 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -20,7 +20,7 @@ including other versions of pandas. API Changes ~~~~~~~~~~~ -Changing the ``sort`` parameter for :meth:`Index.Union` +Changing the ``sort`` parameter for :meth:`Index.union` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None``. @@ -30,14 +30,61 @@ The default *behavior* remains the same: The result is sorted, unless 2. ``self`` or ``other`` is empty 3. ``self`` or ``other`` contain values that can not be compared (a ``RuntimeWarning`` is raised). -This allows ``sort=True`` to now mean "always sort" A ``TypeError`` is raised if the values cannot be compared. +This allows ``sort=True`` to now mean "always sort". A ``TypeError`` is raised if the values cannot be compared. + +**Behavior in 0.24.0** + +.. ipython:: python + + In [1]: idx = pd.Index(['b', 'a']) + + In [2]: idx.union(idx) # sort=True was the default. + Out[2]: Index(['b', 'a'], dtype='object') + + In [3]: idx.union(idx, sort=True) # result is still not sorted. + Out[32]: Index(['b', 'a'], dtype='object') + +**New Behavior** + +.. ipython:: python + + idx = pd.Index(['b', 'a']) + idx.union(idx) # sort=None is the default. Don't sort identical operands. + + idx.union(idx, sort=True) Changed the behavior of :meth:`Index.intersection` with ``sort=True`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When ``sort=True`` is provided to :meth:`Index.intersection`, the values are always sorted. In 0.24.0, the values would not be sorted when ``self`` and ``other`` were identical. Pass ``sort=False`` to not -sort the values. +sort the values. This matches the behavior of pandas 0.23.4 and earlier. + +**Behavior in 0.23.4** + +.. ipython:: python + + In [2]: idx = pd.Index(['b', 'a']) + + In [3]: idx.intersection(idx) # sort was not a keyword. + Out[3]: Index(['b', 'a'], dtype='object') + +**Behavior in 0.24.0** + +.. ipython:: python + + In [5]: idx.intersection(idx) # sort=True by default. Don't sort identical. + Out[5]: Index(['b', 'a'], dtype='object') + + In [6]: idx.intersection(idx, sort=True) + Out[6]: Index(['b', 'a'], dtype='object') + +**New Behavior** + +.. ipython:: python + + idx.intersection(idx) # sort=False by default + idx.intersection(idx, sort=True) .. _whatsnew_0241.regressions: From 52a2f2f567ae6d2e1c38fab937ad00fa09a9d541 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 31 Jan 2019 07:54:15 -0600 Subject: [PATCH 09/12] symdiff --- pandas/core/indexes/base.py | 22 ++++++++++++++++++---- pandas/tests/indexes/test_base.py | 17 +++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b2dc46dab620c..dc7f1da69bd02 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2500,7 +2500,7 @@ def difference(self, other, sort=True): return this._shallow_copy(the_diff, name=result_name, freq=None) - def symmetric_difference(self, other, result_name=None, sort=True): + def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. @@ -2508,11 +2508,23 @@ def symmetric_difference(self, other, result_name=None, sort=True): ---------- other : Index or array-like result_name : str - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None. + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result, raising a TypeError if any elements + cannot be compared. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Added the `None` option. + Returns ------- symmetric_difference : Index @@ -2556,11 +2568,13 @@ def symmetric_difference(self, other, result_name=None, sort=True): right_diff = other.values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: pass + elif sort: + the_diff = sorting.safe_sort(the_diff) attribs = self._get_attributes_dict() attribs['name'] = result_name diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cff515604179f..995a52335ec87 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1099,6 +1099,23 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None + def test_symmetric_difference_incomparable(self): + a = pd.Index([3, pd.Timestamp('2000'), 1]) + b = pd.Index([2, pd.Timestamp('1999'), 1]) + + # sort=None, the default + result = a.symmetric_difference(b) + expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) + tm.assert_index_equal(result, expected) + + # sort=False + result = a.symmetric_difference(b, sort=False) + tm.assert_index_equal(result, expected) + + # sort=True, raises + with pytest.raises(TypeError, match='Cannot compare'): + a.symmetric_difference(b, sort=True) + @pytest.mark.parametrize("sort", [True, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(self.tuples) From bb848f19a68ccb3a3a74f366a26dd152d9f4ad12 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 31 Jan 2019 07:56:19 -0600 Subject: [PATCH 10/12] whatsnew --- doc/source/whatsnew/v0.24.1.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 4045417c6f5e9..14482c93fa67e 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -23,7 +23,7 @@ API Changes Changing the ``sort`` parameter for :meth:`Index.union` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None``. +The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None`` (:issue:`24959`). The default *behavior* remains the same: The result is sorted, unless 1. ``self`` and ``other`` are identical @@ -53,6 +53,9 @@ This allows ``sort=True`` to now mean "always sort". A ``TypeError`` is raised i idx.union(idx, sort=True) +The same change applies to :meth:`Index.symmetric_difference`, which would previously not +sort the result when ``sort=True`` but the values could not be compared. + Changed the behavior of :meth:`Index.intersection` with ``sort=True`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From b15dc7e8de3af51fc3daa190b9daa53aa4841db5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 31 Jan 2019 07:57:27 -0600 Subject: [PATCH 11/12] doc --- pandas/core/indexes/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dc7f1da69bd02..0fa6b168580b4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2523,7 +2523,8 @@ def symmetric_difference(self, other, result_name=None, sort=None): .. versionchanged:: 0.24.1 - Added the `None` option. + Added the `None` option, which matches the behavior of + pandas 0.23.4 and earlier. Returns ------- From 27b5b16d95bdb211a489fa32e54f85eed4bbead5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 31 Jan 2019 09:05:42 -0600 Subject: [PATCH 12/12] index multi --- doc/source/whatsnew/v0.24.1.rst | 4 +-- pandas/core/indexes/base.py | 25 ++++++++++++---- pandas/core/indexes/multi.py | 4 ++- pandas/tests/indexes/multi/test_set_ops.py | 35 +++++++++++++++++++++- pandas/tests/indexes/test_base.py | 15 +++++++--- 5 files changed, 70 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 14482c93fa67e..948350df140eb 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -53,8 +53,8 @@ This allows ``sort=True`` to now mean "always sort". A ``TypeError`` is raised i idx.union(idx, sort=True) -The same change applies to :meth:`Index.symmetric_difference`, which would previously not -sort the result when ``sort=True`` but the values could not be compared. +The same change applies to :meth:`Index.difference` and :meth:`Index.symmetric_difference`, which +would previously not sort the result when ``sort=True`` but the values could not be compared. Changed the behavior of :meth:`Index.intersection` with ``sort=True`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0fa6b168580b4..12880ed93cc2a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2447,7 +2447,7 @@ def intersection(self, other, sort=False): return taken - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Return a new Index with elements from the index that are not in `other`. @@ -2457,11 +2457,24 @@ def difference(self, other, sort=True): Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result, raising a TypeError if any elements + cannot be compared. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Added the `None` option, which matches the behavior of + pandas 0.23.4 and earlier. + Returns ------- difference : Index @@ -2492,11 +2505,13 @@ def difference(self, other, sort=True): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: pass + elif sort: + the_diff = sorting.safe_sort(the_diff) return this._shallow_copy(the_diff, name=result_name, freq=None) @@ -2508,7 +2523,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): ---------- other : Index or array-like result_name : str - sort : bool or None, default None. + sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from incomparable elements is caught by pandas. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2115135552df2..32a5a09359019 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2973,7 +2973,7 @@ def intersection(self, other, sort=False): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Compute set difference of two MultiIndex objects @@ -2993,6 +2993,8 @@ def difference(self, other, sort=True): other, result_names = self._convert_can_do_setop(other) if len(other) == 0: + if sort: + return self.sort_values() return self if self.equals(other): diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index eee649de61714..6a42e29aa8f5c 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -174,7 +174,10 @@ def test_difference(idx, sort): # name from empty array result = first.difference([], sort=sort) - assert first.equals(result) + if sort: + assert first.sort_values().equals(result) + else: + assert first.equals(result) assert first.names == result.names # name from non-empty array @@ -189,6 +192,36 @@ def test_difference(idx, sort): first.difference([1, 2, 3, 4, 5], sort=sort) +def test_difference_sort_special(): + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + # sort=None, the default + result = idx.difference([]) + tm.assert_index_equal(result, idx) + + result = idx.difference([], sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_difference_sort_incomparable(): + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], + ['a', 'b']]) + + other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], + ['c', 'd']]) + # sort=None, the default + # result = idx.difference(other) + # tm.assert_index_equal(result, idx) + + # sort=False + result = idx.difference(other) + tm.assert_index_equal(result, idx) + + # sort=True, raises + with pytest.raises(TypeError): + idx.difference(other, sort=True) + + @pytest.mark.parametrize("sort", [True, False]) def test_union(idx, sort): piece1 = idx[:5][::-1] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 995a52335ec87..4e8555cbe1aab 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -3,6 +3,7 @@ from collections import defaultdict from datetime import datetime, timedelta import math +import operator import sys import numpy as np @@ -1099,22 +1100,28 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None - def test_symmetric_difference_incomparable(self): + @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + def test_difference_incomparable(self, opname): a = pd.Index([3, pd.Timestamp('2000'), 1]) b = pd.Index([2, pd.Timestamp('1999'), 1]) + op = operator.methodcaller(opname, b) # sort=None, the default - result = a.symmetric_difference(b) + result = op(a) expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) + if opname == 'difference': + expected = expected[:2] tm.assert_index_equal(result, expected) # sort=False - result = a.symmetric_difference(b, sort=False) + op = operator.methodcaller(opname, b, sort=False) + result = op(a) tm.assert_index_equal(result, expected) # sort=True, raises + op = operator.methodcaller(opname, b, sort=True) with pytest.raises(TypeError, match='Cannot compare'): - a.symmetric_difference(b, sort=True) + op(a) @pytest.mark.parametrize("sort", [True, False]) def test_symmetric_difference_mi(self, sort):