From b8f27e2d7a415b4414f4e58aa7748c258ea4c559 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Sep 2020 01:04:17 +0200 Subject: [PATCH 01/41] Fix issues in Index.union with duplicate index values --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/base.py | 64 +++++++++++++++---------------- pandas/tests/indexes/test_base.py | 25 ++++++++++++ 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..5407883b40472 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -272,6 +272,7 @@ Indexing - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) +- Bug in :meth:`Index.union` dropped duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8014b16d07b01..a8c780c50390b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2622,41 +2622,39 @@ def _union(self, other, sort): lvals = self._values rvals = other._values - if sort is None and self.is_monotonic and other.is_monotonic: - try: - result = self._outer_indexer(lvals, rvals)[0] - except TypeError: - # incomparable objects - result = list(lvals) - - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here - else: - # find indexes of things in "other" that are not in "self" - if self.is_unique: - indexer = self.get_indexer(other) - indexer = (indexer == -1).nonzero()[0] - else: - indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) - - if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, allow_fill=False) - result = concat_compat((lvals, other_diff)) - + # if sort is None and self.is_monotonic and other.is_monotonic: + try: + if sort is None and self.is_monotonic and other.is_monotonic: + result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] else: - result = lvals - - if sort is None: - try: - result = algos.safe_sort(result) - except TypeError as err: - warnings.warn( - f"{err}, sort order is undefined for incomparable objects", - RuntimeWarning, - stacklevel=3, + new_index_sorted = self._outer_indexer(np.sort(lvals), np.sort(rvals)) + l_reindexer = self.unique().reindex(new_index_sorted[0])[1] + if l_reindexer is None: + result = lvals + else: + l_result = self.unique().take(np.sort(l_reindexer[l_reindexer != -1])) + r_reindexer = other.unique().reindex(new_index_sorted[0])[1] + r_result = other.unique().take( + np.sort(r_reindexer[new_index_sorted[1] == -1]) ) + result = np.append(l_result, r_result) + except TypeError: + # incomparable objects + result = list(lvals) + + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + result = Index(result)._values # do type inference here + if sort is None and (not self.is_monotonic or not other.is_monotonic): + try: + result = algos.safe_sort(result) + except TypeError as err: + warnings.warn( + f"{err}, sort order is undefined for incomparable objects", + RuntimeWarning, + stacklevel=3, + ) # for subclasses return self._wrap_setop_result(other, result) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7720db9d98ebf..ac0295dd0fb70 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2613,3 +2613,28 @@ def construct(dtype): no_matches = np.array([-1] * 6, dtype=np.intp) tm.assert_numpy_array_equal(result[0], no_matches) tm.assert_numpy_array_equal(result[1], no_matches) + + +def test_union_with_duplicate_index(): + # GH: 36289 + idx1 = Index([1, 0, 0]) + idx2 = Index([0, 1]) + expected = pd.Index([0, 0, 1]) + + result = idx1.union(idx2) + tm.assert_index_equal(result, expected) + + result = idx2.union(idx1) + tm.assert_index_equal(result, expected) + + +def test_union_duplicate_index_subsets_of_each_other(): + # GH: 31326 + a = pd.Index([1, 2, 2, 3]) + b = pd.Index([3, 3, 4]) + expected = Index([1, 2, 2, 3, 3, 4]) + + result = a.union(b) + tm.assert_index_equal(result, expected) + result = a.union(b, sort=False) + tm.assert_index_equal(result, expected) From 070abad8d2103e30eb789069bb6ebafa03a19541 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 12 Sep 2020 01:11:18 +0200 Subject: [PATCH 02/41] Run black pandas --- pandas/core/indexes/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a8c780c50390b..3ab86e5f74c5d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2627,12 +2627,15 @@ def _union(self, other, sort): if sort is None and self.is_monotonic and other.is_monotonic: result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] else: + # We calculate the sorted union and resort it afterwards new_index_sorted = self._outer_indexer(np.sort(lvals), np.sort(rvals)) l_reindexer = self.unique().reindex(new_index_sorted[0])[1] if l_reindexer is None: result = lvals else: - l_result = self.unique().take(np.sort(l_reindexer[l_reindexer != -1])) + l_result = self.unique().take( + np.sort(l_reindexer[l_reindexer != -1]) + ) r_reindexer = other.unique().reindex(new_index_sorted[0])[1] r_result = other.unique().take( np.sort(r_reindexer[new_index_sorted[1] == -1]) From 7a905562771ae7bfba35a5433fe8e869f72ed8e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 20:18:53 +0200 Subject: [PATCH 03/41] Parametrize tests --- pandas/tests/indexes/test_base.py | 22 ----------------- pandas/tests/indexes/test_setops.py | 37 ++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ac0295dd0fb70..aead898efad40 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2615,26 +2615,4 @@ def construct(dtype): tm.assert_numpy_array_equal(result[1], no_matches) -def test_union_with_duplicate_index(): - # GH: 36289 - idx1 = Index([1, 0, 0]) - idx2 = Index([0, 1]) - expected = pd.Index([0, 0, 1]) - result = idx1.union(idx2) - tm.assert_index_equal(result, expected) - - result = idx2.union(idx1) - tm.assert_index_equal(result, expected) - - -def test_union_duplicate_index_subsets_of_each_other(): - # GH: 31326 - a = pd.Index([1, 2, 2, 3]) - b = pd.Index([3, 3, 4]) - expected = Index([1, 2, 2, 3, 3, 4]) - - result = a.union(b) - tm.assert_index_equal(result, expected) - result = a.union(b, sort=False) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1a40fe550be61..8a0962bcdbcfa 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.common import is_dtype_equal import pandas as pd -from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +from pandas import CategoricalIndex, DatetimeIndex, Float64Index, Int64Index, RangeIndex, UInt64Index, TimedeltaIndex import pandas._testing as tm from pandas.api.types import pandas_dtype @@ -95,3 +95,38 @@ def test_union_dtypes(left, right, expected): b = pd.Index([], dtype=right) result = (a | b).dtype assert result == expected + + +def test_union_duplicate_index_subsets_of_each_other(): + # GH: 31326 + a = pd.Index([1, 2, 2, 3]) + b = pd.Index([3, 3, 4]) + expected = pd.Index([1, 2, 2, 3, 3, 4]) + + result = a.union(b) + tm.assert_index_equal(result, expected) + result = a.union(b, sort=False) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "func", + [ + (Int64Index), + (Float64Index), + (DatetimeIndex), + (CategoricalIndex), + (TimedeltaIndex), + ] +) +def test_union_with_duplicate_index(func): + # GH: 36289 + idx1 = func([1, 0, 0]) + idx2 = func([0, 1]) + expected = func([0, 0, 1]) + + result = idx1.union(idx2) + tm.assert_index_equal(result, expected) + + result = idx2.union(idx1) + tm.assert_index_equal(result, expected) From dbcd0a7d051cc2698d110e4295ae08bfa6401d0d Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 20:32:27 +0200 Subject: [PATCH 04/41] Fix small bug and Pep8 issues --- pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_base.py | 3 --- pandas/tests/indexes/test_setops.py | 10 +++++++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3ab86e5f74c5d..239e0431ade00 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2625,7 +2625,7 @@ def _union(self, other, sort): # if sort is None and self.is_monotonic and other.is_monotonic: try: if sort is None and self.is_monotonic and other.is_monotonic: - result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] + result = self._outer_indexer(lvals, rvals)[0] else: # We calculate the sorted union and resort it afterwards new_index_sorted = self._outer_indexer(np.sort(lvals), np.sort(rvals)) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aead898efad40..7720db9d98ebf 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2613,6 +2613,3 @@ def construct(dtype): no_matches = np.array([-1] * 6, dtype=np.intp) tm.assert_numpy_array_equal(result[0], no_matches) tm.assert_numpy_array_equal(result[1], no_matches) - - - diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 8a0962bcdbcfa..a6e16d2b77c30 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -8,7 +8,15 @@ from pandas.core.dtypes.common import is_dtype_equal import pandas as pd -from pandas import CategoricalIndex, DatetimeIndex, Float64Index, Int64Index, RangeIndex, UInt64Index, TimedeltaIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + Float64Index, + Int64Index, + RangeIndex, + UInt64Index, + TimedeltaIndex +) import pandas._testing as tm from pandas.api.types import pandas_dtype From 7a9aec77b382cdf753563c3e4bb708e532a270f0 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 22:17:25 +0200 Subject: [PATCH 05/41] Change import order --- pandas/tests/indexes/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a6e16d2b77c30..bb049a605859d 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -14,8 +14,8 @@ Float64Index, Int64Index, RangeIndex, + TimedeltaIndex, UInt64Index, - TimedeltaIndex ) import pandas._testing as tm from pandas.api.types import pandas_dtype From 01d55c624771d69b1b355a18996f70e773f1dff7 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 22:57:01 +0200 Subject: [PATCH 06/41] Fix black bug --- pandas/tests/indexes/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index bb049a605859d..d7af2358dfe4c 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -125,7 +125,7 @@ def test_union_duplicate_index_subsets_of_each_other(): (DatetimeIndex), (CategoricalIndex), (TimedeltaIndex), - ] + ], ) def test_union_with_duplicate_index(func): # GH: 36289 From 84630784e77e3191a254cccdf9c8a61b229338d8 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Sep 2020 10:11:03 +0200 Subject: [PATCH 07/41] Implement reviews --- pandas/core/indexes/base.py | 1 - pandas/tests/indexes/test_setops.py | 53 ++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 239e0431ade00..7c5ff8efcc646 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2622,7 +2622,6 @@ def _union(self, other, sort): lvals = self._values rvals = other._values - # if sort is None and self.is_monotonic and other.is_monotonic: try: if sort is None and self.is_monotonic and other.is_monotonic: result = self._outer_indexer(lvals, rvals)[0] diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d7af2358dfe4c..9e81f9d9f041e 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -15,7 +15,7 @@ Int64Index, RangeIndex, TimedeltaIndex, - UInt64Index, + UInt64Index, Index, ) import pandas._testing as tm from pandas.api.types import pandas_dtype @@ -105,12 +105,24 @@ def test_union_dtypes(left, right, expected): assert result == expected -def test_union_duplicate_index_subsets_of_each_other(): +@pytest.mark.parametrize( + "cls", + [ + Int64Index, + Float64Index, + DatetimeIndex, + CategoricalIndex, + TimedeltaIndex, + lambda x: Index(x, dtype=object) + ], +) +def test_union_duplicate_index_subsets_of_each_other(cls): # GH: 31326 - a = pd.Index([1, 2, 2, 3]) - b = pd.Index([3, 3, 4]) - expected = pd.Index([1, 2, 2, 3, 3, 4]) - + a = cls([1, 2, 2, 3]) + b = cls([3, 3, 4]) + expected = cls([1, 2, 2, 3, 3, 4]) + if cls is CategoricalIndex: + expected = Index([1, 2, 2, 3, 3, 4], dtype="object") result = a.union(b) tm.assert_index_equal(result, expected) result = a.union(b, sort=False) @@ -118,23 +130,32 @@ def test_union_duplicate_index_subsets_of_each_other(): @pytest.mark.parametrize( - "func", + "cls", [ - (Int64Index), - (Float64Index), - (DatetimeIndex), - (CategoricalIndex), - (TimedeltaIndex), + Int64Index, + Float64Index, + DatetimeIndex, + CategoricalIndex, + TimedeltaIndex, + lambda x: Index(x, dtype=object) ], ) -def test_union_with_duplicate_index(func): +def test_union_with_duplicate_index(cls): # GH: 36289 - idx1 = func([1, 0, 0]) - idx2 = func([0, 1]) - expected = func([0, 0, 1]) + idx1 = cls([1, 0, 0]) + idx2 = cls([0, 1]) + expected = cls([0, 0, 1]) result = idx1.union(idx2) tm.assert_index_equal(result, expected) result = idx2.union(idx1) tm.assert_index_equal(result, expected) + + +def test_union_duplicate_index_different_dtypes(): + idx1 = Index([1, 2, 2, 3]) + idx2 = Index(["1", "0", "0"]) + expected = Index([1, 2, 2, 3, "1", "0", "0"]) + result = idx1.union(idx2, sort=False) + tm.assert_index_equal(result, expected) From cddd7f974dde08fac7ed89e7ee773c7e014ead86 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Sep 2020 10:15:10 +0200 Subject: [PATCH 08/41] Move code away from try except --- pandas/core/indexes/base.py | 26 ++++++++++++++------------ pandas/tests/indexes/test_setops.py | 7 ++++--- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7c5ff8efcc646..9369affa2dcf2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2627,27 +2627,29 @@ def _union(self, other, sort): result = self._outer_indexer(lvals, rvals)[0] else: # We calculate the sorted union and resort it afterwards - new_index_sorted = self._outer_indexer(np.sort(lvals), np.sort(rvals)) - l_reindexer = self.unique().reindex(new_index_sorted[0])[1] + result = self._outer_indexer(np.sort(lvals), np.sort(rvals)) + except TypeError: + # incomparable objects + result = list(lvals) + + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + result = Index(result)._values # do type inference here + else: + if sort is False or not self.is_monotonic or not other.is_monotonic: + l_reindexer = self.unique().reindex(result[0])[1] if l_reindexer is None: result = lvals else: l_result = self.unique().take( np.sort(l_reindexer[l_reindexer != -1]) ) - r_reindexer = other.unique().reindex(new_index_sorted[0])[1] + r_reindexer = other.unique().reindex(result[0])[1] r_result = other.unique().take( - np.sort(r_reindexer[new_index_sorted[1] == -1]) + np.sort(r_reindexer[result[1] == -1]) ) result = np.append(l_result, r_result) - except TypeError: - # incomparable objects - result = list(lvals) - - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here if sort is None and (not self.is_monotonic or not other.is_monotonic): try: result = algos.safe_sort(result) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 9e81f9d9f041e..a7e8d09c15e0e 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -15,7 +15,8 @@ Int64Index, RangeIndex, TimedeltaIndex, - UInt64Index, Index, + UInt64Index, + Index, ) import pandas._testing as tm from pandas.api.types import pandas_dtype @@ -113,7 +114,7 @@ def test_union_dtypes(left, right, expected): DatetimeIndex, CategoricalIndex, TimedeltaIndex, - lambda x: Index(x, dtype=object) + lambda x: Index(x, dtype=object), ], ) def test_union_duplicate_index_subsets_of_each_other(cls): @@ -137,7 +138,7 @@ def test_union_duplicate_index_subsets_of_each_other(cls): DatetimeIndex, CategoricalIndex, TimedeltaIndex, - lambda x: Index(x, dtype=object) + lambda x: Index(x, dtype=object), ], ) def test_union_with_duplicate_index(cls): From 3c6b07905a9cd07f05b8da93c36afbd0638717dc Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Sep 2020 13:20:41 +0200 Subject: [PATCH 09/41] Catch r_reindexer None --- pandas/core/indexes/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9369affa2dcf2..9bcbf24232bd7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2639,13 +2639,15 @@ def _union(self, other, sort): else: if sort is False or not self.is_monotonic or not other.is_monotonic: l_reindexer = self.unique().reindex(result[0])[1] + r_reindexer = other.unique().reindex(result[0])[1] if l_reindexer is None: result = lvals + elif r_reindexer is None: + result = rvals else: l_result = self.unique().take( np.sort(l_reindexer[l_reindexer != -1]) ) - r_reindexer = other.unique().reindex(result[0])[1] r_result = other.unique().take( np.sort(r_reindexer[result[1] == -1]) ) From 90056b754a077bea3ab199cb4307384a3c5629bd Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Sep 2020 15:44:19 +0200 Subject: [PATCH 10/41] Resort imports... --- pandas/tests/indexes/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a7e8d09c15e0e..6ced036c82035 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -12,11 +12,11 @@ CategoricalIndex, DatetimeIndex, Float64Index, + Index, Int64Index, RangeIndex, TimedeltaIndex, UInt64Index, - Index, ) import pandas._testing as tm from pandas.api.types import pandas_dtype From 46b7c6c5a96c6c3443d6bc2202f63813b2f0acd4 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 3 Oct 2020 04:02:05 +0200 Subject: [PATCH 11/41] Simplify resorting --- pandas/core/indexes/base.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9bcbf24232bd7..994988f81ddee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2638,20 +2638,13 @@ def _union(self, other, sort): result = Index(result)._values # do type inference here else: if sort is False or not self.is_monotonic or not other.is_monotonic: - l_reindexer = self.unique().reindex(result[0])[1] - r_reindexer = other.unique().reindex(result[0])[1] - if l_reindexer is None: - result = lvals - elif r_reindexer is None: - result = rvals - else: - l_result = self.unique().take( - np.sort(l_reindexer[l_reindexer != -1]) - ) - r_result = other.unique().take( - np.sort(r_reindexer[result[1] == -1]) - ) - result = np.append(l_result, r_result) + values = [] + dc = dict(zip(*np.unique(result[0], return_counts=True))) + unique_array = algos.unique(np.append(lvals, rvals)) + # Create indexer to resort result + for i, value in enumerate(unique_array): + values += [i] * dc[value] + result = unique_array.take(values) if sort is None and (not self.is_monotonic or not other.is_monotonic): try: result = algos.safe_sort(result) From 9f55905548194e391713efe949e43edacb19eeb6 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 3 Oct 2020 04:03:20 +0200 Subject: [PATCH 12/41] Rename variable --- pandas/core/indexes/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 994988f81ddee..fa423903e2269 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2638,13 +2638,13 @@ def _union(self, other, sort): result = Index(result)._values # do type inference here else: if sort is False or not self.is_monotonic or not other.is_monotonic: - values = [] + indexer = [] dc = dict(zip(*np.unique(result[0], return_counts=True))) unique_array = algos.unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): - values += [i] * dc[value] - result = unique_array.take(values) + indexer += [i] * dc[value] + result = unique_array.take(indexer) if sort is None and (not self.is_monotonic or not other.is_monotonic): try: result = algos.safe_sort(result) From 97fe91f74cdc9faec073a22366efd9f649a5bcaa Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 3 Oct 2020 04:10:55 +0200 Subject: [PATCH 13/41] Rename variable --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4962ceab5c4c9..289dffb5c786b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2675,11 +2675,11 @@ def _union(self, other, sort): else: if sort is False or not self.is_monotonic or not other.is_monotonic: indexer = [] - dc = dict(zip(*np.unique(result[0], return_counts=True))) + counts = dict(zip(*np.unique(result[0], return_counts=True))) unique_array = algos.unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): - indexer += [i] * dc[value] + indexer += [i] * counts[value] result = unique_array.take(indexer) if sort is None and (not self.is_monotonic or not other.is_monotonic): try: From e724ade38c88e4da50c269723a8544cc79d6582b Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Oct 2020 20:12:04 +0100 Subject: [PATCH 14/41] Move resorting to algos --- pandas/core/algorithms.py | 25 +++++++++++++++++++++++++ pandas/core/indexes/base.py | 16 ++++------------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d2005d46bbbf1..1474a5555d1e7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2115,3 +2115,28 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def resort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: + """ + Elements from union_values are resorted after the order of + lvals and then rvals, if element is not in lvals. All occurrences + are placed at the spot of the first occurrence of this element. + + Parameters + ---------- + union_values: np.array which is a sorted union of lvals and rvals + lvals: np.ndarray of the left values which is ordered in front. + rvals: np.ndarray of the right values ordered after lvals. + + Returns + ------- + np.ndarray containing the resorted values from union_values + """ + indexer = [] + counts = dict(zip(*np.unique(union_values, return_counts=True))) + unique_array = unique(np.append(lvals, rvals)) + # Create indexer to resort result + for i, value in enumerate(unique_array): + indexer += [i] * counts[value] + return unique_array.take(indexer) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 289dffb5c786b..bbf4f7ad9fd50 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2659,11 +2659,8 @@ def _union(self, other, sort): rvals = other._values try: - if sort is None and self.is_monotonic and other.is_monotonic: - result = self._outer_indexer(lvals, rvals)[0] - else: - # We calculate the sorted union and resort it afterwards - result = self._outer_indexer(np.sort(lvals), np.sort(rvals)) + # If one of both is not monotonic, we resort it afterwards + result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] except TypeError: # incomparable objects result = list(lvals) @@ -2674,13 +2671,8 @@ def _union(self, other, sort): result = Index(result)._values # do type inference here else: if sort is False or not self.is_monotonic or not other.is_monotonic: - indexer = [] - counts = dict(zip(*np.unique(result[0], return_counts=True))) - unique_array = algos.unique(np.append(lvals, rvals)) - # Create indexer to resort result - for i, value in enumerate(unique_array): - indexer += [i] * counts[value] - result = unique_array.take(indexer) + result = algos.resort_union_after_inputs(result, lvals, rvals) + if sort is None and (not self.is_monotonic or not other.is_monotonic): try: result = algos.safe_sort(result) From c36ccff772d857d54dcf144eb3cdae501877f653 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Oct 2020 20:29:42 +0100 Subject: [PATCH 15/41] Add test and improve doc --- pandas/core/algorithms.py | 4 ++-- pandas/tests/test_algos.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 71bb85f013a7c..d1358ba7dd6f9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2134,8 +2134,8 @@ def sort_mixed(values): def resort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: """ - Elements from union_values are resorted after the order of - lvals and then rvals, if element is not in lvals. All occurrences + Elements from union_values are resorted after the ranking of the first occurrence in + lvals and then rvals, if element is not in lvals. All occurrences of this element are placed at the spot of the first occurrence of this element. Parameters diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ee8e2385fe698..c8d86ce4baa37 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2312,3 +2312,13 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) + + +def test_resort_union_after_inputs(): + # GH: 36299 + union_values = np.array([1, 1, 2, 3, 3, 4]) + lvals = np.array([[3, 1, 4]]) + rvals = np.array([[2, 3, 1]]) + result = algos.resort_union_after_inputs(union_values, lvals, rvals) + expected = np.array([3, 3, 1, 1, 4, 2]) + tm.assert_numpy_array_equal(result, expected) From 10627782a5f4de8c47b781c0fa75ee0e10b01298 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Oct 2020 22:21:14 +0100 Subject: [PATCH 16/41] Fix pattern issues --- pandas/tests/indexes/test_setops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 6ced036c82035..cf61dee2c78d0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -100,8 +100,8 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): def test_union_dtypes(left, right, expected): left = pandas_dtype(left) right = pandas_dtype(right) - a = pd.Index([], dtype=left) - b = pd.Index([], dtype=right) + a = Index([], dtype=left) + b = Index([], dtype=right) result = (a | b).dtype assert result == expected From cf0641849b4df1bcb9ac0f258ed10ffccb676e31 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Oct 2020 22:24:47 +0100 Subject: [PATCH 17/41] Add check --- pandas/core/indexes/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e81e7108e71f..5a210d0ecfcc7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2648,7 +2648,12 @@ def _union(self, other, sort): result.extend([x for x in rvals if x not in value_set]) result = Index(result)._values # do type inference here else: - if sort is False or not self.is_monotonic or not other.is_monotonic: + if ( + sort is False + or not self.is_monotonic + or not other.is_monotonic + or is_categorical_dtype(self) + ): result = algos.resort_union_after_inputs(result, lvals, rvals) if sort is None and (not self.is_monotonic or not other.is_monotonic): From 6f0440811615f5f8680e20f666006f7f3daac3cb Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:21:21 +0100 Subject: [PATCH 18/41] Rename function --- pandas/core/algorithms.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/tests/test_algos.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d1358ba7dd6f9..96acf42a590b9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2132,7 +2132,7 @@ def sort_mixed(values): return ordered, ensure_platform_int(new_codes) -def resort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: +def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: """ Elements from union_values are resorted after the ranking of the first occurrence in lvals and then rvals, if element is not in lvals. All occurrences of this element diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5a210d0ecfcc7..02728a19ca18b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2654,7 +2654,7 @@ def _union(self, other, sort): or not other.is_monotonic or is_categorical_dtype(self) ): - result = algos.resort_union_after_inputs(result, lvals, rvals) + result = algos.re_sort_union_after_inputs(result, lvals, rvals) if sort is None and (not self.is_monotonic or not other.is_monotonic): try: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c8d86ce4baa37..4b1f2a62af85d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2319,6 +2319,6 @@ def test_resort_union_after_inputs(): union_values = np.array([1, 1, 2, 3, 3, 4]) lvals = np.array([[3, 1, 4]]) rvals = np.array([[2, 3, 1]]) - result = algos.resort_union_after_inputs(union_values, lvals, rvals) + result = algos.re_sort_union_after_inputs(union_values, lvals, rvals) expected = np.array([3, 3, 1, 1, 4, 2]) tm.assert_numpy_array_equal(result, expected) From 167d695f1e42822e1f5334bec7d56fa63b6e0536 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:21:43 +0100 Subject: [PATCH 19/41] Adjust whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4e5e04409cddf..2f8850fcd7c76 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -433,7 +433,7 @@ Indexing - Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) - Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`) -- Bug in :meth:`Index.union` dropped duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) +- Bug in :meth:`Index.union` dropping duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) Missing ^^^^^^^ From 0b3b54801d7a63f4944b40808570bd343839a1e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:31:20 +0100 Subject: [PATCH 20/41] Change gh reference --- pandas/tests/indexes/test_setops.py | 5 +++-- pandas/tests/test_algos.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index cf61dee2c78d0..dc7b9b24b533e 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -118,7 +118,7 @@ def test_union_dtypes(left, right, expected): ], ) def test_union_duplicate_index_subsets_of_each_other(cls): - # GH: 31326 + # GH#31326 a = cls([1, 2, 2, 3]) b = cls([3, 3, 4]) expected = cls([1, 2, 2, 3, 3, 4]) @@ -142,7 +142,7 @@ def test_union_duplicate_index_subsets_of_each_other(cls): ], ) def test_union_with_duplicate_index(cls): - # GH: 36289 + # GH#36289 idx1 = cls([1, 0, 0]) idx2 = cls([0, 1]) expected = cls([0, 0, 1]) @@ -155,6 +155,7 @@ def test_union_with_duplicate_index(cls): def test_union_duplicate_index_different_dtypes(): + # GH#36289 idx1 = Index([1, 2, 2, 3]) idx2 = Index(["1", "0", "0"]) expected = Index([1, 2, 2, 3, "1", "0", "0"]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4b1f2a62af85d..3809c6a2c3f7a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2315,7 +2315,7 @@ def test_diff_ea_axis(self): def test_resort_union_after_inputs(): - # GH: 36299 + # GH#36299 union_values = np.array([1, 1, 2, 3, 3, 4]) lvals = np.array([[3, 1, 4]]) rvals = np.array([[2, 3, 1]]) From 59dbdf6f0df7b580a7848a575ce7acee6695e1ae Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Nov 2020 18:41:01 +0100 Subject: [PATCH 21/41] Remove pd --- pandas/tests/indexes/test_setops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d5d72de88405d..c352f6c9cff58 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -124,10 +124,10 @@ def test_dunder_inplace_setops_deprecated(index): @pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) def test_intersection_duplicates(values): # GH#31326 - a = pd.Index(values) - b = pd.Index([3, 3]) + a = Index(values) + b = Index([3, 3]) result = a.intersection(b) - expected = pd.Index([3]) + expected = Index([3]) tm.assert_index_equal(result, expected) From 51131be8e6a9d49b1152bf6b87bbc204246bb9ff Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 30 Nov 2020 01:25:54 +0100 Subject: [PATCH 22/41] Adress review comments --- pandas/core/algorithms.py | 16 ++++++++++------ pandas/core/indexes/base.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5e60d22b95dec..c70b9fd66a714 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2197,22 +2197,26 @@ def make_duplicates_of_left_unique_in_right( def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: """ - Elements from union_values are resorted after the ranking of the first occurrence in - lvals and then rvals, if element is not in lvals. All occurrences of this element + Elements from union_values are re-sorted after the ranking of the first occurrence + in lvals and then rvals, if element is not in lvals. All occurrences of this element are placed at the spot of the first occurrence of this element. Parameters ---------- - union_values: np.array which is a sorted union of lvals and rvals - lvals: np.ndarray of the left values which is ordered in front. - rvals: np.ndarray of the right values ordered after lvals. + union_values: np.array + sorted union of lvals and rvals + lvals: np.ndarray + left values which is ordered in front. + rvals: np.ndarray + right values ordered after lvals. Returns ------- np.ndarray containing the resorted values from union_values """ indexer = [] - counts = dict(zip(*np.unique(union_values, return_counts=True))) + # counts = dict(zip(*np.unique(union_values, return_counts=True))) + counts = value_counts(union_values) unique_array = unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 558bbe70983fa..81a05bd3091b1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2729,7 +2729,7 @@ def _union(self, other, sort): rvals = other._values try: - # If one of both is not monotonic, we resort it afterwards + # If one or both is not monotonic, we resort it afterwards result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] except TypeError: # incomparable objects From a1887c72f8995c0e88fb01064d31f1946ab1ca44 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 21 Dec 2020 21:03:46 +0100 Subject: [PATCH 23/41] Move whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.3.0.rst | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e9ebe6c35c01b..bbdcd183f65e1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -693,7 +693,6 @@ Indexing - Bug in :meth:`Series.at` returning :class:`Series` with one element instead of scalar when index is a :class:`MultiIndex` with one level (:issue:`38053`) - Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) -- Bug in :meth:`Index.union` dropping duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) - Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) - Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`) - Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a listlike ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7671962018144..2b2196f3aef29 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -215,6 +215,8 @@ Interval Indexing ^^^^^^^^ + +- Bug in :meth:`Index.union` dropping duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.iloc.__setitem__` and :meth:`DataFrame.loc.__setitem__` with mixed dtypes when setting with a dictionary value (:issue:`38335`) From fe41a6fd3d33e0ae5de672a1ebdfdbf3682b1ff1 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 21 Dec 2020 21:04:34 +0100 Subject: [PATCH 24/41] Fix gh reference --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 72a02cdeebd3b..513f520d0f329 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2424,7 +2424,7 @@ def test_make_duplicates_of_left_unique_in_right(left_values): def test_resort_union_after_inputs(): - # GH#36299 + # GH#36289 union_values = np.array([1, 1, 2, 3, 3, 4]) lvals = np.array([[3, 1, 4]]) rvals = np.array([[2, 3, 1]]) From b63a7320281316bc392ed60a9c05651020253b86 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 22 Dec 2020 21:05:38 +0100 Subject: [PATCH 25/41] Remove comment and fix test --- pandas/core/algorithms.py | 1 - pandas/tests/indexes/test_setops.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5a08436d6eb4d..f0721c1c66c69 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2242,7 +2242,6 @@ def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: np.ndarray containing the resorted values from union_values """ indexer = [] - # counts = dict(zip(*np.unique(union_values, return_counts=True))) counts = value_counts(union_values) unique_array = unique(np.append(lvals, rvals)) # Create indexer to resort result diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 065365fa6b6be..c78c19910deb1 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -466,7 +466,7 @@ def test_union_duplicate_index_subsets_of_each_other(cls): b = cls([3, 3, 4]) expected = cls([1, 2, 2, 3, 3, 4]) if cls is CategoricalIndex: - expected = Index([1, 2, 2, 3, 3, 4], dtype="object") + expected = Index([1, 2, 2, 3, 3, 4]) result = a.union(b) tm.assert_index_equal(result, expected) result = a.union(b, sort=False) From fa49dfee665776ecaafa70414f6d2914f270f161 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 22 Dec 2020 21:18:10 +0100 Subject: [PATCH 26/41] Add test for concat issue --- pandas/tests/reshape/concat/test_index.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 3fc886893b55a..c7ee39c8e0e33 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -259,3 +259,17 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) + + @pytest.mark.parametrize( + "idx, values", [([1, 0, 0], ["y", "z", "x"]), ([0, 0, 1], ["x", "y", "z"])] + ) + def test_concat_duplicates_in_index(self, idx, values): + # GH#31308 + right = DataFrame({"right": ["x", "y", "z"]}, index=idx) + left = DataFrame({"left": ["a", "b"]}) + result = concat([left, right], axis=1) + + expected = DataFrame( + {"left": ["a", "a", "b"], "right": values}, index=sorted(idx) + ) + tm.assert_frame_equal(result, expected) From 2a6f6edaf3c6e38f2b8cdf63b494b76f58bb157b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 22 Dec 2020 21:24:14 +0100 Subject: [PATCH 27/41] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2b2196f3aef29..52da369f31c85 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -268,7 +268,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :func:`concat` raising ``ValueError`` when index of at least one of both inputs was non-unique and non-monotonic (:issue.`31308`) - Sparse From d80949cf2b6f58f5085a7b4df8c8ec744776bde1 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 13:33:12 +0100 Subject: [PATCH 28/41] Remove concat test --- pandas/tests/reshape/concat/test_index.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index c7ee39c8e0e33..3fc886893b55a 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -259,17 +259,3 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) - - @pytest.mark.parametrize( - "idx, values", [([1, 0, 0], ["y", "z", "x"]), ([0, 0, 1], ["x", "y", "z"])] - ) - def test_concat_duplicates_in_index(self, idx, values): - # GH#31308 - right = DataFrame({"right": ["x", "y", "z"]}, index=idx) - left = DataFrame({"left": ["a", "b"]}) - result = concat([left, right], axis=1) - - expected = DataFrame( - {"left": ["a", "a", "b"], "right": values}, index=sorted(idx) - ) - tm.assert_frame_equal(result, expected) From b57f00f4fe547dd98b90ad9f4d3e38d5b09c0b3f Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 21:26:37 +0100 Subject: [PATCH 29/41] Add dropna --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 091c5be353beb..c671c09e2038f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2225,7 +2225,7 @@ def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: np.ndarray containing the resorted values from union_values """ indexer = [] - counts = value_counts(union_values) + counts = value_counts(union_values, dropna=False) unique_array = unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): From aa4533a76d4321af8ca560dbfdcb87373259f929 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 22:17:29 +0100 Subject: [PATCH 30/41] Fix join func --- pandas/_libs/join.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 1b79d68c13570..fc93b130d61ba 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -537,7 +537,7 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): lval = left[i] rval = right[j] - if lval == rval: + if lval == rval or np.isnan(lval) and np.isnan(rval): count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -601,7 +601,7 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): lval = left[i] rval = right[j] - if lval == rval: + if lval == rval or np.isnan(lval) and np.isnan(rval): lindexer[count] = i rindexer[count] = j result[count] = lval From 8fd90a36c5ae9dc838698c0252cc74df256533a2 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 23:26:47 +0100 Subject: [PATCH 31/41] Fix bug --- pandas/_libs/join.pyx | 4 ++-- pandas/core/algorithms.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index fc93b130d61ba..1b79d68c13570 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -537,7 +537,7 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): lval = left[i] rval = right[j] - if lval == rval or np.isnan(lval) and np.isnan(rval): + if lval == rval: count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -601,7 +601,7 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): lval = left[i] rval = right[j] - if lval == rval or np.isnan(lval) and np.isnan(rval): + if lval == rval: lindexer[count] = i rindexer[count] = j result[count] = lval diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c671c09e2038f..dd71c343cd91f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2225,9 +2225,10 @@ def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: np.ndarray containing the resorted values from union_values """ indexer = [] - counts = value_counts(union_values, dropna=False) + x = value_counts(lvals, dropna=False) + y = value_counts(rvals, dropna=False) unique_array = unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): - indexer += [i] * counts[value] + indexer += [i] * max(x[value] if value in x.index else 0, y[value] if value in y.index else 0) return unique_array.take(indexer) From 091942a658b1de978529ccfbcfd42af7984160c9 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 23:39:21 +0100 Subject: [PATCH 32/41] Fix bug --- pandas/core/algorithms.py | 3 +- pandas/core/indexes/base.py | 61 +++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dd71c343cd91f..f55ef2f6753a0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2227,8 +2227,9 @@ def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: indexer = [] x = value_counts(lvals, dropna=False) y = value_counts(rvals, dropna=False) + z = x._align_series(y, fill_value=0) unique_array = unique(np.append(lvals, rvals)) # Create indexer to resort result for i, value in enumerate(unique_array): - indexer += [i] * max(x[value] if value in x.index else 0, y[value] if value in y.index else 0) + indexer += [i] * int(max(z[0][value], z[1][value])) return unique_array.take(indexer) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 36e242ce7456b..99484b6e204f4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2815,35 +2815,44 @@ def _union(self, other, sort): lvals = self._values rvals = other._values - try: - # If one or both is not monotonic, we resort it afterwards - result = self._outer_indexer(np.sort(lvals), np.sort(rvals))[0] - except TypeError: - # incomparable objects - result = list(lvals) + if sort is None and self.is_monotonic and other.is_monotonic and (self.is_unique or other.is_unique): + try: + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + result = Index(result)._values # do type inference here else: - if ( - sort is False - or not self.is_monotonic - or not other.is_monotonic - or is_categorical_dtype(self) - ): - result = algos.re_sort_union_after_inputs(result, lvals, rvals) + # find indexes of things in "other" that are not in "self" + if other.is_unique: + if self.is_unique: + indexer = self.get_indexer(other) + missing = (indexer == -1).nonzero()[0] + else: + missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) - if sort is None and (not self.is_monotonic or not other.is_monotonic): - try: - result = algos.safe_sort(result) - except TypeError as err: - warnings.warn( - f"{err}, sort order is undefined for incomparable objects", - RuntimeWarning, - stacklevel=3, - ) + if len(missing) > 0: + other_diff = algos.take_nd(rvals, missing, allow_fill=False) + result = concat_compat((lvals, other_diff)) + else: + result = lvals + else: + result = algos.re_sort_union_after_inputs(lvals, lvals, rvals) + + + if sort is None and (not self.is_monotonic or not other.is_monotonic): + try: + result = algos.safe_sort(result) + except TypeError as err: + warnings.warn( + f"{err}, sort order is undefined for incomparable objects", + RuntimeWarning, + stacklevel=3, + ) return result From 446eb507b8f8c9dbcaff4f86243cecfeefff1f02 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 6 Jan 2021 01:03:06 +0100 Subject: [PATCH 33/41] Refactor code and add tests --- pandas/core/algorithms.py | 20 ++++++++------------ pandas/core/indexes/base.py | 6 +++--- pandas/tests/indexes/test_setops.py | 18 ++++++++++++++++++ pandas/tests/test_algos.py | 9 ++++----- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f55ef2f6753a0..694156f24b689 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2205,16 +2205,13 @@ def _sort_tuples(values: np.ndarray[tuple]): return values[indexer] -def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: +def union_with_duplicates(lvals, rvals) -> np.ndarray: """ - Elements from union_values are re-sorted after the ranking of the first occurrence - in lvals and then rvals, if element is not in lvals. All occurrences of this element - are placed at the spot of the first occurrence of this element. + Extracts the union from lvals and rvals with respect to duplicates and nans in + both arrays. Parameters ---------- - union_values: np.array - sorted union of lvals and rvals lvals: np.ndarray left values which is ordered in front. rvals: np.ndarray @@ -2222,14 +2219,13 @@ def re_sort_union_after_inputs(union_values, lvals, rvals) -> np.ndarray: Returns ------- - np.ndarray containing the resorted values from union_values + np.ndarray containing the unsorted union of noth arrays """ indexer = [] - x = value_counts(lvals, dropna=False) - y = value_counts(rvals, dropna=False) - z = x._align_series(y, fill_value=0) + l_count = value_counts(lvals, dropna=False) + r_count = value_counts(rvals, dropna=False) + l_count, r_count = l_count._align_series(r_count, fill_value=0) unique_array = unique(np.append(lvals, rvals)) - # Create indexer to resort result for i, value in enumerate(unique_array): - indexer += [i] * int(max(z[0][value], z[1][value])) + indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 99484b6e204f4..59096406656fc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2815,7 +2815,7 @@ def _union(self, other, sort): lvals = self._values rvals = other._values - if sort is None and self.is_monotonic and other.is_monotonic and (self.is_unique or other.is_unique): + if sort is None and self.is_monotonic and other.is_monotonic and not (self.has_duplicates and other.has_duplicates): try: result = self._outer_indexer(lvals, rvals)[0] except TypeError: @@ -2827,8 +2827,8 @@ def _union(self, other, sort): result.extend([x for x in rvals if x not in value_set]) result = Index(result)._values # do type inference here else: - # find indexes of things in "other" that are not in "self" if other.is_unique: + # find indexes of things in "other" that are not in "self" if self.is_unique: indexer = self.get_indexer(other) missing = (indexer == -1).nonzero()[0] @@ -2841,7 +2841,7 @@ def _union(self, other, sort): else: result = lvals else: - result = algos.re_sort_union_after_inputs(lvals, lvals, rvals) + result = algos.union_with_duplicates(lvals, rvals) if sort is None and (not self.is_monotonic or not other.is_monotonic): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index f7abe85e0eba1..4fbf69445c4c0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -542,3 +542,21 @@ def test_union_duplicate_index_different_dtypes(): expected = Index([1, 2, 2, 3, "1", "0", "0"]) result = idx1.union(idx2, sort=False) tm.assert_index_equal(result, expected) + + +def test_union_same_value_duplicated_in_both(): + # GH#36289 + idx1 = Index([0, 0, 1]) + idx2 = Index([0, 0, 1, 2]) + result = idx1.union(idx2) + expected = Index([0, 0, 1, 2]) + tm.assert_index_equal(result, expected) + + +def test_union_nan_in_both(): + # GH#36289 + idx1 = Index([np.nan, 1, 2, 2]) + idx2 = Index([np.nan, 1, 1, 2]) + result = idx1.union(idx2, sort=False) + expected = Index([np.nan, 1.0, 1.0, 2.0, 2.0]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 85df40fdb66f2..9feb471bc8b31 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2411,11 +2411,10 @@ def test_diff_ea_axis(self): algos.diff(dta, 1, axis=1) -def test_resort_union_after_inputs(): +def test_union_with_duplicates(): # GH#36289 - union_values = np.array([1, 1, 2, 3, 3, 4]) - lvals = np.array([[3, 1, 4]]) - rvals = np.array([[2, 3, 1]]) - result = algos.re_sort_union_after_inputs(union_values, lvals, rvals) + lvals = np.array([3, 1, 3, 4]) + rvals = np.array([2, 3, 1, 1]) + result = algos.union_with_duplicates(lvals, rvals) expected = np.array([3, 3, 1, 1, 4, 2]) tm.assert_numpy_array_equal(result, expected) From 6b8fa64f56c0232abc773e971bbc170cf0f94662 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 6 Jan 2021 01:10:36 +0100 Subject: [PATCH 34/41] Run Black --- pandas/core/algorithms.py | 2 +- pandas/core/indexes/base.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 694156f24b689..da16cfa7c98ae 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2219,7 +2219,7 @@ def union_with_duplicates(lvals, rvals) -> np.ndarray: Returns ------- - np.ndarray containing the unsorted union of noth arrays + np.ndarray containing the unsorted union of both arrays """ indexer = [] l_count = value_counts(lvals, dropna=False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 59096406656fc..e4b2eb0a69b78 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2815,7 +2815,12 @@ def _union(self, other, sort): lvals = self._values rvals = other._values - if sort is None and self.is_monotonic and other.is_monotonic and not (self.has_duplicates and other.has_duplicates): + if ( + sort is None + and self.is_monotonic + and other.is_monotonic + and not (self.has_duplicates and other.has_duplicates) + ): try: result = self._outer_indexer(lvals, rvals)[0] except TypeError: @@ -2843,7 +2848,6 @@ def _union(self, other, sort): else: result = algos.union_with_duplicates(lvals, rvals) - if sort is None and (not self.is_monotonic or not other.is_monotonic): try: result = algos.safe_sort(result) From 6cabad894c30bdfe594c394fed3a1ab540ad1841 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 9 Jan 2021 23:44:20 +0100 Subject: [PATCH 35/41] Adress review --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/indexes/test_setops.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0131351379e60..f24da08df58aa 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -242,7 +242,7 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`Index.union` dropping duplicate Index values when Index was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) +- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.loc`, :meth:`Series.loc`, :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` returning incorrect elements for non-monotonic :class:`DatetimeIndex` for string slices (:issue:`33146`) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 4fbf69445c4c0..c0edc1790f2d3 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -524,39 +524,39 @@ def test_union_duplicate_index_subsets_of_each_other(cls): ) def test_union_with_duplicate_index(cls): # GH#36289 - idx1 = cls([1, 0, 0]) - idx2 = cls([0, 1]) + a = cls([1, 0, 0]) + b = cls([0, 1]) expected = cls([0, 0, 1]) - result = idx1.union(idx2) + result = a.union(b) tm.assert_index_equal(result, expected) - result = idx2.union(idx1) + result = a.union(b) tm.assert_index_equal(result, expected) def test_union_duplicate_index_different_dtypes(): # GH#36289 - idx1 = Index([1, 2, 2, 3]) - idx2 = Index(["1", "0", "0"]) + a = Index([1, 2, 2, 3]) + b = Index(["1", "0", "0"]) expected = Index([1, 2, 2, 3, "1", "0", "0"]) - result = idx1.union(idx2, sort=False) + result = a.union(b, sort=False) tm.assert_index_equal(result, expected) def test_union_same_value_duplicated_in_both(): # GH#36289 - idx1 = Index([0, 0, 1]) - idx2 = Index([0, 0, 1, 2]) - result = idx1.union(idx2) + a = Index([0, 0, 1]) + b = Index([0, 0, 1, 2]) + result = a.union(b) expected = Index([0, 0, 1, 2]) tm.assert_index_equal(result, expected) def test_union_nan_in_both(): # GH#36289 - idx1 = Index([np.nan, 1, 2, 2]) - idx2 = Index([np.nan, 1, 1, 2]) - result = idx1.union(idx2, sort=False) + a = Index([np.nan, 1, 2, 2]) + b = Index([np.nan, 1, 1, 2]) + result = a.union(b, sort=False) expected = Index([np.nan, 1.0, 1.0, 2.0, 2.0]) tm.assert_index_equal(result, expected) From 25885d4e2e654ae157f3a40b5e73216d32501cd2 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 22 Jan 2021 18:53:44 +0100 Subject: [PATCH 36/41] Merge master and adjust condition --- pandas/core/indexes/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 182f00ed08dfc..8cc7773866a09 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2896,7 +2896,8 @@ def _union(self, other, sort): else: result = algos.union_with_duplicates(lvals, rvals) - result = _maybe_try_sort(result, sort) + if not self.is_monotonic or not other.is_monotonic: + result = _maybe_try_sort(result, sort) return result From ccaa0c188da4adca9aaf07d14232b699ed9977df Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 13 Feb 2021 21:33:05 +0100 Subject: [PATCH 37/41] Remove unused import --- pandas/tests/indexes/test_setops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 2db9b27adba5a..835ae20e06ed1 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -7,7 +7,6 @@ from pandas.core.dtypes.common import is_dtype_equal -import pandas as pd from pandas import ( CategoricalIndex, DatetimeIndex, From f4ee4667be06266320f5746fedfa820e849c8135 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 21 Feb 2021 21:13:17 +0100 Subject: [PATCH 38/41] Reformat code --- pandas/core/indexes/base.py | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e76388797cc07..0e4b54e9bcc37 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2869,7 +2869,7 @@ def _union(self, other, sort): and not (self.has_duplicates and other.has_duplicates) ): try: - result = self._outer_indexer(lvals, rvals)[0] + return self._outer_indexer(lvals, rvals)[0] except (TypeError, IncompatibleFrequency): # incomparable objects result = list(lvals) @@ -2877,26 +2877,28 @@ def _union(self, other, sort): # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here + return Index(result)._values # do type inference here + + elif not other.is_unique and not self.is_unique: + result = algos.union_with_duplicates(lvals, rvals) + return _maybe_try_sort(result, sort) + + # Either other or self is not unique + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + missing = (indexer == -1).nonzero()[0] else: - if other.is_unique: - # find indexes of things in "other" that are not in "self" - if self.is_unique: - indexer = self.get_indexer(other) - missing = (indexer == -1).nonzero()[0] - else: - missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) + missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) - if len(missing) > 0: - other_diff = algos.take_nd(rvals, missing, allow_fill=False) - result = concat_compat((lvals, other_diff)) - else: - result = lvals - else: - result = algos.union_with_duplicates(lvals, rvals) + if len(missing) > 0: + other_diff = algos.take_nd(rvals, missing, allow_fill=False) + result = concat_compat((lvals, other_diff)) + else: + result = lvals - if not self.is_monotonic or not other.is_monotonic: - result = _maybe_try_sort(result, sort) + if not self.is_monotonic or not other.is_monotonic: + result = _maybe_try_sort(result, sort) return result From e92ab7afb7e12a29471cc50111e7072f2b7caf36 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 22 Feb 2021 21:20:24 +0100 Subject: [PATCH 39/41] Add comments and refactor code --- pandas/core/algorithms.py | 8 ++++---- pandas/core/indexes/base.py | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 254f1c2576453..5b3fdd8667b09 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2279,16 +2279,16 @@ def _sort_tuples(values: np.ndarray): return values[indexer] -def union_with_duplicates(lvals, rvals) -> np.ndarray: +def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: """ Extracts the union from lvals and rvals with respect to duplicates and nans in both arrays. Parameters ---------- - lvals: np.ndarray + lvals: left values which is ordered in front. - rvals: np.ndarray + rvals: right values ordered after lvals. Returns @@ -2298,7 +2298,7 @@ def union_with_duplicates(lvals, rvals) -> np.ndarray: indexer = [] l_count = value_counts(lvals, dropna=False) r_count = value_counts(rvals, dropna=False) - l_count, r_count = l_count._align_series(r_count, fill_value=0) + l_count, r_count = l_count.align(r_count, fill_value=0) unique_array = unique(np.append(lvals, rvals)) for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0db96b019e910..4ff5190991cb0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2941,18 +2941,20 @@ def _union(self, other, sort): and other.is_monotonic and not (self.has_duplicates and other.has_duplicates) ): + # Both are unique and monotonic, so can use outer join try: return self._outer_indexer(lvals, rvals)[0] except (TypeError, IncompatibleFrequency): # incomparable objects - result = list(lvals) + value_list = list(lvals) # worth making this faster? a very unusual case value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - return Index(result)._values # do type inference here + value_list.extend([x for x in rvals if x not in value_set]) + return Index(value_list)._values # do type inference here elif not other.is_unique and not self.is_unique: + # self and other both have duplicates result = algos.union_with_duplicates(lvals, rvals) return _maybe_try_sort(result, sort) From 7ffa07af0a756c094e5410810c79ded6a74e3be1 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 2 Mar 2021 21:33:31 +0100 Subject: [PATCH 40/41] Adress review --- pandas/core/algorithms.py | 6 ++++-- pandas/tests/indexes/period/test_setops.py | 22 ++++++++++++++++++++++ pandas/tests/indexes/test_setops.py | 13 ++++++++----- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5b3fdd8667b09..af3809f30a26d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2286,9 +2286,9 @@ def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: Parameters ---------- - lvals: + lvals: np.ndarray left values which is ordered in front. - rvals: + rvals: np.ndarray right values ordered after lvals. Returns @@ -2300,6 +2300,8 @@ def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: r_count = value_counts(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) unique_array = unique(np.append(lvals, rvals)) + if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals): + unique_array = array(unique_array) for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index fcd2c7d3422e1..dd62d99c4708a 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -348,3 +348,25 @@ def test_intersection_equal_duplicates(self): idx_dup = idx.append(idx) result = idx_dup.intersection(idx_dup) tm.assert_index_equal(result, idx) + + def test_union_duplicates(self): + # GH#36289 + idx = pd.period_range("2011-01-01", periods=2) + idx_dup = idx.append(idx) + + idx2 = pd.period_range("2011-01-02", periods=2) + idx2_dup = idx2.append(idx2) + result = idx_dup.union(idx2_dup) + + expected = PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d05bd5dfee9a7..5cff23943b57d 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -508,8 +508,10 @@ def check_intersection_commutative(left, right): Float64Index, DatetimeIndex, CategoricalIndex, + lambda x: CategoricalIndex(x, categories=set(x)), TimedeltaIndex, lambda x: Index(x, dtype=object), + UInt64Index, ], ) def test_union_duplicate_index_subsets_of_each_other(cls): @@ -517,7 +519,7 @@ def test_union_duplicate_index_subsets_of_each_other(cls): a = cls([1, 2, 2, 3]) b = cls([3, 3, 4]) expected = cls([1, 2, 2, 3, 3, 4]) - if cls is CategoricalIndex: + if isinstance(a, CategoricalIndex): expected = Index([1, 2, 2, 3, 3, 4]) result = a.union(b) tm.assert_index_equal(result, expected) @@ -536,7 +538,7 @@ def test_union_duplicate_index_subsets_of_each_other(cls): lambda x: Index(x, dtype=object), ], ) -def test_union_with_duplicate_index(cls): +def test_union_with_duplicate_index_and_non_monotonic(cls): # GH#36289 a = cls([1, 0, 0]) b = cls([0, 1]) @@ -567,12 +569,13 @@ def test_union_same_value_duplicated_in_both(): tm.assert_index_equal(result, expected) -def test_union_nan_in_both(): +@pytest.mark.parametrize("dup", [1, np.nan]) +def test_union_nan_in_both(dup): # GH#36289 a = Index([np.nan, 1, 2, 2]) - b = Index([np.nan, 1, 1, 2]) + b = Index([np.nan, dup, 1, 2]) result = a.union(b, sort=False) - expected = Index([np.nan, 1.0, 1.0, 2.0, 2.0]) + expected = Index([np.nan, dup, 1.0, 2.0, 2.0]) tm.assert_index_equal(result, expected) From 0af939d8a964a8039fc9f94444be14c18fbed1e4 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 2 Mar 2021 21:43:04 +0100 Subject: [PATCH 41/41] Fix merge introduced missing object --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4bac9b6f6fa84..04eef635dc79b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1890,7 +1890,7 @@ def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: l_count, r_count = l_count.align(r_count, fill_value=0) unique_array = unique(np.append(lvals, rvals)) if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals): - unique_array = array(unique_array) + unique_array = pd_array(unique_array) for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer)