From 95474f861c08c0b3d9af441bb05907a2142932e3 Mon Sep 17 00:00:00 2001 From: Jan Rudolph Date: Tue, 10 Oct 2017 17:26:09 +0200 Subject: [PATCH 1/3] BUG: merging with a boolean/int categorical column pandas-dev/pandas#17187 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/internals.py | 2 +- pandas/tests/reshape/test_merge.py | 43 ++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f04410ef63531..1ca2a81967949 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1010,6 +1010,7 @@ Categorical - Bug in :func:`Series.isin` when called with a categorical (:issue:`16639`) - Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) - Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) +- Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) PyPy ^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 689f5521e1ccb..f6773db8074b2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5494,7 +5494,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): # preserve these for validation in _concat_compat return self.block.values - if self.block.is_bool: + if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index ed99814afd20a..243517c73a7ed 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1546,6 +1546,49 @@ def test_dtype_on_categorical_dates(self): result_inner = pd.merge(df, df2, how='inner', on=['date']) assert_frame_equal(result_inner, expected_inner) + def test_merging_with_boolean_cateorical_column(self): + df1 = pd.DataFrame({'id': [1, 2, 3, 4], + 'cat': [False, True, True, False]}) + df1['cat'] = df1['cat'].astype('category', + categories=[True, False], ordered=True) + df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) + result = df1.merge(df2) + expected = pd.DataFrame({'id': [2, 4], 'cat': [True, False], + 'num': [1, 9]}) + expected['cat'] = expected['cat'].astype('category', + categories=[True, False], + ordered=True) + assert_frame_equal(expected, result) + + def test_merging_with_integer_cateorical_column(self): + df1 = pd.DataFrame({'id': [1, 2, 3, 4], + 'cat': [2, 1, 1, 2]}) + df1['cat'] = df1['cat'].astype('category', + categories=[1, 2], ordered=True) + df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) + result = df1.merge(df2) + expected = pd.DataFrame({'id': [2, 4], 'cat': [1, 2], + 'num': [1, 9]}) + expected['cat'] = expected['cat'].astype('category', + categories=[1, 2], + ordered=True) + assert_frame_equal(expected, result) + + def test_merging_with_string_cateorical_column(self): + df1 = pd.DataFrame({'id': [1, 2, 3, 4], + 'cat': ['False', 'True', 'True', 'False']}) + df1['cat'] = df1['cat'].astype('category', + categories=['True', 'False'], + ordered=True) + df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) + result = df1.merge(df2) + expected = pd.DataFrame({'id': [2, 4], 'cat': ['True', 'False'], + 'num': [1, 9]}) + expected['cat'] = expected['cat'].astype('category', + categories=['True', 'False'], + ordered=True) + assert_frame_equal(expected, result) + @pytest.fixture def left_df(): From 58071ffa0e613776315f88779ce6ec374726750e Mon Sep 17 00:00:00 2001 From: Jan Rudolph Date: Tue, 10 Oct 2017 20:31:12 +0200 Subject: [PATCH 2/3] add gh issue number to tests --- pandas/tests/reshape/test_merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 243517c73a7ed..5c357df610dca 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1547,6 +1547,8 @@ def test_dtype_on_categorical_dates(self): assert_frame_equal(result_inner, expected_inner) def test_merging_with_boolean_cateorical_column(self): + # GH 17187 + # merging with a boolean/int categorical column df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'cat': [False, True, True, False]}) df1['cat'] = df1['cat'].astype('category', @@ -1561,6 +1563,8 @@ def test_merging_with_boolean_cateorical_column(self): assert_frame_equal(expected, result) def test_merging_with_integer_cateorical_column(self): + # GH 17187 + # merging with a boolean/int categorical column df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'cat': [2, 1, 1, 2]}) df1['cat'] = df1['cat'].astype('category', @@ -1575,6 +1579,8 @@ def test_merging_with_integer_cateorical_column(self): assert_frame_equal(expected, result) def test_merging_with_string_cateorical_column(self): + # GH 17187 + # merging with a boolean/int categorical column df1 = pd.DataFrame({'id': [1, 2, 3, 4], 'cat': ['False', 'True', 'True', 'False']}) df1['cat'] = df1['cat'].astype('category', From 6f5b637313bcb26e5aec20a43908d16d5752750e Mon Sep 17 00:00:00 2001 From: Jan Rudolph Date: Fri, 13 Oct 2017 15:00:37 +0200 Subject: [PATCH 3/3] parametrize test --- pandas/tests/reshape/test_merge.py | 51 ++++++++---------------------- 1 file changed, 13 insertions(+), 38 deletions(-) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 5c357df610dca..81956c0bd5b28 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1546,52 +1546,27 @@ def test_dtype_on_categorical_dates(self): result_inner = pd.merge(df, df2, how='inner', on=['date']) assert_frame_equal(result_inner, expected_inner) - def test_merging_with_boolean_cateorical_column(self): + @pytest.mark.parametrize('category_column,categories,expected_categories', + [([False, True, True, False], [True, False], + [True, False]), + ([2, 1, 1, 2], [1, 2], [1, 2]), + (['False', 'True', 'True', 'False'], + ['True', 'False'], ['True', 'False'])]) + def test_merging_with_bool_or_int_cateorical_column(self, category_column, + categories, + expected_categories): # GH 17187 # merging with a boolean/int categorical column df1 = pd.DataFrame({'id': [1, 2, 3, 4], - 'cat': [False, True, True, False]}) + 'cat': category_column}) df1['cat'] = df1['cat'].astype('category', - categories=[True, False], ordered=True) + categories=categories, ordered=True) df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) result = df1.merge(df2) - expected = pd.DataFrame({'id': [2, 4], 'cat': [True, False], + expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories, 'num': [1, 9]}) expected['cat'] = expected['cat'].astype('category', - categories=[True, False], - ordered=True) - assert_frame_equal(expected, result) - - def test_merging_with_integer_cateorical_column(self): - # GH 17187 - # merging with a boolean/int categorical column - df1 = pd.DataFrame({'id': [1, 2, 3, 4], - 'cat': [2, 1, 1, 2]}) - df1['cat'] = df1['cat'].astype('category', - categories=[1, 2], ordered=True) - df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) - result = df1.merge(df2) - expected = pd.DataFrame({'id': [2, 4], 'cat': [1, 2], - 'num': [1, 9]}) - expected['cat'] = expected['cat'].astype('category', - categories=[1, 2], - ordered=True) - assert_frame_equal(expected, result) - - def test_merging_with_string_cateorical_column(self): - # GH 17187 - # merging with a boolean/int categorical column - df1 = pd.DataFrame({'id': [1, 2, 3, 4], - 'cat': ['False', 'True', 'True', 'False']}) - df1['cat'] = df1['cat'].astype('category', - categories=['True', 'False'], - ordered=True) - df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) - result = df1.merge(df2) - expected = pd.DataFrame({'id': [2, 4], 'cat': ['True', 'False'], - 'num': [1, 9]}) - expected['cat'] = expected['cat'].astype('category', - categories=['True', 'False'], + categories=categories, ordered=True) assert_frame_equal(expected, result)