From 5819628adf8db36a1c63cb58d36bdeab830a966f Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 16:08:51 +0000 Subject: [PATCH 1/2] ENH: Categorical.unique can keep same dtype --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/categorical.py | 29 +++++++++++++++---- .../arrays/categorical/test_analytics.py | 21 +++++++++++++- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f046d3a9379d..3c30d7a9b8c23 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -254,6 +254,7 @@ Other enhancements - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) +- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`xxxxx`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62e508c491740..487a12853f61a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2035,16 +2035,24 @@ def mode(self, dropna=True): # ------------------------------------------------------------------ # ExtensionArray Interface - def unique(self): + def unique(self, remove_unused_categories: bool = True) -> "Categorical": """ Return the ``Categorical`` which ``categories`` and ``codes`` are - unique. Unused categories are NOT returned. + unique. By default, unused categories are NOT returned. - unordered category: values and categories are sorted by appearance order. - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + remove_unused_categories : bool, default True + If True, unused categories are not returned. + If False, the input dtype is returned unchanged. + + .. versionadded:: 1.2.0 + Returns ------- unique values : ``Categorical`` @@ -2075,13 +2083,24 @@ def unique(self): ... ).unique() ['b', 'a', 'c'] Categories (3, object): ['a' < 'b' < 'c'] + + By default, unused categories are removed, but this can be changed: + + >>> cat = pd.Categorical(list("baab"), categories=list("abc"), ordered=True) + >>> cat.unique() + ['b', 'a'] + Categories (2, object): ['a' < 'b'] + >>> cat.unique(remove_unused_categories=False) + ['b', 'a'] + Categories (3, object): ['a' < 'b' < 'c'] """ # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() - # keep nan in codes - cat._codes = unique_codes + cat = self._constructor(unique_codes, dtype=self.dtype, fastpath=True) + + if not remove_unused_categories: + return cat # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 7bd7d29ec9703..606a1fd0bde22 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,7 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, NaT, Series, date_range +from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range import pandas._testing as tm from pandas.api.types import is_scalar @@ -242,6 +242,25 @@ def test_unique_ordered(self): exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) + @pytest.mark.parametrize("values, expected", [ + [list("abc"), list("abc")], + [list("bac"), list("bac")], + [list("ab"), list("ab")], + [list("bc"), list("bc")], + [list("aabbcc"), list("abc")], + [list("aabb"), list("ab")], + [[np.nan, "a", "b"], [np.nan, "a", "b"]], + [["a", "b", np.nan], ["a", "b", np.nan]], + [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]], + ]) + def test_unique_keep_unused_categories(self, values, expected, ordered): + # GHxxxxx + dtype = CategoricalDtype(list("abc"), ordered=ordered) + result = Categorical(values, dtype=dtype).unique(remove_unused_categories=False) + expected = Categorical(expected, dtype=dtype) + + tm.assert_categorical_equal(result, expected) + def test_unique_index_series(self): c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) # Categorical.unique sorts categories by appearance order From 29e6f62e2149d1b1110d40c0013f33ccac26608c Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 16:30:35 +0000 Subject: [PATCH 2/2] small fixes --- doc/source/whatsnew/v1.2.0.rst | 2 +- .../arrays/categorical/test_analytics.py | 27 ++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3c30d7a9b8c23..61f8b7c6bcc6a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -254,7 +254,7 @@ Other enhancements - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) -- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`xxxxx`) +- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`38135`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 606a1fd0bde22..91559d92dcd92 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -242,19 +242,22 @@ def test_unique_ordered(self): exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - @pytest.mark.parametrize("values, expected", [ - [list("abc"), list("abc")], - [list("bac"), list("bac")], - [list("ab"), list("ab")], - [list("bc"), list("bc")], - [list("aabbcc"), list("abc")], - [list("aabb"), list("ab")], - [[np.nan, "a", "b"], [np.nan, "a", "b"]], - [["a", "b", np.nan], ["a", "b", np.nan]], - [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]], - ]) + @pytest.mark.parametrize( + "values, expected", + [ + [list("abc"), list("abc")], + [list("bac"), list("bac")], + [list("ab"), list("ab")], + [list("bc"), list("bc")], + [list("aabbcc"), list("abc")], + [list("aabb"), list("ab")], + [[np.nan, "a", "b"], [np.nan, "a", "b"]], + [["a", "b", np.nan], ["a", "b", np.nan]], + [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]], + ], + ) def test_unique_keep_unused_categories(self, values, expected, ordered): - # GHxxxxx + # GH38135 dtype = CategoricalDtype(list("abc"), ordered=ordered) result = Categorical(values, dtype=dtype).unique(remove_unused_categories=False) expected = Categorical(expected, dtype=dtype)