From 5819628adf8db36a1c63cb58d36bdeab830a966f Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 28 Nov 2020 16:08:51 +0000
Subject: [PATCH 1/2] ENH: Categorical.unique can keep same dtype

---
 doc/source/whatsnew/v1.2.0.rst                |  1 +
 pandas/core/arrays/categorical.py             | 29 +++++++++++++++----
 .../arrays/categorical/test_analytics.py      | 21 +++++++++++++-
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 6f046d3a9379d..3c30d7a9b8c23 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -254,6 +254,7 @@ Other enhancements
 - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
 - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
 - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
+- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`xxxxx`)
 - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
 - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 62e508c491740..487a12853f61a 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2035,16 +2035,24 @@ def mode(self, dropna=True):
     # ------------------------------------------------------------------
     # ExtensionArray Interface
 
-    def unique(self):
+    def unique(self, remove_unused_categories: bool = True) -> "Categorical":
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
-        unique. Unused categories are NOT returned.
+        unique. By default, unused categories are NOT returned.
 
         - unordered category: values and categories are sorted by appearance
           order.
         - ordered category: values are sorted by appearance order, categories
           keeps existing order.
 
+        Parameters
+        ----------
+        remove_unused_categories : bool, default True
+            If True, unused categories are not returned.
+            If False, the input dtype is returned unchanged.
+
+            .. versionadded:: 1.2.0
+
         Returns
         -------
         unique values : ``Categorical``
@@ -2075,13 +2083,24 @@ def unique(self):
         ... ).unique()
         ['b', 'a', 'c']
         Categories (3, object): ['a' < 'b' < 'c']
+
+        By default, unused categories are removed, but this can be changed:
+
+        >>> cat = pd.Categorical(list("baab"), categories=list("abc"), ordered=True)
+        >>> cat.unique()
+        ['b', 'a']
+        Categories (2, object): ['a' < 'b']
+        >>> cat.unique(remove_unused_categories=False)
+        ['b', 'a']
+        Categories (3, object): ['a' < 'b' < 'c']
         """
         # unlike np.unique, unique1d does not sort
         unique_codes = unique1d(self.codes)
-        cat = self.copy()
 
-        # keep nan in codes
-        cat._codes = unique_codes
+        cat = self._constructor(unique_codes, dtype=self.dtype, fastpath=True)
+
+        if not remove_unused_categories:
+            return cat
 
         # exclude nan from indexer for categories
         take_codes = unique_codes[unique_codes != -1]
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index 7bd7d29ec9703..606a1fd0bde22 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -6,7 +6,7 @@
 
 from pandas.compat import PYPY
 
-from pandas import Categorical, Index, NaT, Series, date_range
+from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range
 import pandas._testing as tm
 from pandas.api.types import is_scalar
 
@@ -242,6 +242,25 @@ def test_unique_ordered(self):
         exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
         tm.assert_categorical_equal(res, exp_cat)
 
+    @pytest.mark.parametrize("values, expected", [
+        [list("abc"), list("abc")],
+        [list("bac"), list("bac")],
+        [list("ab"), list("ab")],
+        [list("bc"), list("bc")],
+        [list("aabbcc"), list("abc")],
+        [list("aabb"), list("ab")],
+        [[np.nan, "a", "b"], [np.nan, "a", "b"]],
+        [["a", "b", np.nan], ["a", "b", np.nan]],
+        [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]],
+    ])
+    def test_unique_keep_unused_categories(self, values, expected, ordered):
+        # GHxxxxx
+        dtype = CategoricalDtype(list("abc"), ordered=ordered)
+        result = Categorical(values, dtype=dtype).unique(remove_unused_categories=False)
+        expected = Categorical(expected, dtype=dtype)
+
+        tm.assert_categorical_equal(result, expected)
+
     def test_unique_index_series(self):
         c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
         # Categorical.unique sorts categories by appearance order

From 29e6f62e2149d1b1110d40c0013f33ccac26608c Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 28 Nov 2020 16:30:35 +0000
Subject: [PATCH 2/2] small fixes

---
 doc/source/whatsnew/v1.2.0.rst                |  2 +-
 .../arrays/categorical/test_analytics.py      | 27 ++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 3c30d7a9b8c23..61f8b7c6bcc6a 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -254,7 +254,7 @@ Other enhancements
 - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
 - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
 - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
-- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`xxxxx`)
+- :meth:`Categorical.unique` has a new parameter ``remove_unused_categories``, which if set to ``False``, keeps the dtype of the original categorical (:issue:`38135`)
 - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
 - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
 
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index 606a1fd0bde22..91559d92dcd92 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -242,19 +242,22 @@ def test_unique_ordered(self):
         exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
         tm.assert_categorical_equal(res, exp_cat)
 
-    @pytest.mark.parametrize("values, expected", [
-        [list("abc"), list("abc")],
-        [list("bac"), list("bac")],
-        [list("ab"), list("ab")],
-        [list("bc"), list("bc")],
-        [list("aabbcc"), list("abc")],
-        [list("aabb"), list("ab")],
-        [[np.nan, "a", "b"], [np.nan, "a", "b"]],
-        [["a", "b", np.nan], ["a", "b", np.nan]],
-        [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]],
-    ])
+    @pytest.mark.parametrize(
+        "values, expected",
+        [
+            [list("abc"), list("abc")],
+            [list("bac"), list("bac")],
+            [list("ab"), list("ab")],
+            [list("bc"), list("bc")],
+            [list("aabbcc"), list("abc")],
+            [list("aabb"), list("ab")],
+            [[np.nan, "a", "b"], [np.nan, "a", "b"]],
+            [["a", "b", np.nan], ["a", "b", np.nan]],
+            [["a", "b", "a", "b", np.nan], ["a", "b", np.nan]],
+        ],
+    )
     def test_unique_keep_unused_categories(self, values, expected, ordered):
-        # GHxxxxx
+        # GH38135
         dtype = CategoricalDtype(list("abc"), ordered=ordered)
         result = Categorical(values, dtype=dtype).unique(remove_unused_categories=False)
         expected = Categorical(expected, dtype=dtype)