diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 9c5c36528d31b..29d6d972a7a55 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -457,7 +457,7 @@ API changes - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) - +- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) @@ -904,6 +904,35 @@ New Behavior: idx1.difference(idx2) idx1.symmetric_difference(idx2) +.. _whatsnew_0190.api.unique_index: + +``Index.unique`` consistently returns ``Index`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.unique()`` now returns unique values as an +``Index`` of the appropriate ``dtype``. (:issue:`13395`) + +Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex``, +``TimedeltaIndex`` and ``PeriodIndex`` returned ``Index`` to keep metadata like timezone. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pd.Index([1, 2, 3]).unique() + Out[1]: array([1, 2, 3]) + In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + Out[2]: DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', + '2011-01-03 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq=None) + +New Behavior: + +.. ipython:: python + + pd.Index([1, 2, 3]).unique() + pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + .. _whatsnew_0190.api.autogenerated_chunksize_index: ``read_csv`` will progressively enumerate chunks @@ -1181,6 +1210,7 @@ Bug Fixes - Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) - Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) + - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) @@ -1248,7 +1278,6 @@ Bug Fixes - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) - Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) - - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) - Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0f9eb14be40db..b9a70292498e4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,8 +7,7 @@ from pandas.types.missing import isnull from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.types.common import (is_object_dtype, - is_list_like, is_scalar) +from pandas.types.common import is_object_dtype, is_list_like, is_scalar from pandas.core import common as com import pandas.core.nanops as nanops @@ -21,7 +20,7 @@ _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', - duplicated='IndexOpsMixin') + unique='IndexOpsMixin', duplicated='IndexOpsMixin') class StringMixin(object): @@ -952,21 +951,27 @@ def value_counts(self, normalize=False, sort=True, ascending=False, normalize=normalize, bins=bins, dropna=dropna) return result - def unique(self): + _shared_docs['unique'] = ( """ - Return array of unique values in the object. Significantly faster than - numpy.unique. Includes NA values. + Return %(unique)s of unique values in the object. + Significantly faster than numpy.unique. Includes NA values. + The order of the original is preserved. Returns ------- - uniques : ndarray - """ - from pandas.core.nanops import unique1d - values = self.values - if hasattr(values, 'unique'): - return values.unique() + uniques : %(unique)s + """) - return unique1d(values) + @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) + def unique(self): + values = self._values + + if hasattr(values, 'unique'): + result = values.unique() + else: + from pandas.core.nanops import unique1d + result = unique1d(values) + return result def nunique(self, dropna=True): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 7979a230eed84..01d6f6f078d17 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -18,6 +18,7 @@ is_float_dtype, is_extension_type, is_datetimetz, is_datetimelike, + is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, is_hashable, @@ -77,7 +78,7 @@ axes='index', klass='Series', axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - duplicated='Series', + unique='np.ndarray', duplicated='Series', optional_by='') @@ -1231,6 +1232,15 @@ def mode(self): # TODO: Add option for bins like value_counts() return algos.mode(self) + @Appender(base._shared_docs['unique'] % _shared_doc_kwargs) + def unique(self): + result = super(Series, self).unique() + if is_datetime64tz_dtype(self.dtype): + # to return array of Timestamp with tz + # ToDo: it must return DatetimeArray with tz in pandas 2.0 + return result.asobject.values + return result + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 8c1378c07a1d2..15cd2064624d9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -60,7 +60,8 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) -_index_doc_kwargs = dict(klass='Index', inplace='', duplicated='np.array') +_index_doc_kwargs = dict(klass='Index', inplace='', + unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -3217,6 +3218,11 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) + @Appender(base._shared_docs['unique'] % _index_doc_kwargs) + def unique(self): + result = super(Index, self).unique() + return self._shallow_copy(result) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 23c534624930f..251886ebdd974 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -283,6 +283,14 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() + @Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs) + def unique(self): + result = base.IndexOpsMixin.unique(self) + # CategoricalIndex._shallow_copy uses keeps original categories + # and ordered if not otherwise specified + return self._shallow_copy(result, categories=result.categories, + ordered=result.ordered) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 901b57dcc7bfe..b0e50491b8e9d 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -395,6 +395,7 @@ def test_duplicates(self): expected = CategoricalIndex([0], name='foo') self.assert_index_equal(idx.drop_duplicates(), expected) + self.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index eedbd108510f7..c72cab32d198b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1927,6 +1927,38 @@ def test_get_unique_index(self): self.assertTrue(result.unique) self.assert_index_equal(result, expected) + def test_unique(self): + mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]]) + + res = mi.unique() + exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([['a'], ['a']]) + tm.assert_index_equal(res, exp) + + def test_unique_datetimelike(self): + idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', + '2015-01-01', 'NaT', 'NaT']) + idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', + '2015-01-02', 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + result = pd.MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) + eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02', + 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + exp = pd.MultiIndex.from_arrays([eidx1, eidx2]) + tm.assert_index_equal(result, exp) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 66216758ca091..83b1cd141a61b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -17,6 +17,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin) +from pandas.types.common import is_datetime64_dtype from pandas.tseries.base import DatetimeIndexOpsMixin @@ -452,43 +453,29 @@ def test_value_counts_unique_nunique(self): o = orig.copy() klass = type(o) - values = o.values - - # create repeated values, 'n'th element is repeated by n+1 times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq ambiguous - - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None - - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + values = o._values - elif isinstance(o, DatetimeIndex): + if isinstance(values, Index): + # reset name not to affect latter process + values.name = None - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None - - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' - - # don't test boolean - elif isinstance(o, Index) and o.is_boolean(): + # create repeated values, 'n'th element is repeated by n+1 times + # skip boolean, because it only has 2 values at most + if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): - expected_index = pd.Index(values[::-1]) + expected_index = pd.Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) - o = klass(np.repeat(values, range(1, len(o) + 1)), - index=idx, name='a') + rep = np.repeat(values, range(1, len(o) + 1)) + o = klass(rep, index=idx, name='a') + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') @@ -499,15 +486,23 @@ def test_value_counts_unique_nunique(self): self.assertEqual(result.name, 'a') result = o.unique() - if isinstance(o, (DatetimeIndex, PeriodIndex)): + if isinstance(o, Index): self.assertTrue(isinstance(result, o.__class__)) - self.assertEqual(result.freq, o.freq) self.assert_index_equal(result, orig) + elif is_datetimetz(o): + # datetimetz Series returns array of Timestamp + self.assertEqual(result[0], orig[0]) + for r in result: + self.assertIsInstance(r, pd.Timestamp) + tm.assert_numpy_array_equal(result, + orig._values.asobject.values) else: - self.assert_numpy_array_equal(result, values) + tm.assert_numpy_array_equal(result, orig.values) self.assertEqual(o.nunique(), len(np.unique(o.values))) + def test_value_counts_unique_nunique_null(self): + for null_obj in [np.nan, None]: for o in self.objs: klass = type(o) @@ -525,12 +520,14 @@ def test_value_counts_unique_nunique(self): else: o = o.copy() o[0:2] = pd.tslib.iNaT - values = o.values - elif o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): + values = o._values + + elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex): values[0:2] = pd.tslib.iNaT else: values[0:2] = null_obj + # check values has the same dtype as the original + self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times @@ -570,13 +567,17 @@ def test_value_counts_unique_nunique(self): self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') - # numpy_array_equal cannot compare arrays includes nan result = o.unique() - self.assert_numpy_array_equal(result[1:], values[2:]) - - if isinstance(o, (DatetimeIndex, PeriodIndex)): - self.assertTrue(result.asi8[0] == pd.tslib.iNaT) + if isinstance(o, Index): + tm.assert_index_equal(result, + Index(values[1:], name='a')) + elif is_datetimetz(o): + # unable to compare NaT / nan + tm.assert_numpy_array_equal(result[1:], + values[2:].asobject.values) + self.assertIs(result[0], pd.NaT) else: + tm.assert_numpy_array_equal(result[1:], values[2:]) self.assertTrue(pd.isnull(result[0])) self.assertEqual(o.nunique(), 8) @@ -590,8 +591,13 @@ def test_value_counts_inferred(self): expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.unique(np.array(s_values, dtype=np.object_)) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep @@ -627,8 +633,12 @@ def test_value_counts_bins(self): exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) - self.assert_numpy_array_equal(s1.unique(), - np.array([1, 2, 3], dtype=np.int64)) + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) @@ -652,8 +662,12 @@ def test_value_counts_bins(self): expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(['a', 'b', np.nan, 'd']) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) @@ -661,8 +675,13 @@ def test_value_counts_bins(self): tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original - self.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + if isinstance(s, Index): + self.assert_index_equal(s.unique(), Index([]), + exact=False) + else: + self.assert_numpy_array_equal(s.unique(), np.array([]), + check_dtype=False) + self.assertEqual(s.nunique(), 0) def test_value_counts_datetime64(self): @@ -691,10 +710,10 @@ def test_value_counts_datetime64(self): '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') - if isinstance(s, DatetimeIndex): - self.assert_index_equal(s.unique(), DatetimeIndex(expected)) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: - self.assert_numpy_array_equal(s.unique(), expected) + tm.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) @@ -714,12 +733,12 @@ def test_value_counts_datetime64(self): self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT - if isinstance(s, DatetimeIndex): - self.assert_index_equal(unique[:3], DatetimeIndex(expected)) + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) else: - self.assert_numpy_array_equal(unique[:3], expected) - self.assertTrue(unique[3] is pd.NaT or - unique[3].astype('int64') == pd.tslib.iNaT) + tm.assert_numpy_array_equal(unique[:3], expected) + self.assertTrue(pd.isnull(unique[3])) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) @@ -733,10 +752,10 @@ def test_value_counts_datetime64(self): tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') - if isinstance(td, TimedeltaIndex): - self.assert_index_equal(td.unique(), expected) + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) else: - self.assert_numpy_array_equal(td.unique(), expected.values) + tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d703ee7c1d1c2..781c9b786328d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1302,6 +1302,30 @@ def test_unique_ordered(self): ordered=True) tm.assert_categorical_equal(res, exp_cat) + def test_unique_index_series(self): + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + # Categorical.unique sorts categories by appearance order + # if ordered=False + exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) + exp = Categorical([1, 2], categories=[1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) + # Categorical.unique keeps categories order if ordered=True + exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + def test_mode(self): s = Categorical([1, 1, 2, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index e64a0d2ebaf5e..c08bb53238e5c 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -743,19 +743,6 @@ def shift(self, n, freq=None): attribs['end'] = end return type(self)(**attribs) - def unique(self): - """ - Index.unique with handling for DatetimeIndex/PeriodIndex metadata - - Returns - ------- - result : DatetimeIndex or PeriodIndex - """ - from pandas.core.index import Int64Index - result = Int64Index.unique(self) - return self._simple_new(result, name=self.name, freq=self.freq, - tz=getattr(self, 'tz', None)) - def repeat(self, repeats, *args, **kwargs): """ Analogous to ndarray.repeat diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 94de8cb034024..d39569ea0b826 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1534,7 +1534,7 @@ def makeStringIndex(k=10, name=None): def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k)) + return Index(randu_array(nchars=10, size=k), name=name) def makeCategoricalIndex(k=10, n=3, name=None):