From 158d925c277faff3cf8849c2fd2cce74d512a815 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:42:59 +0100 Subject: [PATCH 01/15] Code & whatsnew --- doc/source/whatsnew/v0.25.0.rst | 21 ++++++++++++++++++ pandas/core/algorithms.py | 37 +++++++++++++++++++++++++++++-- pandas/core/arrays/categorical.py | 27 ++++++++++++++++++---- 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 09626be713c4f..43a07cd57b4dd 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -14,6 +14,27 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog including other versions of pandas. +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_0250.enhancements.unique: + +Changes to the ``unique``-method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The method :meth:`pandas.unique` now supports the keyword ``return_inverse``, which, if passed, +makes the output a tuple where the second component is an ndarray that contains the +mapping from the indices of the values to their location in the return unique values. + +.. ipython:: python + + idx = pd.Index([1, 0, 0, 1]) + uniques, inverse = pd.unique(idx, return_inverse=True) + uniques + inverse + reconstruct = pd.Index(uniques[inverse]) + reconstruct.equals(idx) + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b473a7aef929e..4e2a3e512a929 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -275,7 +275,7 @@ def match(to_match, values, na_sentinel=-1): return result -def unique(values): +def unique(values, return_inverse=False): """ Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. @@ -285,6 +285,13 @@ def unique(values): Parameters ---------- values : 1d array-like + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple of two np.ndarray. The second component + contains the mapping between the indices of the elements in the + calling Categorical and their locations in the unique values. + + .. versionadded:: 0.25.0 Returns ------- @@ -348,19 +355,45 @@ def unique(values): >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ + from pandas import Index values = _ensure_arraylike(values) if is_extension_array_dtype(values): # Dispatch to extension dtype's unique. + if return_inverse: + # as long as return_inverse is not part of the EA.unique contract, + # test if this works + try: + # make sure that we're not calling from an Index/Series + # container, as these do not support return_inverse yet + ea_val = getattr(values, 'array', values) + result, inverse = ea_val.unique(return_inverse=return_inverse) + + if is_categorical_dtype(values) and isinstance(values, Index): + # pd.unique(CategoricalIndex) returns Index not Categorical + result = Index(result) + return result, inverse + except TypeError: + msg = ('The Extension Array class for type {dtype} does not ' + 'yet support the unique-method with ' + '"return_inverse=True".'.format(dtype=type(values))) + raise NotImplementedError(msg) return values.unique() original = values htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + if return_inverse: + uniques, inverse = table.unique(values, return_inverse=True) + else: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, dtype, original) + + if return_inverse: + return uniques, inverse return uniques diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 35b662eaae9a5..cbb0098fe815e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2258,7 +2258,7 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) - def unique(self): + def unique(self, return_inverse=False): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. Unused categories are NOT returned. @@ -2268,9 +2268,21 @@ def unique(self): - ordered category: values are sorted by appearance order, categories keeps existing order. + Parameters + ---------- + return_inverse : boolean, default False + Whether to return the inverse of the unique values. If True, the + output will be a tuple of two np.ndarray. The second component + contains the mapping between the indices of the elements in the + calling Categorical and their locations in the unique values. + + .. versionadded:: 0.25.0 + Returns ------- - unique values : ``Categorical`` + uniques : ``Categorical`` + inverse : np.ndarray (if `return_inverse=True`) + The inverse from the `uniques` back to the calling ``Categorical``. Examples -------- @@ -2302,7 +2314,10 @@ def unique(self): """ # unlike np.unique, unique1d does not sort - unique_codes = unique1d(self.codes) + if return_inverse: + unique_codes, inverse = unique1d(self.codes, return_inverse=True) + else: + unique_codes = unique1d(self.codes, return_inverse=False) cat = self.copy() # keep nan in codes @@ -2312,7 +2327,11 @@ def unique(self): take_codes = unique_codes[unique_codes != -1] if self.ordered: take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + result = cat.set_categories(cat.categories.take(take_codes)) + + if return_inverse: + return result, inverse + return result def _values_for_factorize(self): codes = self.codes.astype('int64') From 49e42f1ca039b6484a87c99e8822c48061909a4d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:48:36 +0100 Subject: [PATCH 02/15] Extend and parametrize unique-test to all numpy dtypes --- pandas/tests/test_algos.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d28b17750540..4f2b3355521ec 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -324,14 +324,9 @@ def test_parametrized_factorize_na_value(self, data, na_value): class TestUnique(object): - def test_ints(self): - arr = np.random.randint(0, 100, size=50) - - result = algos.unique(arr) - assert isinstance(result, np.ndarray) - - def test_objects(self): - arr = np.random.randint(0, 100, size=50).astype('O') + def test_unique_all_dtypes(self, any_numpy_dtype): + dtype = any_numpy_dtype + arr = np.random.randint(0, 100, size=50).astype(dtype) result = algos.unique(arr) assert isinstance(result, np.ndarray) From 9285e8ce59e48090f26c08dff5f76cb80a38db32 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:49:23 +0100 Subject: [PATCH 03/15] Also test inverse in test_unique_all_dtypes --- pandas/tests/test_algos.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4f2b3355521ec..0b82c77d1c80b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -331,6 +331,16 @@ def test_unique_all_dtypes(self, any_numpy_dtype): result = algos.unique(arr) assert isinstance(result, np.ndarray) + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + + result_uniques, result_inverse = algos.unique(arr, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = result_uniques[result_inverse] + tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False) + def test_object_refcount_bug(self): lst = ['A', 'B', 'C', 'D', 'E'] for i in range(1000): From 45eff6715dce278071a13e69f931d7b0739beee1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:51:18 +0100 Subject: [PATCH 04/15] Parametrize test_timedelta64_dtype_array_returned --- pandas/tests/test_algos.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0b82c77d1c80b..b3c942c6e3783 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -384,24 +384,16 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype - def test_timedelta64_dtype_array_returned(self): + @pytest.mark.parametrize('box', [Index, Series, np.array]) + def test_timedelta64_dtype_array_returned(self, box): # GH 9431 expected = np.array([31200, 45678, 10000], dtype='m8[ns]') td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) - result = algos.unique(td_index) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype + obj = box(td_index) - s = Series(td_index) - result = algos.unique(s) + result = algos.unique(obj) tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype - - arr = s.values - result = algos.unique(arr) - tm.assert_numpy_array_equal(result, expected) - assert result.dtype == expected.dtype def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) From de22c62cbe7acff1851441b25d7dd77992fd05cd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:52:39 +0100 Subject: [PATCH 05/15] Add inverse to test_timedelta64_dtype_array_returned --- pandas/tests/test_algos.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b3c942c6e3783..296aa65a67461 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -28,6 +28,20 @@ from pandas.util.testing import assert_almost_equal +def assert_series_or_index_or_array_or_categorical_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + elif isinstance(left, Index): + tm.assert_index_equal(left, right) + elif isinstance(left, np.ndarray): + tm.assert_numpy_array_equal(left, right) + elif isinstance(left, Categorical): + tm.assert_categorical_equal(left, right) + else: + # will fail + assert isinstance(left, (Series, Index, np.ndarray, Categorical)) + + class TestMatch(object): def test_ints(self): @@ -395,6 +409,16 @@ def test_timedelta64_dtype_array_returned(self, box): result = algos.unique(obj) tm.assert_numpy_array_equal(result, expected) + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + + result_uniques, result_inverse = algos.unique(obj, return_inverse=True) + tm.assert_numpy_array_equal(result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) + def test_uint64_overflow(self): s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) exp = np.array([1, 2, 2**63], dtype=np.uint64) From 2a4f4a7f82d98b93ec4aae761c2f2591409ed089 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 00:56:21 +0100 Subject: [PATCH 06/15] Add inverse to test_nan_in_object_array --- pandas/tests/test_algos.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 296aa65a67461..b37e8f004b1b5 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -430,6 +430,11 @@ def test_nan_in_object_array(self): expected = np.array(['a', np.nan, 'c'], dtype=object) tm.assert_numpy_array_equal(result, expected) + result_uniques, result_inverse = pd.unique(duplicated_items, + return_inverse=True) + expected_inverse = np.array([0, 1, 2, 2], dtype='int64') + tm.assert_numpy_array_equal(result_inverse, expected_inverse) + def test_categorical(self): # we are expecting to return in the order From 39a2e6418549b7dbfa362a4f0e62bf2c0abe3e8c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:19:42 +0100 Subject: [PATCH 07/15] Parametrize test_categorical --- pandas/tests/test_algos.py | 59 ++++++++++++-------------------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b37e8f004b1b5..129616c81b73e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -435,49 +435,28 @@ def test_nan_in_object_array(self): expected_inverse = np.array([0, 1, 2, 2], dtype='int64') tm.assert_numpy_array_equal(result_inverse, expected_inverse) - def test_categorical(self): - - # we are expecting to return in the order - # of appearance - expected = Categorical(list('bac'), categories=list('bac')) - - # we are expecting to return in the order - # of the categories - expected_o = Categorical( - list('bac'), categories=list('abc'), ordered=True) + @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize('box', [lambda x: x, Series, Index], + ids=['Categorical', 'Series', 'Index']) + @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), + pd.unique], + ids=['classmethod', 'toplevel']) + def test_categorical(self, method, box, ordered): + + categories = list('abc') if ordered else list('bac') + expected = Categorical(list('bac'), categories=categories, + ordered=ordered) + + # Index.unique always returns Index + # pd.unique(Index) stays Index (only) for Categorical + expected = box(expected) if box == Index else expected # GH 15939 - c = Categorical(list('baabc')) - result = c.unique() - tm.assert_categorical_equal(result, expected) - - result = algos.unique(c) - tm.assert_categorical_equal(result, expected) - - c = Categorical(list('baabc'), ordered=True) - result = c.unique() - tm.assert_categorical_equal(result, expected_o) - - result = algos.unique(c) - tm.assert_categorical_equal(result, expected_o) + c = box(Categorical(list('baabc'), categories=categories, + ordered=ordered)) + result = method(c) - # Series of categorical dtype - s = Series(Categorical(list('baabc')), name='foo') - result = s.unique() - tm.assert_categorical_equal(result, expected) - - result = pd.unique(s) - tm.assert_categorical_equal(result, expected) - - # CI -> return CI - ci = CategoricalIndex(Categorical(list('baabc'), - categories=list('bac'))) - expected = CategoricalIndex(expected) - result = ci.unique() - tm.assert_index_equal(result, expected) - - result = pd.unique(ci) - tm.assert_index_equal(result, expected) + assert_series_or_index_or_array_or_categorical_equal(result, expected) def test_datetime64tz_aware(self): # GH 15939 From bfc310fbbe67818ee7a546499e87912a7700b211 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:21:56 +0100 Subject: [PATCH 08/15] Add inverse to test_categorical --- pandas/tests/test_algos.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 129616c81b73e..c0e08b52f0de3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -458,6 +458,20 @@ def test_categorical(self, method, box, ordered): assert_series_or_index_or_array_or_categorical_equal(result, expected) + if method == pd.unique: + # [Series/Index].unique do not yet support return_inverse=True + + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + result_uniques, result_inverse = method(c, return_inverse=True) + + assert_series_or_index_or_array_or_categorical_equal( + result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, c) + def test_datetime64tz_aware(self): # GH 15939 From af7d8f3b33d87f0d8b96060278fa7a9b48df46a3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:23:22 +0100 Subject: [PATCH 09/15] Parametrize test_datetime64tz_aware --- pandas/tests/test_algos.py | 60 ++++++++++++-------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c0e08b52f0de3..ae3cc055ddbc9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -472,36 +472,26 @@ def test_categorical(self, method, box, ordered): reconstr = box(result_uniques[result_inverse]) assert_series_or_index_or_array_or_categorical_equal(reconstr, c) - def test_datetime64tz_aware(self): + @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), + pd.unique], + ids=['classmethod', 'toplevel']) + def test_datetime64tz_aware(self, method, box): # GH 15939 - result = Series( - Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])).unique() - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") - ])) - tm.assert_extension_array_equal(result, expected) - - result = Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]).unique() - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) - tm.assert_index_equal(result, expected) - - result = pd.unique( - Series(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]))) - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01', tz="US/Eastern"), - ])) - tm.assert_extension_array_equal(result, expected) - - result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) - tm.assert_index_equal(result, expected) + ts = Timestamp('20160101', tz='US/Eastern') + obj = box([ts, ts]) + + if box == Series: + expected = DatetimeArray._from_sequence(np.array([ + Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") + ])) + else: # Index + expected = DatetimeIndex(['2016-01-01 00:00:00'], + dtype='datetime64[ns, US/Eastern]', + freq=None) + result = method(obj) + assert_series_or_index_or_array_or_categorical_equal(result, expected) def test_order_of_appearance(self): # 9346 @@ -515,20 +505,6 @@ def test_order_of_appearance(self): tm.assert_numpy_array_equal(result, np.array([2, 1], dtype='int64')) - result = pd.unique(Series([Timestamp('20160101'), - Timestamp('20160101')])) - expected = np.array(['2016-01-01T00:00:00.000000000'], - dtype='datetime64[ns]') - tm.assert_numpy_array_equal(result, expected) - - result = pd.unique(Index( - [Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) - tm.assert_index_equal(result, expected) - result = pd.unique(list('aabc')) expected = np.array(['a', 'b', 'c'], dtype=object) tm.assert_numpy_array_equal(result, expected) From c089f1fb6fb5036cb2e93c375fff6a1f66ebaf88 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:23:55 +0100 Subject: [PATCH 10/15] Add inverse to test_datetime64tz_aware --- pandas/tests/test_algos.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ae3cc055ddbc9..559c40f709e86 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -493,6 +493,20 @@ def test_datetime64tz_aware(self, method, box): result = method(obj) assert_series_or_index_or_array_or_categorical_equal(result, expected) + if method == pd.unique: + # [Series/Index].unique do not yet support return_inverse=True + + # reuse result as expected outcome of return_inverse case + expected_uniques = result.copy() + result_uniques, result_inverse = method(obj, return_inverse=True) + + assert_series_or_index_or_array_or_categorical_equal( + result_uniques, expected_uniques) + + # reconstruction can only work if inverse is correct + reconstr = box(result_uniques[result_inverse]) + assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) + def test_order_of_appearance(self): # 9346 # light testing of guarantee of order of appearance From 78d4758f20618632ee693937735b1fe7abbe8937 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:24:18 +0100 Subject: [PATCH 11/15] Remove test case that is covered elsewhere --- pandas/tests/test_algos.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 559c40f709e86..3fbba9236992d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -523,10 +523,6 @@ def test_order_of_appearance(self): expected = np.array(['a', 'b', 'c'], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Series(Categorical(list('aabc')))) - expected = Categorical(list('abc')) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize("arg ,expected", [ (('1', '1', '2'), np.array(['1', '2'], dtype=object)), (('foo',), np.array(['foo'], dtype=object)) From 006d7ad8403e7e856846280e5de05b6380d2397b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 2 Feb 2019 01:50:55 +0100 Subject: [PATCH 12/15] Fix DatetimeArray-case in test_datetime64tz_aware; create TODO --- pandas/tests/test_algos.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3fbba9236992d..971ea26cbe03d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -37,9 +37,12 @@ def assert_series_or_index_or_array_or_categorical_equal(left, right): tm.assert_numpy_array_equal(left, right) elif isinstance(left, Categorical): tm.assert_categorical_equal(left, right) + elif isinstance(left, DatetimeArray): + tm.assert_extension_array_equal(left, right) else: # will fail - assert isinstance(left, (Series, Index, np.ndarray, Categorical)) + assert isinstance(left, (Series, Index, np.ndarray, + Categorical, DatetimeArray)) class TestMatch(object): @@ -493,19 +496,19 @@ def test_datetime64tz_aware(self, method, box): result = method(obj) assert_series_or_index_or_array_or_categorical_equal(result, expected) - if method == pd.unique: - # [Series/Index].unique do not yet support return_inverse=True - - # reuse result as expected outcome of return_inverse case - expected_uniques = result.copy() - result_uniques, result_inverse = method(obj, return_inverse=True) - - assert_series_or_index_or_array_or_categorical_equal( - result_uniques, expected_uniques) - - # reconstruction can only work if inverse is correct - reconstr = box(result_uniques[result_inverse]) - assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) + # TODO: add support for return_inverse to DatetimeArray/DatetimeIndex, + # as well as [[Series/Index].unique + + # # reuse result as expected outcome of return_inverse case + # expected_uniques = result.copy() + # result_uniques, result_inverse = method(obj, return_inverse=True) + # + # assert_series_or_index_or_array_or_categorical_equal( + # result_uniques, expected_uniques) + # + # # reconstruction can only work if inverse is correct + # reconstr = box(result_uniques[result_inverse]) + # assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_order_of_appearance(self): # 9346 From b310b487ceb54413ad291ab5ef6d95a469d81d72 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 11 Oct 2019 13:51:27 +0200 Subject: [PATCH 13/15] blackify conflict files --- pandas/core/algorithms.py | 595 +++++++++-------- pandas/tests/test_algos.py | 1280 +++++++++++++++++++++++------------- 2 files changed, 1146 insertions(+), 729 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4e2a3e512a929..a1bd66b58d642 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,16 +14,37 @@ from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, maybe_promote) + construct_1d_object_array_from_listlike, + maybe_promote, +) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_object, ensure_platform_int, - ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, - is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, - is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, - is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, - needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_object, + ensure_platform_int, + ensure_uint64, + is_array_like, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -63,20 +84,19 @@ def _ensure_data(values, dtype=None): # we check some simple dtypes first try: if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype('uint64'), 'bool', 'uint64' + return np.asarray(values).astype("uint64"), "bool", "uint64" elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): - return ensure_int64(values), 'int64', 'int64' - elif (is_unsigned_integer_dtype(values) or - is_unsigned_integer_dtype(dtype)): - return ensure_uint64(values), 'uint64', 'uint64' + return ensure_int64(values), "int64", "int64" + elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): + return ensure_uint64(values), "uint64", "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): - return ensure_float64(values), 'float64', 'float64' + return ensure_float64(values), "float64", "float64" elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -84,49 +104,55 @@ def _ensure_data(values, dtype=None): with catch_warnings(): simplefilter("ignore", np.ComplexWarning) values = ensure_float64(values) - return values, 'float64', 'float64' + return values, "float64", "float64" except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" # datetimelike - if (needs_i8_conversion(values) or - is_period_dtype(dtype) or - is_datetime64_any_dtype(dtype) or - is_timedelta64_dtype(dtype)): + if ( + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): if is_period_dtype(values) or is_period_dtype(dtype): from pandas import PeriodIndex + values = PeriodIndex(values) dtype = values.dtype elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex + values = TimedeltaIndex(values) dtype = values.dtype else: # Datetime from pandas import DatetimeIndex + values = DatetimeIndex(values) dtype = values.dtype - return values.asi8, dtype, 'int64' + return values.asi8, dtype, "int64" - elif (is_categorical_dtype(values) and - (is_categorical_dtype(dtype) or dtype is None)): - values = getattr(values, 'values', values) + elif is_categorical_dtype(values) and ( + is_categorical_dtype(dtype) or dtype is None + ): + values = getattr(values, "values", values) values = values.codes - dtype = 'category' + dtype = "category" # we are actually coercing to int64 # until our algos support int* directly (not all do) values = ensure_int64(values) - return values, dtype, 'int64' + return values, dtype, "int64" # we have failed, return object values = np.asarray(values, dtype=np.object) - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" def _reconstruct_data(values, dtype, original): @@ -144,6 +170,7 @@ def _reconstruct_data(values, dtype, original): Index for extension types, otherwise ndarray casted to dtype """ from pandas import Index + if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): @@ -166,7 +193,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ['mixed', 'string', 'unicode']: + if inferred in ["mixed", "string", "unicode"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -176,11 +203,11 @@ def _ensure_arraylike(values): _hashtables = { - 'float64': (htable.Float64HashTable, htable.Float64Vector), - 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), - 'int64': (htable.Int64HashTable, htable.Int64Vector), - 'string': (htable.StringHashTable, htable.ObjectVector), - 'object': (htable.PyObjectHashTable, htable.ObjectVector) + "float64": (htable.Float64HashTable, htable.Float64Vector), + "uint64": (htable.UInt64HashTable, htable.UInt64Vector), + "int64": (htable.Int64HashTable, htable.Int64Vector), + "string": (htable.StringHashTable, htable.ObjectVector), + "object": (htable.PyObjectHashTable, htable.ObjectVector), } @@ -200,15 +227,15 @@ def _get_hashtable_algo(values): """ values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" else: - ndtype = 'object' + ndtype = "object" htable, table = _hashtables[ndtype] return (htable, table, values, dtype, ndtype) @@ -220,15 +247,15 @@ def _get_data_algo(values, func_map): values = values._values_for_rank() values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" - f = func_map.get(ndtype, func_map['object']) + f = func_map.get(ndtype, func_map["object"]) return f, values @@ -237,6 +264,7 @@ def _get_data_algo(values, func_map): # top-level algos # # --------------- # + def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values @@ -269,6 +297,7 @@ def match(to_match, values, na_sentinel=-1): # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas import Series + result = Series(result.ravel()).replace(-1, na_sentinel) result = result.values.reshape(result.shape) @@ -367,7 +396,7 @@ def unique(values, return_inverse=False): try: # make sure that we're not calling from an Index/Series # container, as these do not support return_inverse yet - ea_val = getattr(values, 'array', values) + ea_val = getattr(values, "array", values) result, inverse = ea_val.unique(return_inverse=return_inverse) if is_categorical_dtype(values) and isinstance(values, Index): @@ -375,9 +404,11 @@ def unique(values, return_inverse=False): result = Index(result) return result, inverse except TypeError: - msg = ('The Extension Array class for type {dtype} does not ' - 'yet support the unique-method with ' - '"return_inverse=True".'.format(dtype=type(values))) + msg = ( + "The Extension Array class for type {dtype} does not " + "yet support the unique-method with " + '"return_inverse=True".'.format(dtype=type(values)) + ) raise NotImplementedError(msg) return values.unique() @@ -415,13 +446,19 @@ def isin(comps, values): """ if not is_list_like(comps): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{comps_type}]" - .format(comps_type=type(comps).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{comps_type}]".format( + comps_type=type(comps).__name__ + ) + ) if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format( + values_type=type(values).__name__ + ) + ) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) @@ -445,8 +482,8 @@ def isin(comps, values): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) f = lambda x, y: htable.ismember_int64(x, y) except (TypeError, ValueError, OverflowError): values = values.astype(object) @@ -454,8 +491,8 @@ def isin(comps, values): elif is_float_dtype(comps): try: - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) f = lambda x, y: htable.ismember_float64(x, y) except (TypeError, ValueError): values = values.astype(object) @@ -464,8 +501,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, na_sentinel=-1, size_hint=None, - na_value=None): +def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): """Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -489,14 +525,17 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize(values, na_sentinel=na_sentinel, - na_value=na_value) + uniques, labels = table.factorize( + values, na_sentinel=na_sentinel, na_value=na_value + ) labels = ensure_platform_int(labels) return labels, uniques -_shared_docs['factorize'] = """ +_shared_docs[ + "factorize" +] = """ Encode the object as an enumerated type or categorical variable. This method is useful for obtaining a numeric representation of an @@ -590,29 +629,37 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, @Substitution( - values=dedent("""\ + values=dedent( + """\ values : sequence A 1-D sequence. Sequences that aren't pandas objects are coerced to ndarrays before factorization. - """), - order=dedent("""\ + """ + ), + order=dedent( + """\ order .. deprecated:: 0.23.0 This parameter has no effect and is deprecated. - """), - sort=dedent("""\ + """ + ), + sort=dedent( + """\ sort : bool, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """), - size_hint=dedent("""\ + """ + ), + size_hint=dedent( + """\ size_hint : int, optional Hint to the hashtable sizer. - """), + """ + ), ) -@Appender(_shared_docs['factorize']) -@deprecate_kwarg(old_arg_name='order', new_arg_name=None) +@Appender(_shared_docs["factorize"]) +@deprecate_kwarg(old_arg_name="order", new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -627,26 +674,28 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): original = values if is_extension_array_dtype(values): - values = getattr(values, '_values', values) + values = getattr(values, "_values", values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - if (is_datetime64_any_dtype(original) or - is_timedelta64_dtype(original) or - is_period_dtype(original)): + if ( + is_datetime64_any_dtype(original) + or is_timedelta64_dtype(original) + or is_period_dtype(original) + ): na_value = na_value_for_dtype(original.dtype) else: na_value = None - labels, uniques = _factorize_array(values, - na_sentinel=na_sentinel, - size_hint=size_hint, - na_value=na_value) + labels, uniques = _factorize_array( + values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value + ) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort + try: order = uniques.argsort() order2 = order.argsort() @@ -654,9 +703,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.take(order) except TypeError: # Mixed types, where uniques.argsort fails. - uniques, labels = safe_sort(uniques, labels, - na_sentinel=na_sentinel, - assume_unique=True) + uniques, labels = safe_sort( + uniques, labels, na_sentinel=na_sentinel, assume_unique=True + ) uniques = _reconstruct_data(uniques, dtype, original) @@ -665,13 +714,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index + uniques = Index(uniques) return labels, uniques -def value_counts(values, sort=True, ascending=False, normalize=False, - bins=None, dropna=True): +def value_counts( + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True +): """ Compute a histogram of the counts of non-null values. @@ -696,11 +747,13 @@ def value_counts(values, sort=True, ascending=False, normalize=False, """ from pandas.core.series import Series, Index - name = getattr(values, 'name', None) + + name = getattr(values, "name", None) if bins is not None: try: from pandas.core.reshape.tile import cut + values = Series(values) ii = cut(values, bins, include_lowest=True) except TypeError: @@ -709,7 +762,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) result = result[result.index.notna()] - result.index = result.index.astype('interval') + result.index = result.index.astype("interval") result = result.sort_index() # if we are dropna and we have NO values @@ -787,7 +840,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep="first"): """ Return boolean ndarray denoting duplicate values. @@ -859,8 +912,7 @@ def mode(values, dropna=True): return Series(result) -def rank(values, axis=0, method='average', na_option='keep', - ascending=True, pct=False): +def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False): """ Rank the values along a given axis. @@ -886,12 +938,23 @@ def rank(values, axis=0, method='average', na_option='keep', """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) - ranks = f(values, ties_method=method, ascending=ascending, - na_option=na_option, pct=pct) + ranks = f( + values, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) - ranks = f(values, axis=axis, ties_method=method, - ascending=ascending, na_option=na_option, pct=pct) + ranks = f( + values, + axis=axis, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) else: raise TypeError("Array with ndim > 2 are not supported.") @@ -962,10 +1025,12 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): elif not mask2.any(): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: - to_raise = (((np.iinfo(np.int64).max - - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or - ((np.iinfo(np.int64).min - - b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + to_raise = ( + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) if to_raise: raise OverflowError("Overflow in int64 addition") @@ -973,21 +1038,21 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): _rank1d_functions = { - 'float64': algos.rank_1d_float64, - 'int64': algos.rank_1d_int64, - 'uint64': algos.rank_1d_uint64, - 'object': algos.rank_1d_object + "float64": algos.rank_1d_float64, + "int64": algos.rank_1d_int64, + "uint64": algos.rank_1d_uint64, + "object": algos.rank_1d_object, } _rank2d_functions = { - 'float64': algos.rank_2d_float64, - 'int64': algos.rank_2d_int64, - 'uint64': algos.rank_2d_uint64, - 'object': algos.rank_2d_object + "float64": algos.rank_2d_float64, + "int64": algos.rank_2d_int64, + "uint64": algos.rank_2d_uint64, + "object": algos.rank_2d_object, } -def quantile(x, q, interpolation_method='fraction'): +def quantile(x, q, interpolation_method="fraction"): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 computes the median. @@ -1047,16 +1112,17 @@ def _get_score(at): if idx % 1 == 0: score = values[int(idx)] else: - if interpolation_method == 'fraction': - score = _interpolate(values[int(idx)], values[int(idx) + 1], - idx % 1) - elif interpolation_method == 'lower': + if interpolation_method == "fraction": + score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) + elif interpolation_method == "lower": score = values[np.floor(idx)] - elif interpolation_method == 'higher': + elif interpolation_method == "higher": score = values[np.ceil(idx)] else: - raise ValueError("interpolation_method can only be 'fraction' " - ", 'lower' or 'higher'") + raise ValueError( + "interpolation_method can only be 'fraction' " + ", 'lower' or 'higher'" + ) return score @@ -1071,21 +1137,21 @@ def _get_score(at): # select n # # --------------- # -class SelectN(object): +class SelectN(object): def __init__(self, obj, n, keep): self.obj = obj self.n = n self.keep = keep - if self.keep not in ('first', 'last', 'all'): + if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') def nlargest(self): - return self.compute('nlargest') + return self.compute("nlargest") def nsmallest(self): - return self.compute('nsmallest') + return self.compute("nsmallest") @staticmethod def is_valid_dtype_n_method(dtype): @@ -1093,8 +1159,9 @@ def is_valid_dtype_n_method(dtype): Helper function to determine if dtype is valid for nsmallest/nlargest methods """ - return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)) + return ( + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) class SelectNSeries(SelectN): @@ -1117,9 +1184,10 @@ def compute(self, method): n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError("Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, - dtype=dtype)) + raise TypeError( + "Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, dtype=dtype) + ) if n <= 0: return self.obj[[]] @@ -1129,20 +1197,20 @@ def compute(self, method): # slow method if n >= len(self.obj): - reverse_it = (self.keep == 'last' or method == 'nlargest') - ascending = method == 'nsmallest' + reverse_it = self.keep == "last" or method == "nlargest" + ascending = method == "nsmallest" slc = np.s_[::-1] if reverse_it else np.s_[:] return dropped[slc].sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype, _ = _ensure_data(dropped.values) - if method == 'nlargest': + if method == "nlargest": arr = -arr if is_integer_dtype(pandas_dtype): # GH 21426: ensure reverse ordering at boundaries arr -= 1 - if self.keep == 'last': + if self.keep == "last": arr = arr[::-1] narr = len(arr) @@ -1150,12 +1218,12 @@ def compute(self, method): kth_val = algos.kth_smallest(arr.copy(), n - 1) ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')] + inds = ns[arr[ns].argsort(kind="mergesort")] - if self.keep != 'all': + if self.keep != "all": inds = inds[:n] - if self.keep == 'last': + if self.keep == "last": # reverse indices inds = narr - 1 - inds @@ -1188,6 +1256,7 @@ def __init__(self, obj, n, keep, columns): def compute(self, method): from pandas import Int64Index + n = self.n frame = self.obj columns = self.columns @@ -1195,16 +1264,18 @@ def compute(self, method): for column in columns: dtype = frame[column].dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError(( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method)) + raise TypeError( + ( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method) + ) def get_indexer(current_indexer, other_indexer): """Helper function to concat `current_indexer` and `other_indexer` depending on `method` """ - if method == 'nsmallest': + if method == "nsmallest": return current_indexer.append(other_indexer) else: return other_indexer.append(current_indexer) @@ -1226,8 +1297,8 @@ def get_indexer(current_indexer, other_indexer): series = cur_frame[column] is_last_column = len(columns) - 1 == i values = getattr(series, method)( - cur_n, - keep=self.keep if is_last_column else 'all') + cur_n, keep=self.keep if is_last_column else "all" + ) if is_last_column or len(values) <= cur_n: indexer = get_indexer(indexer, values.index) @@ -1260,12 +1331,9 @@ def get_indexer(current_indexer, other_indexer): if len(columns) == 1: return frame - ascending = method == 'nsmallest' + ascending = method == "nsmallest" - return frame.sort_values( - columns, - ascending=ascending, - kind='mergesort') + return frame.sort_values(columns, ascending=ascending, kind="mergesort") # ------- ## ---- # @@ -1334,110 +1402,103 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): _take_1d_dict = { - ('int8', 'int8'): algos.take_1d_int8_int8, - ('int8', 'int32'): algos.take_1d_int8_int32, - ('int8', 'int64'): algos.take_1d_int8_int64, - ('int8', 'float64'): algos.take_1d_int8_float64, - ('int16', 'int16'): algos.take_1d_int16_int16, - ('int16', 'int32'): algos.take_1d_int16_int32, - ('int16', 'int64'): algos.take_1d_int16_int64, - ('int16', 'float64'): algos.take_1d_int16_float64, - ('int32', 'int32'): algos.take_1d_int32_int32, - ('int32', 'int64'): algos.take_1d_int32_int64, - ('int32', 'float64'): algos.take_1d_int32_float64, - ('int64', 'int64'): algos.take_1d_int64_int64, - ('int64', 'float64'): algos.take_1d_int64_float64, - ('float32', 'float32'): algos.take_1d_float32_float32, - ('float32', 'float64'): algos.take_1d_float32_float64, - ('float64', 'float64'): algos.take_1d_float64_float64, - ('object', 'object'): algos.take_1d_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, - None), - ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( - algos.take_1d_int64_int64, np.int64, np.int64, np.int64) + ("int8", "int8"): algos.take_1d_int8_int8, + ("int8", "int32"): algos.take_1d_int8_int32, + ("int8", "int64"): algos.take_1d_int8_int64, + ("int8", "float64"): algos.take_1d_int8_float64, + ("int16", "int16"): algos.take_1d_int16_int16, + ("int16", "int32"): algos.take_1d_int16_int32, + ("int16", "int64"): algos.take_1d_int16_int64, + ("int16", "float64"): algos.take_1d_int16_float64, + ("int32", "int32"): algos.take_1d_int32_int32, + ("int32", "int64"): algos.take_1d_int32_int64, + ("int32", "float64"): algos.take_1d_int32_float64, + ("int64", "int64"): algos.take_1d_int64_int64, + ("int64", "float64"): algos.take_1d_int64_float64, + ("float32", "float32"): algos.take_1d_float32_float32, + ("float32", "float64"): algos.take_1d_float32_float64, + ("float64", "float64"): algos.take_1d_float64_float64, + ("object", "object"): algos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), } _take_2d_axis0_dict = { - ('int8', 'int8'): algos.take_2d_axis0_int8_int8, - ('int8', 'int32'): algos.take_2d_axis0_int8_int32, - ('int8', 'int64'): algos.take_2d_axis0_int8_int64, - ('int8', 'float64'): algos.take_2d_axis0_int8_float64, - ('int16', 'int16'): algos.take_2d_axis0_int16_int16, - ('int16', 'int32'): algos.take_2d_axis0_int16_int32, - ('int16', 'int64'): algos.take_2d_axis0_int16_int64, - ('int16', 'float64'): algos.take_2d_axis0_int16_float64, - ('int32', 'int32'): algos.take_2d_axis0_int32_int32, - ('int32', 'int64'): algos.take_2d_axis0_int32_int64, - ('int32', 'float64'): algos.take_2d_axis0_int32_float64, - ('int64', 'int64'): algos.take_2d_axis0_int64_int64, - ('int64', 'float64'): algos.take_2d_axis0_int64_float64, - ('float32', 'float32'): algos.take_2d_axis0_float32_float32, - ('float32', 'float64'): algos.take_2d_axis0_float32_float64, - ('float64', 'float64'): algos.take_2d_axis0_float64_float64, - ('object', 'object'): algos.take_2d_axis0_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis0_int8_int8, + ("int8", "int32"): algos.take_2d_axis0_int8_int32, + ("int8", "int64"): algos.take_2d_axis0_int8_int64, + ("int8", "float64"): algos.take_2d_axis0_int8_float64, + ("int16", "int16"): algos.take_2d_axis0_int16_int16, + ("int16", "int32"): algos.take_2d_axis0_int16_int32, + ("int16", "int64"): algos.take_2d_axis0_int16_int64, + ("int16", "float64"): algos.take_2d_axis0_int16_float64, + ("int32", "int32"): algos.take_2d_axis0_int32_int32, + ("int32", "int64"): algos.take_2d_axis0_int32_int64, + ("int32", "float64"): algos.take_2d_axis0_int32_float64, + ("int64", "int64"): algos.take_2d_axis0_int64_int64, + ("int64", "float64"): algos.take_2d_axis0_int64_float64, + ("float32", "float32"): algos.take_2d_axis0_float32_float32, + ("float32", "float64"): algos.take_2d_axis0_float32_float64, + ("float64", "float64"): algos.take_2d_axis0_float64_float64, + ("object", "object"): algos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_axis1_dict = { - ('int8', 'int8'): algos.take_2d_axis1_int8_int8, - ('int8', 'int32'): algos.take_2d_axis1_int8_int32, - ('int8', 'int64'): algos.take_2d_axis1_int8_int64, - ('int8', 'float64'): algos.take_2d_axis1_int8_float64, - ('int16', 'int16'): algos.take_2d_axis1_int16_int16, - ('int16', 'int32'): algos.take_2d_axis1_int16_int32, - ('int16', 'int64'): algos.take_2d_axis1_int16_int64, - ('int16', 'float64'): algos.take_2d_axis1_int16_float64, - ('int32', 'int32'): algos.take_2d_axis1_int32_int32, - ('int32', 'int64'): algos.take_2d_axis1_int32_int64, - ('int32', 'float64'): algos.take_2d_axis1_int32_float64, - ('int64', 'int64'): algos.take_2d_axis1_int64_int64, - ('int64', 'float64'): algos.take_2d_axis1_int64_float64, - ('float32', 'float32'): algos.take_2d_axis1_float32_float32, - ('float32', 'float64'): algos.take_2d_axis1_float32_float64, - ('float64', 'float64'): algos.take_2d_axis1_float64_float64, - ('object', 'object'): algos.take_2d_axis1_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis1_int8_int8, + ("int8", "int32"): algos.take_2d_axis1_int8_int32, + ("int8", "int64"): algos.take_2d_axis1_int8_int64, + ("int8", "float64"): algos.take_2d_axis1_int8_float64, + ("int16", "int16"): algos.take_2d_axis1_int16_int16, + ("int16", "int32"): algos.take_2d_axis1_int16_int32, + ("int16", "int64"): algos.take_2d_axis1_int16_int64, + ("int16", "float64"): algos.take_2d_axis1_int16_float64, + ("int32", "int32"): algos.take_2d_axis1_int32_int32, + ("int32", "int64"): algos.take_2d_axis1_int32_int64, + ("int32", "float64"): algos.take_2d_axis1_int32_float64, + ("int64", "int64"): algos.take_2d_axis1_int64_int64, + ("int64", "float64"): algos.take_2d_axis1_int64_float64, + ("float32", "float32"): algos.take_2d_axis1_float32_float32, + ("float32", "float64"): algos.take_2d_axis1_float32_float64, + ("float64", "float64"): algos.take_2d_axis1_float64_float64, + ("object", "object"): algos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_multi_dict = { - ('int8', 'int8'): algos.take_2d_multi_int8_int8, - ('int8', 'int32'): algos.take_2d_multi_int8_int32, - ('int8', 'int64'): algos.take_2d_multi_int8_int64, - ('int8', 'float64'): algos.take_2d_multi_int8_float64, - ('int16', 'int16'): algos.take_2d_multi_int16_int16, - ('int16', 'int32'): algos.take_2d_multi_int16_int32, - ('int16', 'int64'): algos.take_2d_multi_int16_int64, - ('int16', 'float64'): algos.take_2d_multi_int16_float64, - ('int32', 'int32'): algos.take_2d_multi_int32_int32, - ('int32', 'int64'): algos.take_2d_multi_int32_int64, - ('int32', 'float64'): algos.take_2d_multi_int32_float64, - ('int64', 'int64'): algos.take_2d_multi_int64_int64, - ('int64', 'float64'): algos.take_2d_multi_int64_float64, - ('float32', 'float32'): algos.take_2d_multi_float32_float32, - ('float32', 'float64'): algos.take_2d_multi_float32_float64, - ('float64', 'float64'): algos.take_2d_multi_float64_float64, - ('object', 'object'): algos.take_2d_multi_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_multi_int8_int8, + ("int8", "int32"): algos.take_2d_multi_int8_int32, + ("int8", "int64"): algos.take_2d_multi_int8_int64, + ("int8", "float64"): algos.take_2d_multi_int8_float64, + ("int16", "int16"): algos.take_2d_multi_int16_int16, + ("int16", "int32"): algos.take_2d_multi_int16_int32, + ("int16", "int64"): algos.take_2d_multi_int16_int64, + ("int16", "float64"): algos.take_2d_multi_int16_float64, + ("int32", "int32"): algos.take_2d_multi_int32_int32, + ("int32", "int64"): algos.take_2d_multi_int32_int64, + ("int32", "float64"): algos.take_2d_multi_int32_float64, + ("int64", "int64"): algos.take_2d_multi_int64_int64, + ("int64", "float64"): algos.take_2d_multi_int64_float64, + ("float32", "float32"): algos.take_2d_multi_float32_float32, + ("float32", "float64"): algos.take_2d_multi_float32_float64, + ("float64", "float64"): algos.take_2d_multi_float64_float64, + ("object", "object"): algos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } @@ -1468,8 +1529,9 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = ensure_int64(indexer) - _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value, - mask_info=mask_info) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) return func @@ -1560,16 +1622,18 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): if allow_fill: # Pandas style, -1 means NA validate_indices(indices, len(arr)) - result = take_1d(arr, indices, axis=axis, allow_fill=True, - fill_value=fill_value) + result = take_1d( + arr, indices, axis=axis, allow_fill=True, fill_value=fill_value + ) else: # NumPy style result = arr.take(indices, axis=axis) return result -def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_nd( + arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass @@ -1644,7 +1708,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1673,12 +1737,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) - out = np.empty(out_shape, dtype=dtype, order='F') + out = np.empty(out_shape, dtype=dtype, order="F") else: out = np.empty(out_shape, dtype=dtype) - func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, - mask_info=mask_info) + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) func(arr, indexer, out, fill_value) if flip_order: @@ -1689,8 +1754,9 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, take_1d = take_nd -def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_2d_multi( + arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass """ @@ -1729,7 +1795,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1750,8 +1816,9 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if func is None: def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, - mask_info=mask_info) + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) func(arr, indexer, out=out, fill_value=fill_value) return out @@ -1762,12 +1829,12 @@ def func(arr, indexer, out, fill_value=np.nan): # ---- # _diff_special = { - 'float64': algos.diff_2d_float64, - 'float32': algos.diff_2d_float32, - 'int64': algos.diff_2d_int64, - 'int32': algos.diff_2d_int32, - 'int16': algos.diff_2d_int16, - 'int8': algos.diff_2d_int8, + "float64": algos.diff_2d_float64, + "float32": algos.diff_2d_float32, + "int64": algos.diff_2d_int64, + "int32": algos.diff_2d_int32, + "int16": algos.diff_2d_int16, + "int8": algos.diff_2d_int8, } @@ -1797,7 +1864,7 @@ def diff(arr, n, axis=0): is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 - arr = arr.view('i8') + arr = arr.view("i8") na = iNaT is_timedelta = True @@ -1847,7 +1914,11 @@ def diff(arr, n, axis=0): if is_timedelta: from pandas import TimedeltaIndex - out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( - out_arr.shape).astype('timedelta64[ns]') + + out_arr = ( + TimedeltaIndex(out_arr.ravel().astype("int64")) + .asi8.reshape(out_arr.shape) + .astype("timedelta64[ns]") + ) return out_arr diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 971ea26cbe03d..6fe35cb87b020 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,8 +9,7 @@ from numpy.random import RandomState import pytest -from pandas._libs import ( - algos as libalgos, groupby as libgroupby, hashtable as ht) +from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht from pandas.compat import lrange, range from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -19,8 +18,15 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DatetimeIndex, Index, IntervalIndex, Series, - Timestamp, compat) + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + compat, +) import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com @@ -41,12 +47,10 @@ def assert_series_or_index_or_array_or_categorical_equal(left, right): tm.assert_extension_array_equal(left, right) else: # will fail - assert isinstance(left, (Series, Index, np.ndarray, - Categorical, DatetimeArray)) + assert isinstance(left, (Series, Index, np.ndarray, Categorical, DatetimeArray)) class TestMatch(object): - def test_ints(self): values = np.array([0, 2, 1]) to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0]) @@ -69,8 +73,8 @@ def test_ints(self): tm.assert_series_equal(result, expected) def test_strings(self): - values = ['foo', 'bar', 'baz'] - to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux'] + values = ["foo", "bar", "baz"] + to_match = ["bar", "foo", "qux", "foo", "bar", "baz", "qux"] result = algos.match(to_match, values) expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64) @@ -82,19 +86,17 @@ def test_strings(self): class TestFactorize(object): - def test_basic(self): - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', - 'c']) - tm.assert_numpy_array_equal( - uniques, np.array(['a', 'b', 'c'], dtype=object)) + labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], sort=True) + labels, uniques = algos.factorize( + ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True + ) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array(['a', 'b', 'c'], dtype=object) + exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) @@ -110,41 +112,40 @@ def test_basic(self): exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.))), - sort=True) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): # doc example reshaping.rst - x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index(['A', 'B', 3.14, np.inf]) + exp = Index(["A", "B", 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index([3.14, np.inf, 'A', 'B']) + exp = Index([3.14, np.inf, "A", "B"]) tm.assert_index_equal(uniques, exp) def test_datelike(self): # M8 - v1 = Timestamp('20130101 09:00:00.00004') - v2 = Timestamp('20130101') + v1 = Timestamp("20130101 09:00:00.00004") + v2 = Timestamp("20130101") x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) @@ -160,8 +161,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, exp) # period - v1 = pd.Period('201302', freq='M') - v2 = pd.Period('201303', freq='M') + v1 = pd.Period("201302", freq="M") + v2 = pd.Period("201303", freq="M") x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index @@ -176,8 +177,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 - v1 = pd.to_timedelta('1 day 1 min') - v2 = pd.to_timedelta('1 day') + v1 = pd.to_timedelta("1 day 1 min") + v2 = pd.to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) @@ -193,52 +194,48 @@ def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer - key = np.array([1, 2, 1, np.nan], dtype='O') + key = np.array([1, 2, 1, np.nan], dtype="O") rizer = ht.Factorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) - expected = np.array([0, 1, 0, na_sentinel], dtype='int32') + expected = np.array([0, 1, 0, na_sentinel], dtype="int32") assert len(set(key)) == len(set(expected)) - tm.assert_numpy_array_equal(pd.isna(key), - expected == na_sentinel) + tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) # nan still maps to na_sentinel when sort=False - key = np.array([0, np.nan, 1], dtype='O') + key = np.array([0, np.nan, 1], dtype="O") na_sentinel = -1 # TODO(wesm): unused? ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa - expected = np.array([2, -1, 0], dtype='int32') + expected = np.array([2, -1, 0], dtype="int32") assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) - @pytest.mark.parametrize("data,expected_label,expected_level", [ - ( - [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), 'nonsense'] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), (1, 2, 3)] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2)], - [0, 1, 2, 1], - [(1, 1), (1, 2), (0, 0)] - ) - ]) + @pytest.mark.parametrize( + "data,expected_label,expected_level", + [ + ( + [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), "nonsense"], + ), + ( + [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), (1, 2, 3)], + ), + ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), + ], + ) def test_factorize_tuple_list(self, data, expected_label, expected_level): # GH9454 result = pd.factorize(data) - tm.assert_numpy_array_equal(result[0], - np.array(expected_label, dtype=np.intp)) + tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) - expected_level_array = com.asarray_tuplesafe(expected_level, - dtype=object) + expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) tm.assert_numpy_array_equal(result[1], expected_level_array) def test_complex_sorting(self): @@ -258,42 +255,40 @@ def test_float64_factorize(self, writable): tm.assert_numpy_array_equal(uniques, exp_uniques) def test_uint64_factorize(self, writable): - data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) + data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) + exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_int64_factorize(self, writable): - data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) + exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_string_factorize(self, writable): - data = np.array(['a', 'c', 'a', 'b', 'c'], - dtype=object) + data = np.array(["a", "c", "a", "b", "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_object_factorize(self, writable): - data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], - dtype=object) + data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) @@ -302,17 +297,20 @@ def test_object_factorize(self, writable): def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. - data = np.array([2**63, 1, 2**63], dtype=np.uint64) + data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) with tm.assert_produces_warning(expected_warning=FutureWarning): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) - @pytest.mark.parametrize('data', [ - np.array([0, 1, 0], dtype='u8'), - np.array([-2**63, 1, -2**63], dtype='i8'), - np.array(['__nan__', 'foo', '__nan__'], dtype='object'), - ]) + @pytest.mark.parametrize( + "data", + [ + np.array([0, 1, 0], dtype="u8"), + np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array(["__nan__", "foo", "__nan__"], dtype="object"), + ], + ) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. l, u = algos.factorize(data) @@ -321,16 +319,18 @@ def test_parametrized_factorize_na_value_default(self, data): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) - @pytest.mark.parametrize('data, na_value', [ - (np.array([0, 1, 0, 2], dtype='u8'), 0), - (np.array([1, 0, 1, 2], dtype='u8'), 1), - (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), - (np.array([1, -2**63, 1, 0], dtype='i8'), 1), - (np.array(['a', '', 'a', 'b'], dtype=object), 'a'), - (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()), - (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object), - ('a', 1)), - ]) + @pytest.mark.parametrize( + "data, na_value", + [ + (np.array([0, 1, 0, 2], dtype="u8"), 0), + (np.array([1, 0, 1, 2], dtype="u8"), 1), + (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), + (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array(["a", "", "a", "b"], dtype=object), "a"), + (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), + (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), + ], + ) def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] @@ -340,7 +340,6 @@ def test_parametrized_factorize_na_value(self, data, na_value): class TestUnique(object): - def test_unique_all_dtypes(self, any_numpy_dtype): dtype = any_numpy_dtype arr = np.random.randint(0, 100, size=50).astype(dtype) @@ -359,14 +358,15 @@ def test_unique_all_dtypes(self, any_numpy_dtype): tm.assert_numpy_array_equal(reconstr, arr, check_dtype=False) def test_object_refcount_bug(self): - lst = ['A', 'B', 'C', 'D', 'E'] + lst = ["A", "B", "C", "D", "E"] for i in range(1000): len(algos.unique(lst)) def test_on_index_object(self): - mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( - np.arange(5), 5)]) + mindex = pd.MultiIndex.from_arrays( + [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] + ) expected = mindex.values expected.sort() @@ -380,13 +380,20 @@ def test_on_index_object(self): def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( - ['2015-01-03T00:00:00.000000000+0000', - '2015-01-01T00:00:00.000000000+0000'], - dtype='M8[ns]') + [ + "2015-01-03T00:00:00.000000000+0000", + "2015-01-01T00:00:00.000000000+0000", + ], + dtype="M8[ns]", + ) - dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000', - '2015-01-01T00:00:00.000000000', - '2015-01-01T00:00:00.000000000']) + dt_index = pd.to_datetime( + [ + "2015-01-03T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + ] + ) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype @@ -401,10 +408,10 @@ def test_datetime64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype - @pytest.mark.parametrize('box', [Index, Series, np.array]) + @pytest.mark.parametrize("box", [Index, Series, np.array]) def test_timedelta64_dtype_array_returned(self, box): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype='m8[ns]') + expected = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) obj = box(td_index) @@ -423,40 +430,42 @@ def test_timedelta64_dtype_array_returned(self, box): assert_series_or_index_or_array_or_categorical_equal(reconstr, obj) def test_uint64_overflow(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp) def test_nan_in_object_array(self): - duplicated_items = ['a', np.nan, 'c', 'c'] + duplicated_items = ["a", np.nan, "c", "c"] result = pd.unique(duplicated_items) - expected = np.array(['a', np.nan, 'c'], dtype=object) + expected = np.array(["a", np.nan, "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - result_uniques, result_inverse = pd.unique(duplicated_items, - return_inverse=True) - expected_inverse = np.array([0, 1, 2, 2], dtype='int64') + result_uniques, result_inverse = pd.unique( + duplicated_items, return_inverse=True + ) + expected_inverse = np.array([0, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result_inverse, expected_inverse) - @pytest.mark.parametrize('ordered', [True, False]) - @pytest.mark.parametrize('box', [lambda x: x, Series, Index], - ids=['Categorical', 'Series', 'Index']) - @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), - pd.unique], - ids=['classmethod', 'toplevel']) + @pytest.mark.parametrize("ordered", [True, False]) + @pytest.mark.parametrize( + "box", [lambda x: x, Series, Index], ids=["Categorical", "Series", "Index"] + ) + @pytest.mark.parametrize( + "method", + [lambda x, **kwargs: x.unique(**kwargs), pd.unique], + ids=["classmethod", "toplevel"], + ) def test_categorical(self, method, box, ordered): - categories = list('abc') if ordered else list('bac') - expected = Categorical(list('bac'), categories=categories, - ordered=ordered) + categories = list("abc") if ordered else list("bac") + expected = Categorical(list("bac"), categories=categories, ordered=ordered) # Index.unique always returns Index # pd.unique(Index) stays Index (only) for Categorical expected = box(expected) if box == Index else expected # GH 15939 - c = box(Categorical(list('baabc'), categories=categories, - ordered=ordered)) + c = box(Categorical(list("baabc"), categories=categories, ordered=ordered)) result = method(c) assert_series_or_index_or_array_or_categorical_equal(result, expected) @@ -469,30 +478,33 @@ def test_categorical(self, method, box, ordered): result_uniques, result_inverse = method(c, return_inverse=True) assert_series_or_index_or_array_or_categorical_equal( - result_uniques, expected_uniques) + result_uniques, expected_uniques + ) # reconstruction can only work if inverse is correct reconstr = box(result_uniques[result_inverse]) assert_series_or_index_or_array_or_categorical_equal(reconstr, c) - @pytest.mark.parametrize('box', [Series, Index]) - @pytest.mark.parametrize('method', [lambda x, **kwargs: x.unique(**kwargs), - pd.unique], - ids=['classmethod', 'toplevel']) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize( + "method", + [lambda x, **kwargs: x.unique(**kwargs), pd.unique], + ids=["classmethod", "toplevel"], + ) def test_datetime64tz_aware(self, method, box): # GH 15939 - ts = Timestamp('20160101', tz='US/Eastern') + ts = Timestamp("20160101", tz="US/Eastern") obj = box([ts, ts]) if box == Series: - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") - ])) + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) + ) else: # Index - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) result = method(obj) assert_series_or_index_or_array_or_categorical_equal(result, expected) @@ -515,21 +527,22 @@ def test_order_of_appearance(self): # light testing of guarantee of order of appearance # these also are the doc-examples result = pd.unique(Series([2, 1, 3, 3])) - tm.assert_numpy_array_equal(result, - np.array([2, 1, 3], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64")) result = pd.unique(Series([2] + [1] * 5)) - tm.assert_numpy_array_equal(result, - np.array([2, 1], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - result = pd.unique(list('aabc')) - expected = np.array(['a', 'b', 'c'], dtype=object) + result = pd.unique(list("aabc")) + expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("arg ,expected", [ - (('1', '1', '2'), np.array(['1', '2'], dtype=object)), - (('foo',), np.array(['foo'], dtype=object)) - ]) + @pytest.mark.parametrize( + "arg ,expected", + [ + (("1", "1", "2"), np.array(["1", "2"], dtype=object)), + (("foo",), np.array(["foo"], dtype=object)), + ], + ) def test_tuple_with_strings(self, arg, expected): # see GH 17108 result = pd.unique(arg) @@ -537,9 +550,9 @@ def test_tuple_with_strings(self, arg, expected): def test_obj_none_preservation(self): # GH 20866 - arr = np.array(['foo', None], dtype=object) + arr = np.array(["foo", None], dtype=object) result = pd.unique(arr) - expected = np.array(['foo', None], dtype=object) + expected = np.array(["foo", None], dtype=object) tm.assert_numpy_array_equal(result, expected, strict_nan=True) @@ -553,8 +566,8 @@ def test_signed_zero(self): def test_different_nans(self): # GH 21866 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent @@ -565,8 +578,8 @@ def test_different_nans(self): def test_first_nan_kept(self): # GH 22295 # create different nans from bit-patterns: - bits_for_nan1 = 0xfff8000000000001 - bits_for_nan2 = 0x7ff8000000000001 + bits_for_nan1 = 0xFFF8000000000001 + bits_for_nan2 = 0x7FF8000000000001 NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0] NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] assert NAN1 != NAN1 @@ -576,17 +589,14 @@ def test_first_nan_kept(self): result = pd.unique(a) assert result.size == 1 # use bit patterns to identify which nan was kept: - result_nan_bits = struct.unpack("=Q", - struct.pack("d", result[0]))[0] + result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0] assert result_nan_bits == bits_for_nan1 - def test_do_not_mangle_na_values(self, unique_nulls_fixture, - unique_nulls_fixture2): + def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2): # GH 22295 if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values not unique - a = np.array([unique_nulls_fixture, - unique_nulls_fixture2], dtype=np.object) + a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) result = pd.unique(a) assert result.size == 2 assert a[0] is unique_nulls_fixture @@ -594,7 +604,6 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, class TestIsin(object): - def test_invalid(self): pytest.raises(TypeError, lambda: algos.isin(1, 1)) @@ -623,25 +632,25 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], ['a']) + result = algos.isin(["a", "b"], ["a"]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), Series(['a'])) + result = algos.isin(Series(["a", "b"]), Series(["a"])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), {'a'}) + result = algos.isin(Series(["a", "b"]), {"a"}) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], [1]) + result = algos.isin(["a", "b"], [1]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) def test_i8(self): - arr = pd.date_range('20130101', periods=3).values + arr = pd.date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -654,7 +663,7 @@ def test_i8(self): expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) - arr = pd.timedelta_range('1 day', periods=3).values + arr = pd.timedelta_range("1 day", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -669,7 +678,7 @@ def test_i8(self): def test_large(self): - s = pd.date_range('20000101', periods=2000000, freq='s').values + s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) expected[0] = True @@ -679,7 +688,7 @@ def test_large(self): def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) - cats = ['a', 'b', 'c'] + cats = ["a", "b", "c"] Sd = Series(Categorical(1).from_codes(vals, cats)) St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) expected = np.array([True, True, False, True]) @@ -721,8 +730,8 @@ def test_different_nans(self): # GH 22160 # all nans are handled as equivalent - comps = [float('nan')] - values = [float('nan')] + comps = [float("nan")] + values = [float("nan")] assert comps[0] is not values[0] # different nan-objects # as list of python-objects: @@ -730,20 +739,22 @@ def test_different_nans(self): tm.assert_numpy_array_equal(np.array([True]), result) # as object-array: - result = algos.isin(np.asarray(comps, dtype=np.object), - np.asarray(values, dtype=np.object)) + result = algos.isin( + np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object) + ) tm.assert_numpy_array_equal(np.array([True]), result) # as float64-array: - result = algos.isin(np.asarray(comps, dtype=np.float64), - np.asarray(values, dtype=np.float64)) + result = algos.isin( + np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64) + ) tm.assert_numpy_array_equal(np.array([True]), result) def test_no_cast(self): # GH 22160 # ensure 42 is not casted to a string - comps = ['ss', 42] - values = ['42'] + comps = ["ss", 42] + values = ["42"] expected = np.array([False, False]) result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) @@ -759,8 +770,8 @@ def test_empty(self, empty): def test_different_nan_objects(self): # GH 22119 - comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object) - vals = np.array([float('nan')], dtype=np.object) + comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object) + vals = np.array([float("nan")], dtype=np.object) expected = np.array([False, False, True]) result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) @@ -770,8 +781,8 @@ def test_different_nans_as_float64(self): # create different nans from bit-patterns, # these nans will land in different buckets in the hash-table # if no special care is taken - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 @@ -789,7 +800,6 @@ def test_different_nans_as_float64(self): class TestValueCounts(object): - def test_value_counts(self): np.random.seed(1234) from pandas.core.reshape.tile import cut @@ -807,32 +817,30 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], - index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - expected = Series([2, 2], - index=IntervalIndex.from_tuples([(0.996, 2.5), - (2.5, 4.0)])) + expected = Series( + [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + ) tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): - result = algos.value_counts([1, 1.]) + result = algos.value_counts([1, 1.0]) assert len(result) == 1 - result = algos.value_counts([1, 1.], bins=1) + result = algos.value_counts([1, 1.0], bins=1) assert len(result) == 1 - result = algos.value_counts(Series([1, 1., '1'])) # object + result = algos.value_counts(Series([1, 1.0, "1"])) # object assert len(result) == 2 - pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1), - ['1', 1]) + pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1), ["1", 1]) def test_value_counts_nat(self): - td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]') - dt = pd.to_datetime(['NaT', '2014-01-01']) + td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") + dt = pd.to_datetime(["NaT", "2014-01-01"]) for s in [td, dt]: vc = algos.value_counts(s) @@ -840,32 +848,40 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) def test_value_counts_datetime_outofbounds(self): # GH 13663 - s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1), - datetime(3000, 1, 1), datetime(3000, 1, 1)]) + s = Series( + [ + datetime(3000, 1, 1), + datetime(5000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + datetime(3000, 1, 1), + datetime(3000, 1, 1), + ] + ) res = s.value_counts() - exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(6000, 1, 1)], dtype=object) + exp_index = Index( + [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], + dtype=object, + ) exp = Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(Series(['2362-01-01', np.nan]), - errors='ignore') - exp = Series(['2362-01-01', np.nan], dtype=object) + res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) def test_categorical(self): - s = Series(Categorical(list('aaabbc'))) + s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c'])) + expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) tm.assert_series_equal(result, expected, check_index_type=True) @@ -876,39 +892,51 @@ def test_categorical(self): tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_nans(self): - s = Series(Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) + s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['a', 'b', 'c'])) + expected = Series( + [4, 3, 2], + index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([ - 4, 3, 2, 1 - ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) + expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order - s = Series(Categorical( - list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) + s = Series( + Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"]) + ) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2], + index=CategoricalIndex( + ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex( - ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2, 1], + index=CategoricalIndex( + ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_zeroes(self): # keep the `d` category with 0 - s = Series(Categorical( - list('bbbaac'), categories=list('abcd'), ordered=True)) + s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True)) result = s.value_counts() - expected = Series([3, 2, 1, 0], index=Categorical( - ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True)) + expected = Series( + [3, 2, 1, 0], + index=Categorical( + ["b", "a", "c", "d"], categories=list("abcd"), ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_dropna(self): @@ -916,59 +944,66 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan])) + Series([2, 1, 1], index=[True, False, np.nan]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=False), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=False), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5., None]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - result = Series([10.3, 5., 5., None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) + result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) - dtypes = (np.float64, np.object, 'M8[ns]') + dtypes = (np.float64, np.object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) - expected = Series([0.6, 0.2, 0.2], - index=Series([np.nan, 2.0, 1.0], dtype=t)) + expected = Series( + [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) + ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], - index=Series([2.0, 1.0], dtype=t)) + expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): - arr = np.array([2**63], dtype=np.uint64) - expected = Series([1], index=[2**63]) + arr = np.array([2 ** 63], dtype=np.uint64) + expected = Series([1], index=[2 ** 63]) result = algos.value_counts(arr) tm.assert_series_equal(result, expected) - arr = np.array([-1, 2**63], dtype=object) - expected = Series([1, 1], index=[-1, 2**63]) + arr = np.array([-1, 2 ** 63], dtype=object) + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) # 32-bit linux has a different ordering @@ -977,7 +1012,6 @@ def test_value_counts_uint64(self): class TestDuplicated(object): - def test_duplicated_with_nas(self): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) @@ -985,11 +1019,11 @@ def test_duplicated_with_nas(self): expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='first') + result = algos.duplicated(keys, keep="first") expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array([True, False, True, False, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -998,8 +1032,9 @@ def test_duplicated_with_nas(self): tm.assert_numpy_array_equal(result, expected) keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): + for i, t in enumerate( + zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2) + ): keys[i] = t result = algos.duplicated(keys) @@ -1008,7 +1043,7 @@ def test_duplicated_with_nas(self): expected = np.array(falses + trues) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array(trues + falses) tm.assert_numpy_array_equal(result, expected) @@ -1016,51 +1051,66 @@ def test_duplicated_with_nas(self): expected = np.array(trues + trues) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('case', [ - np.array([1, 2, 1, 5, 3, - 2, 4, 1, 5, 6]), - np.array([1.1, 2.2, 1.1, np.nan, 3.3, - 2.2, 4.4, 1.1, np.nan, 6.6]), - np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, - 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), - np.array(['a', 'b', 'a', 'e', 'c', - 'b', 'd', 'a', 'e', 'f'], dtype=object), - np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], - dtype=np.uint64), - ]) + @pytest.mark.parametrize( + "case", + [ + np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array( + [ + 1 + 1j, + 2 + 2j, + 1 + 1j, + 5 + 5j, + 3 + 3j, + 2 + 2j, + 4 + 4j, + 1 + 1j, + 5 + 5j, + 6 + 6j, + ] + ), + np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object), + np.array( + [1, 2 ** 63, 1, 3 ** 5, 10, 2 ** 63, 39, 1, 3 ** 5, 7], dtype=np.uint64 + ), + ], + ) def test_numeric_object_likes(self, case): - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category')]: - res_first = idx.duplicated(keep='first') + for idx in [Index(case), Index(case, dtype="category")]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category')]: - res_first = s.duplicated(keep='first') + for s in [Series(case), Series(case, dtype="category")]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1068,52 +1118,82 @@ def test_numeric_object_likes(self, case): def test_datetime_likes(self): - dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', - '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] - td = ['1 days', '2 days', '1 days', 'NaT', '3 days', - '2 days', '4 days', '1 days', 'NaT', '6 days'] - - cases = [np.array([Timestamp(d) for d in dt]), - np.array([Timestamp(d, tz='US/Eastern') for d in dt]), - np.array([pd.Period(d, freq='D') for d in dt]), - np.array([np.datetime64(d) for d in dt]), - np.array([pd.Timedelta(d) for d in td])] - - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + dt = [ + "2011-01-01", + "2011-01-02", + "2011-01-01", + "NaT", + "2011-01-03", + "2011-01-02", + "2011-01-04", + "2011-01-01", + "NaT", + "2011-01-06", + ] + td = [ + "1 days", + "2 days", + "1 days", + "NaT", + "3 days", + "2 days", + "4 days", + "1 days", + "NaT", + "6 days", + ] + + cases = [ + np.array([Timestamp(d) for d in dt]), + np.array([Timestamp(d, tz="US/Eastern") for d in dt]), + np.array([pd.Period(d, freq="D") for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td]), + ] + + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last for case in cases: - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category'), - Index(case, dtype=object)]: - res_first = idx.duplicated(keep='first') + for idx in [ + Index(case), + Index(case, dtype="category"), + Index(case, dtype=object), + ]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category'), - Series(case, dtype=object)]: - res_first = s.duplicated(keep='first') + for s in [ + Series(case), + Series(case, dtype="category"), + Series(case, dtype=object), + ]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1123,17 +1203,24 @@ def test_unique_index(self): cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)] for case in cases: assert case.is_unique is True - tm.assert_numpy_array_equal(case.duplicated(), - np.array([False, False, False])) - - @pytest.mark.parametrize('arr, unique', [ - ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], - [(0, 0), (0, 1), (1, 0), (1, 1)]), - ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')], - [('b', 'c'), ('a', 'b')]), - ([('a', 1), ('b', 2), ('a', 3), ('a', 1)], - [('a', 1), ('b', 2), ('a', 3)]), - ]) + tm.assert_numpy_array_equal( + case.duplicated(), np.array([False, False, False]) + ) + + @pytest.mark.parametrize( + "arr, unique", + [ + ( + [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)], + ), + ( + [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")], + [("b", "c"), ("a", "b")], + ), + ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]), + ], + ) def test_unique_tuples(self, arr, unique): # https://github.com/pandas-dev/pandas/issues/16519 expected = np.empty(len(unique), dtype=object) @@ -1144,18 +1231,17 @@ def test_unique_tuples(self, arr, unique): class GroupVarTestMixin(object): - def test_group_var_generic_1d(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 1))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3, )).astype('int64') + labels = np.tile(np.arange(5), (3,)).astype("int64") - expected_out = (np.squeeze(values) - .reshape((5, 3), order='F') - .std(axis=1, ddof=1) ** 2)[:, np.newaxis] + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] expected_counts = counts + 3 self.algo(out, counts, values, labels) @@ -1166,9 +1252,9 @@ def test_group_var_generic_1d_flat_labels(self): prng = RandomState(1234) out = (np.nan * np.ones((1, 1))).astype(self.dtype) - counts = np.zeros(1, dtype='int64') + counts = np.zeros(1, dtype="int64") values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype='int64') + labels = np.zeros(5, dtype="int64") expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 @@ -1182,9 +1268,9 @@ def test_group_var_generic_2d_all_finite(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2, )).astype('int64') + labels = np.tile(np.arange(5), (2,)).astype("int64") expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 expected_counts = counts + 2 @@ -1197,15 +1283,17 @@ def test_group_var_generic_2d_some_nan(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2, )).astype('int64') - - expected_out = np.vstack([values[:, 0] - .reshape(5, 2, order='F') - .std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5)]).T.astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) expected_counts = counts + 2 self.algo(out, counts, values, labels) @@ -1216,9 +1304,9 @@ def test_group_var_constant(self): # Regression test from GH 10448. out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype='int64') + labels = np.zeros(3, dtype="int64") self.algo(out, counts, values, labels) @@ -1239,10 +1327,10 @@ def test_group_var_large_inputs(self): prng = RandomState(1234) out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype='int64') + labels = np.zeros(10 ** 6, dtype="int64") self.algo(out, counts, values, labels) @@ -1259,15 +1347,13 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable(object): - def test_lookup_nan(self, writable): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.Float64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_add_signed_zeros(self): # GH 21866 inconsistent hash-function for float64 @@ -1283,8 +1369,8 @@ def test_add_signed_zeros(self): def test_add_different_nans(self): # GH 21866 inconsistent hash-function for float64 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 # default hash function would lead to different hash-buckets @@ -1295,28 +1381,32 @@ def test_add_different_nans(self): assert len(m) == 1 # NAN1 and NAN2 are equivalent def test_lookup_overflow(self, writable): - xs = np.array([1, 2, 2**63], dtype=np.uint64) + xs = np.array([1, 2, 2 ** 63], dtype=np.uint64) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.UInt64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_get_unique(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(s.unique(), exp) - @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case - @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ - (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), - (ht.StringHashTable, ht.ObjectVector, 'object', True), - (ht.Float64HashTable, ht.Float64Vector, 'float64', False), - (ht.Int64HashTable, ht.Int64Vector, 'int64', False), - (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]) - def test_vector_resize(self, writable, htable, uniques, dtype, - safely_resizes, nvals): + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): # Test for memory errors after internal vector # reallocations (GH 7157) vals = np.array(np.random.randn(1000), dtype=dtype) @@ -1341,21 +1431,25 @@ def test_vector_resize(self, writable, htable, uniques, dtype, if safely_resizes: htable.get_labels(vals, uniques, 0, -1) else: - with pytest.raises(ValueError, match='external reference.*'): + with pytest.raises(ValueError, match="external reference.*"): htable.get_labels(vals, uniques, 0, -1) - uniques.to_array() # should not raise here + uniques.to_array() # should not raise here assert tmp.shape == oldshape - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_unique(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1370,27 +1464,32 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() - expected_unique = s_duplicated.drop_duplicates(keep='first').values + expected_unique = s_duplicated.drop_duplicates(keep="first").values result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) # test return_inverse=True # reconstruction can only succeed if the inverse is correct - result_unique, result_inverse = htable().unique(s_duplicated.values, - return_inverse=True) + result_unique, result_inverse = htable().unique( + s_duplicated.values, return_inverse=True + ) tm.assert_numpy_array_equal(result_unique, expected_unique) reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_factorize(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1418,42 +1517,46 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): expected_reconstruct = s_duplicated.dropna().values tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) - @pytest.mark.parametrize('hashtable', [ - ht.PyObjectHashTable, ht.StringHashTable, - ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.UInt64HashTable, + ], + ) def test_hashtable_large_sizehint(self, hashtable): # GH 22729 size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) # noqa + tbl = hashtable(size_hint=size_hint) # noqa def test_quantile(): s = Series(np.random.randn(100)) - result = algos.quantile(s, [0, .25, .5, .75, 1.]) - expected = algos.quantile(s.values, [0, .25, .5, .75, 1.]) + result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0]) + expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0]) tm.assert_almost_equal(result, expected) def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') + a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) a[np.random.choice(len(a), 10)] = -1 left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) class TestRank(object): - @td.skip_if_no_scipy def test_scipy_compat(self): from scipy.stats import rankdata @@ -1467,13 +1570,13 @@ def _check(arr): exp[mask] = nan assert_almost_equal(result, exp) - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + _check(np.array([nan, nan, 5.0, 5.0, 5.0, nan, 1, 2, 3, nan])) + _check(np.array([4.0, nan, 5.0, 5.0, 5.0, nan, 1, 2, 4.0, nan])) def test_basic(self): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes['AllInteger']: + for dtype in np.typecodes["AllInteger"]: s = Series([1, 100], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) @@ -1481,7 +1584,7 @@ def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64) for dtype in [np.float64, np.uint64]: - s = Series([1, 2**63], dtype=dtype) + s = Series([1, 2 ** 63], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) def test_too_many_ndims(self): @@ -1492,10 +1595,11 @@ def test_too_many_ndims(self): algos.rank(arr) @pytest.mark.single - @pytest.mark.parametrize('values', [ - np.arange(2**24 + 1), - np.arange(2**25 + 2).reshape(2**24 + 1, 2)], - ids=['1d', '2d']) + @pytest.mark.parametrize( + "values", + [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)], + ids=["1d", "2d"], + ) def test_pct_max_many_rows(self, values): # GH 18271 result = algos.rank(values, pct=True).max() @@ -1504,8 +1608,8 @@ def test_pct_max_many_rows(self, values): def test_pad_backfill_object_segfault(): - old = np.array([], dtype='O') - new = np.array([datetime(2010, 12, 31)], dtype='O') + old = np.array([], dtype="O") + new = np.array([datetime(2010, 12, 31)], dtype="O") result = libalgos.pad["object"](old, new) expected = np.array([-1], dtype=np.int64) @@ -1525,13 +1629,12 @@ def test_pad_backfill_object_segfault(): def test_arrmap(): - values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar']) - assert (result.dtype == np.bool_) + values = np.array(["foo", "foo", "bar", "bar", "baz", "qux"], dtype="O") + result = libalgos.arrmap_object(values, lambda x: x in ["foo", "bar"]) + assert result.dtype == np.bool_ class TestTseriesUtil(object): - def test_combineFunc(self): pass @@ -1553,8 +1656,7 @@ def test_backfill(self): filler = libalgos.backfill["int64_t"](old.values, new.values) - expect_filler = np.array([0, 0, 1, 1, 1, 1, - 2, 2, 2, 2, 2, -1], dtype=np.int64) + expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1571,8 +1673,7 @@ def test_pad(self): filler = libalgos.pad["int64_t"](old.values, new.values) - expect_filler = np.array([-1, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 2, 2], dtype=np.int64) + expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1585,32 +1686,267 @@ def test_pad(self): def test_is_lexsorted(): failure = [ - np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, - 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'), - np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, - 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, - 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 6, 5, - 4, 3, 2, 1, 0], dtype='int64')] - - assert (not libalgos.is_lexsorted(failure)) + np.array( + [ + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype="int64", + ), + np.array( + [ + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + ], + dtype="int64", + ), + ] + + assert not libalgos.is_lexsorted(failure) def test_groupsort_indexer(): @@ -1622,7 +1958,7 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer # always returns int64 - expected = np.argsort(a, kind='mergesort') + expected = np.argsort(a, kind="mergesort") expected = expected.astype(np.int64) tm.assert_numpy_array_equal(result, expected) @@ -1694,7 +2030,7 @@ def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp) result = libalgos.ensure_platform_int(arr) - assert (result is arr) + assert result is arr def test_int64_add_overflow(): @@ -1714,34 +2050,42 @@ def test_int64_add_overflow(): with pytest.raises(OverflowError, match=msg): algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True]), + ) with pytest.raises(OverflowError, match=msg): with tm.assert_produces_warning(RuntimeWarning): - algos.checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) + algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) # Check that the nan boolean arrays override whether or not # the addition overflows. We don't check the result but just # the fact that an OverflowError is not raised. - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True]), + ) class TestMode(object): - def test_no_mode(self): exp = Series([], dtype=np.float64) tm.assert_series_equal(algos.mode([]), exp) @@ -1754,7 +2098,7 @@ def test_mode_single(self): exp_multi = [1] data_multi = [1, 1] - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1766,8 +2110,8 @@ def test_mode_single(self): exp = Series([1], dtype=np.int) tm.assert_series_equal(algos.mode([1]), exp) - exp = Series(['a', 'b', 'c'], dtype=np.object) - tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp) + exp = Series(["a", "b", "c"], dtype=np.object) + tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp) def test_number_mode(self): exp_single = [1] @@ -1776,7 +2120,7 @@ def test_number_mode(self): exp_multi = [1, 3] data_multi = [1] * 5 + [2] * 3 + [3] * 5 - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1786,15 +2130,15 @@ def test_number_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_strobj_mode(self): - exp = ['b'] - data = ['a'] * 2 + ['b'] * 3 + exp = ["b"] + data = ["a"] * 2 + ["b"] * 3 - s = Series(data, dtype='c') - exp = Series(exp, dtype='c') + s = Series(data, dtype="c") + exp = Series(exp, dtype="c") tm.assert_series_equal(algos.mode(s), exp) - exp = ['bar'] - data = ['foo'] * 2 + ['bar'] * 3 + exp = ["bar"] + data = ["foo"] * 2 + ["bar"] * 3 for dt in [str, object]: s = Series(data, dtype=dt) @@ -1802,41 +2146,41 @@ def test_strobj_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_datelike_mode(self): - exp = Series(['1900-05-03', '2011-01-03', - '2013-01-02'], dtype="M8[ns]") - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03'], dtype='M8[ns]') + exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02'], dtype='M8[ns]') + exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"], + dtype="M8[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_timedelta_mode(self): - exp = Series(['-1 days', '0 days', '1 days'], - dtype='timedelta64[ns]') - s = Series(['1 days', '-1 days', '0 days'], - dtype='timedelta64[ns]') + exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]") + s = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + s = Series( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_mixed_dtype(self): - exp = Series(['foo']) - s = Series([1, 'foo', 'foo']) + exp = Series(["foo"]) + s = Series([1, "foo", "foo"]) tm.assert_series_equal(algos.mode(s), exp) def test_uint64_overflow(self): - exp = Series([2**63], dtype=np.uint64) - s = Series([1, 2**63, 2**63], dtype=np.uint64) + exp = Series([2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) - exp = Series([1, 2**63], dtype=np.uint64) - s = Series([1, 2**63], dtype=np.uint64) + exp = Series([1, 2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) def test_categorical(self): @@ -1845,8 +2189,8 @@ def test_categorical(self): tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) - c = Categorical([1, 'a', 'a']) - exp = Categorical(['a'], categories=[1, 'a']) + c = Categorical([1, "a", "a"]) + exp = Categorical(["a"], categories=[1, "a"]) tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) @@ -1860,15 +2204,17 @@ def test_index(self): exp = Series([1, 2, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - idx = Index([1, 'a', 'a']) - exp = Series(['a'], dtype=object) + idx = Index([1, "a", "a"]) + exp = Series(["a"], dtype=object) tm.assert_series_equal(algos.mode(idx), exp) idx = Index([1, 1, 2, 3, 3]) exp = Series([1, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + idx = Index( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(idx), exp) From 786159fd56759428c939fbb7891ce4d6a25da8c0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 11 Oct 2019 14:10:46 +0200 Subject: [PATCH 14/15] lint --- pandas/tests/test_algos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d6c29bbf8454f..1d76b99a7170d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -371,6 +371,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) + class TestUnique: def test_unique_all_dtypes(self, any_numpy_dtype): dtype = any_numpy_dtype From dfee500d4ded093ee08fe609cac768faae563633 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 11 Oct 2019 16:18:34 +0200 Subject: [PATCH 15/15] fix oversight from merge --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8053b3e6c3219..8392f9c4cc77d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -428,7 +428,7 @@ def unique(values, return_inverse=False): else: uniques = table.unique(values) - uniques = _reconstruct_data(uniques, dtype, original) + uniques = _reconstruct_data(uniques, original.dtype, original) if return_inverse: return uniques, inverse