diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fa565aa802faf..f4c77263bf21b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2,6 +2,7 @@ import numpy as np from pandas.errors import AbstractMethodError +from pandas.compat.numpy import function as nv _not_implemented_message = "{} does not implement {}." @@ -236,6 +237,57 @@ def isna(self): """ raise AbstractMethodError(self) + def _values_for_argsort(self): + # type: () -> ndarray + """Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + # Note: this is used in `ExtensionArray.argsort`. + return np.array(self) + + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + """ + Return the indices that would sort this array. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + *args, **kwargs: + passed through to :func:`numpy.argsort`. + + Returns + ------- + index_array : ndarray + Array of indices that sort ``self``. + + See Also + -------- + numpy.argsort : Sorting implementation used internally. + """ + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e7d414f9de544..13384dd56a9c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1378,17 +1378,24 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """ - Returns the indices that would sort the Categorical instance if - 'sort_values' was called. This function is implemented to provide - compatibility with numpy ndarray objects. + def _values_for_argsort(self): + return self._codes.copy() - While an ordering is applied to the category values, arg-sorting - in this context refers more to organizing and grouping together - based on matching category values. Thus, this function can be - called on an unordered Categorical instance unlike the functions - 'Categorical.min' and 'Categorical.max'. + def argsort(self, *args, **kwargs): + # TODO(PY2): use correct signature + # We have to do *args, **kwargs to avoid a a py2-only signature + # issue since np.argsort differs from argsort. + """Return the indicies that would sort the Categorical. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + *args, **kwargs: + passed through to :func:`numpy.argsort`. Returns ------- @@ -1397,12 +1404,28 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): See also -------- numpy.ndarray.argsort + + Notes + ----- + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Examples + -------- + >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + array([2, 0, 1, 3]) + + >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], + ... categories=['c', 'b', 'a'], + ... ordered=True) + >>> cat.argsort() + array([3, 0, 1, 2]) """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + # Keep the implementation here just for the docstring. + return super(Categorical, self).argsort(*args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 7ce80e25d8cf6..4d467d62d0a56 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -32,6 +32,46 @@ def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + def test_argsort(self, data_for_sorting): + result = pd.Series(data_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) + self.assert_series_equal(result, expected) + + def test_argsort_missing(self, data_missing_for_sorting): + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + ser = pd.Series(data_for_sorting) + result = ser.sort_values(ascending=ascending) + expected = ser.iloc[[2, 0, 1]] + if not ascending: + expected = expected[::-1] + + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + ser = pd.Series(data_missing_for_sorting) + result = ser.sort_values(ascending=ascending) + if ascending: + expected = ser.iloc[[2, 0, 1]] + else: + expected = ser.iloc[[0, 2, 1]] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + df = pd.DataFrame({"A": [1, 2, 1], + "B": data_for_sorting}) + result = df.sort_values(['A', 'B']) + expected = pd.DataFrame({"A": [1, 1, 2], + 'B': data_for_sorting.take([2, 0, 1])}, + index=[2, 0, 1]) + self.assert_frame_equal(result, expected) + @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index b6dd181c1d8f3..b602d9ee78e2a 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -29,6 +29,18 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def data_for_sorting(): + return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], + ordered=True) + + +@pytest.fixture +def data_missing_for_sorting(): + return Categorical(['A', None, 'B'], categories=['B', 'A'], + ordered=True) + + @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 21ed8894e8ebb..04dfb408fc378 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,6 +30,26 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + @pytest.fixture def na_cmp(): """Binary operator for comparing NA values. diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 4c6ef9b4d38c8..7d959ea4fcd84 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,6 +25,20 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) +@pytest.fixture +def data_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('2'), + decimal.Decimal('0')]) + + +@pytest.fixture +def data_missing_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('NaN'), + decimal.Decimal('0')]) + + @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() @@ -48,11 +62,17 @@ def assert_series_equal(self, left, right, *args, **kwargs): *args, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): - self.assert_series_equal(left.dtypes, right.dtypes) - for col in left.columns: + # TODO(EA): select_dtypes + decimals = (left.dtypes == 'decimal').index + + for col in decimals: self.assert_series_equal(left[col], right[col], *args, **kwargs) + left = left.drop(columns=decimals) + right = right.drop(columns=decimals) + tm.assert_frame_equal(left, right, *args, **kwargs) + class TestDtype(BaseDecimal, base.BaseDtypeTests): pass diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 322944129146a..ee0951812b8f0 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -44,7 +44,11 @@ def __getitem__(self, item): return self._constructor_from_sequence([ x for x, m in zip(self, item) if m ]) + elif isinstance(item, collections.Iterable): + # fancy indexing + return type(self)([self.data[i] for i in item]) else: + # slice return type(self)(self.data[item]) def __setitem__(self, key, value): @@ -104,6 +108,13 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) + def _values_for_argsort(self): + # Disable NumPy's shape inference by including an empty tuple... + # If all the elemnts of self are the same size P, NumPy will + # cast them to an (N, P) array, instead of an (N,) array of tuples. + frozen = [()] + list(tuple(x.items()) for x in self) + return np.array(frozen, dtype=object)[1:] + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 16d5e4415a79f..aec561ece8573 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -29,6 +29,16 @@ def data_missing(): return JSONArray([{}, {'a': 10}]) +@pytest.fixture +def data_for_sorting(): + return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{'b': 1}, {}, {'a': 4}]) + + @pytest.fixture def na_value(): return {} @@ -70,10 +80,39 @@ def test_fillna_frame(self): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="Unhashable") + unhashable = pytest.mark.skip(reason="Unhashable") + unstable = pytest.mark.skipif(sys.version_info <= (3, 5), + reason="Dictionary order unstable") + + @unhashable def test_value_counts(self, all_data, dropna): pass + @unhashable + def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. + pass + + @unstable + def test_argsort(self, data_for_sorting): + super(TestMethods, self).test_argsort(data_for_sorting) + + @unstable + def test_argsort_missing(self, data_missing_for_sorting): + super(TestMethods, self).test_argsort_missing( + data_missing_for_sorting) + + @unstable + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + super(TestMethods, self).test_sort_values( + data_for_sorting, ascending) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + super(TestMethods, self).test_sort_values_missing( + data_missing_for_sorting, ascending) + class TestCasting(base.BaseCastingTests): pass