Skip to content

PERF: RangeIndex.value_counts/searchsorted/to_numpy #58376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 29, 2024
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,9 @@ Performance improvements
- Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`)
- Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`)
- Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`)
- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`)
- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`)
- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,6 @@ def array(self) -> ExtensionArray:
"""
raise AbstractMethodError(self)

@final
def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
Expand Down Expand Up @@ -659,7 +658,7 @@ def to_numpy(
)

values = self._values
if fillna:
if fillna and self.hasnans:
if not can_hold_element(values, na_value):
# if we can't hold the na_value asarray either makes a copy or we
# error before modifying values. The asarray later on thus won't make
Expand Down Expand Up @@ -899,7 +898,6 @@ def _map_values(self, mapper, na_action=None):

return algorithms.map_array(arr, mapper, na_action=na_action)

@final
def value_counts(
self,
normalize: bool = False,
Expand Down
65 changes: 65 additions & 0 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,13 @@
Dtype,
JoinHow,
NaPosition,
NumpySorter,
Self,
npt,
)

from pandas import Series

_empty_range = range(0)
_dtype_int64 = np.dtype(np.int64)

Expand Down Expand Up @@ -1359,3 +1363,64 @@ def take( # type: ignore[override]
taken += self.start

return self._shallow_copy(taken, name=self.name)

def value_counts(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins=None,
dropna: bool = True,
) -> Series:
from pandas import Series

if bins is not None:
return super().value_counts(
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
)
name = "proportion" if normalize else "count"
data: npt.NDArray[np.floating] | npt.NDArray[np.signedinteger] = np.ones(
len(self), dtype=np.int64
)
if normalize:
data = data / len(self)
return Series(data, index=self.copy(), name=name)

def searchsorted( # type: ignore[override]
self,
value,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
if side not in {"left", "right"} or sorter is not None:
return super().searchsorted(value=value, side=side, sorter=sorter)

was_scalar = False
if is_scalar(value):
was_scalar = True
array_value = np.array([value])
else:
array_value = np.asarray(value)
if array_value.dtype.kind not in "iu":
return super().searchsorted(value=value, side=side, sorter=sorter)

if flip := (self.step < 0):
rng = self._range[::-1]
start = rng.start
step = rng.step
shift = side == "right"
else:
start = self.start
step = self.step
shift = side == "left"
result = (array_value - start - int(shift)) // step + 1
if flip:
result = len(self) - result
result = np.maximum(np.minimum(result, len(self)), 0)
if was_scalar:
return np.intp(result.item())
return result.astype(np.intp, copy=False)
33 changes: 33 additions & 0 deletions pandas/tests/indexes/ranges/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,3 +874,36 @@ def test_getitem_integers_return_index():
result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]]
expected = Index([0, 2, 8], dtype="int64", name="foo")
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"rng",
[
range(3),
range(0),
range(0, 3, 2),
range(3, -3, -2),
],
)
def test_value_counts(sort, dropna, ascending, normalize, rng):
ri = RangeIndex(rng, name="A")
result = ri.value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
)
expected = Index(list(rng), name="A").value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
)
tm.assert_series_equal(result, expected, check_index_type=False)


@pytest.mark.parametrize("side", ["left", "right"])
@pytest.mark.parametrize("value", [0, -5, 5, -3, np.array([-5, -3, 0, 5])])
def test_searchsorted(side, value):
ri = RangeIndex(-3, 3, 2)
result = ri.searchsorted(value=value, side=side)
expected = Index(list(ri)).searchsorted(value=value, side=side)
if isinstance(value, int):
assert result == expected
else:
tm.assert_numpy_array_equal(result, expected)