Skip to content

POC/REF: remove SeriesBinGrouper, SeriesGrouper #32083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 0 additions & 247 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -147,253 +147,6 @@ cdef class Reducer:
return result


cdef class _BaseGrouper:
cdef _check_dummy(self, dummy):
# both values and index must be an ndarray!

values = dummy.values
# GH 23683: datetimetz types are equivalent to datetime types here
if (dummy.dtype != self.arr.dtype
and values.dtype != self.arr.dtype):
raise ValueError('Dummy array must be same dtype')
if util.is_array(values) and not values.flags.contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy()
index = dummy.index.values
if not index.flags.contiguous:
index = index.copy()

return values, index

cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
Slider islider, Slider vslider):
if cached_typ is None:
cached_ityp = self.ityp(islider.buf)
cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name)
else:
# See the comment in indexes/base.py about _index_data.
# We need this for EA-backed indexes that have a reference
# to a 1-d ndarray like datetime / timedelta / period.
object.__setattr__(cached_ityp, '_index_data', islider.buf)
cached_ityp._engine.clear_mapping()
object.__setattr__(cached_typ._data._block, 'values', vslider.buf)
object.__setattr__(cached_typ, '_index', cached_ityp)
object.__setattr__(cached_typ, 'name', self.name)

return cached_typ, cached_ityp

cdef inline object _apply_to_group(self,
object cached_typ, object cached_ityp,
Slider islider, Slider vslider,
Py_ssize_t group_size, bint initialized):
"""
Call self.f on our new group, then update to the next group.
"""
cached_ityp._engine.clear_mapping()
res = self.f(cached_typ)
res = _extract_result(res)
if not initialized:
# On the first pass, we check the output shape to see
# if this looks like a reduction.
initialized = 1
_check_result_array(res, len(self.dummy_arr))

islider.advance(group_size)
vslider.advance(group_size)

return res, initialized


cdef class SeriesBinGrouper(_BaseGrouper):
"""
Performs grouping operation according to bin edges, rather than labels
"""
cdef:
Py_ssize_t nresults, ngroups

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object values, f, bins, typ, ityp, name

def __init__(self, object series, object f, object bins, object dummy):

assert dummy is not None # always obj[:0]
assert len(bins) > 0 # otherwise we get IndexError in get_result

self.bins = bins
self.f = f

values = series.values
if util.is_array(values) and not values.flags.c_contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy('C')
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.index = series.index.values
self.name = series.name

self.dummy_arr, self.dummy_index = self._check_dummy(dummy)

# kludge for #1688
if len(bins) > 0 and bins[-1] == len(series):
self.ngroups = len(bins)
else:
self.ngroups = len(bins) + 1

def get_result(self):
cdef:
ndarray arr, result
ndarray[int64_t] counts
Py_ssize_t i, n, group_size
object res
bint initialized = 0
Slider vslider, islider
object cached_typ = None, cached_ityp = None

counts = np.zeros(self.ngroups, dtype=np.int64)

if self.ngroups > 0:
counts[0] = self.bins[0]
for i in range(1, self.ngroups):
if i == self.ngroups - 1:
counts[i] = len(self.arr) - self.bins[i - 1]
else:
counts[i] = self.bins[i] - self.bins[i - 1]

group_size = 0
n = len(self.arr)

vslider = Slider(self.arr, self.dummy_arr)
islider = Slider(self.index, self.dummy_index)

result = np.empty(self.ngroups, dtype='O')

try:
for i in range(self.ngroups):
group_size = counts[i]

islider.set_length(group_size)
vslider.set_length(group_size)

cached_typ, cached_ityp = self._update_cached_objs(
cached_typ, cached_ityp, islider, vslider)

res, initialized = self._apply_to_group(cached_typ, cached_ityp,
islider, vslider,
group_size, initialized)

result[i] = res

finally:
# so we don't free the wrong memory
islider.reset()
vslider.reset()

result = maybe_convert_objects(result)
return result, counts


cdef class SeriesGrouper(_BaseGrouper):
"""
Performs generic grouping operation while avoiding ndarray construction
overhead
"""
cdef:
Py_ssize_t nresults, ngroups

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object f, labels, values, typ, ityp, name

def __init__(self, object series, object f, object labels,
Py_ssize_t ngroups, object dummy):

# in practice we always pass obj.iloc[:0] or equivalent
assert dummy is not None

if len(series) == 0:
# get_result would never assign `result`
raise ValueError("SeriesGrouper requires non-empty `series`")

self.labels = labels
self.f = f

values = series.values
if util.is_array(values) and not values.flags.c_contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy('C')
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.index = series.index.values
self.name = series.name

self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
self.ngroups = ngroups

def get_result(self):
cdef:
# Define result to avoid UnboundLocalError
ndarray arr, result = None
ndarray[int64_t] labels, counts
Py_ssize_t i, n, group_size, lab
object res
bint initialized = 0
Slider vslider, islider
object cached_typ = None, cached_ityp = None

labels = self.labels
counts = np.zeros(self.ngroups, dtype=np.int64)
group_size = 0
n = len(self.arr)

vslider = Slider(self.arr, self.dummy_arr)
islider = Slider(self.index, self.dummy_index)

result = np.empty(self.ngroups, dtype='O')

try:
for i in range(n):
group_size += 1

lab = labels[i]

if i == n - 1 or lab != labels[i + 1]:
if lab == -1:
islider.advance(group_size)
vslider.advance(group_size)
group_size = 0
continue

islider.set_length(group_size)
vslider.set_length(group_size)

cached_typ, cached_ityp = self._update_cached_objs(
cached_typ, cached_ityp, islider, vslider)

res, initialized = self._apply_to_group(cached_typ, cached_ityp,
islider, vslider,
group_size, initialized)

result[lab] = res
counts[lab] = group_size
group_size = 0

finally:
# so we don't free the wrong memory
islider.reset()
vslider.reset()

# We check for empty series in the constructor, so should always
# have result initialized by this point.
assert initialized, "`result` has not been initialized."

result = maybe_convert_objects(result)

return result, counts


cdef inline _extract_result(object res, bint squeeze=True):
""" extract the result object, it might be a 0-dim ndarray
or a len-1 0-dim, or a scalar """
Expand Down
11 changes: 2 additions & 9 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,17 +923,10 @@ def _python_agg_general(self, func, *args, **kwargs):

try:
# if this function is invalid for this dtype, we will ignore it.
func(obj[:0])
result, counts = self.grouper.agg_series(obj, f)
except TypeError:
continue
except AssertionError:
raise
except Exception:
# Our function depends on having a non-empty argument
# See test_groupby_agg_err_catching
pass

result, counts = self.grouper.agg_series(obj, f)

assert result is not None
key = base.OutputKey(label=name, position=idx)
output[key] = self._try_cast(result, obj, numeric_only=True)
Expand Down
50 changes: 1 addition & 49 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,50 +621,8 @@ def agg_series(self, obj: Series, func):
# Caller is responsible for checking ngroups != 0
assert self.ngroups != 0

if len(obj) == 0:
# SeriesGrouper would raise if we were to call _aggregate_series_fast
return self._aggregate_series_pure_python(obj, func)

elif is_extension_array_dtype(obj.dtype):
# _aggregate_series_fast would raise TypeError when
# calling libreduction.Slider
# In the datetime64tz case it would incorrectly cast to tz-naive
# TODO: can we get a performant workaround for EAs backed by ndarray?
return self._aggregate_series_pure_python(obj, func)

elif obj.index._has_complex_internals:
# Pre-empt TypeError in _aggregate_series_fast
return self._aggregate_series_pure_python(obj, func)

try:
return self._aggregate_series_fast(obj, func)
except ValueError as err:
if "Function does not reduce" in str(err):
# raised in libreduction
pass
else:
raise
return self._aggregate_series_pure_python(obj, func)

def _aggregate_series_fast(self, obj: Series, func):
# At this point we have already checked that
# - obj.index is not a MultiIndex
# - obj is backed by an ndarray, not ExtensionArray
# - len(obj) > 0
# - ngroups != 0
func = self._is_builtin_func(func)

group_index, _, ngroups = self.group_info

# avoids object / Series creation overhead
dummy = obj.iloc[:0]
indexer = get_group_index_sorter(group_index, ngroups)
obj = obj.take(indexer)
group_index = algorithms.take_nd(group_index, indexer, allow_fill=False)
grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy)
result, counts = grouper.get_result()
return result, counts

def _aggregate_series_pure_python(self, obj: Series, func):

group_index, _, ngroups = self.group_info
Expand Down Expand Up @@ -856,13 +814,7 @@ def agg_series(self, obj: Series, func):
assert self.ngroups != 0
assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result

if is_extension_array_dtype(obj.dtype):
# pre-empt SeriesBinGrouper from raising TypeError
return self._aggregate_series_pure_python(obj, func)

dummy = obj[:0]
grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy)
return grouper.get_result()
return self._aggregate_series_pure_python(obj, func)


def _is_indexed_like(obj, axes) -> bool:
Expand Down
42 changes: 0 additions & 42 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,48 +9,6 @@
import pandas._testing as tm


def test_series_grouper():
obj = Series(np.random.randn(10))
dummy = obj.iloc[:0]

labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)

grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
result, counts = grouper.get_result()

expected = np.array([obj[3:6].mean(), obj[6:].mean()])
tm.assert_almost_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty_raises():
# GH#29500
obj = Series(np.random.randn(10))
dummy = obj.iloc[:0]
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)

with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"):
libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy)


def test_series_bin_grouper():
obj = Series(np.random.randn(10))
dummy = obj[:0]

bins = np.array([3, 6])

grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
result, counts = grouper.get_result()

expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
tm.assert_almost_equal(result, expected)

exp_counts = np.array([3, 3, 4], dtype=np.int64)
tm.assert_almost_equal(counts, exp_counts)


@pytest.mark.parametrize(
"binner,closed,expected",
[
Expand Down