Skip to content

ENH: add bounds-checking preamble to groupby_X cython code #3031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
6 commits merged into from Mar 17, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/v0.11.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ Bug Fixes

- Fixed slow printing of large Dataframes, due to inefficient dtype
reporting (GH2807_)
- Fixed a segfault when using a function as grouper in groupby (GH3035_)
- Fix pretty-printing of infinite data structures (closes GH2978_)
- str.contains ignored na argument (GH2806_)

Expand All @@ -325,6 +326,7 @@ on GitHub for a complete list.
.. _GH2810: https://github.com/pydata/pandas/issues/2810
.. _GH2837: https://github.com/pydata/pandas/issues/2837
.. _GH2898: https://github.com/pydata/pandas/issues/2898
.. _GH3035: https://github.com/pydata/pandas/issues/3035
.. _GH2978: https://github.com/pydata/pandas/issues/2978
.. _GH2739: https://github.com/pydata/pandas/issues/2739
.. _GH2710: https://github.com/pydata/pandas/issues/2710
Expand Down
23 changes: 18 additions & 5 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
def f(self):
try:
return self._cython_agg_general(alias, numeric_only=numeric_only)
except AssertionError as e:
raise SpecificationError(str(e))
except Exception:
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
if _convert:
Expand Down Expand Up @@ -348,7 +350,7 @@ def mean(self):
"""
try:
return self._cython_agg_general('mean')
except DataError:
except GroupByError:
raise
except Exception: # pragma: no cover
f = lambda x: x.mean(axis=self.axis)
Expand All @@ -362,7 +364,7 @@ def median(self):
"""
try:
return self._cython_agg_general('median')
except DataError:
except GroupByError:
raise
except Exception: # pragma: no cover
f = lambda x: x.median(axis=self.axis)
Expand Down Expand Up @@ -462,7 +464,10 @@ def _cython_agg_general(self, how, numeric_only=True):
if numeric_only and not is_numeric:
continue

result, names = self.grouper.aggregate(obj.values, how)
try:
result, names = self.grouper.aggregate(obj.values, how)
except AssertionError as e:
raise GroupByError(str(e))
output[name] = result

if len(output) == 0:
Expand Down Expand Up @@ -1200,6 +1205,13 @@ def __init__(self, index, grouper=None, name=None, level=None,
# no level passed
if not isinstance(self.grouper, np.ndarray):
self.grouper = self.index.map(self.grouper)
if not (hasattr(self.grouper,"__len__") and \
len(self.grouper) == len(self.index)):
errmsg = "Grouper result violates len(labels) == len(data)\n"
errmsg += "result: %s" % com.pprint_thing(self.grouper)
self.grouper = None # Try for sanity
raise AssertionError(errmsg)


def __repr__(self):
return 'Grouping(%s)' % self.name
Expand Down Expand Up @@ -1718,9 +1730,10 @@ def _aggregate_multiple_funcs(self, arg):
grouper=self.grouper)
results.append(colg.aggregate(arg))
keys.append(col)
except (TypeError, DataError):
except (TypeError, DataError) :
pass

except SpecificationError:
raise
result = concat(results, keys=keys, axis=1)

return result
Expand Down
27 changes: 27 additions & 0 deletions pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,9 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):

length = len(index)

if not length == len(labels):
raise AssertionError("len(index) != len(labels)")

for i in range(length):
key = util.get_value_1d(labels, i)

Expand Down Expand Up @@ -625,6 +628,9 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
ndarray[%(dest_type2)s, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros((<object> out).shape, dtype=np.int64)
resx = np.empty_like(out)

Expand Down Expand Up @@ -760,6 +766,9 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
ndarray[%(dest_type2)s, ndim=2] resx
ndarray[int64_t, ndim=2] nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros((<object> out).shape, dtype=np.int64)
resx = np.empty_like(out)

Expand Down Expand Up @@ -802,6 +811,9 @@ def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, count
ndarray[%(dest_type2)s, ndim=2] sumx, nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
sumx = np.zeros_like(out)

Expand Down Expand Up @@ -915,6 +927,9 @@ def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, count
ndarray[%(dest_type2)s, ndim=2] prodx, nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
prodx = np.ones_like(out)

Expand Down Expand Up @@ -1025,6 +1040,9 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, ct
ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
sumx = np.zeros_like(out)
sumxx = np.zeros_like(out)
Expand Down Expand Up @@ -1220,6 +1238,9 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, count
ndarray[%(dest_type2)s, ndim=2] maxx, nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)

maxx = np.empty_like(out)
Expand Down Expand Up @@ -1342,6 +1363,9 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, count
ndarray[%(dest_type2)s, ndim=2] minx, nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)

minx = np.empty_like(out)
Expand Down Expand Up @@ -1399,6 +1423,9 @@ def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
%(dest_type2)s val, count
ndarray[%(dest_type2)s, ndim=2] sumx, nobs

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

nobs = np.zeros_like(out)
sumx = np.zeros_like(out)

Expand Down
Loading