pandas-dev · Mar 17, 2013 · Mar 12, 2013 · Mar 12, 2013 · Mar 13, 2013 · Mar 13, 2013
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
@@ -314,6 +314,7 @@ Bug Fixes
 
   - Fixed slow printing of large Dataframes, due to inefficient dtype
     reporting (GH2807_)
+  - Fixed a segfault when using a function as grouper in groupby (GH3035_)
   - Fix pretty-printing of infinite data structures (closes GH2978_)
   - str.contains ignored na argument (GH2806_)
 
@@ -325,6 +326,7 @@ on GitHub for a complete list.
 .. _GH2810: https://github.com/pydata/pandas/issues/2810
 .. _GH2837: https://github.com/pydata/pandas/issues/2837
 .. _GH2898: https://github.com/pydata/pandas/issues/2898
+.. _GH3035: https://github.com/pydata/pandas/issues/3035
 .. _GH2978: https://github.com/pydata/pandas/issues/2978
 .. _GH2739: https://github.com/pydata/pandas/issues/2739
 .. _GH2710: https://github.com/pydata/pandas/issues/2710

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -57,6 +57,8 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
     def f(self):
         try:
             return self._cython_agg_general(alias, numeric_only=numeric_only)
+        except AssertionError as e:
+            raise SpecificationError(str(e))
         except Exception:
             result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
             if _convert:
@@ -348,7 +350,7 @@ def mean(self):
         """
         try:
             return self._cython_agg_general('mean')
-        except DataError:
+        except GroupByError:
             raise
         except Exception:  # pragma: no cover
             f = lambda x: x.mean(axis=self.axis)
@@ -362,7 +364,7 @@ def median(self):
         """
         try:
             return self._cython_agg_general('median')
-        except DataError:
+        except GroupByError:
             raise
         except Exception:  # pragma: no cover
             f = lambda x: x.median(axis=self.axis)
@@ -462,7 +464,10 @@ def _cython_agg_general(self, how, numeric_only=True):
             if numeric_only and not is_numeric:
                 continue
 
-            result, names = self.grouper.aggregate(obj.values, how)
+            try:
+                result, names = self.grouper.aggregate(obj.values, how)
+            except AssertionError as e:
+                raise GroupByError(str(e))
             output[name] = result
 
         if len(output) == 0:
@@ -1200,6 +1205,13 @@ def __init__(self, index, grouper=None, name=None, level=None,
             # no level passed
             if not isinstance(self.grouper, np.ndarray):
                 self.grouper = self.index.map(self.grouper)
+                if not (hasattr(self.grouper,"__len__") and \
+                   len(self.grouper) == len(self.index)):
+                    errmsg = "Grouper result violates len(labels) == len(data)\n"
+                    errmsg += "result: %s" % com.pprint_thing(self.grouper)
+                    self.grouper = None # Try for sanity
+                    raise AssertionError(errmsg)
+
 
     def __repr__(self):
         return 'Grouping(%s)' % self.name
@@ -1718,9 +1730,10 @@ def _aggregate_multiple_funcs(self, arg):
                                      grouper=self.grouper)
                 results.append(colg.aggregate(arg))
                 keys.append(col)
-            except (TypeError, DataError):
+            except (TypeError, DataError) :
                 pass
-
+            except SpecificationError:
+                raise
         result = concat(results, keys=keys, axis=1)
 
         return result

diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -593,6 +593,9 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
 
     length = len(index)
 
+    if not length == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
@@ -625,6 +628,9 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         ndarray[%(dest_type2)s, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
     resx = np.empty_like(out)
 
@@ -760,6 +766,9 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         ndarray[%(dest_type2)s, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
     resx = np.empty_like(out)
 
@@ -802,6 +811,9 @@ def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] sumx, nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
 
@@ -915,6 +927,9 @@ def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] prodx, nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
     prodx = np.ones_like(out)
 
@@ -1025,6 +1040,9 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, ct
         ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
     sumxx = np.zeros_like(out)
@@ -1220,6 +1238,9 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] maxx, nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
 
     maxx = np.empty_like(out)
@@ -1342,6 +1363,9 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] minx, nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
 
     minx = np.empty_like(out)
@@ -1399,6 +1423,9 @@ def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] sumx, nobs
 
+    if not len(values) == len(labels):
+       raise AssertionError("len(index) != len(labels)")
+
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)