diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 60ec7de5c4d8e..fdf25a376f68f 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -314,6 +314,7 @@ Bug Fixes - Fixed slow printing of large Dataframes, due to inefficient dtype reporting (GH2807_) + - Fixed a segfault when using a function as grouper in groupby (GH3035_) - Fix pretty-printing of infinite data structures (closes GH2978_) - str.contains ignored na argument (GH2806_) @@ -325,6 +326,7 @@ on GitHub for a complete list. .. _GH2810: https://github.com/pydata/pandas/issues/2810 .. _GH2837: https://github.com/pydata/pandas/issues/2837 .. _GH2898: https://github.com/pydata/pandas/issues/2898 +.. _GH3035: https://github.com/pydata/pandas/issues/3035 .. _GH2978: https://github.com/pydata/pandas/issues/2978 .. _GH2739: https://github.com/pydata/pandas/issues/2739 .. _GH2710: https://github.com/pydata/pandas/issues/2710 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3f12f773db96a..7e20ec95fd763 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -57,6 +57,8 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, def f(self): try: return self._cython_agg_general(alias, numeric_only=numeric_only) + except AssertionError as e: + raise SpecificationError(str(e)) except Exception: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) if _convert: @@ -348,7 +350,7 @@ def mean(self): """ try: return self._cython_agg_general('mean') - except DataError: + except GroupByError: raise except Exception: # pragma: no cover f = lambda x: x.mean(axis=self.axis) @@ -362,7 +364,7 @@ def median(self): """ try: return self._cython_agg_general('median') - except DataError: + except GroupByError: raise except Exception: # pragma: no cover f = lambda x: x.median(axis=self.axis) @@ -462,7 +464,10 @@ def _cython_agg_general(self, how, numeric_only=True): if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how) + try: + result, names = self.grouper.aggregate(obj.values, how) + except AssertionError as e: + raise GroupByError(str(e)) output[name] = result if len(output) == 0: @@ -1200,6 +1205,13 @@ def __init__(self, index, grouper=None, name=None, level=None, # no level passed if not isinstance(self.grouper, np.ndarray): self.grouper = self.index.map(self.grouper) + if not (hasattr(self.grouper,"__len__") and \ + len(self.grouper) == len(self.index)): + errmsg = "Grouper result violates len(labels) == len(data)\n" + errmsg += "result: %s" % com.pprint_thing(self.grouper) + self.grouper = None # Try for sanity + raise AssertionError(errmsg) + def __repr__(self): return 'Grouping(%s)' % self.name @@ -1718,9 +1730,10 @@ def _aggregate_multiple_funcs(self, arg): grouper=self.grouper) results.append(colg.aggregate(arg)) keys.append(col) - except (TypeError, DataError): + except (TypeError, DataError) : pass - + except SpecificationError: + raise result = concat(results, keys=keys, axis=1) return result diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index c94ed8730f32a..fa9e21e16f57f 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -593,6 +593,9 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -625,6 +628,9 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, ndarray[%(dest_type2)s, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -760,6 +766,9 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, ndarray[%(dest_type2)s, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -802,6 +811,9 @@ def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, count ndarray[%(dest_type2)s, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -915,6 +927,9 @@ def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, count ndarray[%(dest_type2)s, ndim=2] prodx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) prodx = np.ones_like(out) @@ -1025,6 +1040,9 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, ct ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) sumxx = np.zeros_like(out) @@ -1220,6 +1238,9 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, count ndarray[%(dest_type2)s, ndim=2] maxx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) maxx = np.empty_like(out) @@ -1342,6 +1363,9 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, count ndarray[%(dest_type2)s, ndim=2] minx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) minx = np.empty_like(out) @@ -1399,6 +1423,9 @@ def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, %(dest_type2)s val, count ndarray[%(dest_type2)s, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index ce83e08782ea2..11a610375830b 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -1967,6 +1967,9 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -1992,6 +1995,9 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -2017,6 +2023,9 @@ def groupby_object(ndarray[object] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -2042,6 +2051,9 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -2067,6 +2079,9 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -2092,6 +2107,9 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): length = len(index) + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + for i in range(length): key = util.get_value_1d(labels, i) @@ -3334,7 +3352,7 @@ def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3374,7 +3392,7 @@ def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3414,7 +3432,7 @@ def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3454,7 +3472,7 @@ def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3494,7 +3512,7 @@ def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3534,7 +3552,7 @@ def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3574,7 +3592,7 @@ def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3614,7 +3632,7 @@ def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3654,7 +3672,7 @@ def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3694,7 +3712,7 @@ def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3734,7 +3752,7 @@ def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3774,7 +3792,7 @@ def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3814,7 +3832,7 @@ def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3854,7 +3872,7 @@ def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3894,7 +3912,7 @@ def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -3934,7 +3952,7 @@ def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -3974,7 +3992,7 @@ def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -4014,7 +4032,7 @@ def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF True: @@ -4054,7 +4072,7 @@ def take_2d_axis1_object_object(ndarray[object, ndim=2] values, n = len(values) k = len(indexer) - + fv = fill_value IF False: @@ -4890,6 +4908,9 @@ def group_last_float64(ndarray[float64_t, ndim=2] out, ndarray[float64_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -4930,6 +4951,9 @@ def group_last_float32(ndarray[float32_t, ndim=2] out, ndarray[float32_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -5060,6 +5084,9 @@ def group_nth_float64(ndarray[float64_t, ndim=2] out, ndarray[float64_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -5101,6 +5128,9 @@ def group_nth_float32(ndarray[float32_t, ndim=2] out, ndarray[float32_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty_like(out) @@ -5233,6 +5263,9 @@ def group_add_float64(ndarray[float64_t, ndim=2] out, float64_t val, count ndarray[float64_t, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -5286,6 +5319,9 @@ def group_add_float32(ndarray[float32_t, ndim=2] out, float32_t val, count ndarray[float32_t, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -5453,6 +5489,9 @@ def group_prod_float64(ndarray[float64_t, ndim=2] out, float64_t val, count ndarray[float64_t, ndim=2] prodx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) prodx = np.ones_like(out) @@ -5506,6 +5545,9 @@ def group_prod_float32(ndarray[float32_t, ndim=2] out, float32_t val, count ndarray[float32_t, ndim=2] prodx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) prodx = np.ones_like(out) @@ -5670,6 +5712,9 @@ def group_var_float64(ndarray[float64_t, ndim=2] out, float64_t val, ct ndarray[float64_t, ndim=2] nobs, sumx, sumxx + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) sumxx = np.zeros_like(out) @@ -5728,6 +5773,9 @@ def group_var_float32(ndarray[float32_t, ndim=2] out, float32_t val, ct ndarray[float32_t, ndim=2] nobs, sumx, sumxx + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) sumxx = np.zeros_like(out) @@ -5910,6 +5958,9 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out, float64_t val, count ndarray[float64_t, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -5959,6 +6010,9 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out, float32_t val, count ndarray[float32_t, ndim=2] sumx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) sumx = np.zeros_like(out) @@ -6119,6 +6173,9 @@ def group_min_float64(ndarray[float64_t, ndim=2] out, float64_t val, count ndarray[float64_t, ndim=2] minx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) minx = np.empty_like(out) @@ -6176,6 +6233,9 @@ def group_min_float32(ndarray[float32_t, ndim=2] out, float32_t val, count ndarray[float32_t, ndim=2] minx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) minx = np.empty_like(out) @@ -6357,6 +6417,9 @@ def group_max_float64(ndarray[float64_t, ndim=2] out, float64_t val, count ndarray[float64_t, ndim=2] maxx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) maxx = np.empty_like(out) @@ -6414,6 +6477,9 @@ def group_max_float32(ndarray[float32_t, ndim=2] out, float32_t val, count ndarray[float32_t, ndim=2] maxx, nobs + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + nobs = np.zeros_like(out) maxx = np.empty_like(out) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4b1770dd4f5df..0e5130ea34674 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -218,6 +218,30 @@ def test_groupby_dict_mapping(self): assert_series_equal(result, result2) assert_series_equal(result, expected2) + def test_groupby_bounds_check(self): + import pandas as pd + # groupby_X is code-generated, so if one variant + # does, the rest probably do to + a = np.array([1,2],dtype='object') + b = np.array([1,2,3],dtype='object') + self.assertRaises(AssertionError, pd.algos.groupby_object,a, b) + + def test_groupby_grouper_f_sanity_checked(self): + import pandas as pd + dates = pd.date_range('01-Jan-2013', periods=12, freq='MS') + ts = pd.TimeSeries(np.random.randn(12), index=dates) + + # GH3035 + # index.map is used to apply grouper to the index + # if it fails on the elements, map tries it on the entire index as + # a sequence. That can yield invalid results that cause trouble + # down the line. + # the surprise comes from using key[0:6] rather then str(key)[0:6] + # when the elements are Timestamp. + # the result is Index[0:6], very confusing. + + self.assertRaises(AssertionError, ts.groupby,lambda key: key[0:6]) + def test_groupby_nonobject_dtype(self): key = self.mframe.index.labels[0] grouped = self.mframe.groupby(key)