raise SpecificationError if we have an invalid aggregator

jreback · jreback · commit 750556b4e4d5 · 2016-02-02T07:49:16.000-06:00
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -417,48 +417,127 @@ def _aggregate(self, arg, *args, **kwargs):
         """
 
         is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
+        is_nested_renamer = False
 
         _level = kwargs.pop('_level', None)
         if isinstance(arg, compat.string_types):
             return getattr(self, arg)(*args, **kwargs), None
 
         result = compat.OrderedDict()
         if isinstance(arg, dict):
+
+            # aggregate based on the passed dict
             if self.axis != 0:  # pragma: no cover
                 raise ValueError('Can only pass dict with axis=0')
 
             obj = self._selected_obj
 
-            if any(is_aggregator(x) for x in arg.values()):
+            # if we have a dict of any non-scalars
+            # eg. {'A' : ['mean']}, normalize all to
+            # be list-likes
+            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                 new_arg = compat.OrderedDict()
                 for k, v in compat.iteritems(arg):
                     if not isinstance(v, (tuple, list, dict)):
                         new_arg[k] = [v]
                     else:
                         new_arg[k] = v
+
+                    # the keys must be in the columns
+                    # for ndim=2, or renamers for ndim=1
+
+                    # ok
+                    # {'A': { 'ra': 'mean' }}
+                    # {'A': { 'ra': ['mean'] }}
+                    # {'ra': ['mean']}
+
+                    # not ok
+                    # {'ra' : { 'A' : 'mean' }}
+                    if isinstance(v, dict):
+                        is_nested_renamer = True
+
+                        if k not in obj.columns:
+                            raise SpecificationError('cannot perform renaming '
+                                                     'for {0} with a nested '
+                                                     'dictionary'.format(k))
+
                 arg = new_arg
 
-            keys = []
-            if self._selection is not None:
-                subset = obj
+            from pandas.tools.merge import concat
+
+            def _agg_1dim(name, how, subset=None):
+                """
+                aggregate a 1-dim with how
+                """
+                colg = self._gotitem(name, ndim=1, subset=subset)
+                if colg.ndim != 1:
+                    raise SpecificationError("nested dictionary is ambiguous "
+                                             "in aggregation")
+                return colg.aggregate(how, _level=(_level or 0) + 1)
+
+            def _agg_2dim(name, how):
+                """
+                aggregate a 2-dim with how
+                """
+                colg = self._gotitem(self._selection, ndim=2,
+                                     subset=obj)
+                return colg.aggregate(how, _level=None)
+
+            # set the final keys
+            keys = list(compat.iterkeys(arg))
+
+            # nested renamer
+            if is_nested_renamer:
+                results = [_agg_1dim(k, v) for k, v in compat.iteritems(arg)]
+
+                if all(isinstance(r, dict) for r in results):
+
+                    for r in results:
+                        result.update(r)
+                    keys = list(compat.iterkeys(result))
+
+                else:
 
-                ndim = 1 if len(self._selection_list) == 1 else 2
-                for fname, agg_how in compat.iteritems(arg):
-                    colg = self._gotitem(self._selection, ndim=ndim,
-                                         subset=subset)
-                    result[fname] = colg.aggregate(agg_how, _level=None)
-                    keys.append(fname)
+                    result = results
+                    if self._selection is not None:
+                        keys = None
+
+            # some selection on the object
+            elif self._selection is not None:
+
+                sl = set(self._selection_list)
+
+                # we are a Series like object,
+                # but may have multiple aggregations
+                if len(sl) == 1:
+
+                    for fname, agg_how in compat.iteritems(arg):
+                        result[fname] = _agg_1dim(self._selection,
+                                                  agg_how)
+
+                # we are selecting the same set as we are aggregating
+                elif not len(sl - set(compat.iterkeys(arg))):
+
+                    for fname, agg_how in compat.iteritems(arg):
+                        result[fname] = _agg_1dim(fname, agg_how)
+
+                # we are a DataFrame, with possibly multiple aggregations
+                else:
+
+                    for fname, agg_how in compat.iteritems(arg):
+                        result[fname] = _agg_2dim(fname, agg_how)
+
+            # no selection
             else:
+
                 for col, agg_how in compat.iteritems(arg):
-                    colg = self._gotitem(col, ndim=1)
-                    if colg.ndim != 1:
-                        raise ValueError("nested dictionary is ambiguous"
-                                         "in aggregation")
-                    result[col] = colg.aggregate(agg_how, _level=_level)
-                    keys.append(col)
+                    result[col] = _agg_1dim(col, agg_how)
 
-            if isinstance(list(result.values())[0], com.ABCDataFrame):
-                from pandas.tools.merge import concat
+            # combine results
+            if isinstance(result, list):
+                result = concat(result, keys=keys, axis=1)
+            elif isinstance(list(compat.itervalues(result))[0],
+                            com.ABCDataFrame):
                 result = concat([result[k] for k in keys], keys=keys, axis=1)
             else:
                 from pandas import DataFrame
@@ -518,11 +597,7 @@ def _aggregate_multiple_funcs(self, arg, _level):
                 except SpecificationError:
                     raise
 
-        if _level:
-            keys = None
-        result = concat(results, keys=keys, axis=1)
-
-        return result
+        return concat(results, keys=keys, axis=1)
 
     def _is_cython_func(self, arg):
         """ if we define an internal function for this argument, return it """
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1156,8 +1156,8 @@ def nth(self, n, dropna=None):
 
             return result
 
-        if (isinstance(self._selected_obj, DataFrame)
-                and dropna not in ['any', 'all']):
+        if isinstance(self._selected_obj, DataFrame) and \
+           dropna not in ['any', 'all']:
             # Note: when agg-ing picker doesn't raise this, just returns NaN
             raise ValueError("For a DataFrame groupby, dropna must be "
                              "either None, 'any' or 'all', "
@@ -2512,12 +2512,12 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
         -------
         Series or DataFrame
         """
-        kwargs.pop('_level', None)
+        _level = kwargs.pop('_level', None)
         if isinstance(func_or_funcs, compat.string_types):
             return getattr(self, func_or_funcs)(*args, **kwargs)
 
         if hasattr(func_or_funcs, '__iter__'):
-            ret = self._aggregate_multiple_funcs(func_or_funcs)
+            ret = self._aggregate_multiple_funcs(func_or_funcs, _level)
         else:
             cyfunc = self._is_cython_func(func_or_funcs)
             if cyfunc and not args and not kwargs:
@@ -2541,7 +2541,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
 
     agg = aggregate
 
-    def _aggregate_multiple_funcs(self, arg):
+    def _aggregate_multiple_funcs(self, arg, _level):
         if isinstance(arg, dict):
             columns = list(arg.keys())
             arg = list(arg.items())
@@ -2562,6 +2562,14 @@ def _aggregate_multiple_funcs(self, arg):
                     columns.append(com._get_callable_name(f))
             arg = lzip(columns, arg)
 
+        # for a ndim=1, disallow a nested dict for an aggregator as
+        # this is a mis-specification of the aggregations, via a
+        # specificiation error
+        # e.g. g['A'].agg({'A': ..., 'B': ...})
+        if self.name in columns and len(columns) > 1:
+            raise SpecificationError('invalid aggregation names specified '
+                                     'for selected objects')
+
         results = {}
         for name, func in arg:
             obj = self
@@ -2577,6 +2585,13 @@ def _aggregate_multiple_funcs(self, arg):
                 obj._selection = name
             results[name] = obj.aggregate(func)
 
+        if isinstance(list(compat.itervalues(results))[0],
+                      com.ABCDataFrame):
+
+            # let higher level handle
+            if _level:
+                return results
+            return list(compat.itervalues(results))[0]
         return DataFrame(results, columns=columns)
 
     def _wrap_output(self, output, index, names=None):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1512,6 +1512,32 @@ def test_aggregate_api_consistency(self):
                                                     ['D', 'C']])
         assert_frame_equal(result, expected, check_like=True)
 
+    def test_agg_nested_dicts(self):
+
+        # API change for disallowing these types of nested dicts
+        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                              'foo', 'bar', 'foo', 'foo'],
+                        'B': ['one', 'one', 'two', 'two',
+                              'two', 'two', 'one', 'two'],
+                        'C': np.random.randn(8) + 1.0,
+                        'D': np.arange(8)})
+
+        g = df.groupby(['A', 'B'])
+
+        def f():
+            g.aggregate({'r1': {'C': ['mean', 'sum']},
+                         'r2': {'D': ['mean', 'sum']}})
+
+        self.assertRaises(SpecificationError, f)
+
+        result = g.agg({'C': {'ra': ['mean', 'std']},
+                        'D': {'rb': ['mean', 'std']}})
+        expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(),
+                              g['D'].std()], axis=1)
+        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
+            'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
+        assert_frame_equal(result, expected, check_like=True)
+
     def test_multi_iter(self):
         s = Series(np.arange(6))
         k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py
@@ -18,6 +18,7 @@
 import pandas.core.datetools as datetools
 import pandas.stats.moments as mom
 import pandas.core.window as rwindow
+from pandas.core.base import SpecificationError
 import pandas.util.testing as tm
 from pandas.compat import range, zip, PY3
 
@@ -197,12 +198,18 @@ def f():
             r.aggregate({'r1': {'A': ['mean', 'sum']},
                          'r2': {'B': ['mean', 'sum']}})
 
-        self.assertRaises(ValueError, f)
+        self.assertRaises(SpecificationError, f)
 
-        result = r.agg({'A': {'ra': ['mean', 'std']},
-                        'B': {'rb': ['mean', 'std']}})
         expected = pd.concat([r['A'].mean(), r['A'].std(), r['B'].mean(),
                               r['B'].std()], axis=1)
+        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
+            'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
+        result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
+                                    'B': {'rb': ['mean', 'std']}})
+        assert_frame_equal(result, expected, check_like=True)
+
+        result = r.agg({'A': {'ra': ['mean', 'std']},
+                        'B': {'rb': ['mean', 'std']}})
         expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), (
             'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')])
         assert_frame_equal(result, expected, check_like=True)
diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
@@ -385,9 +385,9 @@ def _downsample(self, how, **kwargs):
         if isinstance(loffset, compat.string_types):
             loffset = to_offset(self.loffset)
 
-        if isinstance(loffset, (DateOffset, timedelta)):
-            if (isinstance(result.index, DatetimeIndex)
-                    and len(result.index) > 0):
+        if isinstance(loffset, (DateOffset, timedelta)) and \
+           isinstance(result.index, DatetimeIndex) and \
+           len(result.index) > 0:
                 result.index = result.index + loffset
 
         return self._wrap_result(result)
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py