Skip to content

Commit 750556b

Browse files
committed
raise SpecificationError if we have an invalid aggregator
1 parent c54ea69 commit 750556b

File tree

6 files changed

+225
-42
lines changed

6 files changed

+225
-42
lines changed

pandas/core/base.py

Lines changed: 98 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -417,48 +417,127 @@ def _aggregate(self, arg, *args, **kwargs):
417417
"""
418418

419419
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
420+
is_nested_renamer = False
420421

421422
_level = kwargs.pop('_level', None)
422423
if isinstance(arg, compat.string_types):
423424
return getattr(self, arg)(*args, **kwargs), None
424425

425426
result = compat.OrderedDict()
426427
if isinstance(arg, dict):
428+
429+
# aggregate based on the passed dict
427430
if self.axis != 0: # pragma: no cover
428431
raise ValueError('Can only pass dict with axis=0')
429432

430433
obj = self._selected_obj
431434

432-
if any(is_aggregator(x) for x in arg.values()):
435+
# if we have a dict of any non-scalars
436+
# eg. {'A' : ['mean']}, normalize all to
437+
# be list-likes
438+
if any(is_aggregator(x) for x in compat.itervalues(arg)):
433439
new_arg = compat.OrderedDict()
434440
for k, v in compat.iteritems(arg):
435441
if not isinstance(v, (tuple, list, dict)):
436442
new_arg[k] = [v]
437443
else:
438444
new_arg[k] = v
445+
446+
# the keys must be in the columns
447+
# for ndim=2, or renamers for ndim=1
448+
449+
# ok
450+
# {'A': { 'ra': 'mean' }}
451+
# {'A': { 'ra': ['mean'] }}
452+
# {'ra': ['mean']}
453+
454+
# not ok
455+
# {'ra' : { 'A' : 'mean' }}
456+
if isinstance(v, dict):
457+
is_nested_renamer = True
458+
459+
if k not in obj.columns:
460+
raise SpecificationError('cannot perform renaming '
461+
'for {0} with a nested '
462+
'dictionary'.format(k))
463+
439464
arg = new_arg
440465

441-
keys = []
442-
if self._selection is not None:
443-
subset = obj
466+
from pandas.tools.merge import concat
467+
468+
def _agg_1dim(name, how, subset=None):
469+
"""
470+
aggregate a 1-dim with how
471+
"""
472+
colg = self._gotitem(name, ndim=1, subset=subset)
473+
if colg.ndim != 1:
474+
raise SpecificationError("nested dictionary is ambiguous "
475+
"in aggregation")
476+
return colg.aggregate(how, _level=(_level or 0) + 1)
477+
478+
def _agg_2dim(name, how):
479+
"""
480+
aggregate a 2-dim with how
481+
"""
482+
colg = self._gotitem(self._selection, ndim=2,
483+
subset=obj)
484+
return colg.aggregate(how, _level=None)
485+
486+
# set the final keys
487+
keys = list(compat.iterkeys(arg))
488+
489+
# nested renamer
490+
if is_nested_renamer:
491+
results = [_agg_1dim(k, v) for k, v in compat.iteritems(arg)]
492+
493+
if all(isinstance(r, dict) for r in results):
494+
495+
for r in results:
496+
result.update(r)
497+
keys = list(compat.iterkeys(result))
498+
499+
else:
444500

445-
ndim = 1 if len(self._selection_list) == 1 else 2
446-
for fname, agg_how in compat.iteritems(arg):
447-
colg = self._gotitem(self._selection, ndim=ndim,
448-
subset=subset)
449-
result[fname] = colg.aggregate(agg_how, _level=None)
450-
keys.append(fname)
501+
result = results
502+
if self._selection is not None:
503+
keys = None
504+
505+
# some selection on the object
506+
elif self._selection is not None:
507+
508+
sl = set(self._selection_list)
509+
510+
# we are a Series like object,
511+
# but may have multiple aggregations
512+
if len(sl) == 1:
513+
514+
for fname, agg_how in compat.iteritems(arg):
515+
result[fname] = _agg_1dim(self._selection,
516+
agg_how)
517+
518+
# we are selecting the same set as we are aggregating
519+
elif not len(sl - set(compat.iterkeys(arg))):
520+
521+
for fname, agg_how in compat.iteritems(arg):
522+
result[fname] = _agg_1dim(fname, agg_how)
523+
524+
# we are a DataFrame, with possibly multiple aggregations
525+
else:
526+
527+
for fname, agg_how in compat.iteritems(arg):
528+
result[fname] = _agg_2dim(fname, agg_how)
529+
530+
# no selection
451531
else:
532+
452533
for col, agg_how in compat.iteritems(arg):
453-
colg = self._gotitem(col, ndim=1)
454-
if colg.ndim != 1:
455-
raise ValueError("nested dictionary is ambiguous"
456-
"in aggregation")
457-
result[col] = colg.aggregate(agg_how, _level=_level)
458-
keys.append(col)
534+
result[col] = _agg_1dim(col, agg_how)
459535

460-
if isinstance(list(result.values())[0], com.ABCDataFrame):
461-
from pandas.tools.merge import concat
536+
# combine results
537+
if isinstance(result, list):
538+
result = concat(result, keys=keys, axis=1)
539+
elif isinstance(list(compat.itervalues(result))[0],
540+
com.ABCDataFrame):
462541
result = concat([result[k] for k in keys], keys=keys, axis=1)
463542
else:
464543
from pandas import DataFrame
@@ -518,11 +597,7 @@ def _aggregate_multiple_funcs(self, arg, _level):
518597
except SpecificationError:
519598
raise
520599

521-
if _level:
522-
keys = None
523-
result = concat(results, keys=keys, axis=1)
524-
525-
return result
600+
return concat(results, keys=keys, axis=1)
526601

527602
def _is_cython_func(self, arg):
528603
""" if we define an internal function for this argument, return it """

pandas/core/groupby.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,8 +1156,8 @@ def nth(self, n, dropna=None):
11561156

11571157
return result
11581158

1159-
if (isinstance(self._selected_obj, DataFrame)
1160-
and dropna not in ['any', 'all']):
1159+
if isinstance(self._selected_obj, DataFrame) and \
1160+
dropna not in ['any', 'all']:
11611161
# Note: when agg-ing picker doesn't raise this, just returns NaN
11621162
raise ValueError("For a DataFrame groupby, dropna must be "
11631163
"either None, 'any' or 'all', "
@@ -2512,12 +2512,12 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
25122512
-------
25132513
Series or DataFrame
25142514
"""
2515-
kwargs.pop('_level', None)
2515+
_level = kwargs.pop('_level', None)
25162516
if isinstance(func_or_funcs, compat.string_types):
25172517
return getattr(self, func_or_funcs)(*args, **kwargs)
25182518

25192519
if hasattr(func_or_funcs, '__iter__'):
2520-
ret = self._aggregate_multiple_funcs(func_or_funcs)
2520+
ret = self._aggregate_multiple_funcs(func_or_funcs, _level)
25212521
else:
25222522
cyfunc = self._is_cython_func(func_or_funcs)
25232523
if cyfunc and not args and not kwargs:
@@ -2541,7 +2541,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
25412541

25422542
agg = aggregate
25432543

2544-
def _aggregate_multiple_funcs(self, arg):
2544+
def _aggregate_multiple_funcs(self, arg, _level):
25452545
if isinstance(arg, dict):
25462546
columns = list(arg.keys())
25472547
arg = list(arg.items())
@@ -2562,6 +2562,14 @@ def _aggregate_multiple_funcs(self, arg):
25622562
columns.append(com._get_callable_name(f))
25632563
arg = lzip(columns, arg)
25642564

2565+
# for a ndim=1, disallow a nested dict for an aggregator as
2566+
# this is a mis-specification of the aggregations, via a
2567+
# specificiation error
2568+
# e.g. g['A'].agg({'A': ..., 'B': ...})
2569+
if self.name in columns and len(columns) > 1:
2570+
raise SpecificationError('invalid aggregation names specified '
2571+
'for selected objects')
2572+
25652573
results = {}
25662574
for name, func in arg:
25672575
obj = self
@@ -2577,6 +2585,13 @@ def _aggregate_multiple_funcs(self, arg):
25772585
obj._selection = name
25782586
results[name] = obj.aggregate(func)
25792587

2588+
if isinstance(list(compat.itervalues(results))[0],
2589+
com.ABCDataFrame):
2590+
2591+
# let higher level handle
2592+
if _level:
2593+
return results
2594+
return list(compat.itervalues(results))[0]
25802595
return DataFrame(results, columns=columns)
25812596

25822597
def _wrap_output(self, output, index, names=None):

pandas/tests/test_groupby.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,32 @@ def test_aggregate_api_consistency(self):
15121512
['D', 'C']])
15131513
assert_frame_equal(result, expected, check_like=True)
15141514

1515+
def test_agg_nested_dicts(self):
1516+
1517+
# API change for disallowing these types of nested dicts
1518+
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
1519+
'foo', 'bar', 'foo', 'foo'],
1520+
'B': ['one', 'one', 'two', 'two',
1521+
'two', 'two', 'one', 'two'],
1522+
'C': np.random.randn(8) + 1.0,
1523+
'D': np.arange(8)})
1524+
1525+
g = df.groupby(['A', 'B'])
1526+
1527+
def f():
1528+
g.aggregate({'r1': {'C': ['mean', 'sum']},
1529+
'r2': {'D': ['mean', 'sum']}})
1530+
1531+
self.assertRaises(SpecificationError, f)
1532+
1533+
result = g.agg({'C': {'ra': ['mean', 'std']},
1534+
'D': {'rb': ['mean', 'std']}})
1535+
expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(),
1536+
g['D'].std()], axis=1)
1537+
expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
1538+
'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
1539+
assert_frame_equal(result, expected, check_like=True)
1540+
15151541
def test_multi_iter(self):
15161542
s = Series(np.arange(6))
15171543
k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])

pandas/tests/test_window.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pandas.core.datetools as datetools
1919
import pandas.stats.moments as mom
2020
import pandas.core.window as rwindow
21+
from pandas.core.base import SpecificationError
2122
import pandas.util.testing as tm
2223
from pandas.compat import range, zip, PY3
2324

@@ -197,12 +198,18 @@ def f():
197198
r.aggregate({'r1': {'A': ['mean', 'sum']},
198199
'r2': {'B': ['mean', 'sum']}})
199200

200-
self.assertRaises(ValueError, f)
201+
self.assertRaises(SpecificationError, f)
201202

202-
result = r.agg({'A': {'ra': ['mean', 'std']},
203-
'B': {'rb': ['mean', 'std']}})
204203
expected = pd.concat([r['A'].mean(), r['A'].std(), r['B'].mean(),
205204
r['B'].std()], axis=1)
205+
expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
206+
'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
207+
result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
208+
'B': {'rb': ['mean', 'std']}})
209+
assert_frame_equal(result, expected, check_like=True)
210+
211+
result = r.agg({'A': {'ra': ['mean', 'std']},
212+
'B': {'rb': ['mean', 'std']}})
206213
expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), (
207214
'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')])
208215
assert_frame_equal(result, expected, check_like=True)

pandas/tseries/resample.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,9 @@ def _downsample(self, how, **kwargs):
385385
if isinstance(loffset, compat.string_types):
386386
loffset = to_offset(self.loffset)
387387

388-
if isinstance(loffset, (DateOffset, timedelta)):
389-
if (isinstance(result.index, DatetimeIndex)
390-
and len(result.index) > 0):
388+
if isinstance(loffset, (DateOffset, timedelta)) and \
389+
isinstance(result.index, DatetimeIndex) and \
390+
len(result.index) > 0:
391391
result.index = result.index + loffset
392392

393393
return self._wrap_result(result)

0 commit comments

Comments
 (0)