Skip to content

COMPAT: Emit warning when groupby by a tuple #18731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 18, 2017
2 changes: 1 addition & 1 deletion doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, 4th, and last date index for each month
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])

Enumerate group items
~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ Deprecations
- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`).
- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`).
- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`)
- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated.
In the future, a tuple passed to ``'by'`` will always refer to a single key
that is the actual tuple, instead of treating the tuple as multiple keys. To
retain the previous behavior, use a list instead of a tuple (:issue:`18314`)
- ``Series.valid`` is deprecated. Use :meth:`Series.dropna` instead (:issue:`18800`).

.. _whatsnew_0220.prior_deprecations:
Expand Down
23 changes: 22 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
is_bool_dtype,
is_scalar,
is_list_like,
is_hashable,
needs_i8_conversion,
_ensure_float64,
_ensure_platform_int,
Expand Down Expand Up @@ -2850,7 +2851,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
elif isinstance(key, BaseGrouper):
return key, [], obj

# Everything which is not a list is a key (including tuples):
# In the future, a tuple key will always mean an actual key,
# not an iterable of keys. In the meantime, we attempt to provide
# a warning. We can assume that the user wanted a list of keys when
# the key is not in the index. We just have to be careful with
# unhashble elements of `key`. Any unhashable elements implies that
# they wanted a list of keys.
# https://github.com/pandas-dev/pandas/issues/18314
is_tuple = isinstance(key, tuple)
all_hashable = is_tuple and is_hashable(key)

if is_tuple:
if ((all_hashable and key not in obj and set(key).issubset(obj))
or not all_hashable):
# column names ('a', 'b') -> ['a', 'b']
# arrays like (a, b) -> [a, b]
msg = ("Interpreting tuple 'by' as a list of keys, rather than "
"a single key. Use 'by=[...]' instead of 'by=(...)'. In "
"the future, a tuple will always mean a single key.")
warnings.warn(msg, FutureWarning, stacklevel=5)
key = list(key)

if not isinstance(key, list):
keys = [key]
match_axis_length = False
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2727,6 +2727,38 @@ def test_empty_dataframe_groupby(self):

assert_frame_equal(result, expected)

def test_tuple_warns(self):
# https://github.com/pandas-dev/pandas/issues/18314
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
with tm.assert_produces_warning(FutureWarning) as w:
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()

assert "Interpreting tuple 'by' as a list" in str(w[0].message)

with tm.assert_produces_warning(None):
df.groupby(('a', 'b')).c.mean()

def test_tuple_warns_unhashable(self):
# https://github.com/pandas-dev/pandas/issues/18314
business_dates = date_range(start='4/1/2014', end='6/30/2014',
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])

with tm.assert_produces_warning(FutureWarning) as w:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

assert "Interpreting tuple 'by' as a list" in str(w[0].message)

@pytest.mark.xfail(reason="GH-18798")
def test_tuple_correct_keyerror(self):
# https://github.com/pandas-dev/pandas/issues/18798
df = pd.DataFrame(1, index=range(3),
columns=pd.MultiIndex.from_product([[1, 2],
[3, 4]]))
with tm.assert_raises_regex(KeyError, "(7, 8)"):
df.groupby((7, 8)).mean()


def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = lmap(tuple, df[keys].values)
Expand Down