Skip to content

BUG: Fix not to reindex on non-Categorical groups (GH9049) #9177

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,4 @@ Bug Fixes
- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).

- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)
37 changes: 11 additions & 26 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1862,7 +1862,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.grouper = grouper.values

# pre-computed
self._was_factor = False
self._should_compress = True

# we have a single grouper which may be a myriad of things, some of which are
Expand All @@ -1887,8 +1886,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
level_values = index.levels[level].take(inds)
self.grouper = level_values.map(self.grouper)
else:
self._was_factor = True

# all levels may not be observed
labels, uniques = algos.factorize(inds, sort=True)

Expand All @@ -1913,17 +1910,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,

# a passed Categorical
elif isinstance(self.grouper, Categorical):

factor = self.grouper
self._was_factor = True

# Is there any way to avoid this?
self.grouper = np.asarray(factor)

self._labels = factor.codes
self._group_index = factor.categories
self._labels = self.grouper.codes
self._group_index = self.grouper.categories
if self.name is None:
self.name = factor.name
self.name = self.grouper.name

# a passed Grouper like
elif isinstance(self.grouper, Grouper):
Expand All @@ -1936,8 +1926,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.name = grouper.name

# no level passed
if not isinstance(self.grouper, (Series, Index, np.ndarray)):
if getattr(self.grouper,'ndim', 1) != 1:
if not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)):
if getattr(self.grouper, 'ndim', 1) != 1:
t = self.name or str(type(self.grouper))
raise ValueError("Grouper for '%s' not 1-dimensional" % t)
self.grouper = self.index.map(self.grouper)
Expand Down Expand Up @@ -1988,21 +1978,15 @@ def group_index(self):
return self._group_index

def _make_labels(self):
if self._was_factor: # pragma: no cover
raise Exception('Should not call this method grouping by level')
else:
if self._labels is None or self._group_index is None:
labels, uniques = algos.factorize(self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques

_groups = None

@property
@cache_readonly
def groups(self):
if self._groups is None:
self._groups = self.index.groupby(self.grouper)
return self._groups
return self.index.groupby(self.grouper)

def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
"""
Expand Down Expand Up @@ -3238,10 +3222,11 @@ def _reindex_output(self, result):
return result
elif len(groupings) == 1:
return result
elif not any([ping._was_factor for ping in groupings]):
elif not any([isinstance(ping.grouper, Categorical)
for ping in groupings]):
return result

levels_list = [ ping._group_index for ping in groupings ]
levels_list = [ ping.group_index for ping in groupings ]
index = MultiIndex.from_product(levels_list, names=self.grouper.names)
d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False }
return result.reindex(**d).sortlevel(axis=self.axis)
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3297,6 +3297,34 @@ def test_groupby_categorical(self):
expected.index.names = ['myfactor', None]
assert_frame_equal(desc_result, expected)

def test_groupby_datetime_categorical(self):
# GH9049: ensure backward compatibility
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is for ensuring backward compatibility, so added a comment like this.

levels = pd.date_range('2014-01-01', periods=4)
codes = np.random.randint(0, 4, size=100)

cats = Categorical.from_codes(codes, levels, name='myfactor')

data = DataFrame(np.random.randn(100, 4))

result = data.groupby(cats).mean()

expected = data.groupby(np.asarray(cats)).mean()
expected = expected.reindex(levels)
expected.index.name = 'myfactor'

assert_frame_equal(result, expected)
self.assertEqual(result.index.name, cats.name)

grouped = data.groupby(cats)
desc_result = grouped.describe()

idx = cats.codes.argsort()
ord_labels = np.asarray(cats).take(idx)
ord_data = data.take(idx)
expected = ord_data.groupby(ord_labels, sort=False).describe()
expected.index.names = ['myfactor', None]
assert_frame_equal(desc_result, expected)

def test_groupby_groups_datetimeindex(self):
# #1430
from pandas.tseries.api import DatetimeIndex
Expand Down Expand Up @@ -3484,6 +3512,31 @@ def test_groupby_categorical_unequal_len(self):
# len(bins) != len(series) here
self.assertRaises(ValueError,lambda : series.groupby(bins).mean())

def test_groupby_multiindex_missing_pair(self):
# GH9049
df = DataFrame({'group1': ['a','a','a','b'],
'group2': ['c','c','d','c'],
'value': [1,1,1,5]})
df = df.set_index(['group1', 'group2'])
df_grouped = df.groupby(level=['group1','group2'], sort=True)

res = df_grouped.agg('sum')
idx = MultiIndex.from_tuples([('a','c'), ('a','d'), ('b','c')], names=['group1', 'group2'])
exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])

tm.assert_frame_equal(res, exp)

def test_groupby_levels_and_columns(self):
# GH9344, GH9049
idx_names = ['x', 'y']
idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)

by_levels = df.groupby(level=idx_names).mean()
by_columns = df.reset_index().groupby(idx_names).mean()

tm.assert_frame_equal(by_levels, by_columns)

def test_gb_apply_list_of_unequal_len_arrays(self):

# GH1738
Expand Down
12 changes: 12 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,18 @@ def f(g):

groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)


#----------------------------------------------------------------------
# multi-indexed group sum #9049

setup = common_setup + """
N = 50
df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"])
"""

groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup)


#----------------------------------------------------------------------
# Transform testing

Expand Down