-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add DataFrameGroupBy.value_counts #44267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 134 commits
963b7e1
1f710e0
3531383
d7f733b
eb067ec
a6a07d1
6a22a57
9492ee4
b9885fd
e896879
6de9653
5b49322
651b20b
26353ee
9f44a6d
0e065b3
b821fca
19d7257
71ee5f4
1dd2db0
1c18d7d
f25e861
faac0f0
3934042
4904c31
0f615da
c2db74f
ba793bb
221b76a
424d7a6
5216929
50d4c59
9b2869f
3de6132
a9c2b83
0ad5ffb
6905bcd
0281539
eb9600f
f529714
0ae5218
15e3167
dfa82cb
6e2b06e
2dc5972
925d3ec
82730f1
d7b3149
8d8d9b0
c12d831
57d3fb8
c431953
4d10e47
e4582ef
c948274
2a58c42
e1596b1
df76279
04ebe65
f179fbb
98355d5
0be0150
25edd1e
97ab9c1
6ac9356
45b99af
0cbb3e2
fada9a9
8e3f359
ca15937
f055323
417958d
86a0df6
7d29bd4
32f4b6f
c13eef0
2c2eb0a
7638086
57b564b
aa3cb98
5e5d7e7
09cee2f
5838066
8f81bd2
92cb494
95ccdb4
085e8c9
9fcfbfe
bb5f82a
377cee0
c824f3e
92c718b
14d8172
e26cba1
928a9d7
ad0f5b4
e827cd3
2c2b967
51a3a3e
2ee133e
ec2a2d4
9d330d1
8e4f3ed
b2c61de
392986d
3b2ac58
34e6529
91e1ff3
06aaaeb
e062823
a8b0fc5
493e3aa
548c45b
6c19ce2
6141f85
050f070
d669af3
6c0d7f8
de68836
c81adb6
08fd6ab
dc67009
d023579
71d9780
b93f47c
db31257
124b1e9
d613261
a776a3d
4ef5ea0
0f0891f
5c1d021
fe58245
11ad6ea
857e5be
5b9d85a
e226547
c8f1731
8f89580
ac38571
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
Iterable, | ||
Mapping, | ||
NamedTuple, | ||
Sequence, | ||
TypeVar, | ||
Union, | ||
cast, | ||
|
@@ -76,6 +77,7 @@ | |
_transform_template, | ||
warn_dropping_nuisance_columns_deprecated, | ||
) | ||
from pandas.core.groupby.grouper import get_grouper | ||
from pandas.core.indexes.api import ( | ||
Index, | ||
MultiIndex, | ||
|
@@ -1569,6 +1571,193 @@ def func(df): | |
|
||
boxplot = boxplot_frame_groupby | ||
|
||
def value_counts( | ||
self, | ||
subset: Sequence[Hashable] | None = None, | ||
normalize: bool = False, | ||
sort: bool = True, | ||
ascending: bool = False, | ||
dropna: bool = True, | ||
) -> DataFrame | Series: | ||
""" | ||
Return a Series or DataFrame containing counts of unique rows. | ||
|
||
.. versionadded:: 1.4.0 | ||
|
||
Parameters | ||
---------- | ||
subset : list-like, optional | ||
Columns to use when counting unique combinations. | ||
normalize : bool, default False | ||
Return proportions rather than frequencies. | ||
sort : bool, default True | ||
Sort by frequencies. | ||
ascending : bool, default False | ||
Sort in ascending order. | ||
dropna : bool, default True | ||
Don’t include counts of rows that contain NA values. | ||
|
||
Returns | ||
------- | ||
Series or DataFrame | ||
Series if the groupby as_index is True, otherwise DataFrame. | ||
|
||
See Also | ||
-------- | ||
Series.value_counts: Equivalent method on Series. | ||
DataFrame.value_counts: Equivalent method on DataFrame. | ||
SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. | ||
|
||
Notes | ||
----- | ||
- If the groupby as_index is True then the returned Series will have a | ||
MultiIndex with one level per input column. | ||
- If the groupby as_index is False then the returned DataFrame will have an | ||
additional column with the value_counts. The column is labelled 'count' or | ||
'proportion', depending on the ``normalize`` parameter. | ||
|
||
By default, rows that contain any NA values are omitted from | ||
the result. | ||
|
||
By default, the result will be in descending order so that the | ||
first element of each group is the most frequently-occurring row. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame({ | ||
... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], | ||
... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], | ||
... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] | ||
... }) | ||
|
||
>>> df | ||
gender education country | ||
0 male low US | ||
1 male medium FR | ||
2 female high US | ||
3 male low FR | ||
4 female high FR | ||
5 male low FR | ||
|
||
>>> df.groupby('gender').value_counts() | ||
gender education country | ||
female high FR 1 | ||
US 1 | ||
male low FR 2 | ||
US 1 | ||
medium FR 1 | ||
dtype: float64 | ||
|
||
>>> df.groupby('gender').value_counts(ascending=True) | ||
gender education country | ||
female high FR 1 | ||
US 1 | ||
male low US 1 | ||
medium FR 1 | ||
low FR 2 | ||
dtype: int64 | ||
|
||
>>> df.groupby('gender').value_counts(normalize=True) | ||
gender education country | ||
female high FR 0.50 | ||
US 0.50 | ||
male low FR 0.50 | ||
US 0.25 | ||
medium FR 0.25 | ||
dtype: float64 | ||
|
||
>>> df.groupby('gender', as_index=False).value_counts() | ||
gender education country count | ||
0 female high FR 1 | ||
1 female high US 1 | ||
2 male low FR 2 | ||
3 male low US 1 | ||
4 male medium FR 1 | ||
|
||
>>> df.groupby('gender', as_index=False).value_counts(normalize=True) | ||
gender education country proportion | ||
0 female high FR 0.50 | ||
1 female high US 0.50 | ||
2 male low FR 0.50 | ||
3 male low US 0.25 | ||
4 male medium FR 0.25 | ||
""" | ||
if self.axis == 1: | ||
raise NotImplementedError( | ||
"DataFrameGroupBy.value_counts only handles axis=0" | ||
) | ||
|
||
with self._group_selection_context(): | ||
df = self.obj | ||
|
||
in_axis_names = { | ||
grouping.name for grouping in self.grouper.groupings if grouping.in_axis | ||
} | ||
if isinstance(self._selected_obj, Series): | ||
name = self._selected_obj.name | ||
keys = [] if name in in_axis_names else [self._selected_obj] | ||
else: | ||
keys = [ | ||
# Can't use .values because the column label needs to be preserved | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you not just do column selection? e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Column selection would fail if there are duplicate column labels since these are groupers and must be 1-dimensional (otherwise grouper construction will raise) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok is there a test with duplicates? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test_column_name_clashes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have to use positional, to avoid problems with duplicate column labels, and Series, to preserve the label in the grouper. |
||
self._selected_obj.iloc[:, idx] | ||
for idx, name in enumerate(self._selected_obj.columns) | ||
if name not in in_axis_names | ||
] | ||
|
||
if subset is not None: | ||
clashing = set(subset) & set(in_axis_names) | ||
if clashing: | ||
raise ValueError( | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f"Keys {clashing} in subset cannot be in " | ||
"the groupby column keys" | ||
) | ||
|
||
groupings = list(self.grouper.groupings) | ||
for key in keys: | ||
grouper, _, _ = get_grouper( | ||
df, | ||
key=key, | ||
axis=self.axis, | ||
sort=self.sort, | ||
dropna=dropna, | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
groupings += list(grouper.groupings) | ||
|
||
# Take the size of the overall columns | ||
gb = df.groupby( | ||
groupings, | ||
sort=self.sort, | ||
observed=self.observed, | ||
dropna=self.dropna, | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
result = cast(Series, gb.size()) | ||
|
||
if normalize: | ||
# Normalize the results by dividing by the original group sizes. | ||
# We are guaranteed to have the first N levels be the | ||
# user-requested grouping. | ||
levels = list(range(len(self.grouper.groupings), result.index.nlevels)) | ||
indexed_group_size = result.groupby( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you not use gb here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what's meant by gb (self?), but the idea here is to use the result which is typically smaller than the obj in the groupby. By transforming, the index of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gb is the groupby for size above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, when grouping by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each groupby has different options. These ones have the default |
||
result.index.droplevel(levels), | ||
sort=self.sort, | ||
observed=self.observed, | ||
dropna=self.dropna, | ||
).transform("sum") | ||
|
||
result /= indexed_group_size | ||
|
||
if sort: | ||
# Sort the values and then resort by the main grouping | ||
index_level = range(len(self.grouper.groupings)) | ||
result = result.sort_values(ascending=ascending).sort_index( | ||
level=index_level, sort_remaining=False | ||
) | ||
|
||
if not self.as_index: | ||
# Convert to frame | ||
result = result.reset_index(name="proportion" if normalize else "count") | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return result.__finalize__(self.obj, method="value_counts") | ||
|
||
|
||
def _wrap_transform_general_frame( | ||
obj: DataFrame, group: DataFrame, res: DataFrame | Series | ||
|
Uh oh!
There was an error while loading. Please reload this page.