Skip to content

ENH: Implement DataFrame.value_counts #31247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Feb 26, 2020
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
0830e36
Add value_counts tests
Jan 23, 2020
d946e93
Update docs
Jan 23, 2020
7d9306d
Start implementing value_counts
Jan 23, 2020
2e58db4
Set MultiIndex name
Jan 23, 2020
25d7f2f
Format
Jan 23, 2020
aa96c98
Sort imports
Jan 23, 2020
aef75ae
Remove typing for now
Jan 23, 2020
acb81cc
Simplify test a little
Jan 23, 2020
786de34
Remove single col example for now
Jan 23, 2020
7eba59a
Update error for bins
Jan 23, 2020
60554e9
Update pandas/core/frame.py
dsaxton Jan 24, 2020
4c4e858
Update pandas/core/frame.py
dsaxton Jan 24, 2020
07f0e76
Import Label type
Jan 24, 2020
d055b5c
Merge remote-tracking branch 'upstream/master' into df-val-counts
Jan 24, 2020
957a8ec
Make Sequence optional
Jan 24, 2020
4fee5e0
Fix docstring
Jan 24, 2020
b8f4126
Clean docstring
Jan 26, 2020
310c688
Update to comments
Jan 26, 2020
a266021
Add to Series See Also
Jan 26, 2020
d738bf7
Update tests and add back tolist
Jan 26, 2020
2618220
Don't import pytest
Jan 26, 2020
98e7e5b
Merge branch 'master' into df-val-counts
Jan 27, 2020
1ab2aeb
Merge branch 'master' into df-val-counts
Jan 27, 2020
a97347f
Merge branch 'master' into df-val-counts
Jan 31, 2020
e12117e
Merge branch 'master' into df-val-counts
Feb 6, 2020
9e75083
Merge branch 'master' into df-val-counts
Feb 8, 2020
0d46697
Add to basics.rst
Feb 9, 2020
81991a1
Move to Notes
Feb 9, 2020
d618677
Merge branch 'master' into df-val-counts
Feb 9, 2020
85bc213
Rename to avoid doc error
Feb 9, 2020
425ef73
Merge branch 'master' into df-val-counts
Feb 13, 2020
b03978c
Merge branch 'master' into df-val-counts
Feb 14, 2020
de043d9
Merge branch 'master' into df-val-counts
Feb 15, 2020
2f0f46d
Merge branch 'master' into df-val-counts
Feb 15, 2020
5544716
Merge branch 'master' into df-val-counts
Feb 16, 2020
12898ad
Merge branch 'master' into df-val-counts
Feb 17, 2020
d743ac2
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 20, 2020
f7c3abe
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 21, 2020
c297143
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 23, 2020
47683ad
Add See Also
dsaxton Feb 23, 2020
e60de83
Move tests
dsaxton Feb 23, 2020
3903a4d
versionadded
dsaxton Feb 23, 2020
9ee6e0e
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 23, 2020
de40484
Merge remote-tracking branch 'upstream/master' into df-val-counts
dsaxton Feb 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Computations / descriptive stats
DataFrame.std
DataFrame.var
DataFrame.nunique
DataFrame.value_counts

Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Other API changes

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)

Backwards incompatible API changes
Expand Down
1 change: 1 addition & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1199,6 +1199,7 @@ def value_counts(
--------
Series.count: Number of non-NA elements in a Series.
DataFrame.count: Number of non-NA elements in a DataFrame.
DataFrame.value_counts: Equivalent method on DataFrames.

Examples
--------
Expand Down
98 changes: 97 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.multi import maybe_droplevels
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
Expand Down Expand Up @@ -5075,6 +5075,102 @@ def sort_index(
else:
return self._constructor(new_data).__finalize__(self)

def value_counts(
self,
subset: Optional[Sequence[Label]] = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
):
"""
Return a Series containing counts of unique rows in the DataFrame.

.. versionadded:: 1.1.0

The returned Series will have a MultiIndex with one level per input
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should go in Notes

column.
By default, rows that contain any NA values are omitted from the
result.
By default, the resulting Series will be in descending order so that the
first element is the most frequently-occurring row.

Parameters
----------
subset : list-like, optional
Columns to use when counting unique combinations.
normalize : bool, default False
Return proportions rather than frequencies.
sort : bool, default True
Sort by frequencies.
ascending : bool, default False
Sort in ascending order.

Returns
-------
Series

See Also
--------
Series.value_counts: Equivalent method on Series.

Examples
--------
>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
... 'num_wings': [2, 0, 0, 0]},
... index=['falcon', 'dog', 'cat', 'ant'])
>>> df
num_legs num_wings
falcon 2 2
dog 4 0
cat 4 0
ant 6 0

>>> df.value_counts()
num_legs num_wings
4 0 2
6 0 1
2 2 1
dtype: int64

>>> df.value_counts(sort=False)
num_legs num_wings
2 2 1
4 0 2
6 0 1
dtype: int64

>>> df.value_counts(ascending=True)
num_legs num_wings
2 2 1
6 0 1
4 0 2
dtype: int64

>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.50
6 0 0.25
2 2 0.25
dtype: float64
"""
if subset is None:
subset = self.columns.tolist()

counts = self.groupby(subset).size()

if sort:
counts = counts.sort_values(ascending=ascending)
if normalize:
counts /= counts.sum()

# Force MultiIndex for single column
if len(subset) == 1:
counts.index = MultiIndex.from_arrays(
[counts.index], names=[counts.index.name]
)

return counts

def nlargest(self, n, columns, keep="first") -> "DataFrame":
"""
Return the first `n` rows ordered by `columns` in descending order.
Expand Down
102 changes: 102 additions & 0 deletions pandas/tests/frame/test_value_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import numpy as np

import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to the methods/ subdir

import pandas._testing as tm


def test_data_frame_value_counts_unsorted():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(sort=False)
expected = pd.Series(
data=[1, 2, 1],
index=pd.MultiIndex.from_arrays(
[(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_ascending():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(ascending=True)
expected = pd.Series(
data=[1, 1, 2],
index=pd.MultiIndex.from_arrays(
[(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_default():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_normalize():
df = pd.DataFrame(
{"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)

result = df.value_counts(normalize=True)
expected = pd.Series(
data=[0.5, 0.25, 0.25],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_single_col_default():
df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})

result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_empty():
df_no_cols = pd.DataFrame()

result = df_no_cols.value_counts()
expected = pd.Series([], dtype=np.int64)

tm.assert_series_equal(result, expected)


def test_data_frame_value_counts_empty_normalize():
df_no_cols = pd.DataFrame()

result = df_no_cols.value_counts(normalize=True)
expected = pd.Series([], dtype=np.float64)

tm.assert_series_equal(result, expected)