Skip to content

WIP: ENH: pivot/groupby index with nan #12607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

if check_null and val != val or val is None:
if check_null and (val != val or val is None):
labels[i] = na_sentinel
continue

Expand Down
10 changes: 7 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,8 @@ def sort_mixed(values):
return ordered, _ensure_platform_int(new_labels)


def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None,
dropna=True):
"""
Encode input values as an enumerated type or categorical variable

Expand All @@ -534,6 +535,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
na_sentinel : int, default -1
Value to mark "not found"
size_hint : hint to the hashtable sizer
dropna : boolean, default True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add version added directive

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!

Drop NaN values

.. versionadded:: 0.20.0

Returns
-------
Expand All @@ -552,9 +557,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
check_nulls = not is_integer_dtype(original)
check_nulls = (not is_integer_dtype(original)) and dropna
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)

labels = _ensure_platform_int(labels)
uniques = uniques.to_array()

Expand Down
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4132,7 +4132,7 @@ def clip_lower(self, threshold, axis=None):
return self.where(subset, threshold, axis=axis)

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, **kwargs):
group_keys=True, squeeze=False, dropna=True, **kwargs):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns.
Expand Down Expand Up @@ -4164,6 +4164,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
squeeze : boolean, default False
reduce the dimensionality of the return type if possible,
otherwise return a consistent type
dropna : boolean, default True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

versionadded tag

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

drop NaN in the grouping values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0.20.0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

.. versionadded:: 0.20.0

Examples
--------
Expand All @@ -4188,7 +4192,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
axis = self._get_axis_number(axis)
return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys, squeeze=squeeze,
**kwargs)
dropna=dropna, **kwargs)

def asfreq(self, freq, method=None, how=None, normalize=False,
fill_value=None):
Expand Down
33 changes: 24 additions & 9 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@ class _GroupBy(PandasObject, SelectionMixin):

def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True, squeeze=False, **kwargs):

sort=True, group_keys=True, squeeze=False, dropna=True,
**kwargs):
self._selection = selection

if isinstance(obj, NDFrame):
Expand All @@ -380,13 +380,15 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.group_keys = group_keys
self.squeeze = squeeze
self.mutated = kwargs.pop('mutated', False)
self.dropna = dropna

if grouper is None:
grouper, exclusions, obj = _get_grouper(obj, keys,
axis=axis,
level=level,
sort=sort,
mutated=self.mutated)
mutated=self.mutated,
dropna=dropna)

self.obj = obj
self.axis = obj._get_axis_number(axis)
Expand Down Expand Up @@ -968,6 +970,10 @@ class GroupBy(_GroupBy):
List of columns to exclude
name : string
Most users should ignore this
dropna : boolean, default True
drop NaN in the grouping values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


.. versionadded:: 0.20.0

Notes
-----
Expand Down Expand Up @@ -2324,6 +2330,10 @@ class Grouping(object):
level :
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list
dropna : boolean, default True
drop NaN in the grouping values

.. versionadded:: 0.20.0

Returns
-------
Expand All @@ -2337,7 +2347,7 @@ class Grouping(object):
"""

def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True, in_axis=False):
sort=True, in_axis=False, dropna=True):

self.name = name
self.level = level
Expand All @@ -2346,6 +2356,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.sort = sort
self.obj = obj
self.in_axis = in_axis
self.dropna = dropna

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
Expand Down Expand Up @@ -2396,7 +2407,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,

# a passed Grouper like
elif isinstance(self.grouper, Grouper):

# get the new grouper
grouper = self.grouper._get_binner_for_grouping(self.obj)
self.obj = self.grouper.obj
Expand Down Expand Up @@ -2433,6 +2443,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
from pandas import to_timedelta
self.grouper = to_timedelta(self.grouper)

# convert None to NaN if we are going to keep them
if not dropna:
if not isinstance(self.grouper, Index):
self.grouper[np.equal(self.grouper, None)] = np.NaN

def __repr__(self):
return 'Grouping({0})'.format(self.name)

Expand Down Expand Up @@ -2466,7 +2481,7 @@ def group_index(self):
def _make_labels(self):
if self._labels is None or self._group_index is None:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
self.grouper, sort=self.sort, dropna=self.dropna)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques
Expand All @@ -2478,7 +2493,7 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
mutated=False):
mutated=False, dropna=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand Down Expand Up @@ -2631,7 +2646,8 @@ def is_in_obj(gpr):
name=name,
level=level,
sort=sort,
in_axis=in_axis) \
in_axis=in_axis,
dropna=dropna) \
if not isinstance(gpr, Grouping) else gpr

groupings.append(ping)
Expand Down Expand Up @@ -3386,7 +3402,6 @@ def _post_process_cython_aggregate(self, obj):
return obj

def aggregate(self, arg, *args, **kwargs):

_level = kwargs.pop('_level', None)
result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
if how is None:
Expand Down
Loading