Skip to content

Commit 9c2d1a6

Browse files
committed
BUG: Fix issue with incorrect groupby handling of NaT #10590
For groupby the time stamps gets converted to integervalue tslib.iNaT which is -9223372036854775808. The aggregation is then done using this value with incorrect result as a consequence. The solution proposed here is to replace its value by np.nan in case it is a datetime or timedelta.
1 parent 207efc2 commit 9c2d1a6

File tree

3 files changed

+22
-4
lines changed

3 files changed

+22
-4
lines changed

doc/source/whatsnew/v0.17.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -865,4 +865,5 @@ Bug Fixes
865865
- Bug in ``to_json`` which was causing segmentation fault when serializing 0-rank ndarray (:issue:`9576`)
866866
- Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
867867
- Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
868-
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
868+
- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
869+
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)

pandas/core/groupby.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,7 @@ def aggregate(self, values, how, axis=0):
15321532

15331533
if is_datetime_or_timedelta_dtype(values.dtype):
15341534
values = values.view('int64')
1535+
values[values == tslib.iNaT] = np.nan
15351536
# GH 7754
15361537
is_numeric = True
15371538
elif is_bool_dtype(values.dtype):
@@ -2761,9 +2762,7 @@ def _cython_agg_blocks(self, how, numeric_only=True):
27612762

27622763
for block in data.blocks:
27632764

2764-
values = block._try_operate(block.values)
2765-
2766-
result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
2765+
result, _ = self.grouper.aggregate(block.values, how, axis=agg_axis)
27672766

27682767
# see if we can cast the block back to the original dtype
27692768
result = block._try_coerce_and_cast_result(result)

pandas/tests/test_groupby.py

+18
Original file line numberDiff line numberDiff line change
@@ -5413,6 +5413,24 @@ def test_func(x):
54135413
expected = DataFrame()
54145414
tm.assert_frame_equal(result, expected)
54155415

5416+
def test_first_last_max_min_on_time_data(self):
5417+
# GH 10295
5418+
# Verify that NaT is not in the result of max, min, first and last on
5419+
# Dataframe with datetime or timedelta values.
5420+
from datetime import timedelta as td
5421+
df_test=DataFrame({'dt':[nan,'2015-07-24 10:10','2015-07-25 11:11','2015-07-23 12:12',nan],
5422+
'td':[nan,td(days=1),td(days=2),td(days=3),nan]})
5423+
df_test.dt=pd.to_datetime(df_test.dt)
5424+
df_test['group']='A'
5425+
df_ref=df_test[df_test.dt.notnull()]
5426+
5427+
grouped_test=df_test.groupby('group')
5428+
grouped_ref=df_ref.groupby('group')
5429+
5430+
assert_frame_equal(grouped_ref.max(),grouped_test.max())
5431+
assert_frame_equal(grouped_ref.min(),grouped_test.min())
5432+
assert_frame_equal(grouped_ref.first(),grouped_test.first())
5433+
assert_frame_equal(grouped_ref.last(),grouped_test.last())
54165434

54175435
def assert_fp_equal(a, b):
54185436
assert (np.abs(a - b) < 1e-12).all()

0 commit comments

Comments
 (0)