Skip to content

Commit 7990797

Browse files
authored
Merge branch 'pandas-dev:master' into groupby-mean-datetimelike
2 parents f4ea054 + 5f36af3 commit 7990797

24 files changed

+202
-365
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ I/O
395395
- Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`)
396396
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
397397
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
398+
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
398399

399400
Period
400401
^^^^^^
@@ -420,6 +421,7 @@ Groupby/resample/rolling
420421
- Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`)
421422
- Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`)
422423
- Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`)
424+
- Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`)
423425

424426
Reshaping
425427
^^^^^^^^^

pandas/_libs/groupby.pyi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def group_shift_indexer(
3232
periods: int,
3333
) -> None: ...
3434
def group_fillna_indexer(
35-
out: np.ndarray, # ndarray[int64_t]
35+
out: np.ndarray, # ndarray[intp_t]
3636
labels: np.ndarray, # ndarray[int64_t]
3737
mask: np.ndarray, # ndarray[uint8_t]
3838
direction: Literal["ffill", "bfill"],

pandas/_libs/groupby.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,15 +321,15 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels,
321321

322322
@cython.wraparound(False)
323323
@cython.boundscheck(False)
324-
def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
324+
def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels,
325325
ndarray[uint8_t] mask, str direction,
326326
int64_t limit, bint dropna) -> None:
327327
"""
328328
Indexes how to fill values forwards or backwards within a group.
329329

330330
Parameters
331331
----------
332-
out : np.ndarray[np.int64]
332+
out : np.ndarray[np.intp]
333333
Values into which this method will write its results.
334334
labels : np.ndarray[np.intp]
335335
Array containing unique label for each group, with its ordering

pandas/_libs/parsers.pyx

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -606,10 +606,6 @@ cdef class TextReader:
606606
cdef:
607607
void *ptr
608608

609-
if not hasattr(source, "read"):
610-
raise IOError(f'Expected file path name or file-like object, '
611-
f'got {type(source)} type')
612-
613609
ptr = new_rd_source(source)
614610
self.parser.source = ptr
615611
self.parser.cb_io = &buffer_rd_bytes

pandas/_libs/reduction.pyx

Lines changed: 0 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -34,166 +34,6 @@ cpdef check_result_array(object obj, object dtype):
3434
raise ValueError("Must produce aggregated value")
3535

3636

37-
cdef class _BaseGrouper:
38-
cdef _check_dummy(self, object dummy):
39-
# both values and index must be an ndarray!
40-
41-
values = dummy.values
42-
# GH 23683: datetimetz types are equivalent to datetime types here
43-
if (dummy.dtype != self.arr.dtype
44-
and values.dtype != self.arr.dtype):
45-
raise ValueError('Dummy array must be same dtype')
46-
if is_array(values) and not values.flags.contiguous:
47-
# e.g. Categorical has no `flags` attribute
48-
values = values.copy()
49-
index = dummy.index.values
50-
if not index.flags.contiguous:
51-
index = index.copy()
52-
53-
return values, index
54-
55-
cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider):
56-
"""
57-
Create Series and Index objects that we will alter in-place while iterating.
58-
"""
59-
cached_index = self.ityp(islider.buf, dtype=self.idtype)
60-
cached_series = self.typ(
61-
vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name
62-
)
63-
return cached_index, cached_series
64-
65-
cdef inline _update_cached_objs(self, object cached_series, object cached_index,
66-
Slider islider, Slider vslider):
67-
cached_index._engine.clear_mapping()
68-
cached_index._cache.clear() # e.g. inferred_freq must go
69-
cached_series._mgr.set_values(vslider.buf)
70-
71-
cdef inline object _apply_to_group(self,
72-
object cached_series, object cached_index,
73-
bint initialized):
74-
"""
75-
Call self.f on our new group, then update to the next group.
76-
"""
77-
cdef:
78-
object res
79-
80-
# NB: we assume that _update_cached_objs has already cleared cleared
81-
# the cache and engine mapping
82-
res = self.f(cached_series)
83-
res = extract_result(res)
84-
if not initialized:
85-
# On the first pass, we check the output shape to see
86-
# if this looks like a reduction.
87-
initialized = True
88-
check_result_array(res, cached_series.dtype)
89-
90-
return res, initialized
91-
92-
93-
cdef class SeriesGrouper(_BaseGrouper):
94-
"""
95-
Performs generic grouping operation while avoiding ndarray construction
96-
overhead
97-
"""
98-
cdef:
99-
Py_ssize_t nresults, ngroups
100-
101-
cdef public:
102-
ndarray arr, index, dummy_arr, dummy_index
103-
object f, labels, values, typ, ityp, name, idtype
104-
105-
def __init__(self, object series, object f, ndarray[intp_t] labels,
106-
Py_ssize_t ngroups):
107-
108-
if len(series) == 0:
109-
# get_result would never assign `result`
110-
raise ValueError("SeriesGrouper requires non-empty `series`")
111-
112-
self.labels = labels
113-
self.f = f
114-
115-
values = series.values
116-
if is_array(values) and not values.flags.c_contiguous:
117-
# e.g. Categorical has no `flags` attribute
118-
values = values.copy('C')
119-
self.arr = values
120-
self.typ = series._constructor
121-
self.ityp = series.index._constructor
122-
self.idtype = series.index.dtype
123-
self.index = series.index.values
124-
self.name = series.name
125-
126-
dummy = series.iloc[:0]
127-
self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
128-
self.ngroups = ngroups
129-
130-
def get_result(self):
131-
cdef:
132-
# Define result to avoid UnboundLocalError
133-
ndarray arr, result = None
134-
ndarray[intp_t] labels
135-
ndarray[int64_t] counts
136-
Py_ssize_t i, n, group_size, lab, start, end
137-
object res
138-
bint initialized = 0
139-
Slider vslider, islider
140-
object cached_series = None, cached_index = None
141-
142-
labels = self.labels
143-
counts = np.zeros(self.ngroups, dtype=np.int64)
144-
group_size = 0
145-
n = len(self.arr)
146-
147-
vslider = Slider(self.arr, self.dummy_arr)
148-
islider = Slider(self.index, self.dummy_index)
149-
150-
result = np.empty(self.ngroups, dtype='O')
151-
152-
cached_index, cached_series = self._init_dummy_series_and_index(
153-
islider, vslider
154-
)
155-
156-
start = 0
157-
try:
158-
for i in range(n):
159-
group_size += 1
160-
161-
lab = labels[i]
162-
163-
if i == n - 1 or lab != labels[i + 1]:
164-
if lab == -1:
165-
start += group_size
166-
group_size = 0
167-
continue
168-
169-
end = start + group_size
170-
islider.move(start, end)
171-
vslider.move(start, end)
172-
173-
self._update_cached_objs(
174-
cached_series, cached_index, islider, vslider)
175-
176-
res, initialized = self._apply_to_group(cached_series, cached_index,
177-
initialized)
178-
179-
start += group_size
180-
181-
result[lab] = res
182-
counts[lab] = group_size
183-
group_size = 0
184-
185-
finally:
186-
# so we don't free the wrong memory
187-
islider.reset()
188-
vslider.reset()
189-
190-
# We check for empty series in the constructor, so should always
191-
# have result initialized by this point.
192-
assert initialized, "`result` has not been initialized."
193-
194-
return result, counts
195-
196-
19737
cpdef inline extract_result(object res):
19838
""" extract the result object, it might be a 0-dim ndarray
19939
or a len-1 0-dim, or a scalar """
@@ -208,40 +48,3 @@ cpdef inline extract_result(object res):
20848
# see test_resampler_grouper.py::test_apply
20949
res = res[0]
21050
return res
211-
212-
213-
cdef class Slider:
214-
"""
215-
Only handles contiguous data for now
216-
"""
217-
cdef:
218-
ndarray values, buf
219-
Py_ssize_t stride
220-
char *orig_data
221-
222-
def __init__(self, ndarray values, ndarray buf):
223-
assert values.ndim == 1
224-
assert values.dtype == buf.dtype
225-
226-
if not values.flags.contiguous:
227-
values = values.copy()
228-
229-
self.values = values
230-
self.buf = buf
231-
232-
self.stride = values.strides[0]
233-
self.orig_data = self.buf.data
234-
235-
self.buf.data = self.values.data
236-
self.buf.strides[0] = self.stride
237-
238-
cdef move(self, int start, int end):
239-
"""
240-
For slicing
241-
"""
242-
self.buf.data = self.values.data + self.stride * start
243-
self.buf.shape[0] = end - start
244-
245-
cdef reset(self):
246-
self.buf.data = self.orig_data
247-
self.buf.shape[0] = 0

pandas/_testing/_io.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def _get_default_network_errors():
7070
# Lazy import for http.client because it imports many things from the stdlib
7171
import http.client
7272

73-
return (IOError, http.client.HTTPException, TimeoutError)
73+
return (OSError, http.client.HTTPException, TimeoutError)
7474

7575

7676
def optional_args(decorator):
@@ -135,7 +135,7 @@ def network(
135135
If True, checks connectivity before running the test case.
136136
error_classes : tuple or Exception
137137
error classes to ignore. If not in ``error_classes``, raises the error.
138-
defaults to IOError. Be careful about changing the error classes here.
138+
defaults to OSError. Be careful about changing the error classes here.
139139
skip_errnos : iterable of int
140140
Any exception that has .errno or .reason.erno set to one
141141
of these values will be skipped with an appropriate
@@ -165,19 +165,20 @@ def network(
165165
... def test_network():
166166
... with pd.io.common.urlopen("rabbit://bonanza.com"):
167167
... pass
168+
>>> test_network()
168169
Traceback
169170
...
170-
URLError: <urlopen error unknown url type: rabit>
171+
URLError: <urlopen error unknown url type: rabbit>
171172
172173
You can specify alternative URLs::
173174
174175
>>> @ts.network("https://www.yahoo.com")
175176
... def test_something_with_yahoo():
176-
... raise IOError("Failure Message")
177+
... raise OSError("Failure Message")
177178
>>> test_something_with_yahoo()
178179
Traceback (most recent call last):
179180
...
180-
IOError: Failure Message
181+
OSError: Failure Message
181182
182183
If you set check_before_test, it will check the url first and not run the
183184
test on failure::
@@ -241,7 +242,7 @@ def wrapper(*args, **kwargs):
241242

242243
def can_connect(url, error_classes=None):
243244
"""
244-
Try to connect to the given url. True if succeeds, False if IOError
245+
Try to connect to the given url. True if succeeds, False if OSError
245246
raised
246247
247248
Parameters
@@ -252,7 +253,7 @@ def can_connect(url, error_classes=None):
252253
Returns
253254
-------
254255
connectable : bool
255-
Return True if no IOError (unable to connect) or URLError (bad url) was
256+
Return True if no OSError (unable to connect) or URLError (bad url) was
256257
raised
257258
"""
258259
if error_classes is None:

pandas/core/groupby/generic.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,10 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
228228
if maybe_use_numba(engine):
229229
with group_selection_context(self):
230230
data = self._selected_obj
231-
result, index = self._aggregate_with_numba(
231+
result = self._aggregate_with_numba(
232232
data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
233233
)
234+
index = self._group_keys_index
234235
return self.obj._constructor(result.ravel(), index=index, name=data.name)
235236

236237
relabeling = func is None
@@ -400,7 +401,7 @@ def _wrap_applied_output(
400401

401402
if isinstance(values[0], dict):
402403
# GH #823 #24880
403-
index = self._group_keys_index
404+
index = self.grouper.result_index
404405
res_df = self.obj._constructor_expanddim(values, index=index)
405406
res_df = self._reindex_output(res_df)
406407
# if self.observed is False,
@@ -413,7 +414,7 @@ def _wrap_applied_output(
413414
else:
414415
# GH #6265 #24880
415416
result = self.obj._constructor(
416-
data=values, index=self._group_keys_index, name=self.obj.name
417+
data=values, index=self.grouper.result_index, name=self.obj.name
417418
)
418419
return self._reindex_output(result)
419420

@@ -924,9 +925,10 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
924925
if maybe_use_numba(engine):
925926
with group_selection_context(self):
926927
data = self._selected_obj
927-
result, index = self._aggregate_with_numba(
928+
result = self._aggregate_with_numba(
928929
data, func, *args, engine_kwargs=engine_kwargs, **kwargs
929930
)
931+
index = self._group_keys_index
930932
return self.obj._constructor(result, index=index, columns=data.columns)
931933

932934
relabeling, func, columns, order = reconstruct_func(func, **kwargs)

0 commit comments

Comments
 (0)