-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG refactor datetime parsing and fix 8 bugs #50242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3490468
794f9e4
030bcb2
38871c7
762dea8
3da6ceb
392d239
e1fe7c7
bb3cd4d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from numpy cimport int64_t | ||
|
||
|
||
cdef bint parse_today_now(str val, int64_t* iresult, bint utc) |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport ( | |||||||||||||||||||||
c_nat_strings as nat_strings, | ||||||||||||||||||||||
) | ||||||||||||||||||||||
from pandas._libs.tslibs.np_datetime cimport ( | ||||||||||||||||||||||
NPY_DATETIMEUNIT, | ||||||||||||||||||||||
NPY_FR_ns, | ||||||||||||||||||||||
check_dts_bounds, | ||||||||||||||||||||||
npy_datetimestruct, | ||||||||||||||||||||||
npy_datetimestruct_to_datetime, | ||||||||||||||||||||||
pydate_to_dt64, | ||||||||||||||||||||||
pydatetime_to_dt64, | ||||||||||||||||||||||
string_to_dts, | ||||||||||||||||||||||
) | ||||||||||||||||||||||
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime | ||||||||||||||||||||||
from pandas._libs.tslibs.timestamps cimport _Timestamp | ||||||||||||||||||||||
|
@@ -48,9 +50,50 @@ from pandas._libs.util cimport ( | |||||||||||||||||||||
is_float_object, | ||||||||||||||||||||||
is_integer_object, | ||||||||||||||||||||||
) | ||||||||||||||||||||||
from pandas._libs.tslibs.timestamps import Timestamp | ||||||||||||||||||||||
|
||||||||||||||||||||||
cnp.import_array() | ||||||||||||||||||||||
|
||||||||||||||||||||||
cdef bint format_is_iso(f: str): | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
Does format match the iso8601 set that can be handled by the C parser? | ||||||||||||||||||||||
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different | ||||||||||||||||||||||
but must be consistent. Leading 0s in dates and times are optional. | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
excluded_formats = ["%Y%m"] | ||||||||||||||||||||||
|
||||||||||||||||||||||
for date_sep in [" ", "/", "\\", "-", ".", ""]: | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of a loop can you express this as a regular expression? Seems like it would help the performance that way as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah nevermind I see this is the way it is currently written - something to consider for another PR though. My guess is can only help |
||||||||||||||||||||||
for time_sep in [" ", "T"]: | ||||||||||||||||||||||
for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: | ||||||||||||||||||||||
iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" | ||||||||||||||||||||||
if iso_fmt.startswith(f) and f not in excluded_formats: | ||||||||||||||||||||||
return True | ||||||||||||||||||||||
return False | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
def _test_format_is_iso(f: str) -> bool: | ||||||||||||||||||||||
"""Only used in testing.""" | ||||||||||||||||||||||
return format_is_iso(f) | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
cdef bint parse_today_now(str val, int64_t* iresult, bint utc): | ||||||||||||||||||||||
# We delay this check for as long as possible | ||||||||||||||||||||||
# because it catches relatively rare cases | ||||||||||||||||||||||
|
||||||||||||||||||||||
# Multiply by 1000 to convert to nanos, since these methods naturally have | ||||||||||||||||||||||
# microsecond resolution | ||||||||||||||||||||||
if val == "now": | ||||||||||||||||||||||
if utc: | ||||||||||||||||||||||
iresult[0] = Timestamp.utcnow().value * 1000 | ||||||||||||||||||||||
else: | ||||||||||||||||||||||
# GH#18705 make sure to_datetime("now") matches Timestamp("now") | ||||||||||||||||||||||
# Note using Timestamp.now() is faster than Timestamp("now") | ||||||||||||||||||||||
iresult[0] = Timestamp.now().value * 1000 | ||||||||||||||||||||||
return True | ||||||||||||||||||||||
elif val == "today": | ||||||||||||||||||||||
iresult[0] = Timestamp.today().value * 1000 | ||||||||||||||||||||||
return True | ||||||||||||||||||||||
return False | ||||||||||||||||||||||
|
||||||||||||||||||||||
cdef dict _parse_code_table = {"y": 0, | ||||||||||||||||||||||
"Y": 1, | ||||||||||||||||||||||
|
@@ -111,6 +154,9 @@ def array_strptime( | |||||||||||||||||||||
bint found_naive = False | ||||||||||||||||||||||
bint found_tz = False | ||||||||||||||||||||||
tzinfo tz_out = None | ||||||||||||||||||||||
bint iso_format = fmt is not None and format_is_iso(fmt) | ||||||||||||||||||||||
NPY_DATETIMEUNIT out_bestunit | ||||||||||||||||||||||
int out_local = 0, out_tzoffset = 0 | ||||||||||||||||||||||
|
||||||||||||||||||||||
assert is_raise or is_ignore or is_coerce | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
@@ -232,6 +278,34 @@ def array_strptime( | |||||||||||||||||||||
else: | ||||||||||||||||||||||
val = str(val) | ||||||||||||||||||||||
|
||||||||||||||||||||||
if iso_format: | ||||||||||||||||||||||
string_to_dts_failed = string_to_dts( | ||||||||||||||||||||||
val, &dts, &out_bestunit, &out_local, | ||||||||||||||||||||||
&out_tzoffset, False, fmt, exact | ||||||||||||||||||||||
) | ||||||||||||||||||||||
if not string_to_dts_failed: | ||||||||||||||||||||||
# No error reported by string_to_dts, pick back up | ||||||||||||||||||||||
# where we left off | ||||||||||||||||||||||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||||||||||||||||||||||
if out_local == 1: | ||||||||||||||||||||||
# Store the out_tzoffset in seconds | ||||||||||||||||||||||
# since we store the total_seconds of | ||||||||||||||||||||||
# dateutil.tz.tzoffset objects | ||||||||||||||||||||||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the analogous block in tslib we then adjust value using tz_localize_to_utc. do we need to do that here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it happens a few levels up, here: pandas/pandas/core/tools/datetimes.py Lines 335 to 344 in ef0eaa4
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense, thanks. would it be viable to use the same pattern so we can share more code? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would indeed be good, I'll see what I can do |
||||||||||||||||||||||
result_timezone[i] = tz | ||||||||||||||||||||||
out_local = 0 | ||||||||||||||||||||||
out_tzoffset = 0 | ||||||||||||||||||||||
iresult[i] = value | ||||||||||||||||||||||
check_dts_bounds(&dts) | ||||||||||||||||||||||
continue | ||||||||||||||||||||||
|
||||||||||||||||||||||
if parse_today_now(val, &iresult[i], utc): | ||||||||||||||||||||||
continue | ||||||||||||||||||||||
|
||||||||||||||||||||||
# Some ISO formats can't be parsed by string_to_dts | ||||||||||||||||||||||
# For example, 6-digit YYYYMD. So, if there's an error, | ||||||||||||||||||||||
# try the string-matching code below. | ||||||||||||||||||||||
|
||||||||||||||||||||||
# exact matching | ||||||||||||||||||||||
if exact: | ||||||||||||||||||||||
found = format_regex.match(val) | ||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error messaging here is a bit confusing to me - looks like
string_to_dts
is already labeled?except -1
. Is there a reason why Cython doesn't propogate an error before your check ofif string_to_dts_failed
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's because here
want_exc
isFalse
:pandas/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Lines 665 to 671 in a37b78d
The only place where it's True is
pandas/pandas/_libs/tslib.pyx
Lines 145 to 162 in 2f54a47
which is only a testing function. So, perhaps the
?except -1
can just be removed, and the testing function removed (I think it would be better to testto_datetime
directly).I'd keep that to a separate PR anyway, but thanks for catching this!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool thanks for review. Yea I'd be OK with your suggestion in a separate PR. Always good to clean this up - not sure we've handled consistently in the past