pandas-dev · WillAyd · Dec 29, 2022 · Dec 18, 2022 · Dec 20, 2022 · Dec 20, 2022
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -780,6 +780,7 @@ Performance improvements
 - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
 - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`)
+- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
@@ -809,6 +810,13 @@ Datetimelike
 - Bug in :meth:`Timestamp.round` when the ``freq`` argument has zero-duration (e.g. "0ns") returning incorrect results instead of raising (:issue:`49737`)
 - Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`)
 - Bug in :class:`DateOffset` was throwing ``TypeError`` when constructing with milliseconds and another super-daily argument (:issue:`49897`)
+- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`)
+- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`)
+- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`)
+- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`)
+- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`)
+- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`)
+- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`)
 -
 
 Timedelta

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
@@ -23,9 +23,6 @@ def array_to_datetime(
     dayfirst: bool = ...,
     yearfirst: bool = ...,
     utc: bool = ...,
-    require_iso8601: bool = ...,
-    format: str | None = ...,
-    exact: bool = ...,
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 
 # returned ndarray may be object dtype or datetime64[ns]

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport (
     pydatetime_to_dt64,
     string_to_dts,
 )
+from pandas._libs.tslibs.strptime cimport parse_today_now
 from pandas._libs.util cimport (
     is_datetime64_object,
     is_float_object,
@@ -443,9 +444,6 @@ cpdef array_to_datetime(
     bint dayfirst=False,
     bint yearfirst=False,
     bint utc=False,
-    bint require_iso8601=False,
-    format: str | None=None,
-    bint exact=True,
 ):
     """
     Converts a 1D array of date-like values to a numpy array of either:
@@ -472,8 +470,6 @@ cpdef array_to_datetime(
          yearfirst parsing behavior when encountering datetime strings
     utc : bool, default False
          indicator whether the dates should be UTC
-    require_iso8601 : bool, default False
-         indicator whether the datetime string should be iso8601
 
     Returns
     -------
@@ -544,16 +540,6 @@ cpdef array_to_datetime(
                     iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
 
                 elif is_integer_object(val) or is_float_object(val):
-                    if require_iso8601:
-                        if is_coerce:
-                            iresult[i] = NPY_NAT
-                            continue
-                        elif is_raise:
-                            raise ValueError(
-                                f"time data \"{val}\" doesn't "
-                                f"match format \"{format}\", at position {i}"
-                            )
-                        return values, tz_out
                     # these must be ns unit by-definition
                     seen_integer = True
 
@@ -584,25 +570,13 @@ cpdef array_to_datetime(
 
                     string_to_dts_failed = string_to_dts(
 parse_error: 
     if (want_exc) { 
         PyErr_Format(PyExc_ValueError, 
                     "Error parsing datetime string \"%s\" at position %d", str, 
                     (int)(substr - str)); 
     } 
     return -1; 
     object[::1] res_flat = result.ravel()     # should NOT be a copy 
     cnp.flatiter it = cnp.PyArray_IterNew(values) 
 if na_rep is None: 
     na_rep = "NaT" 
 if tz is None: 
     # if we don't have a format nor tz, then choose 
     # a format based on precision 
     basic_format = format is None 
     if basic_format: 
         reso_obj = get_resolution(values, tz=tz, reso=reso) 
         show_ns = reso_obj == Resolution.RESO_NS 
         show_us = reso_obj == Resolution.RESO_US 
         show_ms = reso_obj == Resolution.RESO_MS 
     elif format == "%Y-%m-%d %H:%M:%S": 
         # Same format as default, but with hardcoded precision (s) 
 parse_error: 
     if (want_exc) { 
         PyErr_Format(PyExc_ValueError, 
                     "Error parsing datetime string \"%s\" at position %d", str, 
                     (int)(substr - str)); 
     } 
     return -1; 
     object[::1] res_flat = result.ravel()     # should NOT be a copy 
     cnp.flatiter it = cnp.PyArray_IterNew(values) 
  
 if na_rep is None: 
     na_rep = "NaT" 
  
 if tz is None: 
     # if we don't have a format nor tz, then choose 
     # a format based on precision 
     basic_format = format is None 
     if basic_format: 
         reso_obj = get_resolution(values, tz=tz, reso=reso) 
         show_ns = reso_obj == Resolution.RESO_NS 
         show_us = reso_obj == Resolution.RESO_US 
         show_ms = reso_obj == Resolution.RESO_MS 
  
     elif format == "%Y-%m-%d %H:%M:%S": 
         # Same format as default, but with hardcoded precision (s) 
                         val, &dts, &out_bestunit, &out_local,
-                        &out_tzoffset, False, format, exact
+                        &out_tzoffset, False, None, False
                     )
                     if string_to_dts_failed:
                         # An error at this point is a _parsing_ error
                         # specifically _not_ OutOfBoundsDatetime
-                        if _parse_today_now(val, &iresult[i], utc):
+                        if parse_today_now(val, &iresult[i], utc):
                             continue
-                        elif require_iso8601:
-                            # if requiring iso8601 strings, skip trying
-                            # other formats
-                            if is_coerce:
-                                iresult[i] = NPY_NAT
-                                continue
-                            elif is_raise:
-                                raise ValueError(
-                                    f"time data \"{val}\" doesn't "
-                                    f"match format \"{format}\", at position {i}"
-                                )
-                            return values, tz_out
 
                         try:
                             py_dt = parse_datetime_string(val,
@@ -665,18 +639,6 @@ cpdef array_to_datetime(
                 if is_coerce:
                     iresult[i] = NPY_NAT
                     continue
-                elif require_iso8601 and isinstance(val, str):
-                    # GH#19382 for just-barely-OutOfBounds falling back to
-                    # dateutil parser will return incorrect result because
-                    # it will ignore nanoseconds
-                    if is_raise:
-
-                        # Still raise OutOfBoundsDatetime,
-                        # as error message is informative.
-                        raise
-
-                    assert is_ignore
-                    return values, tz_out
                 raise
 
     except OutOfBoundsDatetime:
@@ -835,26 +797,6 @@ cdef _array_to_datetime_object(
     return oresult, None
 
 
-cdef bint _parse_today_now(str val, int64_t* iresult, bint utc):
-    # We delay this check for as long as possible
-    # because it catches relatively rare cases
-
-    # Multiply by 1000 to convert to nanos, since these methods naturally have
-    #  microsecond resolution
-    if val == "now":
-        if utc:
-            iresult[0] = Timestamp.utcnow().value * 1000
-        else:
-            # GH#18705 make sure to_datetime("now") matches Timestamp("now")
-            # Note using Timestamp.now() is faster than Timestamp("now")
-            iresult[0] = Timestamp.now().value * 1000
-        return True
-    elif val == "today":
-        iresult[0] = Timestamp.today().value * 1000
-        return True
-    return False
-
-
 def array_to_datetime_with_tz(ndarray values, tzinfo tz):
     """
     Vectorized analogue to pd.Timestamp(value, tz=tz)

diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -40,7 +40,6 @@ def try_parse_datetime_components(
     minutes: npt.NDArray[np.object_],  # object[:]
     seconds: npt.NDArray[np.object_],  # object[:]
 ) -> npt.NDArray[np.object_]: ...
-def format_is_iso(f: str) -> bool: ...
 def guess_datetime_format(
     dt_str,
     dayfirst: bool | None = ...,

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -818,26 +818,6 @@ class _timelex:
 _DATEUTIL_LEXER_SPLIT = _timelex.split
 
 
-def format_is_iso(f: str) -> bint:
-    """
-    Does format match the iso8601 set that can be handled by the C parser?
-    Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
-    but must be consistent.  Leading 0s in dates and times are optional.
-    """
-    iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
-    excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
-
-    for date_sep in [" ", "/", "\\", "-", ".", ""]:
-        for time_sep in [" ", "T"]:
-            for micro_or_tz in ["", "%z", ".%f", ".%f%z"]:
-                if (iso_template(date_sep=date_sep,
-                                 time_sep=time_sep,
-                                 micro_or_tz=micro_or_tz,
-                                 ).startswith(f) and f not in excluded_formats):
-                    return True
-    return False
-
-
 def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
     """
     Guess the datetime format of a given datetime string.

diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd
@@ -0,0 +1,4 @@
+from numpy cimport int64_t
+
+
+cdef bint parse_today_now(str val, int64_t* iresult, bint utc)
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport (
     c_nat_strings as nat_strings,
 )
 from pandas._libs.tslibs.np_datetime cimport (
+    NPY_DATETIMEUNIT,
     NPY_FR_ns,
     check_dts_bounds,
     npy_datetimestruct,
     npy_datetimestruct_to_datetime,
     pydate_to_dt64,
     pydatetime_to_dt64,
+    string_to_dts,
 )
 from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
 from pandas._libs.tslibs.timestamps cimport _Timestamp
@@ -48,9 +50,50 @@ from pandas._libs.util cimport (
     is_float_object,
     is_integer_object,
 )
+from pandas._libs.tslibs.timestamps import Timestamp
 
 cnp.import_array()
 
+cdef bint format_is_iso(f: str):
+    """
+    Does format match the iso8601 set that can be handled by the C parser?
+    Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
+    but must be consistent.  Leading 0s in dates and times are optional.
+    """
+    excluded_formats = ["%Y%m"]
+
+    for date_sep in [" ", "/", "\\", "-", ".", ""]:
+        for time_sep in [" ", "T"]:
+            for micro_or_tz in ["", "%z", ".%f", ".%f%z"]:
+                iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}"
+                if iso_fmt.startswith(f) and f not in excluded_formats:
+                    return True
+    return False
+
+
+def _test_format_is_iso(f: str) -> bool:
+    """Only used in testing."""
+    return format_is_iso(f)
+
+
+cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
+    # We delay this check for as long as possible
+    # because it catches relatively rare cases
+
+    # Multiply by 1000 to convert to nanos, since these methods naturally have
+    #  microsecond resolution
+    if val == "now":
+        if utc:
+            iresult[0] = Timestamp.utcnow().value * 1000
+        else:
+            # GH#18705 make sure to_datetime("now") matches Timestamp("now")
+            # Note using Timestamp.now() is faster than Timestamp("now")
+            iresult[0] = Timestamp.now().value * 1000
+        return True
+    elif val == "today":
+        iresult[0] = Timestamp.today().value * 1000
+        return True
+    return False
 
 cdef dict _parse_code_table = {"y": 0,
                                "Y": 1,
@@ -111,6 +154,9 @@ def array_strptime(
         bint found_naive = False
         bint found_tz = False
         tzinfo tz_out = None
+        bint iso_format = fmt is not None and format_is_iso(fmt)
+        NPY_DATETIMEUNIT out_bestunit
+        int out_local = 0, out_tzoffset = 0
 
     assert is_raise or is_ignore or is_coerce
 
@@ -232,6 +278,34 @@ def array_strptime(
             else:
                 val = str(val)
 
+            if iso_format:
+                string_to_dts_failed = string_to_dts(
+                    val, &dts, &out_bestunit, &out_local,
+                    &out_tzoffset, False, fmt, exact
+                )
+                if not string_to_dts_failed:
+                    # No error reported by string_to_dts, pick back up
+                    # where we left off
+                    value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
+                    if out_local == 1:
+                        # Store the out_tzoffset in seconds
+                        # since we store the total_seconds of
+                        # dateutil.tz.tzoffset objects
+                        tz = timezone(timedelta(minutes=out_tzoffset))
 tz_results = np.empty(len(result), dtype=object) 
 for zone in unique(timezones): 
     mask = timezones == zone 
     dta = DatetimeArray(result[mask]).tz_localize(zone) 
     if utc: 
         if dta.tzinfo is None: 
             dta = dta.tz_localize("utc") 
         else: 
             dta = dta.tz_convert("utc") 
     tz_results[mask] = dta 
 tz_results = np.empty(len(result), dtype=object) 
 for zone in unique(timezones): 
     mask = timezones == zone 
     dta = DatetimeArray(result[mask]).tz_localize(zone) 
     if utc: 
         if dta.tzinfo is None: 
             dta = dta.tz_localize("utc") 
         else: 
             dta = dta.tz_convert("utc") 
     tz_results[mask] = dta 
+                        result_timezone[i] = tz
+                        out_local = 0
+                        out_tzoffset = 0
+                    iresult[i] = value
+                    check_dts_bounds(&dts)
+                    continue
+
+            if parse_today_now(val, &iresult[i], utc):
+                continue
+
+            # Some ISO formats can't be parsed by string_to_dts
+            # For example, 6-digit YYYYMD. So, if there's an error,
+            # try the string-matching code below.
+
             # exact matching
             if exact:
                 found = format_regex.match(val)

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2118,10 +2118,7 @@ def objects_to_datetime64ns(
     yearfirst,
     utc: bool = False,
     errors: DateTimeErrorChoices = "raise",
-    require_iso8601: bool = False,
     allow_object: bool = False,
-    format: str | None = None,
-    exact: bool = True,
 ):
     """
     Convert data to array of timestamps.
@@ -2134,7 +2131,6 @@ def objects_to_datetime64ns(
     utc : bool, default False
         Whether to convert/localize timestamps to UTC.
     errors : {'raise', 'ignore', 'coerce'}
-    require_iso8601 : bool, default False
     allow_object : bool
         Whether to return an object-dtype ndarray instead of raising if the
         data contains more than one timezone.
@@ -2165,9 +2161,6 @@ def objects_to_datetime64ns(
             utc=utc,
             dayfirst=dayfirst,
             yearfirst=yearfirst,
-            require_iso8601=require_iso8601,
-            format=format,
-            exact=exact,
         )
         result = result.reshape(data.shape, order=order)
     except OverflowError as err:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from numpy cimport int64_t


		cdef bint parse_today_now(str val, int64_t* iresult, bint utc)