From 83f242c5d1acad6e7af4bb6df459b7535a1853e5 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> Date: Sat, 20 Apr 2024 14:11:18 +0300 Subject: [PATCH] Fix wrong dateformat guess if first element had May as month --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/tools/datetimes.py | 11 +++++++++++ pandas/tests/tools/test_to_datetime.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f291820bc5266..f505c1e75cfae 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -356,6 +356,7 @@ Datetimelike - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) +- Bug in :func:`to_datetime` could guess the date format incorrectly if the first element had May as month (:issue:`58328`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) Timedelta diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index df7a6cdb1ea52..77791eec0bf96 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -128,6 +128,17 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str if (first_non_null := tslib.first_non_null(arr)) != -1: if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 # GH#32264 np.str_ object + if "may" in first_non_nan_element.lower(): + # GH 58328 + first_non_nan_element = next( + ( + x + for x in arr[first_non_null + 1 :] + if isinstance(x, str) and "may" not in x.lower() + ), + first_non_nan_element, + ) + guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b59dd194cac27..1a3c94d33e254 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2649,6 +2649,20 @@ def test_guess_datetime_format_for_array_all_nans(self): ) assert format_for_string_of_nans is None + @pytest.mark.parametrize( + "test_list, expected_format", + [ + (["1 May 2023", "1 Apr 2023", "1 Sep 2023"], "%d %b %Y"), + (["1 May 2023", "1 April 2023", "1 September 2023"], "%d %B %Y"), + (["1 May 2023", "2 May 2023", "3 May 2023"], "%d %B %Y"), + ], + ) + def test_guess_datetime_format_for_array_first_element_month_may( + self, test_list, expected_format + ): # GH 58328 + test_array = np.array(test_list, dtype=object) + assert tools._guess_datetime_format_for_array(test_array) == expected_format + class TestToDatetimeInferFormat: @pytest.mark.parametrize(