From 1062e5086ce07b6db542c4d45a78e882a5bda06b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 22 Feb 2022 18:10:34 +0100 Subject: [PATCH 01/65] Added the default format "%Y-%m-%d %H:%M:%S" as basic format (same as format=None), as suggested by @auderson. --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5c852e85efdc0..f25a63de49b18 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -133,7 +133,7 @@ def format_array_from_datetime( # if we don't have a format nor tz, then choose # a format based on precision - basic_format = format is None and tz is None + basic_format = (format is None or format == "%Y-%m-%d %H:%M:%S") and tz is None if basic_format: consider_values = values[values != NPY_NAT] show_ns = (consider_values % 1000).any() From 8d0779855c48897bcf1fada0a5885ef196c85e75 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 24 Feb 2022 18:17:02 +0100 Subject: [PATCH 02/65] Created two utils `convert_dtformat` and `get_datetime_fmt_dct` so as to be able to replace strftime faster. The result can be demonstrated using `fast_strftime`. --- pandas/core/tools/datetimes.py | 191 +++++++++++++++++++++++++ pandas/tests/io/formats/test_format.py | 73 ++++++++++ 2 files changed, 264 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 33ed64c7ae364..d7a36a6efdeea 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1269,3 +1269,194 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): from pandas.core.tools.times import to_time return to_time(arg, format, infer_time_format, errors) + + +class UnsupportedDatetimeDirective(ValueError): + """The strftime format contains a directive that is not supported in this context.""" + + +def convert_dtformat( + strftime_fmt: str, + new_style_fmt: bool = False, +) -> str: + """Convert a strftime formatting string into a string formatting template string. + + Parameters + ---------- + strftime_fmt : str + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. Note + that not all directives are eligible to successful usage of string formatting. + Unsupported directives will lead to an `UnsupportedDatetimeDirective` being raised. + new_style_fmt : bool, default: False + Whether the output string should be new-style + e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" + or old-style e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + + Returns + ------- + fmt_out : str + A string that may be used to format a `datetime` variable. The style of this string + is either old-style or new-style depending on `new_style_formatting`. + For old-style, it may be used as `fmt_out % get_datetime_fmt_dct(dt)`. + For new-style, it may be used as `fmt_out.format(**get_datetime_fmt_dct(dt))` + + Raises + ------ + UnsupportedDatetimeDirective + Raised when the received `strftime_fmt` format contains a directive for which the output + can not currently be created using string formatting. + + See Also + -------- + - `strftime format codes reference `_ + - `Stackoverflow post `_ explaining + how old-style formatting is faster than new-style formatting, itself faster than + `datetime.strftime`. + """ + non_supported = ( + # All of these below are names and therefore are not in the numpy or datetime attr representation + "%a", # Weekday as locale’s abbreviated name. + "%A", # Weekday as locale’s full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale’s abbreviated name. + "%B", # Month as locale’s full name. + # TODO All of those below can probably be derived easily from the numbers on the instance + "%y", # Year without century as a zero-padded decimal number. >> year % 100 + "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 + "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" + # TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). + "%Z", # Time zone name (empty string if the object is naive). + # We do not want to enter into these below, we do not want to re-create the datetime implementation + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. + "%c", # Locale’s appropriate date and time representation. + "%x", # Locale’s appropriate date representation. + "%X", # Locale’s appropriate time representation. + ) + + # Raise if unsupported directive found in `strftime_fmt` + for key in non_supported: + if key in strftime_fmt: + raise UnsupportedDatetimeDirective(f"Unsupported datetime formatting directive: {key!r}") + + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + supported = { + "%d": "{day:02d}", # Day of the month as a zero-padded decimal number. + "%m": "{month:02d}", # Month as a zero-padded decimal number. + "%Y": "{year}", # Year with century as a decimal number. + "%H": "{hour:02d}", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": "{min:02d}", # Minute as a zero-padded decimal number. + "%S": "{sec:02d}", # Second as a zero-padded decimal number. + "%f": "{us:06d}", # Microsecond as a decimal number, zero-padded to 6 digits. + } + else: + supported = { + "%d": "%(day)02d", # Day of the month as a zero-padded decimal number. + "%m": "%(month)02d", # Month as a zero-padded decimal number. + "%Y": "%(year)s", # Year with century as a decimal number. + "%H": "%(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": "%(min)02d", # Minute as a zero-padded decimal number. + "%S": "%(sec)02d", # Second as a zero-padded decimal number. + "%f": "%(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. + } + + # Create the output by replacing all directives + for key, replacement in supported.items(): + strftime_fmt = strftime_fmt.replace(key, replacement) + + return strftime_fmt + + +def get_datetime_fmt_dct(dt: datetime): + """Create the dict required to use the `convert_dtformat` output to format `dt`. + + `convert_dtformat` returns a formatting string (new style or old style). + + Applyingthis formatting string to a datetime object requires to create a dictionary + containing the various fields used in the formatting string. This is what `get_datetime_fmt_dct` does. + + ```python + # new-style string formatting + new_style_format = convert_dtformat(strftime_fmt, new_style_fmt=True) + new_style_res = new_style_format.format(**fmt_dct) + + # old-style string formatting + old_style_format = convert_dtformat(strftime_fmt) + old_style_res = old_style_format % fmt_dct + ``` + + TODO raise an error if not tz naive and not utc + See datetime.datetime.tzinfo. + + Parameters + ---------- + dt : datetime + A `datetime` instance for which we wish to apply a formatting string + created using `convert_dtformat`. + + Returns + ------- + fmt_dct : dict + A dictionary created from `dt` and containing all the keys required by the + formatting operation. + + See Also + -------- + `datetime attributes `_ + """ + return dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) + + +def fast_strftime( + dt: datetime, + fmt: str, + new_style_fmt: bool = False, +) -> str: + """A faster version of `datetime.strftime` using python string formatting. + + Returns a string representing the date and time, controlled by an explicit format string. + For a complete list of formatting directives, see + `strftime() and strptime() Behavior `_. + + Parameters + ---------- + dt : datetime + The `datetime` instance to convert. + fmt : str + The format string + new_style_fmt : bool, default: False + A boolean indicating if old- or new-style python string formatting should be used. + By default the old-style formatting is enabled, as it is faster as of python <= 3.9. + + Returns + ------- + out_str : str + A string representing the datetime using the format specified. + + Raises + ------ + UnsupportedDatetimeDirective + Raised when the received `strftime_fmt` format contains a directive for which the output + can not currently be created using string formatting. + + See Also + -------- + - `strftime format codes reference `_ + - `Stackoverflow post `_ explaining + how old-style formatting is faster than new-style formatting, itself faster than + `datetime.strftime`. + """ + # common dict used for formatting + fmt_dct = get_datetime_fmt_dct(dt) + + # get the formatting template + fmt_str = convert_dtformat(fmt, new_style_fmt=new_style_fmt) + + # execute + return fmt_str.format(**fmt_dct) if new_style_fmt else (fmt_str % fmt_dct) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index adcaeba5cfd8d..290aa06cf8d03 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,6 +1,7 @@ """ Test output formatting for Series/DataFrame, including to_string & reprs """ +from timeit import Timer from datetime import datetime from io import StringIO @@ -22,6 +23,11 @@ IS64, is_platform_windows, ) +from pandas.core.tools.datetimes import ( + convert_dtformat, + get_datetime_fmt_dct, + fast_strftime +) import pandas.util._test_decorators as td import pandas as pd @@ -3078,6 +3084,73 @@ def test_zero(self): assert result[0].strip() == "'0 days'" +class TestDatetimeFastFormatter: + @pytest.mark.parametrize("strftime_format", ( + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.fZ", + )) + @pytest.mark.parametrize("new_style", (False, True)) + def test_fast_strftime_basic(self, strftime_format, new_style): + """Test that formatting standard `datetime` objects with our utils works as good as strftime.""" + + # create a datetime instance + dt = datetime.now() + + # strftime + strftime_res = dt.strftime(strftime_format) + + # common dict used for formatting + fmt_dct = get_datetime_fmt_dct(dt) + + # get the formatting string + style_format = convert_dtformat(strftime_format, new_style_fmt=new_style) + + # apply it and check the output + if new_style: + res = style_format.format(**fmt_dct) + else: + res = style_format % fmt_dct + assert res == strftime_res + + # fast_strftime is a shortcut function for the above + res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) + assert res2 == res + + @pytest.mark.parametrize("strftime_format", ( + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.fZ", + )) + def test_fast_strftime_perf(self, strftime_format): + """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" + + # create a datetime instance + dt = datetime.now() + + # pre-compute the formats + new_style_format = convert_dtformat(strftime_format, new_style_fmt=True) + old_style_format = convert_dtformat(strftime_format) + + # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 + glob = globals() + glob.update(locals()) + strftime_best = min( + Timer("dt.strftime(strftime_format)", globals=glob).repeat(7, 1000) + ) + # Out[3]: 0.0062 + new_style_best = min( + Timer("new_style_format.format(**get_datetime_fmt_dct(dt))", globals=glob).repeat(7, 1000) + ) + # Out[4]: 0.0018 if get_datetime_fmt_dct is pre-evaluated, 0.0036 otherwise + old_style_best = min( + Timer("old_style_format % get_datetime_fmt_dct(dt)", globals=glob).repeat(7, 1000) + ) + # Out[5]: 0.0012 if get_datetime_fmt_dct is pre-evaluated, 0.0030 otherwise + assert new_style_best < strftime_best # much better + assert old_style_best < new_style_best # even better ! + + class TestDatetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) From 4b2380e06f1b5a37204a33d11851358078a40f42 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 24 Feb 2022 18:19:23 +0100 Subject: [PATCH 03/65] Fixed two pep8 --- pandas/tests/io/formats/test_format.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 290aa06cf8d03..d0cf70b341073 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3086,9 +3086,9 @@ def test_zero(self): class TestDatetimeFastFormatter: @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.fZ", + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.fZ", )) @pytest.mark.parametrize("new_style", (False, True)) def test_fast_strftime_basic(self, strftime_format, new_style): @@ -3118,9 +3118,9 @@ def test_fast_strftime_basic(self, strftime_format, new_style): assert res2 == res @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.fZ", + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.fZ", )) def test_fast_strftime_perf(self, strftime_format): """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" From 6a3708e58fc8f9d304e2b4ed05dc09a911d0d65d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 28 Feb 2022 15:05:48 +0100 Subject: [PATCH 04/65] Added new `strftime` module in tslibs, containing `convert_dtformat` and `UnsupportedDatetimeDirective`. Modified `format_array_from_datetime` to use it when `fast_strftime=True` (new arg). Removed `get_datetime_fmt_dct` --- pandas/_libs/tslib.pyi | 1 + pandas/_libs/tslib.pyx | 41 ++++++- pandas/_libs/tslibs/__init__.py | 6 + pandas/_libs/tslibs/strftime.py | 99 ++++++++++++++++ pandas/_libs/tslibs/timestamps.pyx | 2 +- pandas/core/tools/datetimes.py | 151 ++----------------------- pandas/tests/io/formats/test_format.py | 13 ++- 7 files changed, 161 insertions(+), 152 deletions(-) create mode 100644 pandas/_libs/tslibs/strftime.py diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 4b02235ac9925..97d6311c62d5b 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -9,6 +9,7 @@ def format_array_from_datetime( tz: tzinfo | None = ..., format: str | None = ..., na_rep: object = ..., + fast_strftime: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_with_unit_to_datetime( values: np.ndarray, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f25a63de49b18..0e35da532e02f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -62,6 +62,11 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.strftime import ( + convert_dtformat, + UnsupportedDatetimeDirective, +) + # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na @@ -101,7 +106,8 @@ def format_array_from_datetime( ndarray[int64_t] values, tzinfo tz=None, str format=None, - object na_rep=None + object na_rep=None, + fast_strftime=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -114,6 +120,9 @@ def format_array_from_datetime( a strftime capable string na_rep : optional, default is None a nat format + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_dtformat`. Returns ------- @@ -133,7 +142,7 @@ def format_array_from_datetime( # if we don't have a format nor tz, then choose # a format based on precision - basic_format = (format is None or format == "%Y-%m-%d %H:%M:%S") and tz is None + basic_format = format is None and tz is None if basic_format: consider_values = values[values != NPY_NAT] show_ns = (consider_values % 1000).any() @@ -146,6 +155,24 @@ def format_array_from_datetime( consider_values //= 1000 show_ms = (consider_values % 1000).any() + elif format == "%Y-%m-%d %H:%M:%S": + # Same format as default, but with hardcoded precision (s) + basic_format = True + show_ns = show_us = show_ms = False + + elif format == "%Y-%m-%d %H:%M:%S.%f": + # Same format as default, but with hardcoded precision (us) + basic_format = show_us = True + show_ns = show_ms = False + + elif fast_strftime: + try: + # Try to get the string formatting template for this format + str_format = convert_dtformat(format) + except UnsupportedDatetimeDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + for i in range(N): val = values[i] @@ -167,6 +194,16 @@ def format_array_from_datetime( result[i] = res + elif fast_strftime: + + dt64_to_dtstruct(val, &dts) + + # Use string formatting for faster strftime + result[i] = str_format % dict( + year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, + min=dts.min, sec=dts.sec, us=dts.us + ) + else: ts = Timestamp(val, tz=tz) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 11de4e60f202d..eb09063185442 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -9,6 +9,8 @@ "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", + "convert_dtformat", + "UnsupportedDatetimeDirective", "Resolution", "Timedelta", "normalize_i8_timestamps", @@ -48,6 +50,10 @@ IncompatibleFrequency, Period, ) +from pandas._libs.tslibs.strftime import ( + convert_dtformat, + UnsupportedDatetimeDirective, +) from pandas._libs.tslibs.timedeltas import ( Timedelta, delta_to_nanoseconds, diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py new file mode 100644 index 0000000000000..9dfc07a730253 --- /dev/null +++ b/pandas/_libs/tslibs/strftime.py @@ -0,0 +1,99 @@ +"""Strftime-related classes and functions. +""" +class UnsupportedDatetimeDirective(ValueError): + """The strftime format contains a directive that is not supported in this context.""" + + +def convert_dtformat( + strftime_fmt: str, + new_style_fmt: bool = False, +) -> str: + """Convert a strftime formatting string into a string formatting template string. + + Parameters + ---------- + strftime_fmt : str + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. Note + that not all directives are eligible to successful usage of string formatting. + Unsupported directives will lead to an `UnsupportedDatetimeDirective` being raised. + new_style_fmt : bool, default: False + Whether the output string should be new-style + e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" + or old-style e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + + Returns + ------- + fmt_out : str + A string that may be used to format a `datetime` variable. The style of this string + is either old-style or new-style depending on `new_style_formatting`. + For old-style, it may be used as `fmt_out % get_datetime_fmt_dct(dt)`. + For new-style, it may be used as `fmt_out.format(**get_datetime_fmt_dct(dt))` + + Raises + ------ + UnsupportedDatetimeDirective + Raised when the received `strftime_fmt` format contains a directive for which the output + can not currently be created using string formatting. + + See Also + -------- + - `strftime format codes reference `_ + - `Stackoverflow post `_ explaining + how old-style formatting is faster than new-style formatting, itself faster than + `datetime.strftime`. + """ + non_supported = ( + # All of these below are names and therefore are not in the numpy or datetime attr representation + "%a", # Weekday as locale’s abbreviated name. + "%A", # Weekday as locale’s full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale’s abbreviated name. + "%B", # Month as locale’s full name. + # TODO All of those below can probably be derived easily from the numbers on the instance + "%y", # Year without century as a zero-padded decimal number. >> year % 100 + "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 + "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" + # TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). + "%Z", # Time zone name (empty string if the object is naive). + # We do not want to enter into these below, we do not want to re-create the datetime implementation + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. + "%c", # Locale’s appropriate date and time representation. + "%x", # Locale’s appropriate date representation. + "%X", # Locale’s appropriate time representation. + ) + + # Raise if unsupported directive found in `strftime_fmt` + for key in non_supported: + if key in strftime_fmt: + raise UnsupportedDatetimeDirective(f"Unsupported datetime formatting directive: {key!r}") + + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + supported = { + "%d": "{day:02d}", # Day of the month as a zero-padded decimal number. + "%m": "{month:02d}", # Month as a zero-padded decimal number. + "%Y": "{year}", # Year with century as a decimal number. + "%H": "{hour:02d}", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": "{min:02d}", # Minute as a zero-padded decimal number. + "%S": "{sec:02d}", # Second as a zero-padded decimal number. + "%f": "{us:06d}", # Microsecond as a decimal number, zero-padded to 6 digits. + } + else: + supported = { + "%d": "%(day)02d", # Day of the month as a zero-padded decimal number. + "%m": "%(month)02d", # Month as a zero-padded decimal number. + "%Y": "%(year)s", # Year with century as a decimal number. + "%H": "%(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": "%(min)02d", # Minute as a zero-padded decimal number. + "%S": "%(sec)02d", # Second as a zero-padded decimal number. + "%f": "%(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. + } + + # Create the output by replacing all directives + for key, replacement in supported.items(): + strftime_fmt = strftime_fmt.replace(key, replacement) + + return strftime_fmt diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index aad49bd70b120..27697da3ed88c 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -819,7 +819,7 @@ cdef class _Timestamp(ABCTimestamp): @property def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but - # the datetime strftime() methods require year >= 1900 + # the datetime strftime() methods require year >= 1900 and is slower return f'{self.year}-{self.month:02d}-{self.day:02d}' @property diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d7a36a6efdeea..dca88e9418ad0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -28,6 +28,7 @@ nat_strings, parsing, timezones, + convert_dtformat, ) from pandas._libs.tslibs.parsing import ( # noqa:F401 DateParseError, @@ -1271,148 +1272,6 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): return to_time(arg, format, infer_time_format, errors) -class UnsupportedDatetimeDirective(ValueError): - """The strftime format contains a directive that is not supported in this context.""" - - -def convert_dtformat( - strftime_fmt: str, - new_style_fmt: bool = False, -) -> str: - """Convert a strftime formatting string into a string formatting template string. - - Parameters - ---------- - strftime_fmt : str - The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. Note - that not all directives are eligible to successful usage of string formatting. - Unsupported directives will lead to an `UnsupportedDatetimeDirective` being raised. - new_style_fmt : bool, default: False - Whether the output string should be new-style - e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" - or old-style e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" - - Returns - ------- - fmt_out : str - A string that may be used to format a `datetime` variable. The style of this string - is either old-style or new-style depending on `new_style_formatting`. - For old-style, it may be used as `fmt_out % get_datetime_fmt_dct(dt)`. - For new-style, it may be used as `fmt_out.format(**get_datetime_fmt_dct(dt))` - - Raises - ------ - UnsupportedDatetimeDirective - Raised when the received `strftime_fmt` format contains a directive for which the output - can not currently be created using string formatting. - - See Also - -------- - - `strftime format codes reference `_ - - `Stackoverflow post `_ explaining - how old-style formatting is faster than new-style formatting, itself faster than - `datetime.strftime`. - """ - non_supported = ( - # All of these below are names and therefore are not in the numpy or datetime attr representation - "%a", # Weekday as locale’s abbreviated name. - "%A", # Weekday as locale’s full name. - "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. - "%b", # Month as locale’s abbreviated name. - "%B", # Month as locale’s full name. - # TODO All of those below can probably be derived easily from the numbers on the instance - "%y", # Year without century as a zero-padded decimal number. >> year % 100 - "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 - "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" - # TODO Below Time offset and timezone information ... but may be hard - "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). - "%Z", # Time zone name (empty string if the object is naive). - # We do not want to enter into these below, we do not want to re-create the datetime implementation - "%j", # Day of the year as a zero-padded decimal number. - "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. - "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. - "%c", # Locale’s appropriate date and time representation. - "%x", # Locale’s appropriate date representation. - "%X", # Locale’s appropriate time representation. - ) - - # Raise if unsupported directive found in `strftime_fmt` - for key in non_supported: - if key in strftime_fmt: - raise UnsupportedDatetimeDirective(f"Unsupported datetime formatting directive: {key!r}") - - # Mapping between strftime and string formatting, according to both styles - if new_style_fmt: - supported = { - "%d": "{day:02d}", # Day of the month as a zero-padded decimal number. - "%m": "{month:02d}", # Month as a zero-padded decimal number. - "%Y": "{year}", # Year with century as a decimal number. - "%H": "{hour:02d}", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": "{min:02d}", # Minute as a zero-padded decimal number. - "%S": "{sec:02d}", # Second as a zero-padded decimal number. - "%f": "{us:06d}", # Microsecond as a decimal number, zero-padded to 6 digits. - } - else: - supported = { - "%d": "%(day)02d", # Day of the month as a zero-padded decimal number. - "%m": "%(month)02d", # Month as a zero-padded decimal number. - "%Y": "%(year)s", # Year with century as a decimal number. - "%H": "%(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": "%(min)02d", # Minute as a zero-padded decimal number. - "%S": "%(sec)02d", # Second as a zero-padded decimal number. - "%f": "%(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. - } - - # Create the output by replacing all directives - for key, replacement in supported.items(): - strftime_fmt = strftime_fmt.replace(key, replacement) - - return strftime_fmt - - -def get_datetime_fmt_dct(dt: datetime): - """Create the dict required to use the `convert_dtformat` output to format `dt`. - - `convert_dtformat` returns a formatting string (new style or old style). - - Applyingthis formatting string to a datetime object requires to create a dictionary - containing the various fields used in the formatting string. This is what `get_datetime_fmt_dct` does. - - ```python - # new-style string formatting - new_style_format = convert_dtformat(strftime_fmt, new_style_fmt=True) - new_style_res = new_style_format.format(**fmt_dct) - - # old-style string formatting - old_style_format = convert_dtformat(strftime_fmt) - old_style_res = old_style_format % fmt_dct - ``` - - TODO raise an error if not tz naive and not utc - See datetime.datetime.tzinfo. - - Parameters - ---------- - dt : datetime - A `datetime` instance for which we wish to apply a formatting string - created using `convert_dtformat`. - - Returns - ------- - fmt_dct : dict - A dictionary created from `dt` and containing all the keys required by the - formatting operation. - - See Also - -------- - `datetime attributes `_ - """ - return dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond - ) - - def fast_strftime( dt: datetime, fmt: str, @@ -1420,6 +1279,9 @@ def fast_strftime( ) -> str: """A faster version of `datetime.strftime` using python string formatting. + Note: this is used in tests, not in the pandas API. + TODO since this makes especially a lot of sense for arrays, maybe this should work on arrays. + Returns a string representing the date and time, controlled by an explicit format string. For a complete list of formatting directives, see `strftime() and strptime() Behavior `_. @@ -1453,7 +1315,10 @@ def fast_strftime( `datetime.strftime`. """ # common dict used for formatting - fmt_dct = get_datetime_fmt_dct(dt) + fmt_dct = dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) # get the formatting template fmt_str = convert_dtformat(fmt, new_style_fmt=new_style_fmt) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d0cf70b341073..b32a2ebc121a2 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -19,15 +19,13 @@ import pytest import pytz +from pandas._libs.tslibs import convert_dtformat + from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.core.tools.datetimes import ( - convert_dtformat, - get_datetime_fmt_dct, - fast_strftime -) +from pandas.core.tools.datetimes import fast_strftime import pandas.util._test_decorators as td import pandas as pd @@ -3101,7 +3099,10 @@ def test_fast_strftime_basic(self, strftime_format, new_style): strftime_res = dt.strftime(strftime_format) # common dict used for formatting - fmt_dct = get_datetime_fmt_dct(dt) + fmt_dct = dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) # get the formatting string style_format = convert_dtformat(strftime_format, new_style_fmt=new_style) From 93f0dbe33985839b6dcd5e94f25de7483f041588 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 28 Feb 2022 15:23:29 +0100 Subject: [PATCH 05/65] Modified `DatetimeLikeArrayMixin._format_native_types` so that it accepts `fast_strftime`. Modified `PeriodArray._format_native_types` so that it benefits from `fast_strftime` too. --- pandas/core/arrays/datetimelike.py | 12 +++++++++--- pandas/core/arrays/datetimes.py | 5 +++-- pandas/core/arrays/period.py | 22 ++++++++++++++++++++-- pandas/core/arrays/timedeltas.py | 4 +++- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 483878706db75..90f936359d183 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -294,7 +294,7 @@ def asi8(self) -> npt.NDArray[np.int64]: # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, *, na_rep="NaT", date_format=None): + def _format_native_types(self, *, na_rep="NaT", date_format=None, fast_strftime=True): """ Helper method for astype when converting to strings. @@ -1589,7 +1589,7 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime(self, date_format: str) -> npt.NDArray[np.object_]: + def strftime(self, date_format: str, fast_strftime: bool = True) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1603,6 +1603,12 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: date_format : str Date format string (e.g. "%%Y-%%m-%%d"). + fast_strftime : boolean, default: True + if `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_dtformat`. + + .. versionadded:: 1.5.0 + Returns ------- ndarray[object] @@ -1624,7 +1630,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan) + result = self._format_native_types(date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e21f2e9d7b46e..3100c201d85ac 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -673,14 +673,15 @@ def astype(self, dtype, copy: bool = True): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, + fast_strftime=fast_strftime, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6189584dff7f1..f391491234af0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -25,6 +25,8 @@ parsing, period as libperiod, to_offset, + convert_dtformat, + UnsupportedDatetimeDirective, ) from pandas._libs.tslibs.dtypes import FreqGroup from pandas._libs.tslibs.fields import isleapyear_arr @@ -633,7 +635,7 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> np.ndarray: """ actually format my specific types @@ -641,7 +643,23 @@ def _format_native_types( values = self.astype(object) if date_format: - formatter = lambda dt: dt.strftime(date_format) + if fast_strftime: + try: + # Try to get the string formatting template for this format + str_format = convert_dtformat(date_format) + except UnsupportedDatetimeDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + + if fast_strftime: + # Faster: python old-style string formatting + formatter = lambda dt: str_format % dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.min, sec=dt.sec, us=dt.us + ) + else: + # Slower: strftime + formatter = lambda dt: dt.strftime(date_format) else: formatter = lambda dt: str(dt) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index eb195e4facf4a..834351ea1a20e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -426,10 +426,12 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> np.ndarray: from pandas.io.formats.format import get_format_timedelta64 + # Note: TimeDelta._repr_base already uses `fast_strftime` built-in + # (and does not take the `date_format` arg into account) formatter = get_format_timedelta64(self._ndarray, na_rep) return np.array([formatter(x) for x in self._ndarray]) From 5b9dd1ba58ac126ec79e667f6bd086222bc58ea7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 28 Feb 2022 16:55:03 +0100 Subject: [PATCH 06/65] Added a `fast_strftime` argument to all formatters: `Datetime64Formatter`, `Datetime64TZFormatter` and `DatetimeIndex`. --- pandas/core/indexes/datetimelike.py | 7 +++--- pandas/core/indexes/datetimes.py | 4 ++-- pandas/io/formats/format.py | 35 ++++++++++++++++++++++++----- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 2731c9ab447b7..db961acdc9919 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -160,6 +160,7 @@ def format( formatter: Callable | None = None, na_rep: str = "NaT", date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: """ Render a string representation of the Index. @@ -175,14 +176,14 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header(header, na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, header: list[str], na_rep: str = "NaT", date_format: str | None = None, fast_strftime: bool = True, ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( - self._format_native_types(na_rep=na_rep, date_format=date_format) + self._format_native_types(na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime) ) @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 004d860b20a6f..b827f05731086 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -259,8 +259,8 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format) -> Index: - arr = self._data.strftime(date_format) + def strftime(self, date_format, fast_strftime : bool = True) -> Index: + arr = self._data.strftime(date_format, fast_strftime=fast_strftime) return Index(arr, name=self.name) @doc(DatetimeArray.tz_convert) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 4bc45e290ce4a..5b69b7b75861c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -43,6 +43,8 @@ Timedelta, Timestamp, iNaT, + convert_dtformat, + UnsupportedDatetimeDirective, ) from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( @@ -1613,11 +1615,13 @@ def __init__( values: np.ndarray | Series | DatetimeIndex | DatetimeArray, nat_rep: str = "NaT", date_format: None = None, + fast_strftime: bool = True, **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format + self.fast_strftime = fast_strftime def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" @@ -1630,7 +1634,7 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format + na_rep=self.nat_rep, date_format=self.date_format, fast_strftime=self.fast_strftime ) return fmt_values.tolist() @@ -1766,12 +1770,21 @@ def _format_datetime64_dateonly( x: NaTType | Timestamp, nat_rep: str = "NaT", date_format: str | None = None, + str_date_fmt: str | None = None, ) -> str: if x is NaT: return nat_rep if date_format: - return x.strftime(date_format) + if str_date_fmt: + # Faster, using string formatting + return str_date_fmt % dict( + year=x.year, month=x.month, day=x.day, hour=x.hour, + min=x.min, sec=x.sec, us=x.us + ) + else: + # Slower + return x.strftime(date_format) else: # error: Item "NaTType" of "Union[NaTType, Any]" has no attribute "_date_repr" # The underlying problem here is that mypy doesn't understand that NaT @@ -1780,12 +1793,24 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None, fast_strftime : bool = True, ) -> Callable: + """Return a formatter callable taking a datetime64 as input and providing + a string as output""" if is_dates_only: + str_date_fmt = None + if date_format is not None and fast_strftime: + try: + # Try to get the string formatting template for this format + str_date_fmt = convert_dtformat(date_format) + except UnsupportedDatetimeDirective: + # Unsupported directive: fallback to standard `strftime` + pass + return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format + x, nat_rep=nat_rep, date_format=date_format, + str_date_fmt=str_date_fmt ) else: return lambda x: _format_datetime64(x, nat_rep=nat_rep) @@ -1812,7 +1837,7 @@ def _format_strings(self) -> list[str]: values = self.values.astype(object) ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format + ido, date_format=self.date_format, fast_strftime=self.fast_strftime ) fmt_values = [formatter(x) for x in values] From 8328febf4038ef2076eed0dd720e6d3371ace286 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 28 Feb 2022 16:55:51 +0100 Subject: [PATCH 07/65] Added a test in `TestDatetimeFastFormatter` so that it tests performance on arrays --- pandas/tests/io/formats/test_format.py | 192 ++++++++++++++++--------- 1 file changed, 122 insertions(+), 70 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b32a2ebc121a2..dbdb0951f5350 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3082,76 +3082,6 @@ def test_zero(self): assert result[0].strip() == "'0 days'" -class TestDatetimeFastFormatter: - @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.fZ", - )) - @pytest.mark.parametrize("new_style", (False, True)) - def test_fast_strftime_basic(self, strftime_format, new_style): - """Test that formatting standard `datetime` objects with our utils works as good as strftime.""" - - # create a datetime instance - dt = datetime.now() - - # strftime - strftime_res = dt.strftime(strftime_format) - - # common dict used for formatting - fmt_dct = dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond - ) - - # get the formatting string - style_format = convert_dtformat(strftime_format, new_style_fmt=new_style) - - # apply it and check the output - if new_style: - res = style_format.format(**fmt_dct) - else: - res = style_format % fmt_dct - assert res == strftime_res - - # fast_strftime is a shortcut function for the above - res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) - assert res2 == res - - @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.fZ", - )) - def test_fast_strftime_perf(self, strftime_format): - """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" - - # create a datetime instance - dt = datetime.now() - - # pre-compute the formats - new_style_format = convert_dtformat(strftime_format, new_style_fmt=True) - old_style_format = convert_dtformat(strftime_format) - - # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 - glob = globals() - glob.update(locals()) - strftime_best = min( - Timer("dt.strftime(strftime_format)", globals=glob).repeat(7, 1000) - ) - # Out[3]: 0.0062 - new_style_best = min( - Timer("new_style_format.format(**get_datetime_fmt_dct(dt))", globals=glob).repeat(7, 1000) - ) - # Out[4]: 0.0018 if get_datetime_fmt_dct is pre-evaluated, 0.0036 otherwise - old_style_best = min( - Timer("old_style_format % get_datetime_fmt_dct(dt)", globals=glob).repeat(7, 1000) - ) - # Out[5]: 0.0012 if get_datetime_fmt_dct is pre-evaluated, 0.0030 otherwise - assert new_style_best < strftime_best # much better - assert old_style_best < new_style_best # even better ! - - class TestDatetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) @@ -3327,6 +3257,128 @@ def test_nat_representations(self): assert f(NaT) == "NaT" +class TestDatetimeFastFormatter: + """ + Test that the new `fast_strftime` mode improves formatting perf for all + kind of date/time objects. + """ + + @pytest.mark.parametrize("strftime_format", ( + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + )) + @pytest.mark.parametrize("new_style", (False, True)) + def test_fast_strftime_basic(self, strftime_format, new_style): + """Test that formatting standard `datetime` objects with our utils works as good as strftime.""" + + # create a datetime instance + dt = datetime.now() + + # strftime + strftime_res = dt.strftime(strftime_format) + + # common dict used for formatting + fmt_dct = dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) + + # get the formatting string + style_format = convert_dtformat(strftime_format, new_style_fmt=new_style) + + # apply it and check the output + if new_style: + res = style_format.format(**fmt_dct) + else: + res = style_format % fmt_dct + assert res == strftime_res + + # fast_strftime is a shortcut function for the above + res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) + assert res2 == res + + @pytest.mark.parametrize("strftime_format", ( + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + )) + def test_fast_strftime_perf(self, strftime_format): + """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" + + # create a datetime instance + dt = datetime.now() + + # pre-compute the formats + new_style_format = convert_dtformat(strftime_format, new_style_fmt=True) + old_style_format = convert_dtformat(strftime_format) + + # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 + def get_datetime_fmt_dct(dt): + return dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) + glob = globals() + glob.update(locals()) + strftime_best = min( + Timer("dt.strftime(strftime_format)", globals=glob).repeat(7, 1000) + ) + # Out[3]: 0.0062 + new_style_best = min( + Timer("new_style_format.format(**get_datetime_fmt_dct(dt))", globals=glob).repeat(7, 1000) + ) + # Out[4]: 0.0018 if get_datetime_fmt_dct is pre-evaluated, 0.0036 otherwise + old_style_best = min( + Timer("old_style_format % get_datetime_fmt_dct(dt)", globals=glob).repeat(7, 1000) + ) + # Out[5]: 0.0012 if get_datetime_fmt_dct is pre-evaluated, 0.0030 otherwise + assert new_style_best < strftime_best # much better + assert old_style_best < new_style_best # even better ! + + def test_bad_strftime_directive(self): + """Test which kind of error is output in case of bad `date_format` directive.""" + + # TODO make sure that it does not become very hard for users to understand + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) + + # This does not raise any error, while %D is not a correct directive ! + x.dt.strftime(date_format="%Y-%M-%D___", fast_strftime=False) + + # This raises a `TypeError: not enough arguments for format string` + with pytest.raises(TypeError): + x.dt.strftime(date_format="%Y-%M-%D___") + # TODO raise a more readable error ? + + @pytest.mark.parametrize("date_format", ( + # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f are always accelerated (hardcoded) + "%Y-%m-%d__foo__%H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + )) + def test_perf_datetime64_strftime(self, date_format): + x = Series(date_range("20130101 09:00:00", periods=100, freq="min")) + # res = x.dt.strftime(date_format=date_format) + # slow_res = x.dt.strftime(date_format=date_format, fast_strftime=False) + + glob = globals() + glob.update(locals()) + fast_best = min(Timer("x.dt.strftime(date_format=date_format)", globals=glob).repeat(3, 100)) + strftime_best = min(Timer("x.dt.strftime(date_format=date_format, fast_strftime=False)", globals=glob).repeat(3, 100)) + assert fast_best < strftime_best # much better + + # How many alternative are worth doing here ? + # probably datetime, date, period ? + + # fast_fmt = fmt.Datetime64Formatter(x) + # assert fast_fmt.fast_strftime is True + # fast_result = fast_fmt.get_result() + # slow_result = fmt.Datetime64Formatter(x, fast_strftime=False).get_result() + # for i in range(100): + # assert fast_result[i].strip() == "2013-01-01 00:00:00" + # assert slow_result[i].strip() == "2013-01-01 00:00:00" + + def test_format_percentiles(): result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) expected = ["1.999%", "2.001%", "50%", "66.667%", "99.99%"] From 9bba33b44fa446c923298155df61022387752578 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 13:06:34 +0100 Subject: [PATCH 08/65] Fixed issue in tests happening when format=None --- pandas/_libs/tslib.pyx | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0e35da532e02f..8b960e9644e3e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -166,12 +166,16 @@ def format_array_from_datetime( show_ns = show_ms = False elif fast_strftime: - try: - # Try to get the string formatting template for this format - str_format = convert_dtformat(format) - except UnsupportedDatetimeDirective: - # Unsupported directive: fallback to standard `strftime` + if format is None: + # We'll fallback to the Timestamp.str method fast_strftime = False + else: + try: + # Try to get the string formatting template for this format + str_format = convert_dtformat(format) + except UnsupportedDatetimeDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False for i in range(N): val = values[i] From fe8aa3b57675f85d99363090827c36bcfbe397ea Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 13:37:07 +0100 Subject: [PATCH 09/65] isort --- pandas/_libs/tslib.pyx | 5 ++--- pandas/_libs/tslibs/__init__.py | 2 +- pandas/core/arrays/period.py | 4 ++-- pandas/core/tools/datetimes.py | 2 +- pandas/io/formats/format.py | 4 ++-- pandas/tests/io/formats/test_format.py | 6 ++---- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8b960e9644e3e..0e1d7fb7fa8c6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -60,12 +60,11 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp - from pandas._libs.tslibs.strftime import ( - convert_dtformat, UnsupportedDatetimeDirective, + convert_dtformat, ) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index eb09063185442..c6079d0df30e4 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -51,8 +51,8 @@ Period, ) from pandas._libs.tslibs.strftime import ( - convert_dtformat, UnsupportedDatetimeDirective, + convert_dtformat, ) from pandas._libs.tslibs.timedeltas import ( Timedelta, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f391491234af0..f4491b888458f 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -19,14 +19,14 @@ NaT, NaTType, Timedelta, + UnsupportedDatetimeDirective, + convert_dtformat, delta_to_nanoseconds, dt64arr_to_periodarr as c_dt64arr_to_periodarr, iNaT, parsing, period as libperiod, to_offset, - convert_dtformat, - UnsupportedDatetimeDirective, ) from pandas._libs.tslibs.dtypes import FreqGroup from pandas._libs.tslibs.fields import isleapyear_arr diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index dca88e9418ad0..c4fa1fe38ce95 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -24,11 +24,11 @@ Timedelta, Timestamp, conversion, + convert_dtformat, iNaT, nat_strings, parsing, timezones, - convert_dtformat, ) from pandas._libs.tslibs.parsing import ( # noqa:F401 DateParseError, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5b69b7b75861c..b31d9e302a1da 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,9 +42,9 @@ NaT, Timedelta, Timestamp, - iNaT, - convert_dtformat, UnsupportedDatetimeDirective, + convert_dtformat, + iNaT, ) from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index dbdb0951f5350..0a320a491d0d3 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,8 +1,6 @@ """ Test output formatting for Series/DataFrame, including to_string & reprs """ -from timeit import Timer - from datetime import datetime from io import StringIO import itertools @@ -13,6 +11,7 @@ from shutil import get_terminal_size import sys import textwrap +from timeit import Timer import dateutil import numpy as np @@ -20,12 +19,10 @@ import pytz from pandas._libs.tslibs import convert_dtformat - from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.core.tools.datetimes import fast_strftime import pandas.util._test_decorators as td import pandas as pd @@ -44,6 +41,7 @@ set_option, ) import pandas._testing as tm +from pandas.core.tools.datetimes import fast_strftime import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing From a3cadec9543d3ffbbc638ba5821bb64bfeeb6abc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 13:48:47 +0100 Subject: [PATCH 10/65] Fixed `BusinessHour._repr_attrs` so that it is faster too. --- pandas/_libs/tslibs/offsets.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index dc049e9195d3f..d84fae2f9579b 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1548,8 +1548,10 @@ cdef class BusinessHour(BusinessMixin): def _repr_attrs(self) -> str: out = super()._repr_attrs() + # Use python string formatting to be faster than strftime + # f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' hours = ",".join( - f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' + f'{st.hour:02d}:{st.min:02d}-{en.hour:02d}:{en.min:02d}' for st, en in zip(self.start, self.end) ) attrs = [f"{self._prefix}={hours}"] From 90c836a2c8686d5e9c8043eb0dfe32a0aa7fc2dc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 13:50:07 +0100 Subject: [PATCH 11/65] Added TODO for possible improvement in timedeltas --- pandas/_libs/tslibs/timedeltas.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8eaf86b3d193f..e120eb976af1e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1195,6 +1195,7 @@ cdef class _Timedelta(timedelta): comp_dict = self.components._asdict() comp_dict['sign'] = sign + # TODO make faster using old-style formatting return fmt.format(**comp_dict) def __repr__(self) -> str: From d394a1beb7bce69daac59656b8a4e0e52adc6dba Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 13:50:46 +0100 Subject: [PATCH 12/65] Added maintenance comments concerning using Timestamp.str --- pandas/_libs/tslib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0e1d7fb7fa8c6..26035c471fc3d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -211,6 +211,7 @@ def format_array_from_datetime( ts = Timestamp(val, tz=tz) if format is None: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) else: @@ -219,6 +220,7 @@ def format_array_from_datetime( try: result[i] = ts.strftime(format) except ValueError: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) return result From 460504d4b609224a6b57bc4b4b71ad8284461257 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 19:13:46 +0100 Subject: [PATCH 13/65] Improved/fix periods handling. New c function `_period_fast_strftime`. --- pandas/_libs/tslibs/period.pyx | 129 +++++++++++++++++++++++--------- pandas/_libs/tslibs/strftime.py | 5 ++ pandas/core/arrays/period.py | 20 +++-- 3 files changed, 110 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 8c331b13f9735..aa448ec092d97 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1191,39 +1191,87 @@ cdef str period_format(int64_t value, int freq, object fmt=None): if value == NPY_NAT: return "NaT" - if isinstance(fmt, str): - fmt = fmt.encode("utf-8") - if fmt is None: - freq_group = get_freq_group(freq) - if freq_group == FR_ANN: - fmt = b'%Y' - elif freq_group == FR_QTR: - fmt = b'%FQ%q' - elif freq_group == FR_MTH: - fmt = b'%Y-%m' - elif freq_group == FR_WK: - left = period_asfreq(value, freq, FR_DAY, 0) - right = period_asfreq(value, freq, FR_DAY, 1) - return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" - elif freq_group == FR_BUS or freq_group == FR_DAY: - fmt = b'%Y-%m-%d' - elif freq_group == FR_HR: - fmt = b'%Y-%m-%d %H:00' - elif freq_group == FR_MIN: - fmt = b'%Y-%m-%d %H:%M' - elif freq_group == FR_SEC: - fmt = b'%Y-%m-%d %H:%M:%S' - elif freq_group == FR_MS: - fmt = b'%Y-%m-%d %H:%M:%S.%l' - elif freq_group == FR_US: - fmt = b'%Y-%m-%d %H:%M:%S.%u' - elif freq_group == FR_NS: - fmt = b'%Y-%m-%d %H:%M:%S.%n' - else: - raise ValueError(f"Unknown freq: {freq}") + return _period_fast_strftime(value, freq) + else: + if isinstance(fmt, str): + fmt = fmt.encode("utf-8") + + return _period_strftime(value, freq, fmt) + + +cdef str _period_fast_strftime(int64_t value, int freq): + """A faster strftime alternative leveraging string formatting.""" - return _period_strftime(value, freq, fmt) + cdef: + int freq_group + tm c_date + npy_datetimestruct dts + + # fill dts + get_date_info(value, freq, &dts) + + # get the appropriate format depending on frequency group + freq_group = get_freq_group(freq) + if freq_group == FR_ANN: + # fmt = b'%Y' + return f"{dts.year}" + + elif freq_group == FR_QTR: + # fmt = b'%FQ%q' + quarter = get_yq(value, freq, &dts) + # TODO check _period_strftime : this should be the fiscal year + # f"{(dts.year % 100):02d}Q{quarter}" + return "%02dQ%s" % ((dts.year % 100), quarter) + + elif freq_group == FR_MTH: + # fmt = b'%Y-%m' + return f"{dts.year}-{dts.month:02d}" + + elif freq_group == FR_WK: + # special: start_date/end_date. Recurse + left = period_asfreq(value, freq, FR_DAY, 0) + right = period_asfreq(value, freq, FR_DAY, 1) + return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" + + elif freq_group == FR_BUS or freq_group == FR_DAY: + # fmt = b'%Y-%m-%d' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d}" + + elif freq_group == FR_HR: + # fmt = b'%Y-%m-%d %H:00' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d} {dts.hour:02d}:00" + + elif freq_group == FR_MIN: + # fmt = b'%Y-%m-%d %H:%M' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}") + + elif freq_group == FR_SEC: + # fmt = b'%Y-%m-%d %H:%M:%S' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") + + elif freq_group == FR_MS: + # fmt = b'%Y-%m-%d %H:%M:%S.%l' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(value % 1_000):03d}") + + elif freq_group == FR_US: + # fmt = b'%Y-%m-%d %H:%M:%S.%u' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(value % 1_000_000):06d}") + + elif freq_group == FR_NS: + # fmt = b'%Y-%m-%d %H:%M:%S.%n' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(value % 1_000_000_000):09d}") + + else: + raise ValueError(f"Unknown freq: {freq}") cdef list extra_fmts = [(b"%q", b"^`AB`^"), @@ -1247,6 +1295,9 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): str result, repl get_date_info(value, freq, &dts) + + # Find our additional directives in the pattern and replace them with + # placeholders that are not processed by c_strftime for i in range(len(extra_fmts)): pat = extra_fmts[i][0] brepl = extra_fmts[i][1] @@ -1254,27 +1305,31 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): fmt = fmt.replace(pat, brepl) found_pat[i] = True + # Execute c_strftime to process the usual datetime directives formatted = c_strftime(&dts, fmt) result = util.char_to_string(formatted) free(formatted) + # Now fill the placeholders corresponding to our additional directives for i in range(len(extra_fmts)): if found_pat[i]: quarter = get_yq(value, freq, &dts) - if i == 0: + if i == 0: # %q, quarter repl = str(quarter) - elif i == 1: # %f, 2-digit year + elif i == 1: # %f, 2-digit 'Fiscal' year + # TODO why is this not the fiscal year ? repl = f"{(dts.year % 100):02d}" - elif i == 2: + elif i == 2: # %F, 'Fiscal' year with a century + # TODO why is this not the fiscal year ? repl = str(dts.year) - elif i == 3: + elif i == 3: # %l, milliseconds repl = f"{(value % 1_000):03d}" - elif i == 4: + elif i == 4: # %u, microseconds repl = f"{(value % 1_000_000):06d}" - elif i == 5: + elif i == 5: # %n, nanoseconds repl = f"{(value % 1_000_000_000):09d}" result = result.replace(str_extra_fmts[i], repl) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 9dfc07a730253..216a0f099e505 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -10,6 +10,11 @@ def convert_dtformat( ) -> str: """Convert a strftime formatting string into a string formatting template string. + TODO should we have one such method for each type (date, time, datetime, period), or + accept a parameter target='date'/'time'/'datetime'/'period' ? That + would allow us to validate the directives further, for example period microsecond + does not exist. See PeriodArray._format_native_types + Parameters ---------- strftime_fmt : str diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f4491b888458f..423c06d83c6a3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -642,8 +642,11 @@ def _format_native_types( """ values = self.astype(object) + # Create the formatter function if date_format: if fast_strftime: + # TODO update this using _period_strftime contents + # note: work on a test that fails currently (find one ?) try: # Try to get the string formatting template for this format str_format = convert_dtformat(date_format) @@ -653,23 +656,26 @@ def _format_native_types( if fast_strftime: # Faster: python old-style string formatting - formatter = lambda dt: str_format % dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.min, sec=dt.sec, us=dt.us + formatter = lambda p: str_format % dict( + year=p.year, month=p.month, day=p.day, hour=p.hour, + min=p.minute, sec=p.second ) else: # Slower: strftime - formatter = lambda dt: dt.strftime(date_format) + formatter = lambda p: p.strftime(date_format) else: - formatter = lambda dt: str(dt) + # Uses `_Period.str>format_period` that is faster than strftime too + formatter = lambda p: str(p) + # Apply the formatter to all values in the array, possibly with a mask if self._hasna: mask = self._isnan values[mask] = na_rep imask = ~mask - values[imask] = np.array([formatter(dt) for dt in values[imask]]) + values[imask] = np.array([formatter(p) for p in values[imask]]) else: - values = np.array([formatter(dt) for dt in values]) + values = np.array([formatter(p) for p in values]) + return values # ------------------------------------------------------------------ From d700c446decf944589a71d6875085d4a6b516d57 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 21:34:49 +0100 Subject: [PATCH 14/65] Fixed period string representation %FQ%q --- pandas/_libs/tslibs/period.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index aa448ec092d97..740ad93aa9413 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1221,8 +1221,8 @@ cdef str _period_fast_strftime(int64_t value, int freq): # fmt = b'%FQ%q' quarter = get_yq(value, freq, &dts) # TODO check _period_strftime : this should be the fiscal year - # f"{(dts.year % 100):02d}Q{quarter}" - return "%02dQ%s" % ((dts.year % 100), quarter) + # f"{dts.year}Q{quarter}" + return "%sQ%s" % (dts.year, quarter) elif freq_group == FR_MTH: # fmt = b'%Y-%m' From 1a44d022d7990f1c4d88e372da13158530da341a Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 1 Mar 2022 21:35:23 +0100 Subject: [PATCH 15/65] Relaxed perf test for datetimes --- pandas/tests/io/formats/test_format.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0a320a491d0d3..c33e4f2e21309 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3312,11 +3312,11 @@ def test_fast_strftime_perf(self, strftime_format): old_style_format = convert_dtformat(strftime_format) # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 - def get_datetime_fmt_dct(dt): - return dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond - ) + fmt_dct = dict( + year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, + min=dt.minute, sec=dt.second, us=dt.microsecond + ) + glob = globals() glob.update(locals()) strftime_best = min( @@ -3324,13 +3324,13 @@ def get_datetime_fmt_dct(dt): ) # Out[3]: 0.0062 new_style_best = min( - Timer("new_style_format.format(**get_datetime_fmt_dct(dt))", globals=glob).repeat(7, 1000) + Timer("new_style_format.format(**fmt_dct)", globals=glob).repeat(7, 1000) ) - # Out[4]: 0.0018 if get_datetime_fmt_dct is pre-evaluated, 0.0036 otherwise + # Out[4]: 0.0036 old_style_best = min( - Timer("old_style_format % get_datetime_fmt_dct(dt)", globals=glob).repeat(7, 1000) + Timer("old_style_format % fmt_dct", globals=glob).repeat(7, 1000) ) - # Out[5]: 0.0012 if get_datetime_fmt_dct is pre-evaluated, 0.0030 otherwise + # Out[5]: 0.0030 assert new_style_best < strftime_best # much better assert old_style_best < new_style_best # even better ! From 61ec6d6047fa5dd0fd4150e2f568f440729df52e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 11:23:54 +0100 Subject: [PATCH 16/65] Removed 2 useless cdefs and fixed docstring of strftime with missing extra directives --- pandas/_libs/tslibs/period.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 740ad93aa9413..8ed5f04691f12 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1185,9 +1185,6 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef str period_format(int64_t value, int freq, object fmt=None): - cdef: - int freq_group - if value == NPY_NAT: return "NaT" @@ -1205,7 +1202,6 @@ cdef str _period_fast_strftime(int64_t value, int freq): cdef: int freq_group - tm c_date npy_datetimestruct dts # fill dts @@ -2402,7 +2398,8 @@ cdef class _Period(PeriodMixin): containing one or several directives. The method recognizes the same directives as the :func:`time.strftime` function of the standard Python distribution, as well as the specific additional directives ``%f``, - ``%F``, ``%q``. (formatting & docs originally from scikits.timeries). + ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. + (formatting & docs originally from scikits.timeries). +-----------+--------------------------------+-------+ | Directive | Meaning | Notes | From 846a0fd3dcf510325339f4bbe7ca70f8de629bd9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 16:10:47 +0100 Subject: [PATCH 17/65] Fixed issue with BusinessHour formatting --- pandas/_libs/tslibs/offsets.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index d84fae2f9579b..0d52be5f6597e 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1551,7 +1551,7 @@ cdef class BusinessHour(BusinessMixin): # Use python string formatting to be faster than strftime # f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' hours = ",".join( - f'{st.hour:02d}:{st.min:02d}-{en.hour:02d}:{en.min:02d}' + f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}' for st, en in zip(self.start, self.end) ) attrs = [f"{self._prefix}={hours}"] From 84f4c7f074feceac58623fa62195e1338815c535 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 16:22:45 +0100 Subject: [PATCH 18/65] Fixed _period_fast_strftime FR_QTR frequency default format, and _period_strftime %q format. Also accelerated slightly the _period_strftime. --- pandas/_libs/tslibs/period.pyx | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 8ed5f04691f12..363d05a670317 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1201,7 +1201,7 @@ cdef str _period_fast_strftime(int64_t value, int freq): """A faster strftime alternative leveraging string formatting.""" cdef: - int freq_group + int freq_group, quarter npy_datetimestruct dts # fill dts @@ -1215,10 +1215,9 @@ cdef str _period_fast_strftime(int64_t value, int freq): elif freq_group == FR_QTR: # fmt = b'%FQ%q' + # get quarter and modify dts.year to be the fiscal year (?) quarter = get_yq(value, freq, &dts) - # TODO check _period_strftime : this should be the fiscal year - # f"{dts.year}Q{quarter}" - return "%sQ%s" % (dts.year, quarter) + return f"{dts.year}Q{quarter:02d}" elif freq_group == FR_MTH: # fmt = b'%Y-%m' @@ -1287,7 +1286,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): char *formatted bytes pat, brepl list found_pat = [False] * len(extra_fmts) - int year, quarter + int quarter str result, repl get_date_info(value, freq, &dts) @@ -1308,18 +1307,17 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): free(formatted) # Now fill the placeholders corresponding to our additional directives + if found_pat[0] or found_pat[1] or found_pat[2]: + # get the quarter and modify the year in-place (fiscal?) + quarter = get_yq(value, freq, &dts) + for i in range(len(extra_fmts)): if found_pat[i]: - - quarter = get_yq(value, freq, &dts) - - if i == 0: # %q, quarter - repl = str(quarter) + if i == 0: # %q, 2-digit quarter. TODO add a test + repl = f"{quarter:02d}" elif i == 1: # %f, 2-digit 'Fiscal' year - # TODO why is this not the fiscal year ? repl = f"{(dts.year % 100):02d}" elif i == 2: # %F, 'Fiscal' year with a century - # TODO why is this not the fiscal year ? repl = str(dts.year) elif i == 3: # %l, milliseconds repl = f"{(value % 1_000):03d}" From e96fe8d0a947f21acaf7e61d7b3ca845792f8802 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 16:28:54 +0100 Subject: [PATCH 19/65] Fixed the quarter issue introduced in previous commit. Apparently the code was right, not the docstring. Fixed the docstring --- pandas/_libs/tslibs/period.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 363d05a670317..83d57ffa7499f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1217,7 +1217,7 @@ cdef str _period_fast_strftime(int64_t value, int freq): # fmt = b'%FQ%q' # get quarter and modify dts.year to be the fiscal year (?) quarter = get_yq(value, freq, &dts) - return f"{dts.year}Q{quarter:02d}" + return f"{dts.year}Q{quarter}" elif freq_group == FR_MTH: # fmt = b'%Y-%m' @@ -1313,8 +1313,8 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): for i in range(len(extra_fmts)): if found_pat[i]: - if i == 0: # %q, 2-digit quarter. TODO add a test - repl = f"{quarter:02d}" + if i == 0: # %q, quarter. + repl = f"{quarter}" elif i == 1: # %f, 2-digit 'Fiscal' year repl = f"{(dts.year % 100):02d}" elif i == 2: # %F, 'Fiscal' year with a century @@ -2444,7 +2444,7 @@ cdef class _Period(PeriodMixin): | | AM or PM. | | +-----------+--------------------------------+-------+ | ``%q`` | Quarter as a decimal number | | - | | [01,04] | | + | | [1,4] | | +-----------+--------------------------------+-------+ | ``%S`` | Second as a decimal number | \(4) | | | [00,61]. | | From 707a0847d3a16271927f5ac7d5fdec873f29640a Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 16:34:07 +0100 Subject: [PATCH 20/65] Improved `convert_dtformat` so that it safely escapes remaining formatting symbols (% or {} depending on formatting style) --- pandas/_libs/tslibs/strftime.py | 54 ++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 216a0f099e505..88747f875a722 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -77,28 +77,48 @@ def convert_dtformat( # Mapping between strftime and string formatting, according to both styles if new_style_fmt: + esc_l = "+^_\\" + esc_r = "/_^+" supported = { - "%d": "{day:02d}", # Day of the month as a zero-padded decimal number. - "%m": "{month:02d}", # Month as a zero-padded decimal number. - "%Y": "{year}", # Year with century as a decimal number. - "%H": "{hour:02d}", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": "{min:02d}", # Minute as a zero-padded decimal number. - "%S": "{sec:02d}", # Second as a zero-padded decimal number. - "%f": "{us:06d}", # Microsecond as a decimal number, zero-padded to 6 digits. + "%d": f"{esc_l}day:02df{esc_r}", # Day of the month as a zero-padded decimal number. + "%m": f"{esc_l}month:02d{esc_r}", # Month as a zero-padded decimal number. + "%Y": f"{esc_l}year{esc_r}", # Year with century as a decimal number. + "%H": f"{esc_l}hour:02d{esc_r}", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": f"{esc_l}min:02d{esc_r}", # Minute as a zero-padded decimal number. + "%S": f"{esc_l}sec:02d{esc_r}", # Second as a zero-padded decimal number. + "%f": f"{esc_l}us:06d{esc_r}", # Microsecond as a decimal number, zero-padded to 6 digits. } + + # Create the output by replacing all directives + for key, replacement in supported.items(): + strftime_fmt = strftime_fmt.replace(key, replacement) + + # Escape remaining curly braces + strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") + + # Finally replace our placeholders + strftime_fmt = strftime_fmt.replace(esc_l, "{").replace(esc_r, "}") + else: + esc = "/_^+" supported = { - "%d": "%(day)02d", # Day of the month as a zero-padded decimal number. - "%m": "%(month)02d", # Month as a zero-padded decimal number. - "%Y": "%(year)s", # Year with century as a decimal number. - "%H": "%(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": "%(min)02d", # Minute as a zero-padded decimal number. - "%S": "%(sec)02d", # Second as a zero-padded decimal number. - "%f": "%(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. + "%d": f"{esc}(day)02d", # Day of the month as a zero-padded decimal number. + "%m": f"{esc}(month)02d", # Month as a zero-padded decimal number. + "%Y": f"{esc}(year)s", # Year with century as a decimal number. + "%H": f"{esc}(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. + "%M": f"{esc}(min)02d", # Minute as a zero-padded decimal number. + "%S": f"{esc}(sec)02d", # Second as a zero-padded decimal number. + "%f": f"{esc}(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. } - # Create the output by replacing all directives - for key, replacement in supported.items(): - strftime_fmt = strftime_fmt.replace(key, replacement) + # Create the output by replacing all directives + for key, replacement in supported.items(): + strftime_fmt = strftime_fmt.replace(key, replacement) + + # Escape remaining percent signs + strftime_fmt = strftime_fmt.replace("%", "%%") + + # Finally replace our placeholder + strftime_fmt = strftime_fmt.replace(esc, "%") return strftime_fmt From 37f15c9ed0681744c5ce68a28ff09f4489120bd0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 18:12:24 +0100 Subject: [PATCH 21/65] `convert_dtformat` now supports periods formatting and is robust to special characters. New `Period.fast_strftime` used in `PeriodArray`. Renamed `convert_dtformat` into `convert_strftime_format` and `UnsupportedDatetimeDirective` into `UnsupportedStrFmtDirective`. --- pandas/_libs/tslib.pyx | 10 +- pandas/_libs/tslibs/__init__.py | 8 +- pandas/_libs/tslibs/period.pyx | 41 ++++++ pandas/_libs/tslibs/strftime.py | 185 ++++++++++++++++--------- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/period.py | 18 ++- pandas/core/tools/datetimes.py | 6 +- pandas/io/formats/format.py | 11 +- pandas/tests/io/formats/test_format.py | 11 +- 9 files changed, 197 insertions(+), 95 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 26035c471fc3d..8b1476a62ef0a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -61,8 +61,8 @@ from pandas._libs.tslibs.nattype cimport ( from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.strftime import ( - UnsupportedDatetimeDirective, - convert_dtformat, + UnsupportedStrFmtDirective, + convert_strftime_format, ) from pandas._libs.tslibs.timestamps import Timestamp @@ -121,7 +121,7 @@ def format_array_from_datetime( a nat format fast_strftime : bool, default True If `True` (default) and the format permits it, a faster formatting - method will be used. See `convert_dtformat`. + method will be used. See `convert_strftime_format`. Returns ------- @@ -171,8 +171,8 @@ def format_array_from_datetime( else: try: # Try to get the string formatting template for this format - str_format = convert_dtformat(format) - except UnsupportedDatetimeDirective: + str_format = convert_strftime_format(format) + except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index c6079d0df30e4..80d7a9de19735 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -9,8 +9,8 @@ "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", - "convert_dtformat", - "UnsupportedDatetimeDirective", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "Resolution", "Timedelta", "normalize_i8_timestamps", @@ -51,8 +51,8 @@ Period, ) from pandas._libs.tslibs.strftime import ( - UnsupportedDatetimeDirective, - convert_dtformat, + UnsupportedStrFmtDirective, + convert_strftime_format, ) from pandas._libs.tslibs.timedeltas import ( Timedelta, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 83d57ffa7499f..82aafbc292714 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2389,6 +2389,47 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) + def fast_strftime(self, fmt_str: str) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` should be created using `convert_period_format(fmt)`. + + See also `self.strftime`, that relies on `period_format`. + + Examples + -------- + + >>> a = Period(freq='Q-JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> fast_fmt = convert_period_format('%F-Q%q') + >>> a.fast_strftime(fast_fmt) + '2006-Q1' + """ + freq = self._dtype._dtype_code + value = self.ordinal + + if value == NPY_NAT: + return "NaT" + + cdef: + npy_datetimestruct dts, dts2 + int quarter + + # Fill dts with all fields + get_date_info(value, freq, &dts) + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template + return fmt_str % dict( + year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, + min=dts.min, sec=dts.sec, + ms=dts.us // 1000, us=dts.us, ns=dts.us * 1000, + q=quarter, Fyear=dts2.year, fyear=dts2.year % 100 + ) + def strftime(self, fmt: str) -> str: r""" Returns the string representation of the :class:`Period`, depending diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 88747f875a722..0c5fff80a9e09 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -1,26 +1,100 @@ """Strftime-related classes and functions. """ -class UnsupportedDatetimeDirective(ValueError): - """The strftime format contains a directive that is not supported in this context.""" -def convert_dtformat( +class UnsupportedStrFmtDirective(ValueError): + """The format contains a directive that is not supported in this context. + """ + + +_COMMON_UNSUPPORTED = ( + # All of these below are names and therefore are not in the numpy or datetime attr representation + "%a", # Weekday as locale’s abbreviated name. + "%A", # Weekday as locale’s full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale’s abbreviated name. + "%B", # Month as locale’s full name. + # TODO All of those below can probably be derived easily from the numbers on the instance + "%y", # Year without century as a zero-padded decimal number. >> year % 100 + "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 + "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" + # TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). + "%Z", # Time zone name (empty string if the object is naive). + # We do not want to enter into these below, we do not want to re-create the datetime implementation + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. + "%c", # Locale’s appropriate date and time representation. + "%x", # Locale’s appropriate date representation. + "%X", # Locale’s appropriate time representation. +) + + +_COMMON_MAP = { + "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. + "%m": ("month", "02d"), # Month as a zero-padded decimal number. + "%Y": ("year", "d"), # Year with century as a decimal number. + "%H": ("hour", "02d"), # Hour (24-hour clock) as a zero-padded decimal number. + "%M": ("min", "02d"), # Minute as a zero-padded decimal number. + "%S": ("sec", "02d"), # Second as a zero-padded decimal number. +} + +_DATETIME_MAP = { + "%f": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. +} + +_PERIOD_MAP = { + "%f": ("fyear", "02d"), # 'Fiscal' year without century as zero-padded decimal number [00,99] + "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number + "%q": ("q", "d"), # Quarter as a decimal number [1,4] + "%l": ("ms", "03d"), # Microsecond as a decimal number, zero-padded to 3 digits. + "%u": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. + "%n": ("ns", "09d"), # Microsecond as a decimal number, zero-padded to 9 digits. +} + + +def convert_strftime_format( strftime_fmt: str, + target: str = "datetime", new_style_fmt: bool = False, ) -> str: - """Convert a strftime formatting string into a string formatting template string. + """Convert a strftime formatting string into a formatting template string. + + The set of supported directives varies according to the `target`. + + This method can be tested on a single instance of + + - `datetime` or `Timestamp`, through + `pandas.core.tools.datetimes.fast_strftime`. The + result may be compared with `datetime.strftime` or `Timestamp.strftime` + + - `Period` through `Period.fast_strftime`. The result may be compared + with `Period.strftime`. + + On array-like objects, this method is used in several places: + + - Subclasses of `DatelikeOps` now rely on this method in their + `self.strftime(fmt, fast_strftime=True)` default implementation, which + delegates to `_format_native_types`. - TODO should we have one such method for each type (date, time, datetime, period), or - accept a parameter target='date'/'time'/'datetime'/'period' ? That - would allow us to validate the directives further, for example period microsecond - does not exist. See PeriodArray._format_native_types + - `DatetimeArray._format_native_types` relies on + `tslib.format_array_from_datetime` which relies on this function + - `PeriodArray._format_native_types` directly relies on this function. + - `TimedeltaArray._format_native_types` does not currently support + custom formats. + + In addition, `Datetime64Formatter` and `Datetime64TZFormatter` also + rely on this when their attribute `fast_strftime` is `True` (default). Parameters ---------- strftime_fmt : str The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. Note that not all directives are eligible to successful usage of string formatting. - Unsupported directives will lead to an `UnsupportedDatetimeDirective` being raised. + Unsupported directives will lead to an `UnsupportedStrFmtDirective` being raised. + target : { "datetime", "date", "time", "period" }, default: "datetime" + The kind of data that will be formatted using this template. new_style_fmt : bool, default: False Whether the output string should be new-style e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" @@ -31,67 +105,58 @@ def convert_dtformat( fmt_out : str A string that may be used to format a `datetime` variable. The style of this string is either old-style or new-style depending on `new_style_formatting`. - For old-style, it may be used as `fmt_out % get_datetime_fmt_dct(dt)`. - For new-style, it may be used as `fmt_out.format(**get_datetime_fmt_dct(dt))` + For old-style, it may be used as `fmt_out % fmt_dct`. + For new-style, it may be used as `fmt_out.format(**fmt_dct)` Raises ------ - UnsupportedDatetimeDirective + UnsupportedStrFmtDirective Raised when the received `strftime_fmt` format contains a directive for which the output can not currently be created using string formatting. See Also -------- - - `strftime format codes reference `_ - - `Stackoverflow post `_ explaining - how old-style formatting is faster than new-style formatting, itself faster than - `datetime.strftime`. + `strftime format codes reference `_ + + `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than datetime.strftime`. + + See `Period.strftime` doc for all the supported period directives (same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as specific additional directives ``%f``, ``%F``, + ``%q``, ``%l``, ``%u``, ``%n``). """ - non_supported = ( - # All of these below are names and therefore are not in the numpy or datetime attr representation - "%a", # Weekday as locale’s abbreviated name. - "%A", # Weekday as locale’s full name. - "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. - "%b", # Month as locale’s abbreviated name. - "%B", # Month as locale’s full name. - # TODO All of those below can probably be derived easily from the numbers on the instance - "%y", # Year without century as a zero-padded decimal number. >> year % 100 - "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 - "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" - # TODO Below Time offset and timezone information ... but may be hard - "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). - "%Z", # Time zone name (empty string if the object is naive). - # We do not want to enter into these below, we do not want to re-create the datetime implementation - "%j", # Day of the year as a zero-padded decimal number. - "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. - "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. - "%c", # Locale’s appropriate date and time representation. - "%x", # Locale’s appropriate date representation. - "%X", # Locale’s appropriate time representation. - ) + if target in ("datetime", "date", "time"): + directive_maps = (_COMMON_MAP, _DATETIME_MAP) + elif target == "period": + directive_maps = (_COMMON_MAP, _PERIOD_MAP) + else: + raise ValueError(f"Invalid target: {target!r}") # Raise if unsupported directive found in `strftime_fmt` - for key in non_supported: + for key in _COMMON_UNSUPPORTED: if key in strftime_fmt: - raise UnsupportedDatetimeDirective(f"Unsupported datetime formatting directive: {key!r}") + raise UnsupportedStrFmtDirective(f"Unsupported directive: {key!r}") # Mapping between strftime and string formatting, according to both styles if new_style_fmt: + esc = "/_+\\" + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc) + esc_l = "+^_\\" esc_r = "/_^+" - supported = { - "%d": f"{esc_l}day:02df{esc_r}", # Day of the month as a zero-padded decimal number. - "%m": f"{esc_l}month:02d{esc_r}", # Month as a zero-padded decimal number. - "%Y": f"{esc_l}year{esc_r}", # Year with century as a decimal number. - "%H": f"{esc_l}hour:02d{esc_r}", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": f"{esc_l}min:02d{esc_r}", # Minute as a zero-padded decimal number. - "%S": f"{esc_l}sec:02d{esc_r}", # Second as a zero-padded decimal number. - "%f": f"{esc_l}us:06d{esc_r}", # Microsecond as a decimal number, zero-padded to 6 digits. - } # Create the output by replacing all directives - for key, replacement in supported.items(): - strftime_fmt = strftime_fmt.replace(key, replacement) + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "{day:02d}" but with escaped { and } + strftime_fmt = strftime_fmt.replace(key, f"{esc_l}{_name}:{_fmt}{esc_r}") + + # Restore the %% into % + strftime_fmt = strftime_fmt.replace(esc, "%") # Escape remaining curly braces strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") @@ -101,19 +166,15 @@ def convert_dtformat( else: esc = "/_^+" - supported = { - "%d": f"{esc}(day)02d", # Day of the month as a zero-padded decimal number. - "%m": f"{esc}(month)02d", # Month as a zero-padded decimal number. - "%Y": f"{esc}(year)s", # Year with century as a decimal number. - "%H": f"{esc}(hour)02d", # Hour (24-hour clock) as a zero-padded decimal number. - "%M": f"{esc}(min)02d", # Minute as a zero-padded decimal number. - "%S": f"{esc}(sec)02d", # Second as a zero-padded decimal number. - "%f": f"{esc}(us)06d", # Microsecond as a decimal number, zero-padded to 6 digits. - } + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc * 2) # Create the output by replacing all directives - for key, replacement in supported.items(): - strftime_fmt = strftime_fmt.replace(key, replacement) + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "%(day)02d" but with escaped % + strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") # Escape remaining percent signs strftime_fmt = strftime_fmt.replace("%", "%%") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 90f936359d183..2dc97b8b5023c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1605,7 +1605,7 @@ def strftime(self, date_format: str, fast_strftime: bool = True) -> npt.NDArray[ fast_strftime : boolean, default: True if `True` (default) and the format permits it, a faster formatting - method will be used. See `convert_dtformat`. + method will be used. See `convert_strftime_format`. .. versionadded:: 1.5.0 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 423c06d83c6a3..891681b138eb7 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -19,8 +19,8 @@ NaT, NaTType, Timedelta, - UnsupportedDatetimeDirective, - convert_dtformat, + UnsupportedStrFmtDirective, + convert_strftime_format, delta_to_nanoseconds, dt64arr_to_periodarr as c_dt64arr_to_periodarr, iNaT, @@ -639,27 +639,25 @@ def _format_native_types( ) -> np.ndarray: """ actually format my specific types + + TODO maybe rather align with the way it is done in datetimes.py ? + (delegate all to a tslib.format_array_from_period cython numpy method) """ values = self.astype(object) # Create the formatter function if date_format: if fast_strftime: - # TODO update this using _period_strftime contents - # note: work on a test that fails currently (find one ?) try: # Try to get the string formatting template for this format - str_format = convert_dtformat(date_format) - except UnsupportedDatetimeDirective: + str_format = convert_strftime_format(date_format, target="period") + except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False if fast_strftime: # Faster: python old-style string formatting - formatter = lambda p: str_format % dict( - year=p.year, month=p.month, day=p.day, hour=p.hour, - min=p.minute, sec=p.second - ) + formatter = lambda p: p.fast_strftime(str_format) else: # Slower: strftime formatter = lambda p: p.strftime(date_format) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c4fa1fe38ce95..4b89c766d68f4 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -24,7 +24,7 @@ Timedelta, Timestamp, conversion, - convert_dtformat, + convert_strftime_format, iNaT, nat_strings, parsing, @@ -1303,7 +1303,7 @@ def fast_strftime( Raises ------ - UnsupportedDatetimeDirective + UnsupportedStrFmtDirective Raised when the received `strftime_fmt` format contains a directive for which the output can not currently be created using string formatting. @@ -1321,7 +1321,7 @@ def fast_strftime( ) # get the formatting template - fmt_str = convert_dtformat(fmt, new_style_fmt=new_style_fmt) + fmt_str = convert_strftime_format(fmt, new_style_fmt=new_style_fmt) # execute return fmt_str.format(**fmt_dct) if new_style_fmt else (fmt_str % fmt_dct) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b31d9e302a1da..a2d56c3149d19 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,8 +42,8 @@ NaT, Timedelta, Timestamp, - UnsupportedDatetimeDirective, - convert_dtformat, + UnsupportedStrFmtDirective, + convert_strftime_format, iNaT, ) from pandas._libs.tslibs.nattype import NaTType @@ -1634,7 +1634,8 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format, fast_strftime=self.fast_strftime + na_rep=self.nat_rep, date_format=self.date_format, + fast_strftime=self.fast_strftime ) return fmt_values.tolist() @@ -1803,8 +1804,8 @@ def get_format_datetime64( if date_format is not None and fast_strftime: try: # Try to get the string formatting template for this format - str_date_fmt = convert_dtformat(date_format) - except UnsupportedDatetimeDirective: + str_date_fmt = convert_strftime_format(date_format) + except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index c33e4f2e21309..190ac6397fd4e 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas._libs.tslibs import convert_dtformat +from pandas._libs.tslibs import convert_strftime_format from pandas.compat import ( IS64, is_platform_windows, @@ -3263,7 +3263,8 @@ class TestDatetimeFastFormatter: @pytest.mark.parametrize("strftime_format", ( "%Y-%m-%d %H:%M:%S", - "%Y %Y", + "%Y%%Y", + "%Y{%Y}", "%Y-%m-%dT%H:%M:%S.%fZ", )) @pytest.mark.parametrize("new_style", (False, True)) @@ -3283,7 +3284,7 @@ def test_fast_strftime_basic(self, strftime_format, new_style): ) # get the formatting string - style_format = convert_dtformat(strftime_format, new_style_fmt=new_style) + style_format = convert_strftime_format(strftime_format, new_style_fmt=new_style) # apply it and check the output if new_style: @@ -3308,8 +3309,8 @@ def test_fast_strftime_perf(self, strftime_format): dt = datetime.now() # pre-compute the formats - new_style_format = convert_dtformat(strftime_format, new_style_fmt=True) - old_style_format = convert_dtformat(strftime_format) + new_style_format = convert_strftime_format(strftime_format, new_style_fmt=True) + old_style_format = convert_strftime_format(strftime_format) # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 fmt_dct = dict( From e2e2d70ff4ca520230bda408f90eeca2c5c56389 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 18:12:45 +0100 Subject: [PATCH 22/65] Fixed api test --- pandas/tests/tslibs/test_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 755ac3d144246..10446a2e88288 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -17,6 +17,7 @@ def test_namespace(): "parsing", "period", "strptime", + "strftime", "vectorized", "timedeltas", "timestamps", @@ -33,6 +34,8 @@ def test_namespace(): "OutOfBoundsDatetime", "OutOfBoundsTimedelta", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "IncompatibleFrequency", "Resolution", "Tick", From 087425f63b328f6e087d043f973f8d67343371f3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 2 Mar 2022 22:16:59 +0100 Subject: [PATCH 23/65] Attempt to fix cython related uninitialized variable "quarter" warning --- pandas/_libs/tslibs/period.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 82aafbc292714..59a880d301097 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1310,6 +1310,9 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): if found_pat[0] or found_pat[1] or found_pat[2]: # get the quarter and modify the year in-place (fiscal?) quarter = get_yq(value, freq, &dts) + else: + # initialize it still, otherwise generated c code has an error + quarter = 0 for i in range(len(extra_fmts)): if found_pat[i]: From 98d4cb4fca1fc15a375e7aaccc1f3878e456b389 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 4 Mar 2022 15:59:31 +0100 Subject: [PATCH 24/65] New `Timestamp.fast_strftime`, leveraged by `format_array_from_datetime`. Fixes issue related to timezone-aware timestamp formatting. Added corresponding tests. --- pandas/_libs/tslib.pyx | 19 ++++++++++++------- pandas/_libs/tslibs/timestamps.pyx | 19 +++++++++++++++++++ pandas/tests/io/formats/test_format.py | 18 ++++++++++++++++++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8b1476a62ef0a..c53595170f4af 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -199,14 +199,19 @@ def format_array_from_datetime( elif fast_strftime: - dt64_to_dtstruct(val, &dts) - - # Use string formatting for faster strftime - result[i] = str_format % dict( - year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, - min=dts.min, sec=dts.sec, us=dts.us - ) + if tz is None: + dt64_to_dtstruct(val, &dts) + + # Use string formatting for faster strftime + result[i] = str_format % dict( + year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, + min=dts.min, sec=dts.sec, us=dts.us + ) + else: + ts = Timestamp(val, tz=tz) + # Use string formatting for faster strftime + result[i] = ts.fast_strftime(str_format) else: ts = Timestamp(val, tz=tz) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 27697da3ed88c..829da3f23107f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1195,6 +1195,25 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) + def fast_strftime(self, fmt_str: str) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` should be created using `convert_strftime_format(fmt)`. + + See also `self.strftime`, that relies on `datetime.strftime`. + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> fmt = convert_strftime_format('%Y-%m-%d %X') + >>> ts.fast_strftime(fmt) + '2020-03-14 15:32:52' + """ + return fmt_str % dict( + year=self.year, month=self.month, day=self.day, hour=self.hour, + min=self.minute, sec=self.second, us=self.microsecond + ) + def strftime(self, format): """ Timestamp.strftime(format) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 190ac6397fd4e..1b4edf11736d5 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3175,6 +3175,24 @@ def test_datetime(self): assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" + def test_datetime_tz(self): + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # If tz is currently set as utc, we'll see 2012 + assert dt.format()[0] == "2012-12-31 23:00:00+00:00" + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert dt.format()[0] == "2013-01-01 00:00:00+01:00" + + def test_datetime_tz_custom(self): + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # If tz is currently set as utc, we'll see 2012 + assert dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2012-12-31__foo__23:00:00" + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2013-01-01__foo__00:00:00" + def test_date(self): formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" From 93c8f9320f158deacf6eef2a599c2f676ba6f90b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 4 Mar 2022 16:01:39 +0100 Subject: [PATCH 25/65] Fixed last failing test --- pandas/tests/io/formats/test_format.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 1b4edf11736d5..ab1693ca09eff 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3354,18 +3354,15 @@ def test_fast_strftime_perf(self, strftime_format): assert old_style_best < new_style_best # even better ! def test_bad_strftime_directive(self): - """Test which kind of error is output in case of bad `date_format` directive.""" + """Test what happens in case of bad `date_format` directive.""" - # TODO make sure that it does not become very hard for users to understand x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) # This does not raise any error, while %D is not a correct directive ! x.dt.strftime(date_format="%Y-%M-%D___", fast_strftime=False) - # This raises a `TypeError: not enough arguments for format string` - with pytest.raises(TypeError): - x.dt.strftime(date_format="%Y-%M-%D___") - # TODO raise a more readable error ? + # We align with the same behaviour + x.dt.strftime(date_format="%Y-%M-%D___") @pytest.mark.parametrize("date_format", ( # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f are always accelerated (hardcoded) From 7b74bd24293b97bb8ee08b575cb8922e36529b15 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 4 Mar 2022 17:15:34 +0100 Subject: [PATCH 26/65] Fixed `test_missing_public_nat_methods` --- pandas/tests/scalar/test_nat.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 873103b01f64d..2d7376f5b2361 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -190,7 +190,17 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize( "klass,expected", [ - (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), + ( + Timestamp, + [ + "fast_strftime", + "freqstr", + "normalize", + "to_julian_date", + "to_period", + "tz" + ] + ), ( Timedelta, [ From c093e863ff01627a4d52ea6b909ed2bfa643a898 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 4 Mar 2022 17:52:14 +0100 Subject: [PATCH 27/65] Blackened --- pandas/_libs/tslibs/strftime.py | 34 ++++++----- pandas/core/arrays/datetimelike.py | 12 +++- pandas/core/arrays/datetimes.py | 5 +- pandas/core/indexes/datetimelike.py | 14 ++++- pandas/core/indexes/datetimes.py | 2 +- pandas/core/tools/datetimes.py | 9 ++- pandas/io/formats/format.py | 22 ++++--- pandas/tests/io/formats/test_format.py | 84 ++++++++++++++++++-------- pandas/tests/scalar/test_nat.py | 2 +- 9 files changed, 125 insertions(+), 59 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 0c5fff80a9e09..f962e1d4e230d 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -3,8 +3,7 @@ class UnsupportedStrFmtDirective(ValueError): - """The format contains a directive that is not supported in this context. - """ + """The format contains a directive that is not supported in this context.""" _COMMON_UNSUPPORTED = ( @@ -32,25 +31,28 @@ class UnsupportedStrFmtDirective(ValueError): _COMMON_MAP = { - "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. + "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. "%m": ("month", "02d"), # Month as a zero-padded decimal number. - "%Y": ("year", "d"), # Year with century as a decimal number. - "%H": ("hour", "02d"), # Hour (24-hour clock) as a zero-padded decimal number. - "%M": ("min", "02d"), # Minute as a zero-padded decimal number. - "%S": ("sec", "02d"), # Second as a zero-padded decimal number. + "%Y": ("year", "d"), # Year with century as a decimal number. + "%H": ("hour", "02d"), # Hour (24-hour clock) as a zero-padded decimal number. + "%M": ("min", "02d"), # Minute as a zero-padded decimal number. + "%S": ("sec", "02d"), # Second as a zero-padded decimal number. } _DATETIME_MAP = { - "%f": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. + "%f": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. } _PERIOD_MAP = { - "%f": ("fyear", "02d"), # 'Fiscal' year without century as zero-padded decimal number [00,99] - "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number - "%q": ("q", "d"), # Quarter as a decimal number [1,4] - "%l": ("ms", "03d"), # Microsecond as a decimal number, zero-padded to 3 digits. - "%u": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. - "%n": ("ns", "09d"), # Microsecond as a decimal number, zero-padded to 9 digits. + "%f": ( + "fyear", + "02d", + ), # 'Fiscal' year without century as zero-padded decimal number [00,99] + "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number + "%q": ("q", "d"), # Quarter as a decimal number [1,4] + "%l": ("ms", "03d"), # Microsecond as a decimal number, zero-padded to 3 digits. + "%u": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. + "%n": ("ns", "09d"), # Microsecond as a decimal number, zero-padded to 9 digits. } @@ -153,7 +155,9 @@ def convert_strftime_format( for _map in directive_maps: for key, (_name, _fmt) in _map.items(): # for example replace "%d" by "{day:02d}" but with escaped { and } - strftime_fmt = strftime_fmt.replace(key, f"{esc_l}{_name}:{_fmt}{esc_r}") + strftime_fmt = strftime_fmt.replace( + key, f"{esc_l}{_name}:{_fmt}{esc_r}" + ) # Restore the %% into % strftime_fmt = strftime_fmt.replace(esc, "%") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2dc97b8b5023c..3fb8f7f22bbe2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -294,7 +294,9 @@ def asi8(self) -> npt.NDArray[np.int64]: # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, *, na_rep="NaT", date_format=None, fast_strftime=True): + def _format_native_types( + self, *, na_rep="NaT", date_format=None, fast_strftime=True + ): """ Helper method for astype when converting to strings. @@ -1589,7 +1591,9 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime(self, date_format: str, fast_strftime: bool = True) -> npt.NDArray[np.object_]: + def strftime( + self, date_format: str, fast_strftime: bool = True + ) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1630,7 +1634,9 @@ def strftime(self, date_format: str, fast_strftime: bool = True) -> npt.NDArray[ 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime) + result = self._format_native_types( + date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime + ) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 3100c201d85ac..1f2328e21fb39 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -680,7 +680,10 @@ def _format_native_types( fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, + self.asi8, + tz=self.tz, + format=fmt, + na_rep=na_rep, fast_strftime=fast_strftime, ) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index db961acdc9919..d6f67f360f2c9 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -176,14 +176,22 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime) + return self._format_with_header( + header, na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None, fast_strftime: bool = True, + self, + header: list[str], + na_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( - self._format_native_types(na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime) + self._format_native_types( + na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) ) @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index b827f05731086..1e96b7d393cdd 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -259,7 +259,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format, fast_strftime : bool = True) -> Index: + def strftime(self, date_format, fast_strftime: bool = True) -> Index: arr = self._data.strftime(date_format, fast_strftime=fast_strftime) return Index(arr, name=self.name) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4b89c766d68f4..c45f4af4f4535 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1316,8 +1316,13 @@ def fast_strftime( """ # common dict used for formatting fmt_dct = dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond + year=dt.year, + month=dt.month, + day=dt.day, + hour=dt.hour, + min=dt.minute, + sec=dt.second, + us=dt.microsecond, ) # get the formatting template diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index a2d56c3149d19..eb10994fb1db6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1634,8 +1634,9 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format, - fast_strftime=self.fast_strftime + na_rep=self.nat_rep, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) return fmt_values.tolist() @@ -1780,8 +1781,13 @@ def _format_datetime64_dateonly( if str_date_fmt: # Faster, using string formatting return str_date_fmt % dict( - year=x.year, month=x.month, day=x.day, hour=x.hour, - min=x.min, sec=x.sec, us=x.us + year=x.year, + month=x.month, + day=x.day, + hour=x.hour, + min=x.min, + sec=x.sec, + us=x.us, ) else: # Slower @@ -1794,7 +1800,10 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None, fast_strftime : bool = True, + is_dates_only: bool, + nat_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" @@ -1810,8 +1819,7 @@ def get_format_datetime64( pass return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format, - str_date_fmt=str_date_fmt + x, nat_rep=nat_rep, date_format=date_format, str_date_fmt=str_date_fmt ) else: return lambda x: _format_datetime64(x, nat_rep=nat_rep) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index ab1693ca09eff..79a88206091ae 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3188,10 +3188,16 @@ def test_datetime_tz_custom(self): # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) # If tz is currently set as utc, we'll see 2012 - assert dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2012-12-31__foo__23:00:00" + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2012-12-31__foo__23:00:00" + ) # If tz is currently set as paris, we'll see 2013 dt = dt.tz_convert("Europe/Paris") - assert dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2013-01-01__foo__00:00:00" + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2013-01-01__foo__00:00:00" + ) def test_date(self): formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() @@ -3279,12 +3285,15 @@ class TestDatetimeFastFormatter: kind of date/time objects. """ - @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y%%Y", - "%Y{%Y}", - "%Y-%m-%dT%H:%M:%S.%fZ", - )) + @pytest.mark.parametrize( + "strftime_format", + ( + "%Y-%m-%d %H:%M:%S", + "%Y%%Y", + "%Y{%Y}", + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + ) @pytest.mark.parametrize("new_style", (False, True)) def test_fast_strftime_basic(self, strftime_format, new_style): """Test that formatting standard `datetime` objects with our utils works as good as strftime.""" @@ -3297,8 +3306,13 @@ def test_fast_strftime_basic(self, strftime_format, new_style): # common dict used for formatting fmt_dct = dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond + year=dt.year, + month=dt.month, + day=dt.day, + hour=dt.hour, + min=dt.minute, + sec=dt.second, + us=dt.microsecond, ) # get the formatting string @@ -3315,11 +3329,14 @@ def test_fast_strftime_basic(self, strftime_format, new_style): res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) assert res2 == res - @pytest.mark.parametrize("strftime_format", ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.%fZ", - )) + @pytest.mark.parametrize( + "strftime_format", + ( + "%Y-%m-%d %H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + ) def test_fast_strftime_perf(self, strftime_format): """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" @@ -3332,8 +3349,13 @@ def test_fast_strftime_perf(self, strftime_format): # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 fmt_dct = dict( - year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, - min=dt.minute, sec=dt.second, us=dt.microsecond + year=dt.year, + month=dt.month, + day=dt.day, + hour=dt.hour, + min=dt.minute, + sec=dt.second, + us=dt.microsecond, ) glob = globals() @@ -3350,7 +3372,7 @@ def test_fast_strftime_perf(self, strftime_format): Timer("old_style_format % fmt_dct", globals=glob).repeat(7, 1000) ) # Out[5]: 0.0030 - assert new_style_best < strftime_best # much better + assert new_style_best < strftime_best # much better assert old_style_best < new_style_best # even better ! def test_bad_strftime_directive(self): @@ -3364,12 +3386,15 @@ def test_bad_strftime_directive(self): # We align with the same behaviour x.dt.strftime(date_format="%Y-%M-%D___") - @pytest.mark.parametrize("date_format", ( - # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f are always accelerated (hardcoded) - "%Y-%m-%d__foo__%H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.%fZ", - )) + @pytest.mark.parametrize( + "date_format", + ( + # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f are always accelerated (hardcoded) + "%Y-%m-%d__foo__%H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + ) def test_perf_datetime64_strftime(self, date_format): x = Series(date_range("20130101 09:00:00", periods=100, freq="min")) # res = x.dt.strftime(date_format=date_format) @@ -3377,8 +3402,15 @@ def test_perf_datetime64_strftime(self, date_format): glob = globals() glob.update(locals()) - fast_best = min(Timer("x.dt.strftime(date_format=date_format)", globals=glob).repeat(3, 100)) - strftime_best = min(Timer("x.dt.strftime(date_format=date_format, fast_strftime=False)", globals=glob).repeat(3, 100)) + fast_best = min( + Timer("x.dt.strftime(date_format=date_format)", globals=glob).repeat(3, 100) + ) + strftime_best = min( + Timer( + "x.dt.strftime(date_format=date_format, fast_strftime=False)", + globals=glob, + ).repeat(3, 100) + ) assert fast_best < strftime_best # much better # How many alternative are worth doing here ? diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 2d7376f5b2361..40f6bbd3ccc50 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -198,7 +198,7 @@ def test_nat_iso_format(get_nat): "normalize", "to_julian_date", "to_period", - "tz" + "tz", ] ), ( From 2685aff3d21717d506b1789e2f36421e4338d2e4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 4 Mar 2022 17:57:53 +0100 Subject: [PATCH 28/65] Fixed doctests --- pandas/_libs/tslibs/period.pyx | 5 +++-- pandas/_libs/tslibs/timestamps.pyx | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 59a880d301097..a824a30118429 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2395,17 +2395,18 @@ cdef class _Period(PeriodMixin): def fast_strftime(self, fmt_str: str) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` should be created using `convert_period_format(fmt)`. + `fmt_str` should be created using `convert_strftime_format(fmt)`. See also `self.strftime`, that relies on `period_format`. Examples -------- + >>> from pandas._libs.tslibs import convert_strftime_format >>> a = Period(freq='Q-JUL', year=2006, quarter=1) >>> a.strftime('%F-Q%q') '2006-Q1' - >>> fast_fmt = convert_period_format('%F-Q%q') + >>> fast_fmt = convert_strftime_format('%F-Q%q', target="period") >>> a.fast_strftime(fast_fmt) '2006-Q1' """ diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 829da3f23107f..b792bca897547 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1204,6 +1204,7 @@ class Timestamp(_Timestamp): Examples -------- + >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> fmt = convert_strftime_format('%Y-%m-%d %X') >>> ts.fast_strftime(fmt) From 2c23f48ef23d650ae62642ca5f192b6ed0fe986f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 11:34:07 +0100 Subject: [PATCH 29/65] Fixed doctest example --- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b792bca897547..c7e36c65666ab 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1206,9 +1206,9 @@ class Timestamp(_Timestamp): -------- >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') - >>> fmt = convert_strftime_format('%Y-%m-%d %X') + >>> fmt = convert_strftime_format('%Y-%m-%dT%H:%M:%S') >>> ts.fast_strftime(fmt) - '2020-03-14 15:32:52' + '2020-03-14T15:32:52' """ return fmt_str % dict( year=self.year, month=self.month, day=self.day, hour=self.hour, From 1e17df6eab4b2fc4dc6f1f33783a674f8a28bc9f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 17:30:46 +0100 Subject: [PATCH 30/65] Now supporting %y (short year), %I (12h clock hour) and %p (AM/PM). Note that the latter is locale-specific... Fixed Period.strftime %l %m and %n : Fixed #46252 --- pandas/_libs/tslib.pyx | 20 +++++++--- pandas/_libs/tslibs/period.pyx | 39 ++++++++++++------- pandas/_libs/tslibs/strftime.py | 27 +++++++------ pandas/_libs/tslibs/timestamps.pyx | 18 +++++++-- pandas/core/tools/datetimes.py | 23 +++++++----- pandas/tests/io/formats/test_format.py | 52 ++++++++++++++++++++++++++ 6 files changed, 134 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c53595170f4af..40e1238f85051 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -128,7 +128,7 @@ def format_array_from_datetime( np.ndarray[object] """ cdef: - int64_t val, ns, N = len(values) + int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values bint show_ms = False, show_us = False, show_ns = False bint basic_format = False @@ -203,10 +203,20 @@ def format_array_from_datetime( dt64_to_dtstruct(val, &dts) # Use string formatting for faster strftime - result[i] = str_format % dict( - year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, - min=dts.min, sec=dts.sec, us=dts.us - ) + y = dts.year + h = dts.hour + result[i] = str_format % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": dts.hour, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": "PM" if (h // 12) else "AM", + "min": dts.min, + "sec": dts.sec, + "us": dts.us, + } else: ts = Timestamp(val, tz=tz) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a824a30118429..bb1316d57d380 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1251,19 +1251,19 @@ cdef str _period_fast_strftime(int64_t value, int freq): # fmt = b'%Y-%m-%d %H:%M:%S.%l' return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" - f".{(value % 1_000):03d}") + f".{(dts.us // 1_000):03d}") elif freq_group == FR_US: # fmt = b'%Y-%m-%d %H:%M:%S.%u' return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" - f".{(value % 1_000_000):06d}") + f".{(dts.us):06d}") elif freq_group == FR_NS: # fmt = b'%Y-%m-%d %H:%M:%S.%n' return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" - f".{(value % 1_000_000_000):09d}") + f".{((dts.us * 1000) + (dts.ps // 1000)):09d}") else: raise ValueError(f"Unknown freq: {freq}") @@ -1323,11 +1323,11 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): elif i == 2: # %F, 'Fiscal' year with a century repl = str(dts.year) elif i == 3: # %l, milliseconds - repl = f"{(value % 1_000):03d}" + repl = f"{(dts.us // 1_000):03d}" elif i == 4: # %u, microseconds - repl = f"{(value % 1_000_000):06d}" + repl = f"{(dts.us):06d}" elif i == 5: # %n, nanoseconds - repl = f"{(value % 1_000_000_000):09d}" + repl = f"{((dts.us * 1000) + (dts.ps // 1000)):09d}" result = result.replace(str_extra_fmts[i], repl) @@ -2418,7 +2418,7 @@ cdef class _Period(PeriodMixin): cdef: npy_datetimestruct dts, dts2 - int quarter + int quarter, y, h # Fill dts with all fields get_date_info(value, freq, &dts) @@ -2427,12 +2427,25 @@ cdef class _Period(PeriodMixin): quarter = get_yq(value, freq, &dts2) # Finally use the string template - return fmt_str % dict( - year=dts.year, month=dts.month, day=dts.day, hour=dts.hour, - min=dts.min, sec=dts.sec, - ms=dts.us // 1000, us=dts.us, ns=dts.us * 1000, - q=quarter, Fyear=dts2.year, fyear=dts2.year % 100 - ) + y = dts.year + h = dts.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": "PM" if (h // 12) else "AM", + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } def strftime(self, fmt: str) -> str: r""" diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index f962e1d4e230d..79a5a82864bbc 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -7,20 +7,16 @@ class UnsupportedStrFmtDirective(ValueError): _COMMON_UNSUPPORTED = ( - # All of these below are names and therefore are not in the numpy or datetime attr representation + # 1- Names not in the numpy or datetime attr representation "%a", # Weekday as locale’s abbreviated name. "%A", # Weekday as locale’s full name. "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. "%b", # Month as locale’s abbreviated name. "%B", # Month as locale’s full name. - # TODO All of those below can probably be derived easily from the numbers on the instance - "%y", # Year without century as a zero-padded decimal number. >> year % 100 - "%I", # Hour (12-hour clock) as a zero-padded decimal number. >> hour % 12 - "%p", # Locale’s equivalent of either AM or PM. >> "pm" if (hour // 12) else "am" - # TODO Below Time offset and timezone information ... but may be hard - "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). - "%Z", # Time zone name (empty string if the object is naive). - # We do not want to enter into these below, we do not want to re-create the datetime implementation + # 2- TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] ("" if tz naive). + "%Z", # Time zone name ("" if tz naive). + # 3- Probably too complex ones for now "%j", # Day of the year as a zero-padded decimal number. "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. @@ -34,13 +30,16 @@ class UnsupportedStrFmtDirective(ValueError): "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. "%m": ("month", "02d"), # Month as a zero-padded decimal number. "%Y": ("year", "d"), # Year with century as a decimal number. - "%H": ("hour", "02d"), # Hour (24-hour clock) as a zero-padded decimal number. + "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. + "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. + "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. + "%p": ("ampm", "s"), # Locale’s equivalent of either AM or PM. "%M": ("min", "02d"), # Minute as a zero-padded decimal number. "%S": ("sec", "02d"), # Second as a zero-padded decimal number. } _DATETIME_MAP = { - "%f": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. + "%f": ("us", "06d"), # Microsecond as decimal number, 0-padded to 6 digits } _PERIOD_MAP = { @@ -50,9 +49,9 @@ class UnsupportedStrFmtDirective(ValueError): ), # 'Fiscal' year without century as zero-padded decimal number [00,99] "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number "%q": ("q", "d"), # Quarter as a decimal number [1,4] - "%l": ("ms", "03d"), # Microsecond as a decimal number, zero-padded to 3 digits. - "%u": ("us", "06d"), # Microsecond as a decimal number, zero-padded to 6 digits. - "%n": ("ns", "09d"), # Microsecond as a decimal number, zero-padded to 9 digits. + "%l": ("ms", "03d"), # Millisecond as decimal number, 0-padded 3 digits + "%u": ("us", "06d"), # Microsecond as decimal number, 0-padded 6 digits + "%n": ("ns", "09d"), # Nanosecond as decimal number, 0-padded 9 digits } diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index c7e36c65666ab..d186ed7287bdb 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1210,10 +1210,20 @@ class Timestamp(_Timestamp): >>> ts.fast_strftime(fmt) '2020-03-14T15:32:52' """ - return fmt_str % dict( - year=self.year, month=self.month, day=self.day, hour=self.hour, - min=self.minute, sec=self.second, us=self.microsecond - ) + y = self.year + h = self.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": self.month, + "day": self.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": "PM" if (h // 12) else "AM", + "min": self.minute, + "sec": self.second, + "us": self.microsecond, + } def strftime(self, format): """ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c45f4af4f4535..cab64f717cadd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1315,15 +1315,20 @@ def fast_strftime( `datetime.strftime`. """ # common dict used for formatting - fmt_dct = dict( - year=dt.year, - month=dt.month, - day=dt.day, - hour=dt.hour, - min=dt.minute, - sec=dt.second, - us=dt.microsecond, - ) + y = dt.year + h = dt.hour + fmt_dct = { + "year": y, + "shortyear": y % 100, + "month": dt.month, + "day": dt.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": "PM" if (h // 12) else "AM", + "min": dt.minute, + "sec": dt.second, + "us": dt.microsecond, + } # get the formatting template fmt_str = convert_strftime_format(fmt, new_style_fmt=new_style_fmt) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 79a88206091ae..797b29bfd1169 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3169,6 +3169,58 @@ def test_str(self): assert str(NaT) == "NaT" +class TestPeriodIndexFormat: + + def test_period(self): + p = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq='H') + formatted = p.format() + assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown + assert formatted[1] == "NaT" + + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, + freq="n") + formatted = p.format() + assert formatted[0] == '2003-01-01 12:01:01.123456789' + assert formatted[1] == '2003-01-01 12:01:01.123456790' + + @pytest.mark.parametrize("fast_strftime", (False, True)) + def test_period_custom(self, fast_strftime): + # GH46252 + p = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") + formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime) + assert formatted[0] == "03 12:01:01PM (ms=123 us=123000 ns=123000000)" + assert formatted[1] == "03 12:01:01PM (ms=124 us=124000 ns=124000000)" + + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime) + assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456789)" + assert formatted[1] == "03 12:01:01PM (ms=123 us=123456 ns=123456790)" + + p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") + formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime) + assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456000)" + assert formatted[1] == "03 12:01:01PM (ms=123 us=123457 ns=123457000)" + + def test_period_tz(self): + """Test formatting periods created from a datetime with timezone""" + + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + + # Converting to a period looses the timezone information + # Since tz is currently set as utc, we'll see 2012 + p = dt.to_period(freq="H") + assert p.format()[0] == "2012-12-31 23:00" + + # If tz is currently set as paris before conversion, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + p = dt.to_period(freq="H") + assert p.format()[0] == "2013-01-01 00:00" + + class TestDatetimeIndexFormat: def test_datetime(self): formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() From 1c06d5e04628756a14220b371509c3105049c39c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 17:32:29 +0100 Subject: [PATCH 31/65] Flake8 --- pandas/_libs/tslibs/strftime.py | 12 ++++++++---- pandas/core/tools/datetimes.py | 18 ++++++++---------- pandas/tests/io/formats/test_format.py | 22 +++++++++++++++++++--- pandas/tests/scalar/test_nat.py | 2 +- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 79a5a82864bbc..c3b49af694e8a 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -18,8 +18,12 @@ class UnsupportedStrFmtDirective(ValueError): "%Z", # Time zone name ("" if tz naive). # 3- Probably too complex ones for now "%j", # Day of the year as a zero-padded decimal number. - "%U", # Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. - "%W", # Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. + "%U", # Week number of the year (Sunday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Monday are considered to be in week 0. "%c", # Locale’s appropriate date and time representation. "%x", # Locale’s appropriate date representation. "%X", # Locale’s appropriate time representation. @@ -133,12 +137,12 @@ def convert_strftime_format( elif target == "period": directive_maps = (_COMMON_MAP, _PERIOD_MAP) else: - raise ValueError(f"Invalid target: {target!r}") + raise ValueError(f"Invalid target: {repr(target)}") # Raise if unsupported directive found in `strftime_fmt` for key in _COMMON_UNSUPPORTED: if key in strftime_fmt: - raise UnsupportedStrFmtDirective(f"Unsupported directive: {key!r}") + raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") # Mapping between strftime and string formatting, according to both styles if new_style_fmt: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index cab64f717cadd..de747fe7cd44d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1279,10 +1279,8 @@ def fast_strftime( ) -> str: """A faster version of `datetime.strftime` using python string formatting. - Note: this is used in tests, not in the pandas API. - TODO since this makes especially a lot of sense for arrays, maybe this should work on arrays. - - Returns a string representing the date and time, controlled by an explicit format string. + Returns a string representing the date and time, controlled by an explicit + format string using named attributes of `dt`. For a complete list of formatting directives, see `strftime() and strptime() Behavior `_. @@ -1304,15 +1302,15 @@ def fast_strftime( Raises ------ UnsupportedStrFmtDirective - Raised when the received `strftime_fmt` format contains a directive for which the output - can not currently be created using string formatting. + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. See Also -------- - - `strftime format codes reference `_ - - `Stackoverflow post `_ explaining - how old-style formatting is faster than new-style formatting, itself faster than - `datetime.strftime`. + - `strftime format codes reference `_ # noqa + - `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than `datetime.strftime`. """ # common dict used for formatting y = dt.year diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 797b29bfd1169..68aa5f126b688 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3239,17 +3239,29 @@ def test_datetime_tz(self): def test_datetime_tz_custom(self): # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # If tz is currently set as utc, we'll see 2012 assert ( dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2012-12-31__foo__23:00:00" ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == "2012-12-31__foo__11:00:00PM" + ) + # If tz is currently set as paris, we'll see 2013 dt = dt.tz_convert("Europe/Paris") assert ( dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] == "2013-01-01__foo__00:00:00" ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == "2013-01-01__foo__12:00:00AM" + ) def test_date(self): formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() @@ -3396,10 +3408,13 @@ def test_fast_strftime_perf(self, strftime_format): dt = datetime.now() # pre-compute the formats - new_style_format = convert_strftime_format(strftime_format, new_style_fmt=True) + new_style_format = convert_strftime_format( + strftime_format, new_style_fmt=True + ) old_style_format = convert_strftime_format(strftime_format) - # Perf comparison: confirm the results in https://stackoverflow.com/a/43495629/7262247 + # Perf comparison: confirm the results + # from https://stackoverflow.com/a/43495629/7262247 fmt_dct = dict( year=dt.year, month=dt.month, @@ -3417,7 +3432,8 @@ def test_fast_strftime_perf(self, strftime_format): ) # Out[3]: 0.0062 new_style_best = min( - Timer("new_style_format.format(**fmt_dct)", globals=glob).repeat(7, 1000) + Timer("new_style_format.format(**fmt_dct)", globals=glob) + .repeat(7, 1000) ) # Out[4]: 0.0036 old_style_best = min( diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 40f6bbd3ccc50..2461739cdd876 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -199,7 +199,7 @@ def test_nat_iso_format(get_nat): "to_julian_date", "to_period", "tz", - ] + ], ), ( Timedelta, From c5827532f067c22dedce33cc7906a404d483a0ba Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 17:36:28 +0100 Subject: [PATCH 32/65] Added `fast_strftime` argument to the csv-related tools (`DataFrame.to_csv`, `DataFrameRenderer.to_csv`, `CSVFormatter`) so that users can come back to the original method if needed. --- pandas/core/generic.py | 8 ++++++++ pandas/io/formats/csvs.py | 3 +++ pandas/io/formats/format.py | 3 +++ pandas/tests/frame/methods/test_to_csv.py | 22 +++++++++++++++------- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f9dffaaa399f..f2e46ea79d575 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3396,6 +3396,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool_t = True, escapechar: str | None = None, decimal: str = ".", @@ -3486,6 +3487,12 @@ def to_csv( Rows to write at a time. date_format : str, default None Format string for datetime objects. + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. + + .. versionadded:: 1.5.0 + doublequote : bool, default True Control quoting of `quotechar` inside a field. escapechar : str, default None @@ -3568,6 +3575,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index cfbd2d9c9c8da..c447060aad18c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -63,6 +63,7 @@ def __init__( chunksize: int | None = None, quotechar: str | None = '"', date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions = None, @@ -86,6 +87,7 @@ def __init__( self.escapechar = escapechar self.lineterminator = lineterminator or os.linesep self.date_format = date_format + self.fast_strftime = fast_strftime self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) @@ -172,6 +174,7 @@ def _number_format(self) -> dict[str, Any]: "na_rep": self.na_rep, "float_format": self.float_format, "date_format": self.date_format, + "fast_strftime": self.fast_strftime, "quoting": self.quoting, "decimal": self.decimal, } diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index eb10994fb1db6..425367d10775f 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1146,6 +1146,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, errors: str = "strict", @@ -1176,6 +1177,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, @@ -1836,6 +1838,7 @@ def get_format_datetime64_from_values( ido = is_dates_only(values) if ido: + # only dates and no timezone: provide a default format return date_format or "%Y-%m-%d" return date_format diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index b7874d51b6f33..8f20806bf85b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -993,13 +993,16 @@ def test_to_csv_compression(self, df, encoding, compression): with tm.decompress_file(filename, compression) as fh: tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - def test_to_csv_date_format(self, datetime_frame): + @pytest.mark.parametrize("fast_strftime", (True, False)) + def test_to_csv_date_format(self, datetime_frame, fast_strftime): with tm.ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = datetime_frame.index datetime_frame = DataFrame( {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index ) - datetime_frame.to_csv(path, date_format="%Y%m%d") + datetime_frame.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1013,7 +1016,9 @@ def test_to_csv_date_format(self, datetime_frame): tm.assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format="%Y-%m-%d") + datetime_frame.to_csv( + path, date_format="%Y-%m-%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1028,7 +1033,9 @@ def test_to_csv_date_format(self, datetime_frame): # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + datetime_frame_columns.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) test = read_csv(path, index_col=0) @@ -1047,14 +1054,15 @@ def test_to_csv_date_format(self, datetime_frame): ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format="%Y-%m-%d") + nat_frame.to_csv(path, date_format="%Y-%m-%d", fast_strftime=fast_strftime) test = read_csv(path, parse_dates=[0, 1], index_col=0) tm.assert_frame_equal(test, nat_frame) + @pytest.mark.parametrize("fast_strftime", (True, False)) @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) - def test_to_csv_with_dst_transitions(self, td): + def test_to_csv_with_dst_transitions(self, td, fast_strftime): with tm.ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions @@ -1069,7 +1077,7 @@ def test_to_csv_with_dst_transitions(self, td): i = i._with_freq(None) # freq is not preserved by read_csv time_range = np.array(range(len(i)), dtype="int64") df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True) + df.to_csv(path, index=True, fast_strftime=fast_strftime) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) From 48c5e935ace2cb81208937293fe3586c2fbb4c6c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 17:57:09 +0100 Subject: [PATCH 33/65] `SQLiteTable` now converts time objects faster thanks to string formatting instead of `strftime` --- pandas/io/sql.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e004e9c1ecbcc..9518b2052d6ee 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1833,7 +1833,17 @@ def __init__(self, *args, **kwargs): # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) - sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + # sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + def _adapt(dt): + if dt.tzinfo is None: + # This is faster than strftime + return "%02d:%02d:%02d.%06d" % ( + dt.hour, dt.minute, dt.second, dt.microsecond + ) + else: + return dt.strftime("%H:%M:%S.%f") + + sqlite3.register_adapter(time, _adapt) super().__init__(*args, **kwargs) def sql_schema(self): From 62add6e9ccc95baf486fda54a5a2cb9abadcf25f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 7 Mar 2022 18:10:34 +0100 Subject: [PATCH 34/65] Fixed `_format_datetime64_dateonly` so that it now relies on `Timestamp.fast_strftime`. Added a few comments on speed of some implementations. --- pandas/io/formats/format.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 425367d10775f..5667063f90044 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1767,6 +1767,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep + # Note: this seems to rely on datetime.datetime.__str__ = isoformat + # so it already uses string templating rather than strftime (faster). return str(x) @@ -1782,15 +1784,7 @@ def _format_datetime64_dateonly( if date_format: if str_date_fmt: # Faster, using string formatting - return str_date_fmt % dict( - year=x.year, - month=x.month, - day=x.day, - hour=x.hour, - min=x.min, - sec=x.sec, - us=x.us, - ) + return x.fast_strftime(str_date_fmt) else: # Slower return x.strftime(date_format) @@ -1798,6 +1792,8 @@ def _format_datetime64_dateonly( # error: Item "NaTType" of "Union[NaTType, Any]" has no attribute "_date_repr" # The underlying problem here is that mypy doesn't understand that NaT # is a singleton, so that the check above excludes it here. + # + # Note: this relies on string templating (faster than strftime) return x._date_repr # type: ignore[union-attr] @@ -1821,9 +1817,11 @@ def get_format_datetime64( pass return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format, str_date_fmt=str_date_fmt + x, nat_rep=nat_rep, date_format=date_format, + str_date_fmt=str_date_fmt ) else: + # Relies on datetime.str, which is fast already return lambda x: _format_datetime64(x, nat_rep=nat_rep) @@ -1869,6 +1867,7 @@ def __init__( self.box = box def _format_strings(self) -> list[str]: + # Note: `get_format_timedelta64` uses fast formatting formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) @@ -1912,6 +1911,8 @@ def _formatter(x): if not isinstance(x, Timedelta): x = Timedelta(x) + + # Note: this does not use strftime but string formatting (faster) result = x._repr_base(format=format) if box: result = f"'{result}'" From d92a1328021236fcb12261c692612c522d04a018 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:20:35 +0100 Subject: [PATCH 35/65] Added `AM_LOCAL` and `PM_LOCAL` constants fed at initial import time using `strftime`, and reused in fast formatting ops. New function `get_local_ampm` gives an access for inspection. --- pandas/_libs/tslib.pyx | 7 ++++++- pandas/_libs/tslibs/__init__.py | 2 ++ pandas/_libs/tslibs/base.pxd | 4 ++++ pandas/_libs/tslibs/base.pyi | 4 ++++ pandas/_libs/tslibs/base.pyx | 18 +++++++++++++++++- pandas/_libs/tslibs/period.pyx | 9 ++++++++- pandas/_libs/tslibs/timestamps.pyx | 10 ++++++++-- pandas/core/tools/datetimes.py | 7 ++++++- 8 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 pandas/_libs/tslibs/base.pyi diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 40e1238f85051..fc0511d0de211 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -46,6 +46,10 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string +from pandas._libs.tslibs.base cimport ( + AM_LOCAL, + PM_LOCAL, +) from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, @@ -127,6 +131,7 @@ def format_array_from_datetime( ------- np.ndarray[object] """ + global AM_LOCAL, PM_LOCAL cdef: int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values @@ -212,7 +217,7 @@ def format_array_from_datetime( "day": dts.day, "hour": dts.hour, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": "PM" if (h // 12) else "AM", + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": dts.min, "sec": dts.sec, "us": dts.us, diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 80d7a9de19735..a56d08639bd89 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,5 +1,6 @@ __all__ = [ "dtypes", + "get_local_ampm", "localize_pydatetime", "NaT", "NaTType", @@ -29,6 +30,7 @@ ] from pandas._libs.tslibs import dtypes +from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.conversion import ( OutOfBoundsTimedelta, localize_pydatetime, diff --git a/pandas/_libs/tslibs/base.pxd b/pandas/_libs/tslibs/base.pxd index 3bffff7aca43e..e93a18d02e854 100644 --- a/pandas/_libs/tslibs/base.pxd +++ b/pandas/_libs/tslibs/base.pxd @@ -3,3 +3,7 @@ from cpython.datetime cimport datetime cdef class ABCTimestamp(datetime): pass + + +cdef bytes AM_LOCAL +cdef bytes PM_LOCAL diff --git a/pandas/_libs/tslibs/base.pyi b/pandas/_libs/tslibs/base.pyi new file mode 100644 index 0000000000000..164bc9f747141 --- /dev/null +++ b/pandas/_libs/tslibs/base.pyi @@ -0,0 +1,4 @@ +from typing import Tuple + + +def get_local_ampm() -> Tuple[bytes, bytes]: ... diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 1677a8b0be1ec..07342e1db0ccf 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -5,8 +5,24 @@ in order to allow for fast isinstance checks without circular dependency issues. This is analogous to core.dtypes.generic. """ -from cpython.datetime cimport datetime +from cpython.datetime cimport datetime, time cdef class ABCTimestamp(datetime): pass + + +cdef: + # Use strftime to get the locale-specific version of am and pm + # we will use these when formatting datetime as string + bytes AM_LOCAL = time(1).strftime("%p").encode("utf-8") + bytes PM_LOCAL = time(13).strftime("%p").encode("utf-8") + + +def get_local_ampm(): + """Return the utf-8 encoded am and pm strings used in the current locale + + Note that the considered locale is the one active at module import time. + """ + global AM_LOCAL, PM_LOCAL + return AM_LOCAL, PM_LOCAL diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index bb1316d57d380..6a2d2113a8440 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -61,6 +61,11 @@ cdef extern from "src/datetime/np_datetime.h": cimport pandas._libs.tslibs.util as util +from pandas._libs.tslibs.base cimport ( + AM_LOCAL, + PM_LOCAL, +) + from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timestamps import Timestamp @@ -2410,6 +2415,8 @@ cdef class _Period(PeriodMixin): >>> a.fast_strftime(fast_fmt) '2006-Q1' """ + global AM_LOCAL, PM_LOCAL + freq = self._dtype._dtype_code value = self.ordinal @@ -2436,7 +2443,7 @@ cdef class _Period(PeriodMixin): "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": "PM" if (h // 12) else "AM", + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": dts.min, "sec": dts.sec, "ms": dts.us // 1000, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index d186ed7287bdb..6a45bd4c7e622 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -46,7 +46,11 @@ from cpython.object cimport ( PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar -from pandas._libs.tslibs.base cimport ABCTimestamp +from pandas._libs.tslibs.base cimport ( + ABCTimestamp, + AM_LOCAL, + PM_LOCAL, +) from pandas._libs.tslibs.conversion cimport ( _TSObject, convert_datetime_to_tsobject, @@ -1210,6 +1214,8 @@ class Timestamp(_Timestamp): >>> ts.fast_strftime(fmt) '2020-03-14T15:32:52' """ + global AM_LOCAL, PM_LOCAL + y = self.year h = self.hour return fmt_str % { @@ -1219,7 +1225,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": "PM" if (h // 12) else "AM", + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": self.minute, "sec": self.second, "us": self.microsecond, diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index de747fe7cd44d..6d169e271eacd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -29,6 +29,7 @@ nat_strings, parsing, timezones, + get_local_ampm ) from pandas._libs.tslibs.parsing import ( # noqa:F401 DateParseError, @@ -1272,6 +1273,10 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): return to_time(arg, format, infer_time_format, errors) +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + + def fast_strftime( dt: datetime, fmt: str, @@ -1322,7 +1327,7 @@ def fast_strftime( "day": dt.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": "PM" if (h // 12) else "AM", + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": dt.minute, "sec": dt.second, "us": dt.microsecond, From f797a6f3a1274ac5ed58727505afe8be514f9995 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:24:39 +0100 Subject: [PATCH 36/65] Accelerated ODS writer when dates need to be written: at least one of the strftime operations was removed --- pandas/io/excel/_odswriter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 94f173c1469e0..5b186dd063b2e 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -196,7 +196,8 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: TableCell(valuetype="date", datevalue=value, attributes=attributes), ) elif isinstance(val, datetime.date): - value = val.strftime("%Y-%m-%d") + # value = val.strftime("%Y-%m-%d") (slower) + value = f"{val.year}-{val.month:02d}-{val.day:02d}" # faster pvalue = val.strftime("%x") return ( pvalue, From 3c9620f2d42f6e9971512539591cbd6926971388 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:48:47 +0100 Subject: [PATCH 37/65] Using `bool_t` instead of `bool` in type hints of core/generic.py --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 23bc3368b0f0c..b7f45981dd278 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3387,7 +3387,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, - fast_strftime: bool = True, + fast_strftime: bool_t = True, doublequote: bool_t = True, escapechar: str | None = None, decimal: str = ".", From 7b378120b4055473f9be282e8106fd6eeee3757f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:49:29 +0100 Subject: [PATCH 38/65] isort --- pandas/_libs/tslibs/base.pyx | 5 ++++- pandas/_libs/tslibs/period.pyx | 1 - pandas/_libs/tslibs/timestamps.pyx | 2 +- pandas/core/tools/datetimes.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 07342e1db0ccf..7b49d02b08760 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -5,7 +5,10 @@ in order to allow for fast isinstance checks without circular dependency issues. This is analogous to core.dtypes.generic. """ -from cpython.datetime cimport datetime, time +from cpython.datetime cimport ( + datetime, + time, +) cdef class ABCTimestamp(datetime): diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 163bf1cbf9838..a8931d671da0b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -60,7 +60,6 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util - from pandas._libs.tslibs.base cimport ( AM_LOCAL, PM_LOCAL, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 6a45bd4c7e622..67d838945d819 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -47,9 +47,9 @@ PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar from pandas._libs.tslibs.base cimport ( - ABCTimestamp, AM_LOCAL, PM_LOCAL, + ABCTimestamp, ) from pandas._libs.tslibs.conversion cimport ( _TSObject, diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index dcd3b84640f7e..d30e5bb70f036 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -25,11 +25,11 @@ Timestamp, conversion, convert_strftime_format, + get_local_ampm, iNaT, nat_strings, parsing, timezones, - get_local_ampm ) from pandas._libs.tslibs.parsing import ( # noqa:F401 DateParseError, From 5c1854db68bb6ba1fbe44f16914ddb0847063520 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:53:55 +0100 Subject: [PATCH 39/65] flake8 and black --- pandas/_libs/tslibs/base.pyi | 1 - pandas/_libs/tslibs/strftime.py | 27 ++++++++++++++++----------- pandas/io/formats/format.py | 3 +-- pandas/io/sql.py | 5 ++++- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/base.pyi b/pandas/_libs/tslibs/base.pyi index 164bc9f747141..1c38be34ee408 100644 --- a/pandas/_libs/tslibs/base.pyi +++ b/pandas/_libs/tslibs/base.pyi @@ -1,4 +1,3 @@ from typing import Tuple - def get_local_ampm() -> Tuple[bytes, bytes]: ... diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index c3b49af694e8a..8171da0c7279c 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -95,33 +95,36 @@ def convert_strftime_format( Parameters ---------- strftime_fmt : str - The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. Note - that not all directives are eligible to successful usage of string formatting. - Unsupported directives will lead to an `UnsupportedStrFmtDirective` being raised. + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. + Note that not all directives are eligible to successful usage of string + formatting. Unsupported directives will lead to an + `UnsupportedStrFmtDirective` being raised. target : { "datetime", "date", "time", "period" }, default: "datetime" The kind of data that will be formatted using this template. new_style_fmt : bool, default: False Whether the output string should be new-style e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" - or old-style e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + or old-style + e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" Returns ------- fmt_out : str - A string that may be used to format a `datetime` variable. The style of this string - is either old-style or new-style depending on `new_style_formatting`. + A string that may be used to format a `datetime` variable. The style of + this string is either old-style or new-style depending on + `new_style_formatting`. For old-style, it may be used as `fmt_out % fmt_dct`. For new-style, it may be used as `fmt_out.format(**fmt_dct)` Raises ------ UnsupportedStrFmtDirective - Raised when the received `strftime_fmt` format contains a directive for which the output - can not currently be created using string formatting. + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. See Also -------- - `strftime format codes reference `_ + `strftime format codes reference `_ # noqa `Stackoverflow post `_ explaining how old-style formatting is faster than new-style formatting, @@ -157,7 +160,7 @@ def convert_strftime_format( # Create the output by replacing all directives for _map in directive_maps: for key, (_name, _fmt) in _map.items(): - # for example replace "%d" by "{day:02d}" but with escaped { and } + # for example replace "%d" by "{day:02d}" but with escaped { } strftime_fmt = strftime_fmt.replace( key, f"{esc_l}{_name}:{_fmt}{esc_r}" ) @@ -181,7 +184,9 @@ def convert_strftime_format( for _map in directive_maps: for key, (_name, _fmt) in _map.items(): # for example replace "%d" by "%(day)02d" but with escaped % - strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") + strftime_fmt = strftime_fmt.replace( + key, f"{esc}({_name}){_fmt}" + ) # Escape remaining percent signs strftime_fmt = strftime_fmt.replace("%", "%%") diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1359fd7ed8597..0aaec93a288fb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1817,8 +1817,7 @@ def get_format_datetime64( pass return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format, - str_date_fmt=str_date_fmt + x, nat_rep=nat_rep, date_format=date_format, str_date_fmt=str_date_fmt ) else: # Relies on datetime.str, which is fast already diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9518b2052d6ee..f4091a241f817 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1838,7 +1838,10 @@ def _adapt(dt): if dt.tzinfo is None: # This is faster than strftime return "%02d:%02d:%02d.%06d" % ( - dt.hour, dt.minute, dt.second, dt.microsecond + dt.hour, + dt.minute, + dt.second, + dt.microsecond, ) else: return dt.strftime("%H:%M:%S.%f") From 17255b98576d22ab336cd1e3c1bdbcc1f43f8120 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 12:58:56 +0100 Subject: [PATCH 40/65] Docstring fix --- pandas/core/arrays/datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c580cffa87915..fb14722dbf18c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1601,8 +1601,8 @@ def strftime( date_format : str Date format string (e.g. "%%Y-%%m-%%d"). - fast_strftime : boolean, default: True - if `True` (default) and the format permits it, a faster formatting + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting method will be used. See `convert_strftime_format`. .. versionadded:: 1.5.0 From 12b9041b0159a5cc17f34d350cd2dfc21ea00bdc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 13:17:36 +0100 Subject: [PATCH 41/65] Fixed test namespace --- pandas/tests/tslibs/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 10446a2e88288..5685a6bdaab34 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -52,6 +52,7 @@ def test_namespace(): "tz_convert_from_utc_single", "to_offset", "tz_compare", + "get_local_ampm", ] expected = set(submodules + api) From 743aad8484ed466b92b57b9da828afa128bb9774 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 13:19:44 +0100 Subject: [PATCH 42/65] `AM_LOCAL` and `PM_LOCAL` are now `str` - this should fix the issue on arm and win32 platforms where cython does not seem to intelligently fix the string formatting incompatibility. --- pandas/_libs/tslibs/base.pxd | 4 ++-- pandas/_libs/tslibs/base.pyx | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/base.pxd b/pandas/_libs/tslibs/base.pxd index e93a18d02e854..d251bf9aa04c3 100644 --- a/pandas/_libs/tslibs/base.pxd +++ b/pandas/_libs/tslibs/base.pxd @@ -5,5 +5,5 @@ cdef class ABCTimestamp(datetime): pass -cdef bytes AM_LOCAL -cdef bytes PM_LOCAL +cdef str AM_LOCAL +cdef str PM_LOCAL diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 7b49d02b08760..5c45a268b30c6 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -18,12 +18,12 @@ cdef class ABCTimestamp(datetime): cdef: # Use strftime to get the locale-specific version of am and pm # we will use these when formatting datetime as string - bytes AM_LOCAL = time(1).strftime("%p").encode("utf-8") - bytes PM_LOCAL = time(13).strftime("%p").encode("utf-8") + str AM_LOCAL = time(1).strftime("%p") + str PM_LOCAL = time(13).strftime("%p") def get_local_ampm(): - """Return the utf-8 encoded am and pm strings used in the current locale + """Return the am and pm strings used in the current locale Note that the considered locale is the one active at module import time. """ From cef0a69f0f0172f8abcfaa2feac5b2447bab5945 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 13:23:01 +0100 Subject: [PATCH 43/65] Fixed test_format and removed the performance test, as it would fail on ci when parallel cpus/tests are used --- pandas/tests/io/formats/test_format.py | 120 +++++++++---------------- 1 file changed, 42 insertions(+), 78 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 68aa5f126b688..ef87e7056ca2a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas._libs.tslibs import convert_strftime_format +from pandas._libs.tslibs import convert_strftime_format, get_local_ampm from pandas.compat import ( IS64, is_platform_windows, @@ -3170,37 +3170,38 @@ def test_str(self): class TestPeriodIndexFormat: - def test_period(self): - p = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq='H') + p = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="H") formatted = p.format() assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown assert formatted[1] == "NaT" - p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, - freq="n") + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") formatted = p.format() - assert formatted[0] == '2003-01-01 12:01:01.123456789' - assert formatted[1] == '2003-01-01 12:01:01.123456790' + assert formatted[0] == "2003-01-01 12:01:01.123456789" + assert formatted[1] == "2003-01-01 12:01:01.123456790" @pytest.mark.parametrize("fast_strftime", (False, True)) def test_period_custom(self, fast_strftime): # GH46252 p = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") - formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime) + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + ) assert formatted[0] == "03 12:01:01PM (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01PM (ms=124 us=124000 ns=124000000)" p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") - formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime) + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + ) assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01PM (ms=123 us=123456 ns=123456790)" p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") - formatted = p.format(date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime) + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + ) assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01PM (ms=123 us=123457 ns=123457000)" @@ -3247,8 +3248,8 @@ def test_datetime_tz_custom(self): ) # same with fancy format assert ( - dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == "2012-12-31__foo__11:00:00PM" + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == "2012-12-31__foo__11:00:00PM" ) # If tz is currently set as paris, we'll see 2013 @@ -3259,8 +3260,8 @@ def test_datetime_tz_custom(self): ) # same with fancy format assert ( - dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == "2013-01-01__foo__12:00:00AM" + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == "2013-01-01__foo__12:00:00AM" ) def test_date(self): @@ -3360,7 +3361,10 @@ class TestDatetimeFastFormatter: ) @pytest.mark.parametrize("new_style", (False, True)) def test_fast_strftime_basic(self, strftime_format, new_style): - """Test that formatting standard `datetime` objects with our utils works as good as strftime.""" + """ + Test that formatting standard `datetime` objects with our utils works + as good as strftime. + """ # create a datetime instance dt = datetime.now() @@ -3368,19 +3372,29 @@ def test_fast_strftime_basic(self, strftime_format, new_style): # strftime strftime_res = dt.strftime(strftime_format) + # Get the locale-specific versions of am and pm strings. + AM_LOCAL, PM_LOCAL = get_local_ampm() + # common dict used for formatting - fmt_dct = dict( - year=dt.year, - month=dt.month, - day=dt.day, - hour=dt.hour, - min=dt.minute, - sec=dt.second, - us=dt.microsecond, - ) + fmt_dct = { + "year": dt.year, + "shortyear": dt.year % 100, + "month": dt.month, + "day": dt.day, + "hour": dt.hour, + "hour12": 12 if dt.hour in (0, 12) else (dt.hour % 12), + "ampm": PM_LOCAL if (dt.hour // 12) else AM_LOCAL, + "min": dt.minute, + "sec": dt.second, + "us": dt.microsecond, + } # get the formatting string - style_format = convert_strftime_format(strftime_format, new_style_fmt=new_style) + # fmt: off + style_format = convert_strftime_format( + strftime_format, new_style_fmt=new_style + ) + # fmt: on # apply it and check the output if new_style: @@ -3393,56 +3407,6 @@ def test_fast_strftime_basic(self, strftime_format, new_style): res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) assert res2 == res - @pytest.mark.parametrize( - "strftime_format", - ( - "%Y-%m-%d %H:%M:%S", - "%Y %Y", - "%Y-%m-%dT%H:%M:%S.%fZ", - ), - ) - def test_fast_strftime_perf(self, strftime_format): - """Test that formatting standard `datetime` objects with our utils is faster than strftime.""" - - # create a datetime instance - dt = datetime.now() - - # pre-compute the formats - new_style_format = convert_strftime_format( - strftime_format, new_style_fmt=True - ) - old_style_format = convert_strftime_format(strftime_format) - - # Perf comparison: confirm the results - # from https://stackoverflow.com/a/43495629/7262247 - fmt_dct = dict( - year=dt.year, - month=dt.month, - day=dt.day, - hour=dt.hour, - min=dt.minute, - sec=dt.second, - us=dt.microsecond, - ) - - glob = globals() - glob.update(locals()) - strftime_best = min( - Timer("dt.strftime(strftime_format)", globals=glob).repeat(7, 1000) - ) - # Out[3]: 0.0062 - new_style_best = min( - Timer("new_style_format.format(**fmt_dct)", globals=glob) - .repeat(7, 1000) - ) - # Out[4]: 0.0036 - old_style_best = min( - Timer("old_style_format % fmt_dct", globals=glob).repeat(7, 1000) - ) - # Out[5]: 0.0030 - assert new_style_best < strftime_best # much better - assert old_style_best < new_style_best # even better ! - def test_bad_strftime_directive(self): """Test what happens in case of bad `date_format` directive.""" From ff501337801b313d08ceb638502bd6ad2642f3c0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 14:22:31 +0100 Subject: [PATCH 44/65] flake8 black and isort --- pandas/_libs/tslibs/strftime.py | 4 +--- pandas/tests/io/formats/test_format.py | 8 ++++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 8171da0c7279c..6411649792ece 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -184,9 +184,7 @@ def convert_strftime_format( for _map in directive_maps: for key, (_name, _fmt) in _map.items(): # for example replace "%d" by "%(day)02d" but with escaped % - strftime_fmt = strftime_fmt.replace( - key, f"{esc}({_name}){_fmt}" - ) + strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") # Escape remaining percent signs strftime_fmt = strftime_fmt.replace("%", "%%") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index ef87e7056ca2a..3d19d6cc6d4ad 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,10 @@ import pytest import pytz -from pandas._libs.tslibs import convert_strftime_format, get_local_ampm +from pandas._libs.tslibs import ( + convert_strftime_format, + get_local_ampm, +) from pandas.compat import ( IS64, is_platform_windows, @@ -3421,7 +3424,8 @@ def test_bad_strftime_directive(self): @pytest.mark.parametrize( "date_format", ( - # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f are always accelerated (hardcoded) + # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f + # are always accelerated (hardcoded) "%Y-%m-%d__foo__%H:%M:%S", "%Y %Y", "%Y-%m-%dT%H:%M:%S.%fZ", From 98ae42989177f7e409c01fcf05a683da1486bed3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 14:25:16 +0100 Subject: [PATCH 45/65] Added missing type hint declaration for the `fast_strftime` methods (mypy) --- pandas/_libs/tslibs/period.pyi | 1 + pandas/_libs/tslibs/timestamps.pyi | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 7d6e31f339e29..787ea66f37ed9 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -76,6 +76,7 @@ class Period: def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: BaseOffset = ...) -> Period: ... + def fast_strftime(self, fmt_str: str) -> str: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 7a33be1793797..bd1206fe8ce28 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -104,6 +104,7 @@ class Timestamp(datetime): def combine(cls, date: _date, time: _time) -> datetime: ... # type: ignore[override] @classmethod def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... + def fast_strftime(self, fmt_str: str) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... From 64fa9de29d76d0f3687b2a1aebaba48039eb0d8b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 14:33:28 +0100 Subject: [PATCH 46/65] Now catching the warnings about time zone being dropped in test during dt to period conversion --- pandas/tests/io/formats/test_format.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 3d19d6cc6d4ad..80dda7a7b4e0a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3216,12 +3216,18 @@ def test_period_tz(self): # Converting to a period looses the timezone information # Since tz is currently set as utc, we'll see 2012 - p = dt.to_period(freq="H") + with pytest.warns(UserWarning) as record: + p = dt.to_period(freq="H") + assert len(record) == 1 + assert "will drop timezone information" in record[0].message.args[0] assert p.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") - p = dt.to_period(freq="H") + with pytest.warns(UserWarning) as record: + p = dt.to_period(freq="H") + assert len(record) == 1 + assert "will drop timezone information" in record[0].message.args[0] assert p.format()[0] == "2013-01-01 00:00" From c200ef180f3a2d099f19892384159c79ecff9bea Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 15:20:09 +0100 Subject: [PATCH 47/65] Fixed locale-related issue: AM_LOCAL and PM_LOCAL were created at compile time so they were not using the right (runtime) locale. Added a test. --- pandas/_libs/tslib.pyx | 9 +++++---- pandas/_libs/tslibs/base.pxd | 4 ---- pandas/_libs/tslibs/base.pyi | 2 +- pandas/_libs/tslibs/base.pyx | 19 ++++++++----------- pandas/_libs/tslibs/period.pyx | 9 +++++---- pandas/_libs/tslibs/timestamps.pyx | 9 ++++----- pandas/tests/io/formats/test_format.py | 15 ++++++++++++++- 7 files changed, 37 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fc0511d0de211..09bfcabd9cb3a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -46,10 +46,7 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string -from pandas._libs.tslibs.base cimport ( - AM_LOCAL, - PM_LOCAL, -) +from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, @@ -76,6 +73,10 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + + def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used diff --git a/pandas/_libs/tslibs/base.pxd b/pandas/_libs/tslibs/base.pxd index d251bf9aa04c3..3bffff7aca43e 100644 --- a/pandas/_libs/tslibs/base.pxd +++ b/pandas/_libs/tslibs/base.pxd @@ -3,7 +3,3 @@ from cpython.datetime cimport datetime cdef class ABCTimestamp(datetime): pass - - -cdef str AM_LOCAL -cdef str PM_LOCAL diff --git a/pandas/_libs/tslibs/base.pyi b/pandas/_libs/tslibs/base.pyi index 1c38be34ee408..d33c416d78dfa 100644 --- a/pandas/_libs/tslibs/base.pyi +++ b/pandas/_libs/tslibs/base.pyi @@ -1,3 +1,3 @@ from typing import Tuple -def get_local_ampm() -> Tuple[bytes, bytes]: ... +def get_local_ampm() -> Tuple[str, str]: ... diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 5c45a268b30c6..5c2cc213d71ea 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -15,17 +15,14 @@ cdef class ABCTimestamp(datetime): pass -cdef: - # Use strftime to get the locale-specific version of am and pm - # we will use these when formatting datetime as string - str AM_LOCAL = time(1).strftime("%p") - str PM_LOCAL = time(13).strftime("%p") - - def get_local_ampm(): - """Return the am and pm strings used in the current locale + """Return the am and pm strings used in the current locale. + + We will use these when formatting datetime as string using string templates, which + is faster than strftime when executed on arrays. The strftime directive where we + need these is `%p`. - Note that the considered locale is the one active at module import time. + Note that the considered locale is the one active when this function is called + (not at cython compilation time). """ - global AM_LOCAL, PM_LOCAL - return AM_LOCAL, PM_LOCAL + return time(1).strftime("%p"), time(13).strftime("%p") diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a8931d671da0b..acccf907924cd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -60,10 +60,7 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.base cimport ( - AM_LOCAL, - PM_LOCAL, -) +from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timestamps import Timestamp @@ -125,6 +122,10 @@ cdef: INT32_MIN = -2_147_483_648LL +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + + ctypedef struct asfreq_info: int64_t intraday_conversion_factor int is_end diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 67d838945d819..f726dc442d7e1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -46,11 +46,8 @@ from cpython.object cimport ( PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar -from pandas._libs.tslibs.base cimport ( - AM_LOCAL, - PM_LOCAL, - ABCTimestamp, -) +from pandas._libs.tslibs.base import get_local_ampm +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, convert_datetime_to_tsobject, @@ -114,6 +111,8 @@ from pandas._libs.tslibs.tzconversion cimport ( # Constants _zero_time = time(0, 0) _no_input = object() +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" # ---------------------------------------------------------------------- diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 80dda7a7b4e0a..c0764fccc9ecc 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,7 +1,10 @@ """ Test output formatting for Series/DataFrame, including to_string & reprs """ -from datetime import datetime +from datetime import ( + datetime, + time, +) from io import StringIO import itertools from operator import methodcaller @@ -3184,6 +3187,16 @@ def test_period(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" + def test_period_locale(self): + """Test that `get_local_ampm` relies on runtime locale, not compile-time one + + If this test fails, all tests using %p format strftime will fail when + the runtime locale is different from the compile-time one. + """ + AM_LOCAL, PM_LOCAL = get_local_ampm() + assert AM_LOCAL == time(1).strftime("%p") + assert PM_LOCAL == time(13).strftime("%p") + @pytest.mark.parametrize("fast_strftime", (False, True)) def test_period_custom(self, fast_strftime): # GH46252 From cea3e0739c9e8afda91d6550042b6123962eae2e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 15:23:57 +0100 Subject: [PATCH 48/65] Replaced `pytest.warns` with pandas `assert_produces_warning` --- pandas/tests/io/formats/test_format.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index c0764fccc9ecc..a8a3087956da3 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3229,18 +3229,14 @@ def test_period_tz(self): # Converting to a period looses the timezone information # Since tz is currently set as utc, we'll see 2012 - with pytest.warns(UserWarning) as record: + with tm.assert_produces_warning(UserWarning, match="will drop timezone"): p = dt.to_period(freq="H") - assert len(record) == 1 - assert "will drop timezone information" in record[0].message.args[0] assert p.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") - with pytest.warns(UserWarning) as record: + with tm.assert_produces_warning(UserWarning, match="will drop timezone"): p = dt.to_period(freq="H") - assert len(record) == 1 - assert "will drop timezone information" in record[0].message.args[0] assert p.format()[0] == "2013-01-01 00:00" From ca3a6ccecad8523f30138093aa7abbb5ea7b7f18 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 16:46:53 +0100 Subject: [PATCH 49/65] Fixed `CSVFormatter` (used in `to_csv`) so that `DatetimeIndex` or `PeriodIndex` are now formatted using the same routine than columns of the same type. In particular `strftime` is not used anymore when possible, which is therefore faster. Added two tests. --- pandas/io/formats/csvs.py | 9 +--- pandas/tests/io/formats/test_to_csv.py | 63 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c447060aad18c..0e04a1b156daf 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -182,14 +182,7 @@ def _number_format(self) -> dict[str, Any]: @cache_readonly def data_index(self) -> Index: data_index = self.obj.index - if ( - isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and self.date_format is not None - ): - data_index = Index( - [x.strftime(self.date_format) if notna(x) else "" for x in data_index] - ) - elif isinstance(data_index, ABCMultiIndex): + if isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 70fa071fe8fb6..8b69baad18c52 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -284,6 +284,69 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_datetime_format_index(self): + """Test that formatting also works for datetime index""" + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_sec = df_sec.set_index("A") + + # default date_format + res = df_sec.to_csv() + expected_rows = [ + "A", + "2013-01-01 00:00:00", + "2013-01-01 00:00:01", + "2013-01-01 00:00:02", + "2013-01-01 00:00:03", + "2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + # custom date_format + res = df_sec.to_csv(date_format="%Y-%m-%d %H:%M:%S.%f") + expected_rows = [ + "A", + "2013-01-01 00:00:00.000000", + "2013-01-01 00:00:01.000000", + "2013-01-01 00:00:02.000000", + "2013-01-01 00:00:03.000000", + "2013-01-01 00:00:04.000000", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + def test_to_csv_period_format_index(self): + """Test that formatting also works for period index""" + # same for periods + df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="m")}) + df_month = df_month.set_index("A") + + # default date_format + res = df_month.to_csv() + expected_rows = [ + "A", + "2013-01", + "2013-02", + "2013-03", + "2013-04", + "2013-05", + ] + expected_default_mon = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_mon + + # custom format + res = df_month.to_csv(date_format="%F : %q") + expected_rows = [ + "A", + "2013 : 1", + "2013 : 1", + "2013 : 1", + "2013 : 2", + "2013 : 2", + ] + expected_ymdhms_month = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_ymdhms_month + def test_to_csv_different_datetime_formats(self): # GH#21734 df = DataFrame( From 53cefd0e30e6e19f4738b10ad0b8058ef9b3bcf2 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 17:32:02 +0100 Subject: [PATCH 50/65] Fixed locale problem and improved test --- pandas/_libs/tslib.pyx | 7 ++----- pandas/_libs/tslibs/period.pyx | 10 +++------- pandas/_libs/tslibs/timestamps.pyx | 9 +++------ pandas/tests/io/formats/test_format.py | 19 +++++++++++++++++++ 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 09bfcabd9cb3a..9cfc30b9efc7c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -73,10 +73,6 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - - def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -132,7 +128,6 @@ def format_array_from_datetime( ------- np.ndarray[object] """ - global AM_LOCAL, PM_LOCAL cdef: int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values @@ -141,6 +136,8 @@ def format_array_from_datetime( ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts + str AM_LOCAL = Timestamp._AM_LOCAL + str PM_LOCAL = Timestamp._PM_LOCAL if na_rep is None: na_rep = 'NaT' diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index acccf907924cd..8dc94f94d1127 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -122,10 +122,6 @@ cdef: INT32_MIN = -2_147_483_648LL -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - - ctypedef struct asfreq_info: int64_t intraday_conversion_factor int is_end @@ -2415,8 +2411,6 @@ cdef class _Period(PeriodMixin): >>> a.fast_strftime(fast_fmt) '2006-Q1' """ - global AM_LOCAL, PM_LOCAL - freq = self._dtype._dtype_code value = self.ordinal @@ -2443,7 +2437,7 @@ cdef class _Period(PeriodMixin): "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": Period._PM_LOCAL if (h // 12) else Period._AM_LOCAL, "min": dts.min, "sec": dts.sec, "ms": dts.us // 1000, @@ -2626,6 +2620,8 @@ class Period(_Period): Second value of the period. """ + _AM_LOCAL, _PM_LOCAL = get_local_ampm() + def __new__(cls, value=None, freq=None, ordinal=None, year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f726dc442d7e1..2c041d986235b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -111,9 +111,6 @@ from pandas._libs.tslibs.tzconversion cimport ( # Constants _zero_time = time(0, 0) _no_input = object() -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - # ---------------------------------------------------------------------- @@ -1065,6 +1062,8 @@ class Timestamp(_Timestamp): Timestamp('2017-01-01 12:00:00') """ + _AM_LOCAL, _PM_LOCAL = get_local_ampm() + @classmethod def fromordinal(cls, ordinal, freq=None, tz=None): """ @@ -1213,8 +1212,6 @@ class Timestamp(_Timestamp): >>> ts.fast_strftime(fmt) '2020-03-14T15:32:52' """ - global AM_LOCAL, PM_LOCAL - y = self.year h = self.hour return fmt_str % { @@ -1224,7 +1221,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": Timestamp._PM_LOCAL if (h // 12) else Timestamp._AM_LOCAL, "min": self.minute, "sec": self.second, "us": self.microsecond, diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a8a3087956da3..6ae98206d40ac 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -37,6 +37,7 @@ Index, MultiIndex, NaT, + Period, Series, Timestamp, date_range, @@ -3193,10 +3194,28 @@ def test_period_locale(self): If this test fails, all tests using %p format strftime will fail when the runtime locale is different from the compile-time one. """ + # Make sure the function is ok AM_LOCAL, PM_LOCAL = get_local_ampm() assert AM_LOCAL == time(1).strftime("%p") assert PM_LOCAL == time(13).strftime("%p") + # Now what about the classes ? + # Timestamp + am_ts = Timestamp(2020, 1, 1, 1) + assert AM_LOCAL == am_ts.strftime("%p") + assert AM_LOCAL == am_ts.fast_strftime("%(ampm)s") + pm_ts = Timestamp(2020, 1, 1, 13) + assert PM_LOCAL == pm_ts.strftime("%p") + assert PM_LOCAL == pm_ts.fast_strftime("%(ampm)s") + + # Period + am_per = Period("2018-03-11 01:00", freq="H") + assert AM_LOCAL == am_per.strftime("%p") + assert AM_LOCAL == am_per.fast_strftime("%(ampm)s") + pm_per = Period("2018-03-11 13:00", freq="H") + assert PM_LOCAL == pm_per.strftime("%p") + assert PM_LOCAL == pm_ts.fast_strftime("%(ampm)s") + @pytest.mark.parametrize("fast_strftime", (False, True)) def test_period_custom(self, fast_strftime): # GH46252 From 0bdb9b2a5d3a1e7dfc7e5c7cdaf1204ccb9feb15 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 17:33:01 +0100 Subject: [PATCH 51/65] isort --- pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9cfc30b9efc7c..ce874a962624c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -43,10 +43,10 @@ from pandas._libs.util cimport ( is_integer_object, ) +from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string -from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 8dc94f94d1127..08494abed297d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -60,8 +60,8 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.base import get_local_ampm +from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timestamps import Timestamp From d98154222dbdab320a545b2f3cc0b8bfefcca15f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 17:38:16 +0100 Subject: [PATCH 52/65] Reverted the mods as it appears it was not this problem --- pandas/_libs/tslib.pyx | 7 +++++-- pandas/_libs/tslibs/period.pyx | 10 +++++++--- pandas/_libs/tslibs/timestamps.pyx | 9 ++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ce874a962624c..151dcbf5f3f0b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -73,6 +73,10 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + + def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -128,6 +132,7 @@ def format_array_from_datetime( ------- np.ndarray[object] """ + global AM_LOCAL, PM_LOCAL cdef: int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values @@ -136,8 +141,6 @@ def format_array_from_datetime( ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts - str AM_LOCAL = Timestamp._AM_LOCAL - str PM_LOCAL = Timestamp._PM_LOCAL if na_rep is None: na_rep = 'NaT' diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 08494abed297d..b2896bbd40ba9 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -122,6 +122,10 @@ cdef: INT32_MIN = -2_147_483_648LL +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + + ctypedef struct asfreq_info: int64_t intraday_conversion_factor int is_end @@ -2411,6 +2415,8 @@ cdef class _Period(PeriodMixin): >>> a.fast_strftime(fast_fmt) '2006-Q1' """ + global AM_LOCAL, PM_LOCAL + freq = self._dtype._dtype_code value = self.ordinal @@ -2437,7 +2443,7 @@ cdef class _Period(PeriodMixin): "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": Period._PM_LOCAL if (h // 12) else Period._AM_LOCAL, + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": dts.min, "sec": dts.sec, "ms": dts.us // 1000, @@ -2620,8 +2626,6 @@ class Period(_Period): Second value of the period. """ - _AM_LOCAL, _PM_LOCAL = get_local_ampm() - def __new__(cls, value=None, freq=None, ordinal=None, year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 2c041d986235b..f726dc442d7e1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -111,6 +111,9 @@ from pandas._libs.tslibs.tzconversion cimport ( # Constants _zero_time = time(0, 0) _no_input = object() +AM_LOCAL, PM_LOCAL = get_local_ampm() +"""Get the locale-specific versions of am and pm strings.""" + # ---------------------------------------------------------------------- @@ -1062,8 +1065,6 @@ class Timestamp(_Timestamp): Timestamp('2017-01-01 12:00:00') """ - _AM_LOCAL, _PM_LOCAL = get_local_ampm() - @classmethod def fromordinal(cls, ordinal, freq=None, tz=None): """ @@ -1212,6 +1213,8 @@ class Timestamp(_Timestamp): >>> ts.fast_strftime(fmt) '2020-03-14T15:32:52' """ + global AM_LOCAL, PM_LOCAL + y = self.year h = self.hour return fmt_str % { @@ -1221,7 +1224,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": Timestamp._PM_LOCAL if (h // 12) else Timestamp._AM_LOCAL, + "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, "min": self.minute, "sec": self.second, "us": self.microsecond, From 37a622caa7ac2155954141b9d333c0366fcd2c54 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 17:41:05 +0100 Subject: [PATCH 53/65] Fixed failing test on non-en locales --- pandas/tests/io/formats/test_format.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 6ae98206d40ac..d5f771c0de859 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3223,22 +3223,23 @@ def test_period_custom(self, fast_strftime): formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime ) - assert formatted[0] == "03 12:01:01PM (ms=123 us=123000 ns=123000000)" - assert formatted[1] == "03 12:01:01PM (ms=124 us=124000 ns=124000000)" + AM_LOCAL, PM_LOCAL = get_local_ampm() + assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123000 ns=123000000)" + assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=124 us=124000 ns=124000000)" p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime ) - assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456789)" - assert formatted[1] == "03 12:01:01PM (ms=123 us=123456 ns=123456790)" + assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456789)" + assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456790)" p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime ) - assert formatted[0] == "03 12:01:01PM (ms=123 us=123456 ns=123456000)" - assert formatted[1] == "03 12:01:01PM (ms=123 us=123457 ns=123457000)" + assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456000)" + assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123457 ns=123457000)" def test_period_tz(self): """Test formatting periods created from a datetime with timezone""" @@ -3275,6 +3276,8 @@ def test_datetime_tz(self): assert dt.format()[0] == "2013-01-01 00:00:00+01:00" def test_datetime_tz_custom(self): + AM_LOCAL, PM_LOCAL = get_local_ampm() + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) @@ -3286,7 +3289,7 @@ def test_datetime_tz_custom(self): # same with fancy format assert ( dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == "2012-12-31__foo__11:00:00PM" + == f"2012-12-31__foo__11:00:00{PM_LOCAL}" ) # If tz is currently set as paris, we'll see 2013 @@ -3298,7 +3301,7 @@ def test_datetime_tz_custom(self): # same with fancy format assert ( dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == "2013-01-01__foo__12:00:00AM" + == f"2013-01-01__foo__12:00:00{AM_LOCAL}" ) def test_date(self): From ec870073f5a8c33c9c8ee8ff2a26f3835390436b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 10 Mar 2022 19:21:39 +0100 Subject: [PATCH 54/65] Revert "Reverted the mods as it appears it was not this problem" This reverts commit d98154222dbdab320a545b2f3cc0b8bfefcca15f. --- pandas/_libs/tslib.pyx | 7 ++----- pandas/_libs/tslibs/period.pyx | 10 +++------- pandas/_libs/tslibs/timestamps.pyx | 9 +++------ 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 151dcbf5f3f0b..ce874a962624c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -73,10 +73,6 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - - def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -132,7 +128,6 @@ def format_array_from_datetime( ------- np.ndarray[object] """ - global AM_LOCAL, PM_LOCAL cdef: int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values @@ -141,6 +136,8 @@ def format_array_from_datetime( ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts + str AM_LOCAL = Timestamp._AM_LOCAL + str PM_LOCAL = Timestamp._PM_LOCAL if na_rep is None: na_rep = 'NaT' diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index b2896bbd40ba9..08494abed297d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -122,10 +122,6 @@ cdef: INT32_MIN = -2_147_483_648LL -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - - ctypedef struct asfreq_info: int64_t intraday_conversion_factor int is_end @@ -2415,8 +2411,6 @@ cdef class _Period(PeriodMixin): >>> a.fast_strftime(fast_fmt) '2006-Q1' """ - global AM_LOCAL, PM_LOCAL - freq = self._dtype._dtype_code value = self.ordinal @@ -2443,7 +2437,7 @@ cdef class _Period(PeriodMixin): "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": Period._PM_LOCAL if (h // 12) else Period._AM_LOCAL, "min": dts.min, "sec": dts.sec, "ms": dts.us // 1000, @@ -2626,6 +2620,8 @@ class Period(_Period): Second value of the period. """ + _AM_LOCAL, _PM_LOCAL = get_local_ampm() + def __new__(cls, value=None, freq=None, ordinal=None, year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f726dc442d7e1..2c041d986235b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -111,9 +111,6 @@ from pandas._libs.tslibs.tzconversion cimport ( # Constants _zero_time = time(0, 0) _no_input = object() -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - # ---------------------------------------------------------------------- @@ -1065,6 +1062,8 @@ class Timestamp(_Timestamp): Timestamp('2017-01-01 12:00:00') """ + _AM_LOCAL, _PM_LOCAL = get_local_ampm() + @classmethod def fromordinal(cls, ordinal, freq=None, tz=None): """ @@ -1213,8 +1212,6 @@ class Timestamp(_Timestamp): >>> ts.fast_strftime(fmt) '2020-03-14T15:32:52' """ - global AM_LOCAL, PM_LOCAL - y = self.year h = self.hour return fmt_str % { @@ -1224,7 +1221,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": Timestamp._PM_LOCAL if (h // 12) else Timestamp._AM_LOCAL, "min": self.minute, "sec": self.second, "us": self.microsecond, From 98cbfca7603889a140529b73f3f49301910aa686 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 12:21:05 +0100 Subject: [PATCH 55/65] New `LocaleSpecificDtStrings` objects are returned by `convert_strftime_format` so that the caller has the correct elements to format in other locales. --- pandas/_libs/tslib.pyx | 10 +-- pandas/_libs/tslibs/__init__.py | 2 - pandas/_libs/tslibs/base.pyi | 3 - pandas/_libs/tslibs/base.pyx | 18 +--- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 13 ++- pandas/_libs/tslibs/strftime.py | 73 ++++++++++++++- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 13 ++- pandas/core/arrays/period.py | 5 +- pandas/core/tools/datetimes.py | 15 ++-- pandas/tests/io/formats/test_format.py | 119 +++++++++++++++---------- pandas/tests/tslibs/test_api.py | 1 - 13 files changed, 168 insertions(+), 108 deletions(-) delete mode 100644 pandas/_libs/tslibs/base.pyi diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ce874a962624c..3db419437ad24 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -43,7 +43,6 @@ from pandas._libs.util cimport ( is_integer_object, ) -from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string @@ -136,8 +135,7 @@ def format_array_from_datetime( ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts - str AM_LOCAL = Timestamp._AM_LOCAL - str PM_LOCAL = Timestamp._PM_LOCAL + object str_format, loc_s if na_rep is None: na_rep = 'NaT' @@ -174,7 +172,7 @@ def format_array_from_datetime( else: try: # Try to get the string formatting template for this format - str_format = convert_strftime_format(format) + str_format, loc_s = convert_strftime_format(format) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False @@ -215,7 +213,7 @@ def format_array_from_datetime( "day": dts.day, "hour": dts.hour, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": loc_s.pm if (h // 12) else loc_s.am, "min": dts.min, "sec": dts.sec, "us": dts.us, @@ -224,7 +222,7 @@ def format_array_from_datetime( ts = Timestamp(val, tz=tz) # Use string formatting for faster strftime - result[i] = ts.fast_strftime(str_format) + result[i] = ts.fast_strftime(str_format, loc_s) else: ts = Timestamp(val, tz=tz) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index a56d08639bd89..80d7a9de19735 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,6 +1,5 @@ __all__ = [ "dtypes", - "get_local_ampm", "localize_pydatetime", "NaT", "NaTType", @@ -30,7 +29,6 @@ ] from pandas._libs.tslibs import dtypes -from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.conversion import ( OutOfBoundsTimedelta, localize_pydatetime, diff --git a/pandas/_libs/tslibs/base.pyi b/pandas/_libs/tslibs/base.pyi deleted file mode 100644 index d33c416d78dfa..0000000000000 --- a/pandas/_libs/tslibs/base.pyi +++ /dev/null @@ -1,3 +0,0 @@ -from typing import Tuple - -def get_local_ampm() -> Tuple[str, str]: ... diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 5c2cc213d71ea..1677a8b0be1ec 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -5,24 +5,8 @@ in order to allow for fast isinstance checks without circular dependency issues. This is analogous to core.dtypes.generic. """ -from cpython.datetime cimport ( - datetime, - time, -) +from cpython.datetime cimport datetime cdef class ABCTimestamp(datetime): pass - - -def get_local_ampm(): - """Return the am and pm strings used in the current locale. - - We will use these when formatting datetime as string using string templates, which - is faster than strftime when executed on arrays. The strftime directive where we - need these is `%p`. - - Note that the considered locale is the one active when this function is called - (not at cython compilation time). - """ - return time(1).strftime("%p"), time(13).strftime("%p") diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 787ea66f37ed9..33bbab7b24190 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -76,7 +76,7 @@ class Period: def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: BaseOffset = ...) -> Period: ... - def fast_strftime(self, fmt_str: str) -> str: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 08494abed297d..6e3dc887d3438 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -61,7 +61,6 @@ cdef extern from "src/datetime/np_datetime.h": cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timestamps import Timestamp @@ -2393,10 +2392,10 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def fast_strftime(self, fmt_str: str) -> str: + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` should be created using `convert_strftime_format(fmt)`. + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. See also `self.strftime`, that relies on `period_format`. @@ -2407,8 +2406,8 @@ cdef class _Period(PeriodMixin): >>> a = Period(freq='Q-JUL', year=2006, quarter=1) >>> a.strftime('%F-Q%q') '2006-Q1' - >>> fast_fmt = convert_strftime_format('%F-Q%q', target="period") - >>> a.fast_strftime(fast_fmt) + >>> fast_fmt, loc_s = convert_strftime_format('%F-Q%q', target="period") + >>> a.fast_strftime(fast_fmt, loc_s) '2006-Q1' """ freq = self._dtype._dtype_code @@ -2437,7 +2436,7 @@ cdef class _Period(PeriodMixin): "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": Period._PM_LOCAL if (h // 12) else Period._AM_LOCAL, + "ampm": loc_s.pm if (h // 12) else loc_s.am, "min": dts.min, "sec": dts.sec, "ms": dts.us // 1000, @@ -2620,8 +2619,6 @@ class Period(_Period): Second value of the period. """ - _AM_LOCAL, _PM_LOCAL = get_local_ampm() - def __new__(cls, value=None, freq=None, ordinal=None, year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None): diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 6411649792ece..1f26f40276783 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -1,5 +1,9 @@ """Strftime-related classes and functions. """ +import locale + +from datetime import time +from typing import Tuple, Dict class UnsupportedStrFmtDirective(ValueError): @@ -59,11 +63,72 @@ class UnsupportedStrFmtDirective(ValueError): } +class LocaleSpecificDtStrings: + """A container for date/time strings used in a specific locale. + + We will use these when formatting datetime as string using string templates, which + is faster than strftime when executed on arrays. + + `get_current_locale_specific_string()` is the recommended way to get an instance, + as it provides caching. + + Attributes + ---------- + am : str + Used in the %p strftime directive. Locale’s equivalent of AM. + pm : str + Used in the %p strftime directive. Locale’s equivalent of PM. + """ + + __slots__ = ("am", "pm") + + def __init__(self, am: str, pm: str): + self.am = am + self.pm = pm + + def __repr__(self): + attrs = ", ".join( + f"{k}={repr(getattr(self, k))}" + for k in type(self).__slots__ + ) + return f"{type(self).__name__}({attrs})" + + @classmethod + def get_current(cls): + return LocaleSpecificDtStrings( + am=time(1).strftime("%p"), + pm=time(13).strftime("%p"), + ) + + +_locale_specifics: Dict[str, LocaleSpecificDtStrings] = dict() + + +def get_current_locale_specific_string() -> LocaleSpecificDtStrings: + """Return a `LocaleSpecificDtStrings` for the current locale. + + This function caches results in the `_locale_specifics` dict. + """ + global _locale_specifics + + # Get current locale + current_locale = locale.setlocale(locale.LC_ALL) + + try: + # Any entry in cache for current locale ? + return _locale_specifics[current_locale] + except KeyError: + # Create it using current locale, and cache it + o = LocaleSpecificDtStrings.get_current() + _locale_specifics[current_locale] = o + return o + + def convert_strftime_format( strftime_fmt: str, target: str = "datetime", new_style_fmt: bool = False, -) -> str: +) -> Tuple[str, LocaleSpecificDtStrings]: """Convert a strftime formatting string into a formatting template string. The set of supported directives varies according to the `target`. @@ -115,6 +180,10 @@ def convert_strftime_format( `new_style_formatting`. For old-style, it may be used as `fmt_out % fmt_dct`. For new-style, it may be used as `fmt_out.format(**fmt_dct)` + loc_s : LocaleSpecificDtStrings + An object containing the locale-specific strings needed for some of the + directives. For example loc_s.am and loc_s.pm should be used to fill the "ampm" + part of the template, induced by directive %p. Raises ------ @@ -192,4 +261,4 @@ def convert_strftime_format( # Finally replace our placeholder strftime_fmt = strftime_fmt.replace(esc, "%") - return strftime_fmt + return strftime_fmt, get_current_locale_specific_string() diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index bd1206fe8ce28..71af54fbf2419 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -104,7 +104,7 @@ class Timestamp(datetime): def combine(cls, date: _date, time: _time) -> datetime: ... # type: ignore[override] @classmethod def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... - def fast_strftime(self, fmt_str: str) -> str: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 2c041d986235b..bc2ff4d536a4e 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -46,7 +46,6 @@ from cpython.object cimport ( PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar -from pandas._libs.tslibs.base import get_local_ampm from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, @@ -1062,8 +1061,6 @@ class Timestamp(_Timestamp): Timestamp('2017-01-01 12:00:00') """ - _AM_LOCAL, _PM_LOCAL = get_local_ampm() - @classmethod def fromordinal(cls, ordinal, freq=None, tz=None): """ @@ -1197,10 +1194,10 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) - def fast_strftime(self, fmt_str: str) -> str: + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` should be created using `convert_strftime_format(fmt)`. + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. See also `self.strftime`, that relies on `datetime.strftime`. @@ -1208,8 +1205,8 @@ class Timestamp(_Timestamp): -------- >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') - >>> fmt = convert_strftime_format('%Y-%m-%dT%H:%M:%S') - >>> ts.fast_strftime(fmt) + >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S') + >>> ts.fast_strftime(fmt, loc_s) '2020-03-14T15:32:52' """ y = self.year @@ -1221,7 +1218,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": Timestamp._PM_LOCAL if (h // 12) else Timestamp._AM_LOCAL, + "ampm": loc_s.pm if (h // 12) else loc_s.am, "min": self.minute, "sec": self.second, "us": self.microsecond, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 1e388da8c6fc7..c35ca6fab1768 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -652,14 +652,15 @@ def _format_native_types( if fast_strftime: try: # Try to get the string formatting template for this format - str_format = convert_strftime_format(date_format, target="period") + str_format, loc_s = convert_strftime_format(date_format, + target="period") except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False if fast_strftime: # Faster: python old-style string formatting - formatter = lambda p: p.fast_strftime(str_format) + formatter = lambda p: p.fast_strftime(str_format, loc_s) else: # Slower: strftime formatter = lambda p: p.strftime(date_format) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d30e5bb70f036..9d499d2533703 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -25,7 +25,6 @@ Timestamp, conversion, convert_strftime_format, - get_local_ampm, iNaT, nat_strings, parsing, @@ -1282,13 +1281,10 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): return to_time(arg, format, infer_time_format, errors) -AM_LOCAL, PM_LOCAL = get_local_ampm() -"""Get the locale-specific versions of am and pm strings.""" - - def fast_strftime( dt: datetime, fmt: str, + loc_s: LocalizedDtStrings, new_style_fmt: bool = False, ) -> str: """A faster version of `datetime.strftime` using python string formatting. @@ -1329,6 +1325,10 @@ def fast_strftime( # common dict used for formatting y = dt.year h = dt.hour + + # get the formatting template + fmt_str, loc_s = convert_strftime_format(fmt, new_style_fmt=new_style_fmt) + fmt_dct = { "year": y, "shortyear": y % 100, @@ -1336,14 +1336,11 @@ def fast_strftime( "day": dt.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": PM_LOCAL if (h // 12) else AM_LOCAL, + "ampm": loc_s.pm if (h // 12) else loc_s.am, "min": dt.minute, "sec": dt.second, "us": dt.microsecond, } - # get the formatting template - fmt_str = convert_strftime_format(fmt, new_style_fmt=new_style_fmt) - # execute return fmt_str.format(**fmt_dct) if new_style_fmt else (fmt_str % fmt_dct) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d5f771c0de859..05396efb42cd4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -7,6 +7,7 @@ ) from io import StringIO import itertools +import locale from operator import methodcaller import os from pathlib import Path @@ -21,10 +22,7 @@ import pytest import pytz -from pandas._libs.tslibs import ( - convert_strftime_format, - get_local_ampm, -) +from pandas._libs.tslibs import convert_strftime_format from pandas.compat import ( IS64, is_platform_windows, @@ -37,7 +35,6 @@ Index, MultiIndex, NaT, - Period, Series, Timestamp, date_range, @@ -56,6 +53,13 @@ use_32bit_repr = is_platform_windows() or not IS64 +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -3188,58 +3192,69 @@ def test_period(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" - def test_period_locale(self): - """Test that `get_local_ampm` relies on runtime locale, not compile-time one + def test_period_locale(self, overridden_locale): + """Test that `convert_strftime_format` relies on runtime locale If this test fails, all tests using %p format strftime will fail when the runtime locale is different from the compile-time one. """ - # Make sure the function is ok - AM_LOCAL, PM_LOCAL = get_local_ampm() - assert AM_LOCAL == time(1).strftime("%p") - assert PM_LOCAL == time(13).strftime("%p") + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p") + assert str_tmp == "%(ampm)s" # Now what about the classes ? # Timestamp am_ts = Timestamp(2020, 1, 1, 1) - assert AM_LOCAL == am_ts.strftime("%p") - assert AM_LOCAL == am_ts.fast_strftime("%(ampm)s") + assert am_local == am_ts.strftime("%p") + assert am_local == am_ts.fast_strftime(str_tmp, loc_s) pm_ts = Timestamp(2020, 1, 1, 13) - assert PM_LOCAL == pm_ts.strftime("%p") - assert PM_LOCAL == pm_ts.fast_strftime("%(ampm)s") + assert pm_local == pm_ts.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) # Period - am_per = Period("2018-03-11 01:00", freq="H") - assert AM_LOCAL == am_per.strftime("%p") - assert AM_LOCAL == am_per.fast_strftime("%(ampm)s") - pm_per = Period("2018-03-11 13:00", freq="H") - assert PM_LOCAL == pm_per.strftime("%p") - assert PM_LOCAL == pm_ts.fast_strftime("%(ampm)s") + am_per = pd.Period("2018-03-11 01:00", freq="H") + assert am_local == am_per.strftime("%p") + assert am_local == am_per.fast_strftime(str_tmp, loc_s) + pm_per = pd.Period("2018-03-11 13:00", freq="H") + assert pm_local == pm_per.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) @pytest.mark.parametrize("fast_strftime", (False, True)) - def test_period_custom(self, fast_strftime): + def test_period_custom(self, fast_strftime, overridden_locale): # GH46252 + + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # 3 digits p = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") formatted = p.format( - date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime ) - AM_LOCAL, PM_LOCAL = get_local_ampm() - assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123000 ns=123000000)" - assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=124 us=124000 ns=124000000)" + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123000 ns=123000000)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=124 us=124000 ns=124000000)" - p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + # 6 digits + p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") formatted = p.format( - date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime ) - assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456789)" - assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456790)" + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456000)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123457 ns=123457000)" - p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") + # 9 digits + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") formatted = p.format( - date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", fast_strftime=fast_strftime + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime ) - assert formatted[0] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123456 ns=123456000)" - assert formatted[1] == f"03 12:01:01{PM_LOCAL} (ms=123 us=123457 ns=123457000)" + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" def test_period_tz(self): """Test formatting periods created from a datetime with timezone""" @@ -3260,6 +3275,16 @@ def test_period_tz(self): assert p.format()[0] == "2013-01-01 00:00" +@pytest.fixture(params=["fr_FR", "zh_CN", "it_IT"]) +def overridden_locale(request): + """A fixture used to change the locale temporarily""" + old = locale.setlocale(locale.LC_ALL) + target = request.param + locale.setlocale(locale.LC_ALL, target) + yield target + locale.setlocale(locale.LC_ALL, old) + + class TestDatetimeIndexFormat: def test_datetime(self): formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() @@ -3276,7 +3301,8 @@ def test_datetime_tz(self): assert dt.format()[0] == "2013-01-01 00:00:00+01:00" def test_datetime_tz_custom(self): - AM_LOCAL, PM_LOCAL = get_local_ampm() + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) @@ -3289,7 +3315,7 @@ def test_datetime_tz_custom(self): # same with fancy format assert ( dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == f"2012-12-31__foo__11:00:00{PM_LOCAL}" + == f"2012-12-31__foo__11:00:00{pm_local}" ) # If tz is currently set as paris, we'll see 2013 @@ -3301,7 +3327,7 @@ def test_datetime_tz_custom(self): # same with fancy format assert ( dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] - == f"2013-01-01__foo__12:00:00{AM_LOCAL}" + == f"2013-01-01__foo__12:00:00{am_local}" ) def test_date(self): @@ -3412,8 +3438,12 @@ def test_fast_strftime_basic(self, strftime_format, new_style): # strftime strftime_res = dt.strftime(strftime_format) - # Get the locale-specific versions of am and pm strings. - AM_LOCAL, PM_LOCAL = get_local_ampm() + # get the formatting string + # fmt: off + style_format, loc_s = convert_strftime_format( + strftime_format, new_style_fmt=new_style + ) + # fmt: on # common dict used for formatting fmt_dct = { @@ -3423,19 +3453,12 @@ def test_fast_strftime_basic(self, strftime_format, new_style): "day": dt.day, "hour": dt.hour, "hour12": 12 if dt.hour in (0, 12) else (dt.hour % 12), - "ampm": PM_LOCAL if (dt.hour // 12) else AM_LOCAL, + "ampm": loc_s.pm if (dt.hour // 12) else loc_s.am, "min": dt.minute, "sec": dt.second, "us": dt.microsecond, } - # get the formatting string - # fmt: off - style_format = convert_strftime_format( - strftime_format, new_style_fmt=new_style - ) - # fmt: on - # apply it and check the output if new_style: res = style_format.format(**fmt_dct) @@ -3444,7 +3467,7 @@ def test_fast_strftime_basic(self, strftime_format, new_style): assert res == strftime_res # fast_strftime is a shortcut function for the above - res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) + res2 = fast_strftime(dt, strftime_format, loc_s=loc_s, new_style_fmt=new_style) assert res2 == res def test_bad_strftime_directive(self): diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 5685a6bdaab34..10446a2e88288 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -52,7 +52,6 @@ def test_namespace(): "tz_convert_from_utc_single", "to_offset", "tz_compare", - "get_local_ampm", ] expected = set(submodules + api) From c80bbecd62daa68866240a4a6cc6f9625cf7d1b0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 12:28:20 +0100 Subject: [PATCH 56/65] Removed useless argument in fast_strftime --- pandas/core/tools/datetimes.py | 1 - pandas/tests/io/formats/test_format.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9d499d2533703..7930897e28019 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1284,7 +1284,6 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): def fast_strftime( dt: datetime, fmt: str, - loc_s: LocalizedDtStrings, new_style_fmt: bool = False, ) -> str: """A faster version of `datetime.strftime` using python string formatting. diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 05396efb42cd4..9cd8f7f1e6c56 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3467,7 +3467,8 @@ def test_fast_strftime_basic(self, strftime_format, new_style): assert res == strftime_res # fast_strftime is a shortcut function for the above - res2 = fast_strftime(dt, strftime_format, loc_s=loc_s, new_style_fmt=new_style) + # but it would actually only be fast if it could receive lists :) + res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) assert res2 == res def test_bad_strftime_directive(self): From b173dc6f5638e020abb3f2199b84569ab77d28bd Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 12:33:47 +0100 Subject: [PATCH 57/65] Fixed #46319 by using `PyUnicode_DecodeLocale` to decode the string returned by `c_strftime`. --- pandas/_libs/tslibs/period.pyx | 2 +- pandas/_libs/tslibs/util.pxd | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 6e3dc887d3438..f30d9c28aaa83 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1303,7 +1303,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): # Execute c_strftime to process the usual datetime directives formatted = c_strftime(&dts, fmt) - result = util.char_to_string(formatted) + result = util.char_to_string_locale(formatted) free(formatted) # Now fill the placeholders corresponding to our additional directives diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 150516aadffc6..420ebef622f97 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -11,6 +11,15 @@ cdef extern from *: object char_to_string(const char* data) +cdef extern from *: + object PyUnicode_DecodeLocale(const char *str, const char *errors) + + +cdef inline object char_to_string_locale(const char* data): + """As opposed to PyUnicode_FromString, use the locale to decode.""" + return PyUnicode_DecodeLocale(data, NULL) + + cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. From b44c6cf96e94c5a91cd847c6e186164e28a78f33 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 12:34:40 +0100 Subject: [PATCH 58/65] flake8 black and isort --- pandas/_libs/tslibs/strftime.py | 13 +++++++------ pandas/io/formats/csvs.py | 3 --- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 1f26f40276783..2bcc69ae05e1c 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -1,9 +1,11 @@ """Strftime-related classes and functions. """ -import locale - from datetime import time -from typing import Tuple, Dict +import locale +from typing import ( + Dict, + Tuple, +) class UnsupportedStrFmtDirective(ValueError): @@ -88,8 +90,7 @@ def __init__(self, am: str, pm: str): def __repr__(self): attrs = ", ".join( - f"{k}={repr(getattr(self, k))}" - for k in type(self).__slots__ + [f"{k}={repr(getattr(self, k))}" for k in type(self).__slots__] ) return f"{type(self).__name__}({attrs})" @@ -101,7 +102,7 @@ def get_current(cls): ) -_locale_specifics: Dict[str, LocaleSpecificDtStrings] = dict() +_locale_specifics: Dict[str, LocaleSpecificDtStrings] = {} def get_current_locale_specific_string() -> LocaleSpecificDtStrings: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0e04a1b156daf..c4a422ed91292 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,12 +29,9 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCIndex, ABCMultiIndex, - ABCPeriodIndex, ) -from pandas.core.dtypes.missing import notna from pandas.core.indexes.api import Index From 4956ff1ad168acf3c03d2e72626904bc7ba26960 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 12:53:24 +0100 Subject: [PATCH 59/65] black --- pandas/core/arrays/period.py | 5 +++-- pandas/tests/io/formats/test_format.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c35ca6fab1768..408f7467a903d 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -652,8 +652,9 @@ def _format_native_types( if fast_strftime: try: # Try to get the string formatting template for this format - str_format, loc_s = convert_strftime_format(date_format, - target="period") + str_format, loc_s = convert_strftime_format( + date_format, target="period" + ) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 9cd8f7f1e6c56..5b2e25a705bca 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3225,6 +3225,7 @@ def test_period_locale(self, overridden_locale): @pytest.mark.parametrize("fast_strftime", (False, True)) def test_period_custom(self, fast_strftime, overridden_locale): # GH46252 + # fmt: off # Get locale-specific reference am_local, pm_local = get_local_am_pm() @@ -3255,6 +3256,7 @@ def test_period_custom(self, fast_strftime, overridden_locale): ) assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" + # fmt: on def test_period_tz(self): """Test formatting periods created from a datetime with timezone""" From 399878a94bbf5b57c407de35714a03cd75e59cd4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 22:16:27 +0100 Subject: [PATCH 60/65] Fixed `_format_datetime64_dateonly` used in `get_format_datetime64`. It was not covered by any test! TODO --- pandas/io/formats/format.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 0aaec93a288fb..463da1ee2d54e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1777,6 +1777,7 @@ def _format_datetime64_dateonly( nat_rep: str = "NaT", date_format: str | None = None, str_date_fmt: str | None = None, + loc_s: object | None = None, ) -> str: if isinstance(x, NaTType): return nat_rep @@ -1784,7 +1785,7 @@ def _format_datetime64_dateonly( if date_format: if str_date_fmt: # Faster, using string formatting - return x.fast_strftime(str_date_fmt) + return x.fast_strftime(str_date_fmt, loc_s) else: # Slower return x.strftime(date_format) @@ -1811,13 +1812,17 @@ def get_format_datetime64( if date_format is not None and fast_strftime: try: # Try to get the string formatting template for this format - str_date_fmt = convert_strftime_format(date_format) + str_date_fmt, loc_s = convert_strftime_format(date_format) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format, str_date_fmt=str_date_fmt + x, + nat_rep=nat_rep, + date_format=date_format, + str_date_fmt=str_date_fmt, + loc_s=loc_s, ) else: # Relies on datetime.str, which is fast already From 5b6e07e4f10f091e6b78f32327d5ddcf7b715ef0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 22:17:14 +0100 Subject: [PATCH 61/65] Removed the multi-locale test as it fails on some ci targets --- pandas/tests/io/formats/test_format.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 5b2e25a705bca..097666317c9f7 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3192,7 +3192,7 @@ def test_period(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" - def test_period_locale(self, overridden_locale): + def test_period_locale(self): """Test that `convert_strftime_format` relies on runtime locale If this test fails, all tests using %p format strftime will fail when @@ -3223,7 +3223,7 @@ def test_period_locale(self, overridden_locale): assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) @pytest.mark.parametrize("fast_strftime", (False, True)) - def test_period_custom(self, fast_strftime, overridden_locale): + def test_period_custom(self, fast_strftime): # GH46252 # fmt: off @@ -3277,14 +3277,14 @@ def test_period_tz(self): assert p.format()[0] == "2013-01-01 00:00" -@pytest.fixture(params=["fr_FR", "zh_CN", "it_IT"]) -def overridden_locale(request): - """A fixture used to change the locale temporarily""" - old = locale.setlocale(locale.LC_ALL) - target = request.param - locale.setlocale(locale.LC_ALL, target) - yield target - locale.setlocale(locale.LC_ALL, old) +# @pytest.fixture(params=["fr_FR", "zh_CN", "it_IT"]) +# def overridden_locale(request): +# """A fixture used to change the locale temporarily""" +# old = locale.setlocale(locale.LC_ALL) +# target = request.param +# locale.setlocale(locale.LC_ALL, target) +# yield target +# locale.setlocale(locale.LC_ALL, old) class TestDatetimeIndexFormat: From 743106f055e1ca53151fe3b30ffbd7c2d89c31c7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 11 Mar 2022 22:33:29 +0100 Subject: [PATCH 62/65] Fixed var init issue --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 463da1ee2d54e..d869c0410d03c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1808,7 +1808,7 @@ def get_format_datetime64( a string as output""" if is_dates_only: - str_date_fmt = None + str_date_fmt = loc_s = None if date_format is not None and fast_strftime: try: # Try to get the string formatting template for this format From 44724af46cf7386c5367aedddd9f29eaffd899ff Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 12 Mar 2022 14:28:38 +0100 Subject: [PATCH 63/65] Flake8 --- pandas/tests/io/formats/test_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 097666317c9f7..e738880173cec 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -7,7 +7,6 @@ ) from io import StringIO import itertools -import locale from operator import methodcaller import os from pathlib import Path From 718a6474c111bd1b31eda1234bd2dd143be20cb8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 12 Mar 2022 14:53:37 +0100 Subject: [PATCH 64/65] Improved `Datetime64TZFormatter` readability by replacing useless call to `is_dates_only` by an assert that the result is False. Added a corresponding test --- pandas/io/formats/format.py | 7 +++++-- pandas/tests/arrays/categorical/test_repr.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d869c0410d03c..95b74c56af8ee 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1849,9 +1849,12 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" values = self.values.astype(object) - ido = is_dates_only(values) + # When there is a timezone `is_dates_only` always returns `False` since dates + # are not universal dates but 00:00:00 timestamps in the given timezone. + assert not is_dates_only(values) formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format, fast_strftime=self.fast_strftime + is_dates_only=False, date_format=self.date_format, + fast_strftime=self.fast_strftime ) fmt_values = [formatter(x) for x in values] diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 3454c8bb90941..b78df1041d399 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -207,6 +207,22 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp + # same with dates only: since there is a timezone, dates become datetimes + idx = date_range("2011-01-01", freq="D", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00, " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00, " + "2011-01-05 00:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00,\n" + " " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00,\n" + " " + "2011-01-05 00:00:00-05:00]" + ) + assert repr(c) == exp + def test_categorical_repr_datetime_ordered(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) From e6bb843496dcf4ce7284910ffb43f962b0a7cc28 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 12 Mar 2022 15:47:31 +0100 Subject: [PATCH 65/65] Flake8 fix ! --- pandas/io/formats/format.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 95b74c56af8ee..9113e40768fd4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1853,8 +1853,9 @@ def _format_strings(self) -> list[str]: # are not universal dates but 00:00:00 timestamps in the given timezone. assert not is_dates_only(values) formatter = self.formatter or get_format_datetime64( - is_dates_only=False, date_format=self.date_format, - fast_strftime=self.fast_strftime + is_dates_only=False, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) fmt_values = [formatter(x) for x in values]