diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index ff0ccffced0f3..baf44fb260dcb 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -79,7 +79,13 @@ def setup(self): rng = date_range("1/1/2000", periods=1000) self.data = DataFrame(rng, index=rng) - def time_frame_date_formatting(self): + def time_frame_date_formatting_default(self): + self.data.to_csv(self.fname) + + def time_frame_date_formatting_default_explicit(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d") + + def time_frame_date_formatting_custom(self): self.data.to_csv(self.fname, date_format="%Y%m%d") @@ -90,11 +96,14 @@ def setup(self): rng = date_range("2000", periods=100_000, freq="s") self.data = DataFrame({"a": 1}, index=rng) - def time_frame_date_formatting_index(self): + def time_frame_date_formatting_index_default(self): + self.data.to_csv(self.fname) + + def time_frame_date_formatting_index_default_explicit(self): self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S") - def time_frame_date_no_format_index(self): - self.data.to_csv(self.fname) + def time_frame_date_formatting_index_custom(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d__%H:%M:%S") class ToCSVPeriod(BaseIO): @@ -117,7 +126,7 @@ def time_frame_period_formatting_default(self, nobs, freq): def time_frame_period_formatting_default_explicit(self, nobs, freq): self.data.to_csv(self.fname, date_format=self.default_fmt) - def time_frame_period_formatting(self, nobs, freq): + def time_frame_period_formatting_custom(self, nobs, freq): # Nb: `date_format` is not actually taken into account here today, so the # performance is currently identical to `time_frame_period_formatting_default` # above. This timer is therefore expected to degrade when GH#51621 is fixed. @@ -139,15 +148,15 @@ def setup(self, nobs, freq): elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" - def time_frame_period_formatting_index(self, nobs, freq): - self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") - def time_frame_period_formatting_index_default(self, nobs, freq): self.data.to_csv(self.fname) def time_frame_period_formatting_index_default_explicit(self, nobs, freq): self.data.to_csv(self.fname, date_format=self.default_fmt) + def time_frame_period_formatting_index_custom(self, nobs, freq): + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" @@ -166,9 +175,12 @@ def setup(self, nobs): } ) - def time_frame(self, nobs): + def time_frame_formatting_default(self, nobs): self.data.to_csv(self.fname) + def time_frame_date_formatting_custom(self, nobs): + self.data.to_csv(self.fname, date_format="%Y%m%d__%H%M%S") + class ToCSVIndexes(BaseIO): fname = "__test__.csv" diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 5f2832e361eb5..19d9d0a587ee1 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -6,10 +6,10 @@ class DatetimeStrftime: timeout = 1500 - params = [1000, 10000] - param_names = ["nobs"] + params = ([1000, 10000], [False, True]) + param_names = ["nobs", "tz_aware"] - def setup(self, nobs): + def setup(self, nobs, tz_aware): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = pd.DataFrame( @@ -19,37 +19,68 @@ def setup(self, nobs): "r": [np.random.uniform()] * nobs, } ) + if tz_aware: + self.data["dt"] = self.data["dt"].dt.tz_localize("UTC") + self.data["d"] = self.data["d"].dt.tz_localize("UTC") + + self.data["i"] = self.data["dt"] + self.data.set_index("i", inplace=True) - def time_frame_date_to_str(self, nobs): + def time_frame_date_to_str(self, nobs, tz_aware): self.data["d"].astype(str) - def time_frame_date_formatting_default(self, nobs): + def time_frame_date_formatting_default(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format=None) - def time_frame_date_formatting_default_explicit(self, nobs): - self.data["d"].dt.strftime(date_format="%Y-%m-%d") + def time_frame_date_formatting_index_to_str(self, nobs, tz_aware): + self.data.index.astype(str) + + def time_frame_date_formatting_index_default(self, nobs, tz_aware): + self.data.index.strftime(date_format=None) - def time_frame_date_formatting_custom(self, nobs): + def time_frame_date_formatting_custom(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format="%Y---%m---%d") - def time_frame_datetime_to_str(self, nobs): + def time_frame_date_formatting_index_custom(self, nobs, tz_aware): + self.data.index.strftime(date_format="%Y---%m---%d") + + def time_frame_datetime_to_str(self, nobs, tz_aware): self.data["dt"].astype(str) - def time_frame_datetime_formatting_default(self, nobs): + def time_frame_datetime_formatting_default(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format=None) - def time_frame_datetime_formatting_default_explicit_date_only(self, nobs): + def time_frame_datetime_formatting_default_explicit_date_only(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_datetime_formatting_default_explicit(self, nobs): + def time_frame_datetime_formatting_default_explicit(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") - def time_frame_datetime_formatting_default_with_float(self, nobs): + def time_frame_datetime_formatting_default_with_float(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") - def time_frame_datetime_formatting_custom(self, nobs): + def time_frame_datetime_formatting_index_to_str(self, nobs, tz_aware): + self.data.set_index("dt").index.astype(str) + + def time_frame_datetime_formatting_index_default(self, nobs, tz_aware): + self.data.set_index("dt").index.strftime(date_format=None) + + def time_frame_datetime_formatting_custom(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + def time_frame_datetime_formatting_index_custom(self, nobs, tz_aware): + self.data.set_index("dt").index.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + def time_frame_datetime_formatting_iso8601_map(self, nobs, tz_aware): + self.data["dt"].map(lambda timestamp: timestamp.isoformat()) + + def time_frame_datetime_formatting_iso8601_strftime_Z(self, nobs, tz_aware): + self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + + def time_frame_datetime_formatting_iso8601_strftime_offset(self, nobs, tz_aware): + """Not optimized yet as %z is not supported by `convert_strftime_format`""" + self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") + class PeriodStrftime: timeout = 1500 @@ -73,6 +104,12 @@ def setup(self, nobs, freq): def time_frame_period_to_str(self, nobs, freq): self.data["p"].astype(str) + def time_frame_period_str(self, nobs, freq): + self.data["p"].apply(str) + + def time_frame_period_repr(self, nobs, freq): + self.data["p"].apply(repr) + def time_frame_period_formatting_default(self, nobs, freq): self.data["p"].dt.strftime(date_format=None) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index be4b9c218f9f5..0a88a0d57a8d3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -441,6 +441,13 @@ Other Removals Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ + +- Performance improvement in all datetime formatting procedures, achieved with using python string formatting instead of OS ``strftime`` (:issue:`51298`):: + + - in :meth:`DatetimeLikeArrayMixin.strftime`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` + - This is achieved thanks to new :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. strftime templates that can not be converted to such a fast python string template continue to be processed with OS ``strftime`` as fallback. + - Eliminated circular reference in to original pandas object in accessor attributes (e.g. :attr:`Series.str`). However, accessor instantiation is no longer cached (:issue:`47667`, :issue:`41357`) - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 26a872a90e493..3bf8f5ca004cb 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -5,6 +5,7 @@ "Period", "Timedelta", "Timestamp", + "convert_strftime_format", "iNaT", "Interval", ] @@ -23,5 +24,6 @@ Period, Timedelta, Timestamp, + convert_strftime_format, iNaT, ) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 7e3372a80db9d..b4c81629102b3 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,6 +10,7 @@ def format_array_from_datetime( format: str | None = ..., na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT + _use_pystr_engine: bool = ..., ) -> npt.NDArray[np.object_]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 928d253bf3169..eef978cad6c10 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -77,6 +77,10 @@ from pandas._libs.tslibs import ( Resolution, get_resolution, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here @@ -118,6 +122,7 @@ def format_array_from_datetime( str format=None, na_rep: str | float = "NaT", NPY_DATETIMEUNIT reso=NPY_FR_ns, + _use_pystr_engine=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -131,18 +136,22 @@ def format_array_from_datetime( na_rep : optional, default is None a nat format reso : NPY_DATETIMEUNIT, default NPY_FR_ns + _use_pystr_engine : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. Returns ------- np.ndarray[object] """ cdef: - int64_t val, ns, N = values.size + int64_t val, ns, y, h, N = values.size bint show_ms = False, show_us = False, show_ns = False bint basic_format = False, basic_format_day = False _Timestamp ts object res npy_datetimestruct dts + object str_format, locale_dt_strings # Note that `result` (and thus `result_flat`) is C-order and # `it` iterates C-order as well, so the iteration matches @@ -176,8 +185,24 @@ def format_array_from_datetime( # Default format for dates basic_format_day = True + # Sanity check - these flags are exclusive assert not (basic_format_day and basic_format) + if not basic_format_day and not basic_format and _use_pystr_engine: + # Preprocessing for _use_pystr_engine + if format is None: + # We'll fallback to the Timestamp.str method + _use_pystr_engine = False + else: + try: + # Try to get the string formatting template for this format + str_format, locale_dt_strings = convert_strftime_format( + format, target="datetime" + ) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + _use_pystr_engine = False + for i in range(N): # Analogous to: utc_val = values[i] val = (cnp.PyArray_ITER_DATA(it))[0] @@ -203,6 +228,35 @@ def format_array_from_datetime( elif show_ms: res += f".{dts.us // 1000:03d}" + elif _use_pystr_engine: + + if tz is None: + pandas_datetime_to_datetimestruct(val, reso, &dts) + + # Use string formatting for faster strftime + y = dts.year + shortyear = y % 100 + if y < 0 and shortyear != 0: + # Fix negative modulo to adopt C-style modulo + shortyear -= 100 + h = dts.hour + res = str_format % { + "year": y, + "shortyear": shortyear, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": locale_dt_strings.pm if (h // 12) else locale_dt_strings.am, + "min": dts.min, + "sec": dts.sec, + "us": dts.us, + } + else: + ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) + + # Use string formatting for faster strftime + res = ts._strftime_pystr(str_format, locale_dt_strings) else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 31979b293a940..944c7efea0935 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -9,6 +9,8 @@ "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "Resolution", "Timedelta", "normalize_i8_timestamps", @@ -69,6 +71,10 @@ IncompatibleFrequency, Period, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timedeltas import ( Timedelta, delta_to_nanoseconds, diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 85410f771233f..5bf60ab0ab093 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -51,6 +51,7 @@ sources_to_install = [ 'offsets.pyi', 'parsing.pyi', 'period.pyi', + 'strftime.py', 'strptime.pyi', 'timedeltas.pyi', 'timestamps.pyi', diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 22f3bdbe668de..1ae7b51141271 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,6 +48,7 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, + _use_pystr_engine: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 023a0f52e320f..bb9e66cf4dab8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -57,6 +57,10 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct_to_datetime, pandas_datetime_to_datetimestruct, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) import_pandas_datetime() @@ -1170,11 +1174,20 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: return result -cdef str period_format(int64_t value, int freq, object fmt=None): +cdef str period_format( + int64_t value, + int freq, + str fmt=None, + bytes fmt_bytes=None, + object fast_fmt=None, + object fast_loc_am=None, + object fast_loc_pm=None, +): + """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" cdef: int freq_group, quarter - npy_datetimestruct dts + npy_datetimestruct dts, dts2 bint is_fmt_none if value == NPY_NAT: @@ -1239,13 +1252,42 @@ cdef str period_format(int64_t value, int freq, object fmt=None): # `freq_group` is invalid, raise raise ValueError(f"Unknown freq: {freq}") + elif fast_fmt is not None: + # A custom format is requested using python string formatting + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template. Note: handling of non-utf8 chars is directly + # done in python here, no need to encode as for c-strftime + y = dts.year + h = dts.hour + return fast_fmt % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": fast_loc_pm if (h // 12) else fast_loc_am, + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } else: - # A custom format is requested - if isinstance(fmt, str): - # Encode using current locale, in case fmt contains non-utf8 chars - fmt = util.string_encode_locale(fmt) + # A custom format is requested using strftime (slower) - return _period_strftime(value, freq, fmt, dts) + if fmt_bytes is None and isinstance(fmt, str): + # If not already done, + # encode using current locale, in case fmt contains non-utf8 chars + fmt_bytes = util.string_encode_locale(fmt) + + return _period_strftime(value, freq, fmt_bytes, dts) cdef list extra_fmts = [(b"%q", b"^`AB`^"), @@ -1320,7 +1362,11 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt, npy_datetimestruct def period_array_strftime( - ndarray values, int dtype_code, object na_rep, str date_format + ndarray values, + int dtype_code, + object na_rep, + str date_format, + bint _use_pystr_engine, ): """ Vectorized Period.strftime used for PeriodArray._format_native_types. @@ -1332,6 +1378,9 @@ def period_array_strftime( Corresponds to PeriodDtype._dtype_code na_rep : any date_format : str or None + _use_pystr_engine : bool + If `True` and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. """ cdef: Py_ssize_t i, n = values.size @@ -1342,6 +1391,28 @@ def period_array_strftime( ) object[::1] out_flat = out.ravel() cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, values) + object date_fmt_bytes + object fast_fmt = None + object fast_loc + object fast_loc_am = None + object fast_loc_pm = None + + if isinstance(date_format, str): + # Encode string using current locale, in case it contains non-utf8 chars + date_fmt_bytes = util.string_encode_locale(date_format) + else: + # None or bytes already + date_fmt_bytes = date_format + + if _use_pystr_engine and date_format is not None: + try: + # Try to get the string formatting template for this format + fast_fmt, fast_loc = convert_strftime_format(date_format, target="period") + fast_loc_am = fast_loc.am + fast_loc_pm = fast_loc.pm + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + pass for i in range(n): # Analogous to: ordinal = values[i] @@ -1353,11 +1424,21 @@ def period_array_strftime( # This is equivalent to # freq = frequency_corresponding_to_dtype_code(dtype_code) # per = Period(ordinal, freq=freq) - # if date_format: + # if fast_fmt: + # item_repr = per._strftime_pystr(fast_fmt, fast_loc) + # elif date_format: # item_repr = per.strftime(date_format) # else: # item_repr = str(per) - item_repr = period_format(ordinal, dtype_code, date_format) + item_repr = period_format( + ordinal, + dtype_code, + date_format, + date_fmt_bytes, + fast_fmt, + fast_loc_am, + fast_loc_pm, + ) # Analogous to: ordinals[i] = ordinal out_flat[i] = item_repr @@ -2519,6 +2600,41 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `locale_dt_strings` should be created using + `convert_strftime_format(fmt, target="period")`. + + See also `self.strftime`, that relies on `period_format`. + + Examples + -------- + + >>> from pandas._libs.tslibs import convert_strftime_format + >>> a = Period(freq='Q-JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> fast_fmt, loc_dt_strs = convert_strftime_format('%F-Q%q', target="period") + >>> a._strftime_pystr(fast_fmt, loc_dt_strs) + '2006-Q1' + """ + value = self.ordinal + + if value == NPY_NAT: + return "NaT" + else: + freq = self._dtype._dtype_code + return period_format( + value, + freq, + "dummy" if fmt_str is not None else None, + None, + fmt_str, + locale_dt_strings.am, + locale_dt_strings.pm + ) + def strftime(self, fmt: str | None) -> str: r""" Returns a formatted string representation of the :class:`Period`. diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py new file mode 100644 index 0000000000000..61ee9e1617a9a --- /dev/null +++ b/pandas/_libs/tslibs/strftime.py @@ -0,0 +1,277 @@ +"""Strftime-related classes and functions.""" + +from __future__ import annotations + +from datetime import time +import locale + + +class UnsupportedStrFmtDirective(ValueError): + """The format contains a directive that is not supported in this context.""" + + +_COMMON_UNSUPPORTED = ( + # 1- Names not in the numpy or datetime attr representation + "%a", # Weekday as locale's abbreviated name. + "%A", # Weekday as locale's full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale's abbreviated name. + "%B", # Month as locale's full name. + # 2- TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] ("" if tz naive). + "%Z", # Time zone name ("" if tz naive). + # 3- Probably too complex ones for now + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Monday are considered to be in week 0. + "%c", # Locale's appropriate date and time representation. + "%x", # Locale's appropriate date representation. + "%X", # Locale's appropriate time representation. +) + + +_COMMON_MAP = { + "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. + "%m": ("month", "02d"), # Month as a zero-padded decimal number. + "%Y": ("year", "04d"), # Year with century as a 0-padded decimal number. + "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. + "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. + "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. + "%p": ("ampm", "s"), # Locale's equivalent of either AM or PM. + "%M": ("min", "02d"), # Minute as a zero-padded decimal number. + "%S": ("sec", "02d"), # Second as a zero-padded decimal number. +} + +_DATETIME_MAP = { + "%f": ("us", "06d"), # Microsecond as decimal number, 0-padded to 6 digits +} +_DATETIME_UNSUPPORTED = ( + "%F", + "%q", + "%l", + "%u", + "%n", +) + +_PERIOD_MAP = { + "%f": ( + "fyear", + "02d", + ), # 'Fiscal' year without century as zero-padded decimal number [00,99] + "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number + "%q": ("q", "d"), # Quarter as a decimal number [1,4] + "%l": ("ms", "03d"), # Millisecond as decimal number, 0-padded 3 digits + "%u": ("us", "06d"), # Microsecond as decimal number, 0-padded 6 digits + "%n": ("ns", "09d"), # Nanosecond as decimal number, 0-padded 9 digits +} +_PERIOD_UNSUPPORTED = () + + +class LocaleSpecificDtStrings: + """A container for date/time strings used in a specific locale. + + We will use these when formatting datetime as string using string templates, which + is faster than strftime when executed on arrays. + + `get_current_locale_specific_string()` is the recommended way to get an instance, + as it provides caching. + + Attributes + ---------- + am : str + Used in the %p strftime directive. Locale's equivalent of AM. + pm : str + Used in the %p strftime directive. Locale's equivalent of PM. + """ + + __slots__ = ("am", "pm") + + def __init__(self, am: str, pm: str) -> None: + self.am = am + self.pm = pm + + def __repr__(self) -> str: + attrs = ", ".join([f"{k}={getattr(self, k)!r}" for k in type(self).__slots__]) + return f"{type(self).__name__}({attrs})" + + @classmethod + def get_current(cls): + return LocaleSpecificDtStrings( + am=time(1).strftime("%p"), + pm=time(13).strftime("%p"), + ) + + +_locale_specifics: dict[str, LocaleSpecificDtStrings] = {} + + +def get_current_locale_specific_string() -> LocaleSpecificDtStrings: + """Return a `LocaleSpecificDtStrings` for the current locale. + + This function caches results in the `_locale_specifics` dict. + """ + + # Get current locale + current_locale = locale.setlocale(locale.LC_ALL) + + try: + # Any entry in cache for current locale ? + return _locale_specifics[current_locale] + except KeyError: + # Create it using current locale, and cache it + o = LocaleSpecificDtStrings.get_current() + _locale_specifics[current_locale] = o + return o + + +def convert_strftime_format( + strftime_fmt: str, + target: str, + new_style_fmt: bool = False, +) -> tuple[str, LocaleSpecificDtStrings]: + """Convert a strftime formatting string into a formatting template string. + + The set of supported directives varies according to the `target`. + + This method can be tested on a single instance of + + - `Timestamp`, through `Timestamp._strftime_pystr`. The result may be compared + with `Timestamp.strftime` + + - `Period` through `Period._strftime_pystr`. The result may be compared + with `Period.strftime`. + + On array-like objects, this method is used in several places: + + - Subclasses of `DatelikeOps` now rely on this method in their + `self.strftime(fmt)` default implementation, which delegates to + `_format_native_types`. + + - `DatetimeArray._format_native_types` relies on + `tslib.format_array_from_datetime` which relies on this function + - `PeriodArray._format_native_types` directly relies on this function. + - `TimedeltaArray._format_native_types` does not currently support + custom formats. + + In addition, `Datetime64Formatter` and `Datetime64TZFormatter` rely on this + too. + + Parameters + ---------- + strftime_fmt : str + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. + Note that not all directives are eligible to successful usage of string + formatting. Unsupported directives will lead to an + `UnsupportedStrFmtDirective` being raised. + target : { "datetime", "date", "time", "period" }, default: "datetime" + The kind of data that will be formatted using this template. + new_style_fmt : bool, default: False + Whether the output string should be new-style + e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" + or old-style + e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + + Returns + ------- + fmt_out : str + A string that may be used to format a `datetime` variable. The style of + this string is either old-style or new-style depending on + `new_style_formatting`. + For old-style, it may be used as `fmt_out % fmt_dct`. + For new-style, it may be used as `fmt_out.format(**fmt_dct)` + locale_dt_strings : LocaleSpecificDtStrings + An object containing the locale-specific strings needed for some of the + directives. For example locale_dt_strings.am and locale_dt_strings.pm should be + used to fill the "ampm" part of the template, induced by directive %p. + + Raises + ------ + UnsupportedStrFmtDirective + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. + + See Also + -------- + `strftime format codes reference + `_ + + `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than datetime.strftime`. + + See `Period.strftime` doc for all the supported period directives (same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as specific additional directives ``%f``, ``%F``, + ``%q``, ``%l``, ``%u``, ``%n``). + """ + unsupported: tuple[tuple[str, ...], ...] + if target in ("datetime", "date", "time"): + directive_maps = (_COMMON_MAP, _DATETIME_MAP) + unsupported = (_COMMON_UNSUPPORTED, _DATETIME_UNSUPPORTED) + elif target == "period": + directive_maps = (_COMMON_MAP, _PERIOD_MAP) + unsupported = (_COMMON_UNSUPPORTED, _PERIOD_UNSUPPORTED) + else: + raise ValueError(f"Invalid target: {target!r}") + + # Raise if unsupported directive found in `strftime_fmt` + for _u in unsupported: + for key in _u: + if key in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") + + # Find an escape sequence, that we will use to replace all '%' signs + esc = _create_escape_sequence(strftime_fmt, init_esc="-+", prefix="-") + + # Escape the %% before searching for directives (we will put them back at the end) + strftime_fmt = strftime_fmt.replace("%%", esc * 2) + + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + # Escape single curly braces + strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") + + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "{day:02d}" + strftime_fmt = strftime_fmt.replace(key, f"{{{_name}:{_fmt}}}") + + # If there are remaining percent signs, be conservative and fallback + if "%" in strftime_fmt: + raise UnsupportedStrFmtDirective("Unsupported directive found") + + else: + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "%(day)02d" but with escaped % + strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") + + # If there are remaining percent signs, raise 'unsupported directive' so that + # the caller can fallback to OS C strftime engine. + if "%" in strftime_fmt: + raise UnsupportedStrFmtDirective("Unsupported directive found") + + # Restore the escaped %% + strftime_fmt = strftime_fmt.replace(esc, "%") + + return strftime_fmt, get_current_locale_specific_string() + + +def _create_escape_sequence(txt: str, init_esc: str = "+", prefix: str = "-") -> str: + """Return a unique string that does not exist in txt, by prepending as many + `prefix` as necessary to the initial proposal `init_esc`.""" + + if init_esc in prefix: + raise ValueError("`ini_esc` must not be a subset of `prefix`") + + # Prepend `ini_esc` with `str_to_add` as many times as necessary + while init_esc in txt: + init_esc = prefix + init_esc + return init_esc diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index c769b09d1b7a1..ac612d837c57f 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -107,6 +107,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index c4bd9e1b47bbe..2104c92938d84 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1598,6 +1598,41 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `locale_dt_strings` should be created using + `convert_strftime_format(fmt, target="datetime")`. + + See also `self.strftime`, that relies on `datetime.strftime`. + + Examples + -------- + >>> from pandas._libs.tslibs import convert_strftime_format + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S', target="datetime") + >>> ts._strftime_pystr(fmt, loc_s) + '2020-03-14T15:32:52' + """ + y = self.year + shortyear = y % 100 + if y < 0 and shortyear != 0: + # Fix negative modulo to adopt C-style modulo + shortyear -= 100 + h = self.hour + return fmt_str % { + "year": y, + "shortyear": shortyear, + "month": self.month, + "day": self.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": locale_dt_strings.pm if (h // 12) else locale_dt_strings.am, + "min": self.minute, + "sec": self.second, + "us": self.microsecond, + } + def strftime(self, format): """ Return a formatted string of the Timestamp. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 34d25f04b69e1..b50fd4bb0570f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -771,7 +771,12 @@ def _format_native_types( date_format = "%Y-%m-%d" return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso + self.asi8, + tz=self.tz, + format=date_format, + na_rep=na_rep, + reso=self._creso, + _use_pystr_engine=True, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index e762c3e547819..a6632a067f8bd 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -773,7 +773,11 @@ def _format_native_types( actually format my specific types """ return libperiod.period_array_strftime( - self.asi8, self.dtype._dtype_code, na_rep, date_format + self.asi8, + self.dtype._dtype_code, + na_rep, + date_format, + _use_pystr_engine=True, ) # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 15bfe442ca87f..030bede09e998 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -447,7 +447,7 @@ def _format_native_types( ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 - # Relies on TimeDelta._repr_base + # Relies on TimeDelta._repr_base (and does use the `date_format` arg) formatter = get_format_timedelta64(self, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index edd1fdd4da943..2096c8600ceea 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -76,7 +76,7 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, + ["start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) @@ -177,6 +177,11 @@ def _resolution_obj(self) -> Resolution: # methods that dispatch to array and wrap result in Index # These are defined here instead of via inherit_names for mypy + @doc(PeriodArray.strftime) + def strftime(self, date_format: str) -> Index: + arr = self._data.strftime(date_format) + return Index(arr, name=self.name, dtype=object) + @doc( PeriodArray.asfreq, other="arrays.PeriodArray", diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 50503e862ef43..5c461da6ffa60 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -30,9 +30,6 @@ ABCMultiIndex, ABCPeriodIndex, ) -from pandas.core.dtypes.missing import notna - -from pandas.core.indexes.api import Index from pandas.io.common import get_handle @@ -47,6 +44,8 @@ npt, ) + from pandas.core.indexes.api import Index + from pandas.io.formats.format import DataFrameFormatter @@ -194,9 +193,9 @@ def data_index(self) -> Index: isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and self.date_format is not None ): - data_index = Index( - [x.strftime(self.date_format) if notna(x) else "" for x in data_index] - ) + # Format and replace missing entries with empty string + data_index = data_index.strftime(date_format=self.date_format) + data_index = data_index.fillna("") # type: ignore[no-untyped-call] elif isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index @@ -317,9 +316,12 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] + # Format the values. Note: `self._number_format` includes `date_format` if any res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) + # Format the index. Note that if `self.date_format` is not None the actual + # formatting is done beforehand, inside the `.data_index` property accessor. ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) libwriters.write_csv_rows( data, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9ad5ac83e9eae..888a3f132a7e7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -40,6 +40,8 @@ NaT, Timedelta, Timestamp, + UnsupportedStrFmtDirective, + convert_strftime_format, ) from pandas._libs.tslibs.nattype import NaTType @@ -1610,12 +1612,20 @@ def _format_datetime64_dateonly( x: NaTType | Timestamp, nat_rep: str = "NaT", date_format: str | None = None, + str_date_fmt: str | None = None, + locale_dt_strings: object | None = None, ) -> str: + """str_date_fmt, locale_dt_strings are for fast strftime""" if isinstance(x, NaTType): return nat_rep if date_format: - return x.strftime(date_format) + if str_date_fmt: + # Faster, using string formatting + return x._strftime_pystr(str_date_fmt, locale_dt_strings) + else: + # Slower + return x.strftime(date_format) else: # Timestamp._date_repr relies on string formatting (faster than strftime) return x._date_repr @@ -1628,10 +1638,26 @@ def get_format_datetime64( a string as output""" if is_dates_only: + str_date_fmt = locale_dt_strings = None + if date_format is not None: + try: + # Try to get the string formatting template for this format + str_date_fmt, locale_dt_strings = convert_strftime_format( + date_format, target="datetime" + ) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + pass return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format + x, + nat_rep=nat_rep, + date_format=date_format, + str_date_fmt=str_date_fmt, + locale_dt_strings=locale_dt_strings, ) else: + # Relies on datetime.str, which is fast already + # TODO why is date_format not used in this case ? This seems like a bug? return lambda x: _format_datetime64(x, nat_rep=nat_rep) @@ -1640,10 +1666,10 @@ class _Datetime64TZFormatter(_Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" - ido = self.values._is_dates_only values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format + is_dates_only=False, + date_format=self.date_format, ) fmt_values = [formatter(x) for x in values] @@ -1664,6 +1690,7 @@ def __init__( self.nat_rep = nat_rep def _format_strings(self) -> list[str]: + # Note: `get_format_timedelta64` uses fast formatting formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=False ) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ef0315130215c..5d64ea0ffde80 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -224,6 +224,22 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp + # same with dates only: since there is a timezone, dates become datetimes + idx = date_range("2011-01-01", freq="D", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00, " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00, " + "2011-01-05 00:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00,\n" + " " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00,\n" + " " + "2011-01-05 00:00:00-05:00]" + ) + assert repr(c) == exp + def test_categorical_repr_datetime_ordered(self): idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3d8f8d791b763..d637d172a9e3b 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,8 +1,11 @@ from __future__ import annotations +import datetime as dt import re import warnings +from hypothesis import given +import hypothesis.strategies as st import numpy as np import pytest @@ -12,6 +15,7 @@ Timestamp, ) from pandas._libs.tslibs import to_offset +from pandas.compat import is_platform_windows from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import PeriodDtype @@ -608,6 +612,65 @@ def test_from_integer_array(self): tm.assert_extension_array_equal(result, expected) +def _is_supported_directive(d: str) -> bool: + """ + Return True if strftime directive 'd' is supported on current platform + See https://strftime.org/ and https://stackoverflow.com/a/2073189/7262247 + """ + try: + dt.datetime(1700, 1, 1).strftime(d) + except ValueError: + return False + else: + return True + + +DT_STRFTIME_DIRECTIVES = [ + d + for d in [ + "%a", + "%A", + "%w", + "%d", + "%#d" if is_platform_windows() else "%-d", + "%b", + "%B", + "%m", + "%#m" if is_platform_windows() else "%-m", + "%y", + "%Y", + "%H", + "%#H" if is_platform_windows() else "%-H", + "%I", + "%#I" if is_platform_windows() else "%-I", + "%p", + "%M", + "%#M" if is_platform_windows() else "%-M", + "%S", + "%#S" if is_platform_windows() else "%-S", + "%f", + "%z", + "%Z", + "%j", + "%#j" if is_platform_windows() else "%-j", + "%U", + "%#U" if is_platform_windows() else "%-U", + "%W", + "%#W" if is_platform_windows() else "%-W", + "%c", + "%x", + "%X", + "%%", + ] + if _is_supported_directive(d) +] +PERIOD_STRFTIME_DIRECTIVES = [ + d for d in DT_STRFTIME_DIRECTIVES if d not in ("%X", "%f", "%z", "%Z") +] +"""Note that even though periods are not timezone-aware (GH#45736), %z and %Z return +unexpected non empty result, hence their exclusion from this testing list.""" + + class TestDatetimeArray(SharedTests): index_cls = DatetimeIndex array_cls = DatetimeArray @@ -903,6 +966,56 @@ def test_strftime_nat(self): expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) + def test_strftime_small_years(self): + np_dts = np.array( + ["220-01-01", "10-01-01", "1-01-01", "0-01-01"], "datetime64[s]" + ) + arr = DatetimeIndex(np_dts)._data + + result = arr.strftime("%Y-%m-%d__%y") + # Note that either -20 or 20 could be considered correct for the first %y. + # We opt for 20 to preserve the "two digits" property of %y. + expected = np.array( + ["0220-01-01__20", "0010-01-01__10", "0001-01-01__01", "0000-01-01__00"], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_negative(self): + np_neg = np.array( + ["-2020-01-01", "-220-01-01", "-200-01-01", "-1-01-01"], "datetime64[s]" + ) + arr = DatetimeIndex(np_neg)._data + + result = arr.strftime("%Y-%m-%d__%y") + # Note that either -20 or 20 could be considered correct for the first %y. + # We opt for -20 to have the same property than %Y. + # Similarly note that we have -1 instead of -01 + expected = np.array( + ["-2020-01-01__-20", "-220-01-01__-20", "-200-01-01__00", "-001-01-01__-1"], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + @given( + datetimes=st.lists( + st.datetimes( + min_value=dt.datetime(1700, 1, 1), max_value=dt.datetime(2200, 1, 1) + ) + ), + fmt=st.sets(st.sampled_from(DT_STRFTIME_DIRECTIVES), min_size=1), + ) + @pytest.mark.parametrize("tz_aware", (False, True)) + def test_strftime_hypo(self, datetimes, fmt, tz_aware): + """Test that idx.strftime's content is equivalent to datetime.strftime""" + fmt = "".join(fmt) + if tz_aware: + datetimes = [_dt.replace(tzinfo=dt.timezone.utc) for _dt in datetimes] + idx = DatetimeIndex(datetimes) + result = idx.strftime(fmt) + expected = pd.Index([i.strftime(fmt) for i in datetimes]) + tm.assert_index_equal(result, expected) + class TestTimedeltaArray(SharedTests): index_cls = TimedeltaIndex @@ -1173,6 +1286,24 @@ def test_strftime_nat(self): expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) + @given( + datetimes=st.lists( + st.datetimes( + min_value=dt.datetime(1700, 1, 1), max_value=dt.datetime(2200, 1, 1) + ) + ), + fmt=st.sets(st.sampled_from(PERIOD_STRFTIME_DIRECTIVES), min_size=1), + ) + def test_strftime_hypo(self, datetimes, fmt): + """Test that idx.strftime's content is equivalent to datetime.strftime + Note that periods are not timezone-aware see GH#45736 + """ + fmt = "".join(fmt) + idx = PeriodIndex(datetimes, freq="s") + result = idx.strftime(fmt) + expected = pd.Index([i.strftime(fmt) for i in datetimes]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "arr,casting_nats", diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 49776d532db1d..262b92d3c79e2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -281,6 +281,69 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_datetime_format_index(self): + """Test that formatting also works for datetime index""" + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_sec = df_sec.set_index("A") + + # default date_format + res = df_sec.to_csv() + expected_rows = [ + "A", + "2013-01-01 00:00:00", + "2013-01-01 00:00:01", + "2013-01-01 00:00:02", + "2013-01-01 00:00:03", + "2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + # custom date_format + res = df_sec.to_csv(date_format="%Y-%m-%d %H:%M:%S.%f") + expected_rows = [ + "A", + "2013-01-01 00:00:00.000000", + "2013-01-01 00:00:01.000000", + "2013-01-01 00:00:02.000000", + "2013-01-01 00:00:03.000000", + "2013-01-01 00:00:04.000000", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + def test_to_csv_period_format_index(self): + """Test that formatting also works for period index""" + # same for periods + df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="M")}) + df_month = df_month.set_index("A") + + # default date_format + res = df_month.to_csv() + expected_rows = [ + "A", + "2013-01", + "2013-02", + "2013-03", + "2013-04", + "2013-05", + ] + expected_default_mon = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_mon + + # custom format + res = df_month.to_csv(date_format="%F : %q") + expected_rows = [ + "A", + "2013 : 1", + "2013 : 1", + "2013 : 1", + "2013 : 2", + "2013 : 2", + ] + expected_ymdhms_month = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_ymdhms_month + def test_to_csv_different_datetime_formats(self): # GH#21734 df = DataFrame( diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 49bd48b40e67a..331f565e33fc0 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1,14 +1,20 @@ +from contextlib import nullcontext from datetime import ( date, datetime, + time, timedelta, ) +import locale import re import numpy as np import pytest -from pandas._libs.tslibs import iNaT +from pandas._libs.tslibs import ( + convert_strftime_format, + iNaT, +) from pandas._libs.tslibs.ccalendar import ( DAYS, MONTHS, @@ -614,6 +620,13 @@ def test_period_large_ordinal(self, hour): assert p.hour == hour +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + class TestPeriodMethods: def test_round_trip(self): p = Period("2000Q1") @@ -785,6 +798,43 @@ def test_strftime(self): assert res == "2000-01-01 12:34:12" assert isinstance(res, str) + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_strftime_locale(self, locale_str): + """ + Test that `convert_strftime_format` and `_strftime_pystr` + work well together and rely on runtime locale + """ + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="period") + assert str_tmp == "%(ampm)s" + + # Period + am_per = Period("2018-03-11 01:00", freq="h") + assert am_local == am_per.strftime("%p") + assert am_local == am_per._strftime_pystr(str_tmp, loc_s) + pm_per = Period("2018-03-11 13:00", freq="h") + assert pm_local == pm_per.strftime("%p") + assert pm_local == pm_per._strftime_pystr(str_tmp, loc_s) + class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index b20df43dd49a6..fe8523390529b 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -182,7 +182,15 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize( "klass,expected", [ - (Timestamp, ["normalize", "to_julian_date", "to_period", "unit"]), + ( + Timestamp, + [ + "normalize", + "to_julian_date", + "to_period", + "unit", + ], + ), ( Timedelta, [ diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 7b20f0a17556d..194c1000bc232 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,15 +1,26 @@ +from contextlib import nullcontext from datetime import ( datetime, + time, timezone, ) +import locale import pprint import dateutil.tz import pytest +from pytz.exceptions import NonExistentTimeError -from pandas.compat import WASM +from pandas.compat import ( + ISMUSL, + WASM, + is_platform_linux, +) from pandas import Timestamp +import pandas._testing as tm + +from pandas.tseries.api import convert_strftime_format ts_no_ns = Timestamp( year=2019, @@ -91,6 +102,13 @@ def test_isoformat(ts, timespec, expected_iso): assert ts.isoformat(timespec=timespec) == expected_iso +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + class TestTimestampRendering: @pytest.mark.parametrize( "tz", ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] @@ -204,3 +222,87 @@ def test_repr_matches_pydatetime_tz_dateutil(self): dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + def test_timestamp_repr_strftime_small_year_date(self): + stamp = Timestamp("0002-01-01") + assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" + + # Make sure we have zero-padding, consistent with python strftime doc + # Note: current behaviour of strftime with %Y is OS-dependent + if WASM or (is_platform_linux() and not ISMUSL): + assert stamp.strftime("%Y") == "2", f"Actual: {stamp.strftime('%Y')}" + else: + assert stamp.strftime("%Y") == "0002", f"Actual: {stamp.strftime('%Y')}" + + # This is not OS-dependent + str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") + assert stamp._strftime_pystr(str_tmp, loc_dt_s) == "0002" + + def test_timestamp_repr_strftime_small_shortyear_date(self): + stamp = Timestamp("1902-01-01") + assert repr(stamp) == "Timestamp('1902-01-01 00:00:00')" + + # Make sure we have zero-padding, consistent with python strftime doc + assert stamp.strftime("%y") == "02", f"actual: {stamp.strftime('%y')}" + str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") + assert stamp._strftime_pystr(str_tmp, loc_dt_s) == "02" + + def test_timestamp_repr_strftime_negative_date(self): + stamp = Timestamp("-0020-01-01") + assert repr(stamp) == "Timestamp('-20-01-01 00:00:00')" + + with pytest.raises( + NotImplementedError, + match="strftime not yet supported on Timestamps which are outside the " + "range of Python's standard library.", + ): + stamp.strftime("%y") + + str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") + assert Timestamp("-0020-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-20" + assert Timestamp("-0002-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-2" + + str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") + assert Timestamp("-2020-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-2020" + assert Timestamp("-0200-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-200" + + with pytest.raises(NonExistentTimeError): + Timestamp("-0020-01-01", tz="US/Eastern") + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_strftime_locale(self, locale_str): + """ + Test that `convert_strftime_format` and `_strftime_pystr` + work well together and rely on runtime locale + """ + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="datetime") + assert str_tmp == "%(ampm)s" + + # Now what about the classes ? + # Timestamp + am_ts = Timestamp(2020, 1, 1, 1) + assert am_local == am_ts.strftime("%p") + assert am_local == am_ts._strftime_pystr(str_tmp, loc_s) + pm_ts = Timestamp(2020, 1, 1, 13) + assert pm_local == pm_ts.strftime("%p") + assert pm_local == pm_ts._strftime_pystr(str_tmp, loc_s) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 42d055326c2a5..af14539de497a 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -16,6 +16,7 @@ def test_namespace(): "parsing", "period", "strptime", + "strftime", "vectorized", "timedeltas", "timestamps", @@ -32,6 +33,8 @@ def test_namespace(): "OutOfBoundsDatetime", "OutOfBoundsTimedelta", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "IncompatibleFrequency", "Resolution", "Tick", diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py new file mode 100644 index 0000000000000..ffdbff678247e --- /dev/null +++ b/pandas/tests/tslibs/test_strftime.py @@ -0,0 +1,187 @@ +""" +Test datetime formatting low-level routines +""" + +from contextlib import nullcontext +from datetime import time +import locale + +import pytest + +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + _create_escape_sequence, + get_current_locale_specific_string, +) + +import pandas._testing as tm + +from pandas.tseries.api import convert_strftime_format + + +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + +@pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], +) +def test_get_current_locale_specific_string(locale_str): + """Test that `get_current_locale_specific_string` relies on runtime locale.""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Test that the function returns the correct ones + res = get_current_locale_specific_string() + assert res.am == am_local + assert res.pm == pm_local + + +class TestConvertStrftimeFormat: + """Tests for `convert_strftime_format`.""" + + @pytest.mark.parametrize( + "strftime_fmt,res_fmt_old,res_fmt_new", + ( + ("%p", "%(ampm)s", "{ampm:s}"), + ( + "%m-%d-%Y", + "%(month)02d-%(day)02d-%(year)04d", + "{month:02d}-{day:02d}-{year:04d}", + ), + ( + "20%y-%m-%d__foo__%I:%M:%S%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__" + "%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__" + "{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + ), + ), + ) + def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): + """Test that `convert_strftime_format` returns the correct template""" + str_tmp, loc_s = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=False + ) + assert str_tmp == res_fmt_old + + str_tmp_new, loc_s2 = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=True + ) + assert loc_s2 == loc_s + assert str_tmp_new == res_fmt_new + + @pytest.mark.parametrize( + "strftime_fmt,res_fmt_old,res_fmt_new", + ( + ("%p", "%(ampm)s", "{ampm:s}"), + ( + "%m-%d-%Y", + "%(month)02d-%(day)02d-%(year)04d", + "{month:02d}-{day:02d}-{year:04d}", + ), + ( + "%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s " + "(ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", + "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} " + "(ms={ms:03d} us={us:06d} ns={ns:09d})", + ), + ( + "20%y-%m-%d__f{o}o__%I:%M:%S%%%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__f{o}o__" + "%(hour12)02d:%(min)02d:%(sec)02d%%%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__f{{o}}o__" + "{hour12:02d}:{min:02d}:{sec:02d}%%{ampm:s}", + ), + ), + ) + def test_format_period(self, strftime_fmt, res_fmt_old, res_fmt_new): + """Test that `convert_strftime_format` returns the correct template""" + str_tmp, loc_s = convert_strftime_format( + strftime_fmt, target="period", new_style_fmt=False + ) + assert str_tmp == res_fmt_old + + str_tmp_new, loc_s2 = convert_strftime_format( + strftime_fmt, target="period", new_style_fmt=True + ) + assert loc_s2 == loc_s + assert str_tmp_new == res_fmt_new + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + @pytest.mark.parametrize("target", ("datetime", "date", "time", "period")) + def test_format_non_ascii(self, locale_str, target): + """Test that `convert_strftime_format` is robust to locale and fmt encoding""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + strftime_fmt = "%y é" + + str_tmp, _ = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=False + ) + assert str_tmp == "%(shortyear)02d é" + + str_tmp_new, _ = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=True + ) + assert str_tmp_new == "{shortyear:02d} é" + + def test_invalid_datetime_directive(self): + """Test that using invalid strftime directives for datetime raises an error""" + with pytest.raises(UnsupportedStrFmtDirective, match="Unsupported directive"): + convert_strftime_format("%F", target="datetime") + + # Make sure that the same directive is valid for periods + assert convert_strftime_format("%F", target="period")[0] == "%(Fyear)d" + + def test_invalid_period_directive(self): + """Test that using invalid strftime directives for period raises an error""" + with pytest.raises(UnsupportedStrFmtDirective, match="Unsupported directive"): + convert_strftime_format("%j", target="period") + + def test_unknown_directive(self): + """Test that unknown/not available strftime directives lead to an error.""" + with pytest.raises(ValueError, match="Unsupported directive"): + convert_strftime_format("%O", target="datetime") + + with pytest.raises(ValueError, match="Unsupported directive"): + convert_strftime_format("%O", target="datetime", new_style_fmt=True) + + +def test_create_escape_sequence(): + txt = "-*" + esc = _create_escape_sequence(txt, init_esc="*", prefix="-") + assert esc not in txt + assert esc == "--*" diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..201f5f662f084 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -2,9 +2,10 @@ Timeseries API """ +from pandas._libs.tslibs import convert_strftime_format from pandas._libs.tslibs.parsing import guess_datetime_format from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["convert_strftime_format", "infer_freq", "offsets", "guess_datetime_format"]