diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index f5d6abdf0f186..b60245d20e8e4 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -14,7 +14,7 @@ runs: if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..a09ac1a4e5ffb 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -7,7 +7,7 @@ runs: using: composite steps: - name: Install ${{ inputs.environment-file }} - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-file: ${{ inputs.environment-file }} environment-name: test diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 908baa87815ab..294334ca1d54b 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -59,6 +59,10 @@ jobs: - name: Build documentation run: doc/make.py --warnings-are-errors + - name: Build the interactive terminal + working-directory: web/interactive_terminal + run: jupyter lite build + - name: Build documentation zip run: doc/make.py zip_html diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c13c38b20a7f8..09bfda1755e03 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.4 + rev: v0.9.9 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -70,7 +70,7 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 6.0.0 + rev: 6.0.1 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade diff --git a/Dockerfile b/Dockerfile index dead3a494e52d..4090a4adb1af8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,5 +13,5 @@ COPY requirements-dev.txt /tmp RUN python -m pip install -r /tmp/requirements-dev.txt RUN git config --global --add safe.directory /home/pandas -ENV SHELL "/bin/bash" +ENV SHELL="/bin/bash" CMD ["/bin/bash"] diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index bd4da00bfd2ad..f9a5f38c2e349 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -10,7 +10,19 @@ class Methods: ["DataFrame", "Series"], [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], + [ + "median", + "mean", + "max", + "min", + "std", + "count", + "skew", + "kurt", + "sum", + "sem", + "nunique", + ], ) param_names = ["constructor", "window_kwargs", "dtype", "method"] diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5782b2b171e07..6ce43725fecc9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -83,6 +83,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ + -i "pandas.tseries.offsets.BHalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.startingMonth GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ @@ -185,6 +195,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.startingMonth GL08" \ -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index a8b7a387d80ec..98a68080d33ef 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -174,3 +174,4 @@ License ------- .. literalinclude:: ../../../LICENSE + :language: none diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index 8bb2c6ffe73be..5876e005574fd 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -776,6 +776,146 @@ Methods QuarterBegin.is_year_start QuarterBegin.is_year_end +BHalfYearEnd +------------ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd.freqstr + BHalfYearEnd.kwds + BHalfYearEnd.name + BHalfYearEnd.nanos + BHalfYearEnd.normalize + BHalfYearEnd.rule_code + BHalfYearEnd.n + BHalfYearEnd.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearEnd.copy + BHalfYearEnd.is_on_offset + BHalfYearEnd.is_month_start + BHalfYearEnd.is_month_end + BHalfYearEnd.is_quarter_start + BHalfYearEnd.is_quarter_end + BHalfYearEnd.is_year_start + BHalfYearEnd.is_year_end + +BHalfYearBegin +-------------- +.. autosummary:: + :toctree: api/ + + BHalfYearBegin + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearBegin.freqstr + BHalfYearBegin.kwds + BHalfYearBegin.name + BHalfYearBegin.nanos + BHalfYearBegin.normalize + BHalfYearBegin.rule_code + BHalfYearBegin.n + BHalfYearBegin.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + BHalfYearBegin.copy + BHalfYearBegin.is_on_offset + BHalfYearBegin.is_month_start + BHalfYearBegin.is_month_end + BHalfYearBegin.is_quarter_start + BHalfYearBegin.is_quarter_end + BHalfYearBegin.is_year_start + BHalfYearBegin.is_year_end + +HalfYearEnd +----------- +.. autosummary:: + :toctree: api/ + + HalfYearEnd + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearEnd.freqstr + HalfYearEnd.kwds + HalfYearEnd.name + HalfYearEnd.nanos + HalfYearEnd.normalize + HalfYearEnd.rule_code + HalfYearEnd.n + HalfYearEnd.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearEnd.copy + HalfYearEnd.is_on_offset + HalfYearEnd.is_month_start + HalfYearEnd.is_month_end + HalfYearEnd.is_quarter_start + HalfYearEnd.is_quarter_end + HalfYearEnd.is_year_start + HalfYearEnd.is_year_end + +HalfYearBegin +------------- +.. autosummary:: + :toctree: api/ + + HalfYearBegin + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearBegin.freqstr + HalfYearBegin.kwds + HalfYearBegin.name + HalfYearBegin.nanos + HalfYearBegin.normalize + HalfYearBegin.rule_code + HalfYearBegin.n + HalfYearBegin.startingMonth + +Methods +~~~~~~~ +.. autosummary:: + :toctree: api/ + + HalfYearBegin.copy + HalfYearBegin.is_on_offset + HalfYearBegin.is_month_start + HalfYearBegin.is_month_end + HalfYearBegin.is_quarter_start + HalfYearBegin.is_quarter_end + HalfYearBegin.is_year_start + HalfYearBegin.is_year_end + BYearEnd -------- .. autosummary:: diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 2aeb57faac112..2bd63f02faf69 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -42,6 +42,7 @@ Rolling window functions Rolling.quantile Rolling.sem Rolling.rank + Rolling.nunique .. _api.functions_window: @@ -86,6 +87,7 @@ Expanding window functions Expanding.quantile Expanding.sem Expanding.rank + Expanding.nunique .. _api.functions_ewm: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 07d06f61b3fd6..23da52f26358f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -18,10 +18,10 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like :widths: 30, 100, 60, 60 text,`CSV `__, :ref:`read_csv`, :ref:`to_csv` - text,Fixed-Width Text File, :ref:`read_fwf` , NA + text,Fixed-Width Text File, :ref:`read_fwf`, NA text,`JSON `__, :ref:`read_json`, :ref:`to_json` text,`HTML `__, :ref:`read_html`, :ref:`to_html` - text,`LaTeX `__, :ref:`Styler.to_latex` , NA + text,`LaTeX `__, NA, :ref:`Styler.to_latex` text,`XML `__, :ref:`read_xml`, :ref:`to_xml` text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index e96faecd9a266..3bb151a2dd339 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -90,7 +90,7 @@ Behavior differences These are places where the behavior of ``StringDtype`` objects differ from ``object`` dtype: -l. For ``StringDtype``, :ref:`string accessor methods` +1. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of NA values. Methods returning **boolean** output will return a nullable boolean dtype. @@ -332,8 +332,8 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex -``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in Python 3.9 -`__: +``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in +`Python 3.9 `__: .. versionadded:: 1.4.0 diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index d046d13f71daf..10260cb011d90 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -891,6 +891,10 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end" :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" + :class:`~pandas.tseries.offsets.HalfYearEnd`, ``'HYE'``, "calendar half year end" + :class:`~pandas.tseries.offsets.HalfYearBegin`, ``'HYS'``, "calendar half year begin" + :class:`~pandas.tseries.offsets.BHalfYearEnd`, ``'BHYE``, "business half year end" + :class:`~pandas.tseries.offsets.BHalfYearBegin`, ``'BHYS'``, "business half year begin" :class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end" :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" :class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end" diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 09134763977c3..9352063ad3a79 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -118,6 +118,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`) - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c967b97cb2ef6..7800e7a5a4241 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -62,9 +62,11 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) +- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) +- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) @@ -72,8 +74,10 @@ Other enhancements - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). +- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) +- Improved deprecation message for offset aliases (:issue:`60820`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) @@ -692,8 +696,10 @@ Interval Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) +- Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) +- Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) @@ -710,12 +716,13 @@ MultiIndex - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) -- +- Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) @@ -784,6 +791,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) Sparse ^^^^^^ diff --git a/environment.yml b/environment.yml index 69647a436e3ad..a8c8b20e20fe4 100644 --- a/environment.yml +++ b/environment.yml @@ -116,6 +116,13 @@ dependencies: - requests - pygments # Code highlighting + # web interactive REPL + # see the following links for more context: + # 1. https://jupyterlite-pyodide-kernel.readthedocs.io/en/stable/#compatibility + # 2. https://pyodide.org/en/stable/usage/packages-in-pyodide.html + - jupyterlite-core + - jupyterlite-pyodide-kernel + - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index a5a3edad63403..0480ee54ffb4e 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -41,7 +41,7 @@ cdef class HashTable: cdef class UInt64HashTable(HashTable): cdef kh_uint64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint64_t val) @@ -51,7 +51,7 @@ cdef class UInt64HashTable(HashTable): cdef class Int64HashTable(HashTable): cdef kh_int64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int64_t val) @@ -61,7 +61,7 @@ cdef class Int64HashTable(HashTable): cdef class UInt32HashTable(HashTable): cdef kh_uint32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint32_t val) @@ -71,7 +71,7 @@ cdef class UInt32HashTable(HashTable): cdef class Int32HashTable(HashTable): cdef kh_int32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int32_t val) @@ -81,7 +81,7 @@ cdef class Int32HashTable(HashTable): cdef class UInt16HashTable(HashTable): cdef kh_uint16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint16_t val) @@ -91,7 +91,7 @@ cdef class UInt16HashTable(HashTable): cdef class Int16HashTable(HashTable): cdef kh_int16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int16_t val) @@ -101,7 +101,7 @@ cdef class Int16HashTable(HashTable): cdef class UInt8HashTable(HashTable): cdef kh_uint8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint8_t val) @@ -111,7 +111,7 @@ cdef class UInt8HashTable(HashTable): cdef class Int8HashTable(HashTable): cdef kh_int8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int8_t val) @@ -121,7 +121,7 @@ cdef class Int8HashTable(HashTable): cdef class Float64HashTable(HashTable): cdef kh_float64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float64_t val) @@ -131,7 +131,7 @@ cdef class Float64HashTable(HashTable): cdef class Float32HashTable(HashTable): cdef kh_float32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float32_t val) @@ -141,7 +141,7 @@ cdef class Float32HashTable(HashTable): cdef class Complex64HashTable(HashTable): cdef kh_complex64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex64_t val) @@ -151,7 +151,7 @@ cdef class Complex64HashTable(HashTable): cdef class Complex128HashTable(HashTable): cdef kh_complex128_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex128_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 210df09f07db6..eae393f33bfd3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -535,7 +535,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val khiter_t k - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover @@ -567,7 +567,7 @@ cdef class {{name}}HashTable(HashTable): Int64Vector self_locs = Int64Vector() Int64VectorData *l Int64VectorData *sl - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position l = &locs.data sl = &self_locs.data @@ -609,7 +609,7 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3c509a3eae11a..63d70f4ce59c9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -502,7 +502,7 @@ def has_only_ints_or_nan(const floating[:] arr) -> bool: return True -def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, intp_t max_len): cdef: Py_ssize_t i, n = len(indices) intp_t k, vstart, vlast, v diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 3f942d6aa3622..f9f56d38c5e0a 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -168,6 +168,16 @@ class BQuarterEnd(QuarterOffset): ... class BQuarterBegin(QuarterOffset): ... class QuarterEnd(QuarterOffset): ... class QuarterBegin(QuarterOffset): ... + +class HalfYearOffset(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ... + ) -> None: ... + +class BHalfYearEnd(HalfYearOffset): ... +class BHalfYearBegin(HalfYearOffset): ... +class HalfYearEnd(HalfYearOffset): ... +class HalfYearBegin(HalfYearOffset): ... class MonthOffset(SingleConstructorOffset): ... class MonthEnd(MonthOffset): ... class MonthBegin(MonthOffset): ... diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 36b431974c121..a16964435ef50 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -32,6 +32,8 @@ cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure +from typing import ClassVar + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util @@ -2524,8 +2526,7 @@ cdef class YearOffset(SingleConstructorOffset): """ _attributes = tuple(["n", "normalize", "month"]) - # FIXME(cython#4446): python annotation here gives compile-time errors - # _default_month: int + _default_month: ClassVar[int] cdef readonly: int month @@ -2788,9 +2789,8 @@ cdef class QuarterOffset(SingleConstructorOffset): # point. Also apply_index, is_on_offset, rule_code if # startingMonth vs month attr names are resolved - # FIXME(cython#4446): python annotation here gives compile-time errors - # _default_starting_month: int - # _from_name_starting_month: int + _default_starting_month: ClassVar[int] + _from_name_starting_month: ClassVar[int] cdef readonly: int startingMonth @@ -3011,6 +3011,227 @@ cdef class QuarterBegin(QuarterOffset): _day_opt = "start" +# ---------------------------------------------------------------------- +# HalfYear-Based Offset Classes + +cdef class HalfYearOffset(SingleConstructorOffset): + _attributes = tuple(["n", "normalize", "startingMonth"]) + # TODO: Consider combining HalfYearOffset, QuarterOffset and YearOffset + + _default_starting_month: ClassVar[int] + _from_name_starting_month: ClassVar[int] + + cdef readonly: + int startingMonth + + def __init__(self, n=1, normalize=False, startingMonth=None): + BaseOffset.__init__(self, n, normalize) + + if startingMonth is None: + startingMonth = self._default_starting_month + self.startingMonth = startingMonth + + cpdef __setstate__(self, state): + self.startingMonth = state.pop("startingMonth") + self.n = state.pop("n") + self.normalize = state.pop("normalize") + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs["startingMonth"] = MONTH_TO_CAL_NUM[suffix] + else: + if cls._from_name_starting_month is not None: + kwargs["startingMonth"] = cls._from_name_starting_month + return cls(**kwargs) + + @property + def rule_code(self) -> str: + month = MONTH_ALIASES[self.startingMonth] + return f"{self._prefix}-{month}" + + def is_on_offset(self, dt: datetime) -> bool: + if self.normalize and not _is_normalized(dt): + return False + mod_month = (dt.month - self.startingMonth) % 6 + return mod_month == 0 and dt.day == self._get_offset_day(dt) + + @apply_wraps + def _apply(self, other: datetime) -> datetime: + # months_since: find the calendar half containing other.month, + # e.g. if other.month == 8, the calendar half is [Jul, Aug, Sep, ..., Dec]. + # Then find the month in that half containing an is_on_offset date for + # self. `months_since` is the number of months to shift other.month + # to get to this on-offset month. + months_since = other.month % 6 - self.startingMonth % 6 + hlvs = roll_qtrday( + other, self.n, self.startingMonth, day_opt=self._day_opt, modby=6 + ) + months = hlvs * 6 - months_since + return shift_month(other, months, self._day_opt) + + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: + reso = get_unit_from_dtype(dtarr.dtype) + shifted = shift_quarters( + dtarr.view("i8"), + self.n, + self.startingMonth, + self._day_opt, + modby=6, + reso=reso, + ) + return shifted + + +cdef class BHalfYearEnd(HalfYearOffset): + """ + DateOffset increments between the last business day of each half-year. + + startingMonth = 1 corresponds to dates like 1/31/2007, 7/31/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 8/31/2007, ... + startingMonth = 6 corresponds to dates like 6/30/2007, 12/31/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 6 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> from pandas.tseries.offsets import BHalfYearEnd + >>> ts = pd.Timestamp('2020-05-24 05:01:15') + >>> ts + BHalfYearEnd() + Timestamp('2020-06-30 05:01:15') + >>> ts + BHalfYearEnd(2) + Timestamp('2020-12-31 05:01:15') + >>> ts + BHalfYearEnd(1, startingMonth=2) + Timestamp('2020-08-31 05:01:15') + >>> ts + BHalfYearEnd(startingMonth=2) + Timestamp('2020-08-31 05:01:15') + """ + _output_name = "BusinessHalfYearEnd" + _default_starting_month = 6 + _from_name_starting_month = 12 + _prefix = "BHYE" + _day_opt = "business_end" + + +cdef class BHalfYearBegin(HalfYearOffset): + """ + DateOffset increments between the first business day of each half-year. + + startingMonth = 1 corresponds to dates like 1/01/2007, 7/01/2007, ... + startingMonth = 2 corresponds to dates like 2/01/2007, 8/01/2007, ... + startingMonth = 3 corresponds to dates like 3/01/2007, 9/01/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 1 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> from pandas.tseries.offsets import BHalfYearBegin + >>> ts = pd.Timestamp('2020-05-24 05:01:15') + >>> ts + BHalfYearBegin() + Timestamp('2020-07-01 05:01:15') + >>> ts + BHalfYearBegin(2) + Timestamp('2021-01-01 05:01:15') + >>> ts + BHalfYearBegin(startingMonth=2) + Timestamp('2020-08-03 05:01:15') + >>> ts + BHalfYearBegin(-1) + Timestamp('2020-01-01 05:01:15') + """ + _output_name = "BusinessHalfYearBegin" + _default_starting_month = 1 + _from_name_starting_month = 1 + _prefix = "BHYS" + _day_opt = "business_start" + + +cdef class HalfYearEnd(HalfYearOffset): + """ + DateOffset increments between half-year end dates. + + startingMonth = 1 corresponds to dates like 1/31/2007, 7/31/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 8/31/2007, ... + startingMonth = 6 corresponds to dates like 6/30/2007, 12/31/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 6 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.HalfYearEnd() + Timestamp('2022-06-30 00:00:00') + """ + _default_starting_month = 6 + _from_name_starting_month = 12 + _prefix = "HYE" + _day_opt = "end" + + +cdef class HalfYearBegin(HalfYearOffset): + """ + DateOffset increments between half-year start dates. + + startingMonth = 1 corresponds to dates like 1/01/2007, 7/01/2007, ... + startingMonth = 2 corresponds to dates like 2/01/2007, 8/01/2007, ... + startingMonth = 3 corresponds to dates like 3/01/2007, 9/01/2007, ... + + Attributes + ---------- + n : int, default 1 + The number of half-years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + startingMonth : int, default 1 + A specific integer for the month of the year from which we start half-years. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + >>> ts = pd.Timestamp(2022, 2, 1) + >>> ts + pd.offsets.HalfYearBegin() + Timestamp('2022-07-01 00:00:00') + """ + _default_starting_month = 1 + _from_name_starting_month = 1 + _prefix = "HYS" + _day_opt = "start" + + # ---------------------------------------------------------------------- # Month-Based Offset Classes @@ -4823,6 +5044,8 @@ prefix_mapping = { BusinessMonthEnd, # 'BME' BQuarterEnd, # 'BQE' BQuarterBegin, # 'BQS' + BHalfYearEnd, # 'BHYE' + BHalfYearBegin, # 'BHYS' BusinessHour, # 'bh' CustomBusinessDay, # 'C' CustomBusinessMonthEnd, # 'CBME' @@ -4839,6 +5062,8 @@ prefix_mapping = { Micro, # 'us' QuarterEnd, # 'QE' QuarterBegin, # 'QS' + HalfYearEnd, # 'HYE' + HalfYearBegin, # 'HYS' Milli, # 'ms' Hour, # 'h' Day, # 'D' @@ -4883,7 +5108,7 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'" f" instead.", FutureWarning, stacklevel=find_stack_level(), @@ -4897,7 +5122,7 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{_name}\' " + f"\'{_name}\'" f" instead.", FutureWarning, stacklevel=find_stack_level(), diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e320aca04683c..36fe29b2146b7 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1740,7 +1740,8 @@ cdef class _Timedelta(timedelta): Format the Timedelta as ISO 8601 Duration. ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the - values. See https://en.wikipedia.org/wiki/ISO_8601#Durations. + values. See Wikipedia: + `ISO 8601 § Durations `_. Returns ------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 6b4b90167e625..452ba0fe869ee 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1309,7 +1309,7 @@ cdef class _Timestamp(ABCTimestamp): By default, the fractional part is omitted if self.microsecond == 0 and self._nanosecond == 0. - If self.tzinfo is not None, the UTC offset is also attached, giving + If self.tzinfo is not None, the UTC offset is also attached, giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'. Parameters diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index ee735761e3dc6..b4bdd7e05cf0e 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -89,6 +89,12 @@ def roll_rank( method: WindowingRankType, ascending: bool, ) -> np.ndarray: ... # np.ndarray[float] +def roll_nunique( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index d33c840371d2a..2baed13cbd7be 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -6,6 +6,7 @@ from libc.math cimport ( sqrt, ) from libcpp.deque cimport deque +from libcpp.unordered_map cimport unordered_map from pandas._libs.algos cimport TiebreakEnumType @@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, return np.asarray(output) +def roll_nunique(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + """ + Rolling number of unique elements in the window + """ + cdef: + Py_ssize_t i, j, s, e, N = len(start) + int64_t nobs = 0 + float64_t val + float64_t[::1] output + unordered_map[float64_t, int64_t] value_counts + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + output = np.empty(N, dtype=np.float64) + value_counts = unordered_map[float64_t, int64_t]() + + with nogil: + for i in range(N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + if i != 0: + nobs = 0 + value_counts.clear() + + # setup + for j in range(s, e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + else: + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + value_counts[val] -= 1 + if value_counts[val] == 0: + value_counts.erase(val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + nobs += 1 + value_counts[val] += 1 + + if nobs >= minp: + output[i] = value_counts.size() + else: + output[i] = NaN + + return np.asarray(output) + + def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 8f659e3cd14c8..a016e67a41360 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,6 +1,7 @@ """public toolkit API""" from pandas.api import ( + executors, extensions, indexers, interchange, @@ -9,6 +10,7 @@ ) __all__ = [ + "executors", "extensions", "indexers", "interchange", diff --git a/pandas/api/executors/__init__.py b/pandas/api/executors/__init__.py new file mode 100644 index 0000000000000..04c94ee688332 --- /dev/null +++ b/pandas/api/executors/__init__.py @@ -0,0 +1,7 @@ +""" +Public API for function executor engines to be used with ``map`` and ``apply``. +""" + +from pandas.core.apply import BaseExecutionEngine + +__all__ = ["BaseExecutionEngine"] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 138456f877c5f..9f3bfdc205498 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -35,6 +35,7 @@ pa_version_under17p0, pa_version_under18p0, pa_version_under19p0, + pa_version_under20p0, ) if TYPE_CHECKING: @@ -168,4 +169,5 @@ def is_ci_environment() -> bool: "pa_version_under17p0", "pa_version_under18p0", "pa_version_under19p0", + "pa_version_under20p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index c501c06b93813..163934bee509c 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -19,6 +19,7 @@ pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") pa_version_under19p0 = _palv < Version("19.0.0") + pa_version_under20p0 = _palv < Version("20.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -32,4 +33,5 @@ pa_version_under17p0 = True pa_version_under18p0 = True pa_version_under19p0 = True + pa_version_under20p0 = True HAS_PYARROW = False diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aafd802b827a5..0c0232bdc6d4c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1647,6 +1647,8 @@ def map_array( If the function returns a tuple with more than one element a MultiIndex will be returned. """ + from pandas import Index + if na_action not in (None, "ignore"): msg = f"na_action must either be 'ignore' or None, {na_action} was passed" raise ValueError(msg) @@ -1676,6 +1678,10 @@ def map_array( if len(mapper) == 0: mapper = Series(mapper, dtype=np.float64) + elif isinstance(mapper, dict): + mapper = Series( + mapper.values(), index=Index(mapper.keys(), tupleize_cols=False) + ) else: mapper = Series(mapper) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f36fc82fb1a11..da6124307e3f1 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -74,6 +74,110 @@ ResType = dict[int, Any] +class BaseExecutionEngine(abc.ABC): + """ + Base class for execution engines for map and apply methods. + + An execution engine receives all the parameters of a call to + ``apply`` or ``map``, such as the data container, the function, + etc. and takes care of running the execution. + + Supporting different engines allows functions to be JIT compiled, + run in parallel, and others. Besides the default executor which + simply runs the code with the Python interpreter and pandas. + """ + + @staticmethod + @abc.abstractmethod + def map( + data: Series | DataFrame | np.ndarray, + func: AggFuncType, + args: tuple, + kwargs: dict[str, Any], + decorator: Callable | None, + skip_na: bool, + ): + """ + Executor method to run functions elementwise. + + In general, pandas uses ``map`` for running functions elementwise, + but ``Series.apply`` with the default ``by_row='compat'`` will also + call this executor function. + + Parameters + ---------- + data : Series, DataFrame or NumPy ndarray + The object to use for the data. Some methods implement a ``raw`` + parameter which will convert the original pandas object to a + NumPy array, which will then be passed here to the executor. + func : function or NumPy ufunc + The function to execute. + args : tuple + Positional arguments to be passed to ``func``. + kwargs : dict + Keyword arguments to be passed to ``func``. + decorator : function, optional + For JIT compilers and other engines that need to decorate the + function ``func``, this is the decorator to use. While the + executor may already know which is the decorator to use, this + is useful as for a single executor the user can specify for + example ``numba.jit`` or ``numba.njit(nogil=True)``, and this + decorator parameter will contain the exact decorator from the + executor the user wants to use. + skip_na : bool + Whether the function should be called for missing values or not. + This is specified by the pandas user as ``map(na_action=None)`` + or ``map(na_action='ignore')``. + """ + + @staticmethod + @abc.abstractmethod + def apply( + data: Series | DataFrame | np.ndarray, + func: AggFuncType, + args: tuple, + kwargs: dict[str, Any], + decorator: Callable, + axis: Axis, + ): + """ + Executor method to run functions by an axis. + + While we can see ``map`` as executing the function for each cell + in a ``DataFrame`` (or ``Series``), ``apply`` will execute the + function for each column (or row). + + Parameters + ---------- + data : Series, DataFrame or NumPy ndarray + The object to use for the data. Some methods implement a ``raw`` + parameter which will convert the original pandas object to a + NumPy array, which will then be passed here to the executor. + func : function or NumPy ufunc + The function to execute. + args : tuple + Positional arguments to be passed to ``func``. + kwargs : dict + Keyword arguments to be passed to ``func``. + decorator : function, optional + For JIT compilers and other engines that need to decorate the + function ``func``, this is the decorator to use. While the + executor may already know which is the decorator to use, this + is useful as for a single executor the user can specify for + example ``numba.jit`` or ``numba.njit(nogil=True)``, and this + decorator parameter will contain the exact decorator from the + executor the user wants to use. + axis : {0 or 'index', 1 or 'columns'} + 0 or 'index' should execute the function passing each column as + parameter. 1 or 'columns' should execute the function passing + each row as parameter. The default executor engine passes rows + as pandas ``Series``. Other executor engines should probably + expect functions to be implemented this way for compatibility. + But passing rows as other data structures is technically possible + as far as the function ``func`` is implemented accordingly. + """ + + def frame_apply( obj: DataFrame, func: AggFuncType, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 33745438e2aea..dbf2090e53579 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2628,7 +2628,15 @@ def _groupby_op( if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) - npvalues = self.to_numpy(object, na_value=np.nan) + + arr = self + if op.how == "sum": + # https://github.com/pandas-dev/pandas/issues/60229 + # All NA should result in the empty string. + assert "skipna" in kwargs + if kwargs["skipna"] and min_count == 0: + arr = arr.fillna("") + npvalues = arr.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( f"function is not implemented for this dtype: {self.dtype}" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a86048bc20e2..8f65277f660f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5880,6 +5880,8 @@ def set_index( Delete columns to be used as the new index. append : bool, default False Whether to append columns to existing index. + Setting to True will add the new columns to existing index. + When set to False, the current index will be dropped from the DataFrame. inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. verify_integrity : bool, default False @@ -5953,6 +5955,25 @@ def set_index( 2 4 4 2014 40 3 9 7 2013 84 4 16 10 2014 31 + + Append a column to the existing index: + + >>> df = df.set_index("month") + >>> df.set_index("year", append=True) + sale + month year + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + >>> df.set_index("year", append=False) + sale + year + 2012 55 + 2014 40 + 2013 84 + 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) @@ -10254,7 +10275,7 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: Literal["python", "numba"] = "python", + engine: Callable | None | Literal["python", "numba"] = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): @@ -10265,7 +10286,9 @@ def apply( either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). By default (``result_type=None``), the final return type is inferred from the return type of the applied function. Otherwise, - it depends on the `result_type` argument. + it depends on the `result_type` argument. The return type of the applied + function is inferred based on the first computed result obtained after + applying the function to a Series object. Parameters ---------- @@ -10316,28 +10339,24 @@ def apply( .. versionadded:: 2.1.0 - engine : {'python', 'numba'}, default 'python' - Choose between the python (default) engine or the numba engine in apply. - - The numba engine will attempt to JIT compile the passed function, - which may result in speedups for large DataFrames. - It also supports the following engine_kwargs : + engine : decorator or {'python', 'numba'}, optional + Choose the execution engine to use. If not provided the function + will be executed by the regular Python interpreter. - - nopython (compile the function in nopython mode) - - nogil (release the GIL inside the JIT compiled function) - - parallel (try to apply the function in parallel over the DataFrame) + Other options include JIT compilers such Numba and Bodo, which in some + cases can speed up the execution. To use an executor you can provide + the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can + also provide the decorator with parameters, like ``numba.jit(nogit=True)``. - Note: Due to limitations within numba/how pandas interfaces with numba, - you should only use this if raw=True + Not all functions can be executed with all execution engines. In general, + JIT compilers will require type stability in the function (no variable + should change data type during the execution). And not all pandas and + NumPy APIs are supported. Check the engine documentation [1]_ and [2]_ + for limitations. - Note: The numba compiler only supports a subset of - valid Python/numpy operations. + .. warning:: - Please read more about the `supported python features - `_ - and `supported numpy features - `_ - in numba to learn what you can or cannot use in the passed function. + String parameters will stop being supported in a future pandas version. .. versionadded:: 2.2.0 @@ -10345,6 +10364,7 @@ def apply( Pass keyword arguments to the engine. This is currently only used by the numba engine, see the documentation for the engine argument for more information. + **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10367,6 +10387,13 @@ def apply( behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. + References + ---------- + .. [1] `Numba documentation + `_ + .. [2] `Bodo documentation + `/ + Examples -------- >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) @@ -10435,22 +10462,99 @@ def apply( 0 1 2 1 1 2 2 1 2 + + Advanced users can speed up their code by using a Just-in-time (JIT) compiler + with ``apply``. The main JIT compilers available for pandas are Numba and Bodo. + In general, JIT compilation is only possible when the function passed to + ``apply`` has type stability (variables in the function do not change their + type during the execution). + + >>> import bodo + >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) + + Note that JIT compilation is only recommended for functions that take a + significant amount of time to run. Fast functions are unlikely to run faster + with JIT compilation. """ - from pandas.core.apply import frame_apply + if engine is None or isinstance(engine, str): + from pandas.core.apply import frame_apply - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, - args=args, - kwargs=kwargs, - ) - return op.apply().__finalize__(self, method="apply") + if engine is None: + engine = "python" + + if engine not in ["python", "numba"]: + raise ValueError(f"Unknown engine '{engine}'") + + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + return op.apply().__finalize__(self, method="apply") + elif hasattr(engine, "__pandas_udf__"): + if result_type is not None: + raise NotImplementedError( + f"{result_type=} only implemented for the default engine" + ) + + agg_axis = self._get_agg_axis(self._get_axis_number(axis)) + + # one axis is empty + if not all(self.shape): + func = cast(Callable, func) + try: + if axis == 0: + r = func(Series([], dtype=np.float64), *args, **kwargs) + else: + r = func( + Series(index=self.columns, dtype=np.float64), + *args, + **kwargs, + ) + except Exception: + pass + else: + if not isinstance(r, Series): + if len(agg_axis): + r = func(Series([], dtype=np.float64), *args, **kwargs) + else: + r = np.nan + + return self._constructor_sliced(r, index=agg_axis) + return self.copy() + + data: DataFrame | np.ndarray = self + if raw: + # This will upcast the whole DataFrame to the same type, + # and likely result in an object 2D array. + # We should probably pass a list of 1D arrays instead, at + # lest for ``axis=0`` + data = self.values + result = engine.__pandas_udf__.apply( + data=data, + func=func, + args=args, + kwargs=kwargs, + decorator=engine, + axis=axis, + ) + if raw: + if result.ndim == 2: + return self._constructor( + result, index=self.index, columns=self.columns + ) + else: + return self._constructor_sliced(result, index=agg_axis) + return result + else: + raise ValueError(f"Unknown engine {engine}") def map( self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs @@ -10567,9 +10671,11 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 79eb1b693d866..29b34f560ab2e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -9,6 +9,7 @@ Sequence, ) from functools import wraps +from itertools import zip_longest from sys import getsizeof from typing import ( TYPE_CHECKING, @@ -588,7 +589,7 @@ def from_tuples( elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrs = zip(*tuples) + arrs = zip_longest(*tuples, fillvalue=np.nan) arrays = cast(list[Sequence[Hashable]], arrs) return cls.from_arrays(arrays, sortorder=sortorder, names=names) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e7cb7069bbc26..5efaf0dc051bd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -477,18 +477,23 @@ def _sanitize_mixed_ndim( else: name = getattr(obj, "name", None) + rename_columns = False if ignore_index or name is None: if axis == 1: # doing a row-wise concatenation so need everything # to line up - name = 0 + if name is None: + name = 0 + rename_columns = True else: # doing a column-wise concatenation so need series # to have unique names - name = current_column - current_column += 1 + if name is None: + rename_columns = True + name = current_column + current_column += 1 obj = sample._constructor(obj, copy=False) - if isinstance(obj, ABCDataFrame): + if isinstance(obj, ABCDataFrame) and rename_columns: obj.columns = range(name, name + 1, 1) else: obj = sample._constructor({name: obj}, copy=False) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 81c89e1ef5428..bff3485c9cb86 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -927,6 +927,41 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) + >>> s.expanding().nunique() + 0 1.0 + 1 2.0 + 2 3.0 + 3 4.0 + 4 5.0 + 5 5.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 69fce8cf2137e..03534bbee4c58 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1799,6 +1799,16 @@ def rank( return self._apply(window_func, name="rank", numeric_only=numeric_only) + def nunique( + self, + numeric_only: bool = False, + ): + window_func = partial( + window_aggregations.roll_nunique, + ) + + return self._apply(window_func, name="nunique", numeric_only=numeric_only) + def cov( self, other: DataFrame | Series | None = None, @@ -2855,6 +2865,43 @@ def rank( numeric_only=numeric_only, ) + @doc( + template_header, + ".. versionadded:: 3.0.0 \n\n", + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5]) + >>> s.rolling(3).nunique() + 0 NaN + 1 NaN + 2 3.0 + 3 NaN + 4 NaN + 5 NaN + 6 2.0 + 7 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="nunique", + agg_method="nunique", + ) + def nunique( + self, + numeric_only: bool = False, + ): + return super().nunique( + numeric_only=numeric_only, + ) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/io/common.py b/pandas/io/common.py index e0076eb486976..1a9e6b472463d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -71,7 +71,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") +_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -291,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and bool(_RFC_3986_PATTERN.match(url)) + and bool(_FSSPEC_URL_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 45c8876dbe3e5..642408b35ba24 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -147,7 +147,7 @@ def nested_to_record( return new_ds -def _normalise_json( +def _normalize_json( data: Any, key_string: str, normalized_dict: dict[str, Any], @@ -177,7 +177,7 @@ def _normalise_json( if not key_string: new_key = new_key.removeprefix(separator) - _normalise_json( + _normalize_json( data=value, key_string=new_key, normalized_dict=normalized_dict, @@ -188,7 +188,7 @@ def _normalise_json( return normalized_dict -def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: +def _normalize_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: """ Order the top level keys and then recursively go to depth @@ -201,10 +201,10 @@ def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, A Returns ------- - dict or list of dicts, matching `normalised_json_object` + dict or list of dicts, matching `normalized_json_object` """ top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} - nested_dict_ = _normalise_json( + nested_dict_ = _normalize_json( data={k: v for k, v in data.items() if isinstance(v, dict)}, key_string="", normalized_dict={}, @@ -235,7 +235,7 @@ def _simple_json_normalize( Returns ------- frame : DataFrame - d - dict or list of dicts, matching `normalised_json_object` + d - dict or list of dicts, matching `normalized_json_object` Examples -------- @@ -256,14 +256,14 @@ def _simple_json_normalize( } """ - normalised_json_object = {} + normalized_json_object = {} # expect a dictionary, as most jsons are. However, lists are perfectly valid if isinstance(ds, dict): - normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) + normalized_json_object = _normalize_json_ordered(data=ds, separator=sep) elif isinstance(ds, list): - normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] - return normalised_json_list - return normalised_json_object + normalized_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] + return normalized_json_list + return normalized_json_object def json_normalize( diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 4a05259a98087..2ba90948be399 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -6,6 +6,7 @@ from pandas import api import pandas._testing as tm from pandas.api import ( + executors as api_executors, extensions as api_extensions, indexers as api_indexers, interchange as api_interchange, @@ -243,6 +244,7 @@ def test_depr(self): class TestApi(Base): allowed_api_dirs = [ + "executors", "types", "extensions", "indexers", @@ -338,6 +340,7 @@ class TestApi(Base): "ExtensionArray", "ExtensionScalarOpsMixin", ] + allowed_api_executors = ["BaseExecutionEngine"] def test_api(self): self.check(api, self.allowed_api_dirs) @@ -357,6 +360,9 @@ def test_api_indexers(self): def test_api_extensions(self): self.check(api_extensions, self.allowed_api_extensions) + def test_api_executors(self): + self.check(api_executors, self.allowed_api_executors) + class TestErrors(Base): def test_errors(self): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b9e407adc3051..2d47cd851ad10 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -17,10 +17,63 @@ date_range, ) import pandas._testing as tm +from pandas.api.executors import BaseExecutionEngine from pandas.tests.frame.common import zip_frames from pandas.util.version import Version +class MockExecutionEngine(BaseExecutionEngine): + """ + Execution Engine to test if the execution engine interface receives and + uses all parameters provided by the user. + + Making this engine work as the default Python engine by calling it, no extra + functionality is implemented here. + + When testing, this will be called when this engine is provided, and then the + same pandas.map and pandas.apply function will be called, but without engine, + executing the default behavior from the python engine. + """ + + def map(data, func, args, kwargs, decorator, skip_na): + kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {} + return data.map( + func, action_na="ignore" if skip_na else False, **kwargs_to_pass + ) + + def apply(data, func, args, kwargs, decorator, axis): + if isinstance(data, Series): + return data.apply(func, convert_dtype=True, args=args, by_row=False) + elif isinstance(data, DataFrame): + return data.apply( + func, + axis=axis, + raw=False, + result_type=None, + args=args, + by_row="compat", + **kwargs, + ) + else: + assert isinstance(data, np.ndarray) + + def wrap_function(func): + # https://github.com/numpy/numpy/issues/8352 + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs) + + +class MockEngineDecorator: + __pandas_udf__ = MockExecutionEngine + + @pytest.fixture def int_frame_const_col(): """ @@ -35,7 +88,13 @@ def int_frame_const_col(): return df -@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +@pytest.fixture( + params=[ + "python", + pytest.param("numba", marks=pytest.mark.single_cpu), + MockEngineDecorator, + ] +) def engine(request): if request.param == "numba": pytest.importorskip("numba") @@ -1079,12 +1138,21 @@ def test_result_type_broadcast(int_frame_const_col, request, engine): mark = pytest.mark.xfail(reason="numba engine doesn't support list return") request.node.add_marker(mark) df = int_frame_const_col - # broadcast result - result = df.apply( - lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + if engine is MockEngineDecorator: + with pytest.raises( + NotImplementedError, + match="result_type='broadcast' only implemented for the default engine", + ): + df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) + else: + # broadcast result + result = df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) def test_result_type_broadcast_series_func(int_frame_const_col, engine, request): @@ -1097,14 +1165,27 @@ def test_result_type_broadcast_series_func(int_frame_const_col, engine, request) request.node.add_marker(mark) df = int_frame_const_col columns = ["other", "col", "names"] - result = df.apply( - lambda x: Series([1, 2, 3], index=columns), - axis=1, - result_type="broadcast", - engine=engine, - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + + if engine is MockEngineDecorator: + with pytest.raises( + NotImplementedError, + match="result_type='broadcast' only implemented for the default engine", + ): + df.apply( + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, + ) + else: + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) def test_result_type_series_result(int_frame_const_col, engine, request): @@ -1791,3 +1872,9 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("engine_name", ["unknown", 25]) +def test_wrong_engine(engine_name): + with pytest.raises(ValueError, match="Unknown engine "): + DataFrame().apply(lambda x: x, engine=engine_name) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index e3f49d04a0ff2..87505b1b22fc4 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -769,8 +769,10 @@ def test_date_range_frequency_M_Q_Y_raises(self, freq): @pytest.mark.parametrize("freq_depr", ["2MIN", "2nS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): # GH#9586, GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_depr.lower()[1:]}' instead." + depr_msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + ) expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) with tm.assert_produces_warning(FutureWarning, match=depr_msg): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fbd3868f62899..7b7c2a632aba2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -42,6 +42,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under20p0, ) from pandas.core.dtypes.dtypes import ( @@ -453,31 +454,24 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - if op_name in ["kurt", "skew"]: + if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"): return False dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]: + if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod", "skew"]: if pa.types.is_duration(pa_dtype) and op_name in ["sum"]: # summing timedeltas is one case that *is* well-defined pass else: return False - elif pa.types.is_binary(pa_dtype) and op_name == "sum": + elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]: return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) - ) and op_name in [ - "mean", - "median", - "prod", - "std", - "sem", - "var", - ]: + ) and op_name in ["mean", "median", "prod", "std", "sem", "var", "skew"]: return False if ( @@ -561,7 +555,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std", "sem"]: + if op_name not in ["median", "var", "std", "sem", "skew"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" @@ -579,10 +573,29 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): }[arr.dtype.kind] return cmp_dtype + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): + if ( + not pa_version_under20p0 + and skipna + and all_numeric_reductions == "skew" + and ( + pa.types.is_integer(data.dtype.pyarrow_dtype) + or pa.types.is_floating(data.dtype.pyarrow_dtype) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/45733", + ) + ) + return super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions - if op_name == "skew": + if op_name == "skew" and pa_version_under20p0: if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 64e686d25faa7..127f0fc50a747 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index): expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 1]) + def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": [pd.NA]}, dtype=dtype) + result = df.sum(axis=1, skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = Series([value], dtype=dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index ea876cfdf4933..45047fe004aa0 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -955,6 +955,20 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) + gb = df.groupby("a") + result = gb.sum(skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = DataFrame( + {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype + ) + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): df = DataFrame( { diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index b2867d4ac8e68..92827cf154394 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -410,6 +410,19 @@ def test_from_tuples_with_tuple_label(): tm.assert_frame_equal(expected, result) +@pytest.mark.parametrize( + "keys, expected", + [ + ((("l1",), ("l1", "l2")), (("l1", np.nan), ("l1", "l2"))), + ((("l1", "l2"), ("l1",)), (("l1", "l2"), ("l1", np.nan))), + ], +) +def test_from_tuples_with_various_tuple_lengths(keys, expected): + # GH 60695 + idx = MultiIndex.from_tuples(keys) + assert tuple(idx) == expected + + # ---------------------------------------------------------------------------- # from_product # ---------------------------------------------------------------------------- diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..e64fab21b85a5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1753,6 +1753,7 @@ def test_read_timezone_information(self): [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", + "filecache::s3://yet-another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e162815271ab3..99af421d5aa48 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -501,6 +501,18 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +def test_is_fsspec_url_chained(): + # GH#48978 Support chained fsspec URLs + # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. + assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") + assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache::://pandas/test.csv") + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 56a8e4c439164..78f39b649cb9a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -18,6 +18,7 @@ pa_version_under15p0, pa_version_under17p0, pa_version_under19p0, + pa_version_under20p0, ) import pandas as pd @@ -1075,27 +1076,34 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - @pytest.mark.xfail( - pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" + @pytest.mark.parametrize( + "columns", + [ + [0, 1], + pytest.param( + [b"foo", b"bar"], + marks=pytest.mark.xfail( + pa_version_under20p0, + raises=NotImplementedError, + reason="https://github.com/apache/arrow/pull/44171", + ), + ), + pytest.param( + [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ], + marks=pytest.mark.xfail( + pa_version_under17p0, + reason="pa.pandas_compat passes 'datetime64' to .astype", + ), + ), + ], ) - def test_columns_dtypes_not_invalid(self, pa): + def test_columns_dtypes_not_invalid(self, pa, columns): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - # numeric - df.columns = [0, 1] - check_round_trip(df, pa) - - # bytes - df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] + df.columns = columns check_round_trip(df, pa) def test_empty_columns(self, pa): diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 50b561aefcf49..6a95cfc7355d8 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -149,18 +149,19 @@ def test_map_locations(self, table_type, dtype, writable): def test_map_locations_mask(self, table_type, dtype, writable): if table_type == ht.PyObjectHashTable: pytest.skip("Mask not supported for object") - N = 3 + N = 129 # must be > 128 to test GH#58924 table = table_type(uses_mask=True) keys = (np.arange(N) + N).astype(dtype) keys.flags.writeable = writable - table.map_locations(keys, np.array([False, False, True])) + mask = np.concatenate([np.repeat(False, N - 1), [True]], axis=0) + table.map_locations(keys, mask) for i in range(N - 1): assert table.get_item(keys[i]) == i with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))): table.get_item(keys[N - 1]) - assert table.get_na() == 2 + assert table.get_na() == N - 1 def test_lookup(self, table_type, dtype, writable): N = 3 diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index b2d9f6c0e3eb0..0db5c0c82d4d4 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method): assert result.index.freq == expected.index.freq +@pytest.mark.parametrize("min_count", [0, 1]) +def test_resample_empty_sum_string(string_dtype_no_object, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + ser = Series( + pd.NA, + index=DatetimeIndex( + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:10", + "2000-01-01 00:00:20", + "2000-01-01 00:00:30", + ] + ), + dtype=dtype, + ) + rs = ser.resample("20s") + result = rs.sum(min_count=min_count) + + value = "" if min_count == 0 else pd.NA + index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s") + expected = Series(value, index=index, dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "freq", [ diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index e7850f96b3b0f..7870c5a9d3e17 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -494,6 +494,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_groupby_resample_empty_sum_string( + string_dtype_no_object, test_frame, min_count +): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype)) + gbrs = test_frame.groupby("A").resample("40s") + result = gbrs.sum(min_count=min_count) + + index = pd.MultiIndex( + levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]], + codes=[[0, 1, 2], [0, 0, 0]], + names=["A", None], + ) + value = "" if min_count == 0 else pd.NA + expected = DataFrame({"B": value}, index=index, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_groupby_resample_with_list_of_keys(): # GH 47362 df = DataFrame( diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index d3edee17366f7..2d0eb5d14a1d9 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -326,6 +326,8 @@ def test_concat_mixed_objs_index(self): def test_concat_mixed_objs_index_names(self): # Test row-wise concat for mixed series/frames with distinct names # GH2385, GH15047 + # GH #60723 & GH #56257 (Updated the test case, + # as the above GH PR ones were incorrect) index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") @@ -341,8 +343,11 @@ def test_concat_mixed_objs_index_names(self): result = concat([s1, df, s2]) tm.assert_frame_equal(result, expected) - # Rename all series to 0 when ignore_index=True - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + expected = DataFrame( + np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T, + index=np.arange(30, dtype=np.int64), + columns=["foo", 0, "bar"], + ) result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) @@ -943,3 +948,56 @@ def test_concat_with_moot_ignore_index_and_keys(): msg = f"Cannot set {ignore_index=} and specify keys. Either should be used." with pytest.raises(ValueError, match=msg): concat([df1, df2], keys=keys, ignore_index=ignore_index) + + +@pytest.mark.parametrize( + "inputs, ignore_index, axis, expected", + [ + # Concatenating DataFrame and named Series without ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5], name="c")], + False, + 0, + DataFrame( + { + "a": [0, 1, None, None], + "b": [2, 3, None, None], + "c": [None, None, 4, 5], + }, + index=[0, 1, 0, 1], + ), + ), + # Concatenating DataFrame and named Series with ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5], name="c")], + True, + 0, + DataFrame( + { + "a": [0, 1, None, None], + "b": [2, 3, None, None], + "c": [None, None, 4, 5], + }, + index=[0, 1, 2, 3], + ), + ), + # Concatenating DataFrame and unnamed Series along columns + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5]), Series([4, 5])], + False, + 1, + DataFrame({"a": [0, 1], "b": [2, 3], 0: [4, 5], 1: [4, 5]}, index=[0, 1]), + ), + # Concatenating DataFrame and unnamed Series along columns with ignore_index + ( + [DataFrame({"a": [0, 1], "b": [2, 3]}), Series([4, 5]), Series([4, 5])], + True, + 1, + DataFrame({0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [4, 5]}, index=[0, 1]), + ), + ], +) +def test_concat_of_series_and_frame(inputs, ignore_index, axis, expected): + # GH #60723 and #56257 + result = concat(inputs, ignore_index=ignore_index, axis=axis) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a2be698c0ec28..5f4a100e7ccc7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1441,10 +1441,17 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data - def test_constructor_dict_of_tuples(self): - data = {(1, 2): 3, (None, 5): 6} + @pytest.mark.parametrize( + "data, expected_values, expected_index", + [ + ({(1, 2): 3, (None, 5): 6}, [3, 6], [(1, 2), (None, 5)]), + ({(1,): 3, (4, 5): 6}, [3, 6], [(1, None), (4, 5)]), + ], + ) + def test_constructor_dict_of_tuples(self, data, expected_values, expected_index): + # GH 60695 result = Series(data).sort_values() - expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + expected = Series(expected_values, index=MultiIndex.from_tuples(expected_index)) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/22698 @@ -1860,23 +1867,30 @@ class A(OrderedDict): series = Series(A(data)) tm.assert_series_equal(series, expected) - def test_constructor_dict_multiindex(self): - d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} - _d = sorted(d.items()) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) - ) - tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data, expected_index_multi", + [ + ({("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, True), + ({("a",): 0.0, ("a", "b"): 1.0}, True), + ({"z": 111.0, ("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, False), + ], + ) + def test_constructor_dict_multiindex(self, data, expected_index_multi): + # GH#60695 + result = Series(data) - d["z"] = 111.0 - _d.insert(0, ("z", d["z"])) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) - ) - result = result.reindex(index=expected.index) - tm.assert_series_equal(result, expected) + if expected_index_multi: + expected = Series( + list(data.values()), + index=MultiIndex.from_tuples(list(data.keys())), + ) + tm.assert_series_equal(result, expected) + else: + expected = Series( + list(data.values()), + index=Index(list(data.keys())), + ) + tm.assert_series_equal(result, expected) def test_constructor_dict_multiindex_reindex_flat(self): # construction involves reindexing with a MultiIndex corner case diff --git a/pandas/tests/tseries/offsets/test_business_halfyear.py b/pandas/tests/tseries/offsets/test_business_halfyear.py new file mode 100644 index 0000000000000..9ea336b3d13f8 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_business_halfyear.py @@ -0,0 +1,329 @@ +""" +Tests for the following offsets: +- BHalfYearBegin +- BHalfYearEnd +""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from pandas.tests.tseries.offsets.common import ( + assert_is_on_offset, + assert_offset_equal, +) + +from pandas.tseries.offsets import ( + BHalfYearBegin, + BHalfYearEnd, +) + + +@pytest.mark.parametrize("klass", (BHalfYearBegin, BHalfYearEnd)) +def test_halfyearly_dont_normalize(klass): + date = datetime(2012, 3, 31, 5, 30) + result = date + klass() + assert result.time() == date.time() + + +@pytest.mark.parametrize("offset", [BHalfYearBegin(), BHalfYearEnd()]) +@pytest.mark.parametrize( + "date", + [ + datetime(2016, m, d) + for m in [7, 8, 9, 10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m in {9, 11} and d == 31) + ], +) +def test_on_offset(offset, date): + res = offset.is_on_offset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +class TestBHalfYearBegin: + def test_repr(self): + expected = "" + assert repr(BHalfYearBegin()) == expected + expected = "" + assert repr(BHalfYearBegin(startingMonth=3)) == expected + expected = "" + assert repr(BHalfYearBegin(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = BHalfYearBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 8, 1), + datetime(2008, 3, 15): datetime(2008, 8, 1), + datetime(2008, 3, 31): datetime(2008, 8, 1), + datetime(2008, 4, 15): datetime(2008, 8, 1), + datetime(2008, 4, 30): datetime(2008, 8, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2008, 7, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 2), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 1, 1), + datetime(2008, 4, 30): datetime(2008, 1, 1), + datetime(2008, 7, 1): datetime(2008, 1, 1), + datetime(2008, 7, 15): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2009, 1, 1), + datetime(2008, 2, 29): datetime(2009, 1, 1), + datetime(2008, 3, 15): datetime(2009, 1, 1), + datetime(2008, 3, 31): datetime(2009, 1, 1), + datetime(2008, 4, 15): datetime(2009, 1, 1), + datetime(2008, 4, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 7, 1), + datetime(2008, 7, 1): datetime(2009, 7, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 1, 1), True), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 12, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 3, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=1), datetime(2007, 6, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 1, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 12, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 3, 1), True), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2008, 5, 2), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 6, 1), False), + (BHalfYearBegin(1, startingMonth=3), datetime(2007, 6, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 1, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 12, 3), True), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 2, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 3, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 3, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 4, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 5, 1), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2008, 5, 2), False), + (BHalfYearBegin(1, startingMonth=6), datetime(2007, 6, 1), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBHalfYearEnd: + def test_repr(self): + expected = "" + assert repr(BHalfYearEnd()) == expected + expected = "" + assert repr(BHalfYearEnd(startingMonth=3)) == expected + expected = "" + assert repr(BHalfYearEnd(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = BHalfYearEnd(n=-1, startingMonth=1) + assert datetime(2010, 1, 30) + offset == datetime(2010, 1, 29) + + offset_cases = [] + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2009, 1, 30), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 8, 29), + datetime(2008, 3, 15): datetime(2008, 8, 29), + datetime(2008, 3, 31): datetime(2008, 8, 29), + datetime(2008, 4, 15): datetime(2008, 8, 29), + datetime(2008, 8, 28): datetime(2008, 8, 29), + datetime(2008, 8, 29): datetime(2009, 2, 27), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 31), + datetime(2008, 1, 31): datetime(2007, 7, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 7, 15): datetime(2008, 1, 31), + datetime(2008, 7, 30): datetime(2008, 1, 31), + datetime(2008, 7, 31): datetime(2008, 1, 31), + datetime(2008, 8, 1): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BHalfYearEnd(startingMonth=6, n=2), + { + datetime(2008, 1, 31): datetime(2008, 12, 31), + datetime(2008, 2, 15): datetime(2008, 12, 31), + datetime(2008, 2, 29): datetime(2008, 12, 31), + datetime(2008, 3, 15): datetime(2008, 12, 31), + datetime(2008, 3, 31): datetime(2008, 12, 31), + datetime(2008, 4, 15): datetime(2008, 12, 31), + datetime(2008, 4, 30): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2009, 6, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BHalfYearEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 12, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (BHalfYearEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 1, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 12, 31), True), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 2, 29), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 3, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 3, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 4, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 5, 30), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2008, 5, 31), False), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 6, 29), True), + (BHalfYearEnd(1, startingMonth=6), datetime(2007, 6, 30), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tseries/offsets/test_halfyear.py b/pandas/tests/tseries/offsets/test_halfyear.py new file mode 100644 index 0000000000000..5bb3821fd07f8 --- /dev/null +++ b/pandas/tests/tseries/offsets/test_halfyear.py @@ -0,0 +1,329 @@ +""" +Tests for the following offsets: +- HalfYearBegin +- HalfYearEnd +""" + +from __future__ import annotations + +from datetime import datetime + +import pytest + +from pandas.tests.tseries.offsets.common import ( + assert_is_on_offset, + assert_offset_equal, +) + +from pandas.tseries.offsets import ( + HalfYearBegin, + HalfYearEnd, +) + + +@pytest.mark.parametrize("klass", (HalfYearBegin, HalfYearEnd)) +def test_halfyearly_dont_normalize(klass): + date = datetime(2012, 3, 31, 5, 30) + result = date + klass() + assert result.time() == date.time() + + +@pytest.mark.parametrize("offset", [HalfYearBegin(), HalfYearEnd()]) +@pytest.mark.parametrize( + "date", + [ + datetime(2016, m, d) + for m in [7, 8, 9, 10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m in {9, 11} and d == 31) + ], +) +def test_on_offset(offset, date): + res = offset.is_on_offset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +class TestHalfYearBegin: + def test_repr(self): + expected = "" + assert repr(HalfYearBegin()) == expected + expected = "" + assert repr(HalfYearBegin(startingMonth=3)) == expected + expected = "" + assert repr(HalfYearBegin(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = HalfYearBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append( + ( + HalfYearBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 8, 1), + datetime(2008, 3, 15): datetime(2008, 8, 1), + datetime(2008, 3, 31): datetime(2008, 8, 1), + datetime(2008, 4, 15): datetime(2008, 8, 1), + datetime(2008, 4, 30): datetime(2008, 8, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + datetime(2008, 7, 1): datetime(2008, 7, 1), + datetime(2008, 7, 15): datetime(2009, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 1, 1), + datetime(2008, 4, 30): datetime(2008, 1, 1), + datetime(2008, 7, 1): datetime(2008, 1, 1), + datetime(2008, 7, 15): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + HalfYearBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 2, 15): datetime(2009, 1, 1), + datetime(2008, 2, 29): datetime(2009, 1, 1), + datetime(2008, 3, 15): datetime(2009, 1, 1), + datetime(2008, 3, 31): datetime(2009, 1, 1), + datetime(2008, 4, 15): datetime(2009, 1, 1), + datetime(2008, 4, 1): datetime(2009, 1, 1), + datetime(2008, 7, 15): datetime(2009, 7, 1), + datetime(2008, 7, 1): datetime(2009, 7, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (HalfYearBegin(1, startingMonth=1), datetime(2008, 1, 1), True), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 12, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 3, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=1), datetime(2007, 6, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 1, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 12, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 3, 1), True), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2008, 5, 2), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 6, 1), False), + (HalfYearBegin(1, startingMonth=3), datetime(2007, 6, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 1, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 12, 1), True), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 2, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 3, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 3, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 4, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 5, 1), False), + (HalfYearBegin(1, startingMonth=6), datetime(2008, 5, 2), False), + (HalfYearBegin(1, startingMonth=6), datetime(2007, 6, 1), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestHalfYearEnd: + def test_repr(self): + expected = "" + assert repr(HalfYearEnd()) == expected + expected = "" + assert repr(HalfYearEnd(startingMonth=3)) == expected + expected = "" + assert repr(HalfYearEnd(startingMonth=1)) == expected + + def test_offset_corner_case(self): + # corner + offset = HalfYearEnd(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) + + offset_cases = [] + offset_cases.append( + ( + HalfYearEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2009, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 8, 31), + datetime(2008, 3, 15): datetime(2008, 8, 31), + datetime(2008, 3, 31): datetime(2008, 8, 31), + datetime(2008, 4, 15): datetime(2008, 8, 31), + datetime(2008, 8, 30): datetime(2008, 8, 31), + datetime(2008, 8, 31): datetime(2009, 2, 28), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 7, 31): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 7, 31), + datetime(2008, 1, 31): datetime(2007, 7, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 7, 15): datetime(2008, 1, 31), + datetime(2008, 7, 30): datetime(2008, 1, 31), + datetime(2008, 7, 31): datetime(2008, 1, 31), + datetime(2008, 8, 1): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + HalfYearEnd(startingMonth=6, n=2), + { + datetime(2008, 1, 31): datetime(2008, 12, 31), + datetime(2008, 2, 15): datetime(2008, 12, 31), + datetime(2008, 2, 29): datetime(2008, 12, 31), + datetime(2008, 3, 15): datetime(2008, 12, 31), + datetime(2008, 3, 31): datetime(2008, 12, 31), + datetime(2008, 4, 15): datetime(2008, 12, 31), + datetime(2008, 4, 30): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2009, 6, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (HalfYearEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 12, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 1, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 12, 31), True), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 2, 29), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 3, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 3, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 4, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 5, 30), False), + (HalfYearEnd(1, startingMonth=6), datetime(2008, 5, 31), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 6, 29), False), + (HalfYearEnd(1, startingMonth=6), datetime(2007, 6, 30), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 7480b99595066..f5c2c06162fcb 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -160,6 +160,10 @@ def expecteds(): "BQuarterBegin": Timestamp("2011-03-01 09:00:00"), "QuarterEnd": Timestamp("2011-03-31 09:00:00"), "BQuarterEnd": Timestamp("2011-03-31 09:00:00"), + "HalfYearBegin": Timestamp("2011-07-01 09:00:00"), + "HalfYearEnd": Timestamp("2011-06-30 09:00:00"), + "BHalfYearBegin": Timestamp("2011-01-03 09:00:00"), + "BHalfYearEnd": Timestamp("2011-06-30 09:00:00"), "BusinessHour": Timestamp("2011-01-03 10:00:00"), "CustomBusinessHour": Timestamp("2011-01-03 10:00:00"), "WeekOfMonth": Timestamp("2011-01-08 09:00:00"), @@ -325,6 +329,7 @@ def test_rollforward(self, offset_types, expecteds): "MonthBegin", "SemiMonthBegin", "YearBegin", + "HalfYearBegin", "Week", "Hour", "Minute", @@ -351,6 +356,7 @@ def test_rollforward(self, offset_types, expecteds): "MonthBegin": Timestamp("2011-02-01 00:00:00"), "SemiMonthBegin": Timestamp("2011-01-15 00:00:00"), "YearBegin": Timestamp("2012-01-01 00:00:00"), + "HalfYearBegin": Timestamp("2011-07-01 00:00:00"), "Week": Timestamp("2011-01-08 00:00:00"), "Hour": Timestamp("2011-01-01 00:00:00"), "Minute": Timestamp("2011-01-01 00:00:00"), @@ -388,6 +394,10 @@ def test_rollback(self, offset_types): "BQuarterBegin": Timestamp("2010-12-01 09:00:00"), "QuarterEnd": Timestamp("2010-12-31 09:00:00"), "BQuarterEnd": Timestamp("2010-12-31 09:00:00"), + "HalfYearBegin": Timestamp("2010-07-01 09:00:00"), + "HalfYearEnd": Timestamp("2010-12-31 09:00:00"), + "BHalfYearBegin": Timestamp("2010-07-01 09:00:00"), + "BHalfYearEnd": Timestamp("2010-12-31 09:00:00"), "BusinessHour": Timestamp("2010-12-31 17:00:00"), "CustomBusinessHour": Timestamp("2010-12-31 17:00:00"), "WeekOfMonth": Timestamp("2010-12-11 09:00:00"), @@ -403,6 +413,7 @@ def test_rollback(self, offset_types): "MonthBegin", "SemiMonthBegin", "YearBegin", + "HalfYearBegin", "Week", "Hour", "Minute", @@ -425,6 +436,7 @@ def test_rollback(self, offset_types): "MonthBegin": Timestamp("2010-12-01 00:00:00"), "SemiMonthBegin": Timestamp("2010-12-15 00:00:00"), "YearBegin": Timestamp("2010-01-01 00:00:00"), + "HalfYearBegin": Timestamp("2010-07-01 00:00:00"), "Week": Timestamp("2010-12-25 00:00:00"), "Hour": Timestamp("2011-01-01 00:00:00"), "Minute": Timestamp("2011-01-01 00:00:00"), @@ -849,7 +861,20 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["YE", "YS", "BYE", "BYS", "QE", "QS", "BQE", "BQS"] + base_lst = [ + "YE", + "YS", + "BYE", + "BYS", + "QE", + "QS", + "BQE", + "BQS", + "HYE", + "HYS", + "BHYE", + "BHYS", + ] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -868,7 +893,20 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["YE", "YS", "BYE", "BYS", "QE", "BQE", "BQS", "QS"] + month_prefixes = [ + "YE", + "YS", + "BYE", + "BYS", + "QE", + "BQE", + "BQS", + "QS", + "HYE", + "HYS", + "BHYE", + "BHYS", + ] names = [ prefix + "-" + month for prefix in month_prefixes diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index feb25a294c540..39811ea3ec5b9 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -32,6 +32,7 @@ def _get_rolling_aggregations(): ("roll_min", window_aggregations.roll_min), ("roll_first", window_aggregations.roll_first), ("roll_last", window_aggregations.roll_last), + ("roll_nunique", window_aggregations.roll_nunique), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 39cedc3b692da..2c96ce01c6328 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -255,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) + + expected = ser.expanding(window).apply(lambda x: x.nunique()) + result = ser.expanding(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_expanding_corr(series): A = series.dropna() B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5] diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 392239b8adadd..1dcdad2bfd73d 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame): "count", "kurt", "skew", + "nunique", ], ) def test_rolling(self, f, roll_frame): @@ -1034,7 +1035,19 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] + "f", + [ + "sum", + "mean", + "min", + "max", + "first", + "last", + "count", + "kurt", + "skew", + "nunique", + ], ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2aaa35ec5ec2c..8c57781c1447c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1586,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"]) +def test_nunique(window, test_data): + length = 20 + if test_data == "default": + ser = Series(data=np.random.default_rng(2).random(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.default_rng(2).choice(3, length)) + elif test_data == "nans": + ser = Series( + data=np.random.default_rng(2).choice( + [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length + ) + ) + elif test_data == "precision": + ser = Series( + data=[ + 0.3, + 0.1 * 3, # Not necessarily exactly 0.3 + 0.6, + 0.2 * 3, # Not necessarily exactly 0.6 + 0.9, + 0.3 * 3, # Not necessarily exactly 0.9 + 0.5, + 0.1 * 5, # Not necessarily exactly 0.5 + 0.8, + 0.2 * 4, # Not necessarily exactly 0.8 + ], + dtype=np.float64, + ) + + expected = ser.rolling(window).apply(lambda x: x.nunique()) + result = ser.rolling(window).nunique() + + tm.assert_series_equal(result, expected) + + def test_rolling_quantile_np_percentile(): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a065137e6971c..1f0c4281ffc77 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -4,6 +4,8 @@ FY5253, BaseOffset, BDay, + BHalfYearBegin, + BHalfYearEnd, BMonthBegin, BMonthEnd, BQuarterBegin, @@ -25,6 +27,8 @@ Day, Easter, FY5253Quarter, + HalfYearBegin, + HalfYearEnd, Hour, LastWeekOfMonth, Micro, @@ -48,6 +52,8 @@ __all__ = [ "FY5253", "BDay", + "BHalfYearBegin", + "BHalfYearEnd", "BMonthBegin", "BMonthEnd", "BQuarterBegin", @@ -70,6 +76,8 @@ "Day", "Easter", "FY5253Quarter", + "HalfYearBegin", + "HalfYearEnd", "Hour", "LastWeekOfMonth", "Micro", diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..990901958cd9e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -85,6 +85,8 @@ feedparser pyyaml requests pygments +jupyterlite-core +jupyterlite-pyodide-kernel adbc-driver-postgresql>=0.10.0 adbc-driver-sqlite>=0.8.0 typing_extensions; python_version<"3.11" diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 44318cd797163..73a90f4fca0f6 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -92,6 +92,11 @@ "BYearBegin", "BYearEnd", "YearOffset", + "HalfYearBegin", + "HalfYearEnd", + "BHalfYearBegin", + "BHalfYearEnd", + "HalfYearOffset", "QuarterBegin", "QuarterEnd", "BQuarterBegin", diff --git a/web/interactive_terminal/README.md b/web/interactive_terminal/README.md new file mode 100644 index 0000000000000..6457cbccf2016 --- /dev/null +++ b/web/interactive_terminal/README.md @@ -0,0 +1,38 @@ +# The interactive `pandas` REPL + +An interactive REPL to easily try `pandas` in the browser, powered by JupyterLite. + +![image](https://user-images.githubusercontent.com/591645/175000291-e8c69f6f-5f2c-48d7-817c-cff05ab2cde9.png) + +## Build + +The interactive REPL is built with the `jupyter lite` CLI. + +First make sure `jupyterlite` and a kernel are installed: + +```bash +python -m pip install jupyterlite-core +python -m pip install jupyterlite-pyodide-kernel +``` + +Then in `web/interactive_terminal`, run the following command: + +```bash +jupyter lite build +``` + +## Configuration + +This folder contains configuration files for the interactive terminal powered by JupyterLite: + +- `jupyter_lite_config.json`: build time configuration, used when building the assets with the `jupyter lite build` command +- `jupyter-lite.json` run time configuration applied when launching the application in the browser + +This interactive `pandas` JupyterLite deployment enables a couple of optimizations to only include the `repl` app in the generated static assets, and disables source maps, which can make the assets smaller and faster to load, at the cost of +debugging capabilities. + +To learn more about it, check out the JupyterLite documentation: + +- Optimizations: https://jupyterlite.readthedocs.io/en/latest/howto/configure/advanced/optimizations.html +- JupyterLite schema: https://jupyterlite.readthedocs.io/en/latest/reference/schema-v0.html +- CLI reference: https://jupyterlite.readthedocs.io/en/latest/reference/cli.html diff --git a/web/interactive_terminal/jupyter-lite.json b/web/interactive_terminal/jupyter-lite.json new file mode 100644 index 0000000000000..2199acf1d368a --- /dev/null +++ b/web/interactive_terminal/jupyter-lite.json @@ -0,0 +1,10 @@ +{ + "jupyter-lite-schema-version": 0, + "jupyter-config-data": { + "appName": "Pandas REPL", + "appUrl": "./repl", + "enableMemoryStorage": true, + "settingsStorageDrivers": ["memoryStorageDriver"], + "contentsStorageDrivers": ["memoryStorageDriver"] + } +} diff --git a/web/interactive_terminal/jupyter_lite_config.json b/web/interactive_terminal/jupyter_lite_config.json new file mode 100644 index 0000000000000..42e64f26f2356 --- /dev/null +++ b/web/interactive_terminal/jupyter_lite_config.json @@ -0,0 +1,8 @@ +{ + "LiteBuildConfig": { + "apps": ["repl"], + "no_unused_shared_packages": true, + "output_dir": "../build/lite", + "no_sourcemaps": true + } +} diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 2ad8d6243db55..dc663654be985 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -590,7 +590,7 @@ df = pd.read_csv("big.csv") # use all your cores! ### [Pandarallel](https://github.com/nalepae/pandarallel) Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. -If also displays progress bars. +It also displays progress bars. ```python from pandarallel import pandarallel diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index 801081a9ef391..c556eda57ac31 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -22,11 +22,23 @@ by [Wes McKinney](https://wesmckinney.com/), creator of pandas. ## Videos - + ## Cheat sheet [pandas cheat sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) + +## Try pandas in your browser (experimental) + +You can try pandas in your browser with the following interactive shell +without needing to install anything on your system. + +

+ Try it in your browser +

diff --git a/web/pandas/index.html b/web/pandas/index.html index e8aab9e11144c..4df4d73fb64ec 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -18,9 +18,9 @@

pandas

diff --git a/web/pandas/try.md b/web/pandas/try.md new file mode 100644 index 0000000000000..ee2f98b05aa64 --- /dev/null +++ b/web/pandas/try.md @@ -0,0 +1,12 @@ +# Try pandas in your browser (experimental) + +Try our experimental [JupyterLite](https://jupyterlite.readthedocs.io/en/stable/) live shell with `pandas`, powered by [Pyodide](https://pyodide.org/en/stable/). + +**Please note it can take a while (>30 seconds) before the shell is initialized and ready to run commands.** + +**Running it requires a reasonable amount of bandwidth and resources (>70 MiB on the first load), so it may not work properly on all devices or networks.** + +