diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 56f9c93ac63a5..93e7667ccec58 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -96,6 +96,7 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) +- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0a62ee956be61..96f37bd47e10c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -64,7 +64,10 @@ rands_array, randu_array, ) -from pandas._testing._warnings import assert_produces_warning # noqa:F401 +from pandas._testing._warnings import ( # noqa:F401 + assert_produces_warning, + maybe_produces_warning, +) from pandas._testing.asserters import ( # noqa:F401 assert_almost_equal, assert_attr_equal, diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f9443f80e585c..9e89e09e418b3 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -1,6 +1,9 @@ from __future__ import annotations -from contextlib import contextmanager +from contextlib import ( + contextmanager, + nullcontext, +) import re import sys from typing import ( @@ -97,6 +100,16 @@ class for all warnings. To check that no warning is returned, ) +def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs): + """ + Return a context manager that possibly checks a warning based on the condition + """ + if condition: + return assert_produces_warning(warning, **kwargs) + else: + return nullcontext() + + def _assert_caught_expected_warning( *, caught_warnings: Sequence[warnings.WarningMessage], diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5078d87bc91c7..bc8948cc8aee1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -23,6 +23,8 @@ pa_version_under3p0, pa_version_under4p0, pa_version_under5p0, + pa_version_under6p0, + pa_version_under7p0, ) PY39 = sys.version_info >= (3, 9) @@ -150,4 +152,6 @@ def get_lzma_file(): "pa_version_under3p0", "pa_version_under4p0", "pa_version_under5p0", + "pa_version_under6p0", + "pa_version_under7p0", ] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0a48638f5cf05..fdd505e259dd9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ pa_version_under1p01, pa_version_under2p0, pa_version_under5p0, + pa_version_under6p0, ) from pandas.util._decorators import doc @@ -37,6 +38,8 @@ import pyarrow as pa import pyarrow.compute as pc + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + if TYPE_CHECKING: from pandas import Series @@ -104,6 +107,20 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return ArrowExtensionArray without NA values. + + Returns + ------- + ArrowExtensionArray + """ + if pa_version_under6p0: + fallback_performancewarning(version="6") + return super().dropna() + else: + return type(self)(pc.drop_null(self._data)) + @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: encoded = self._data.dictionary_encode() @@ -219,6 +236,20 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Compute the ArrowExtensionArray of unique values. + + Returns + ------- + ArrowExtensionArray + """ + if pa_version_under2p0: + fallback_performancewarning(version="2") + return super().unique() + else: + return type(self)(pc.unique(self._data)) + def value_counts(self, dropna: bool = True) -> Series: """ Return a Series containing counts of each unique value. diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c21319f6de6ef..5442f96ab2d22 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,8 +2,6 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ -from contextlib import nullcontext - import numpy as np import pytest @@ -18,13 +16,6 @@ from pandas.core.arrays.string_arrow import ArrowStringArray -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - @pytest.fixture def dtype(string_storage): return pd.StringDtype(storage=string_storage) @@ -568,22 +559,30 @@ def test_to_numpy_na_value(dtype, nulls_fixture): def test_isin(dtype, fixed_now_ts): s = pd.Series(["a", "b", None], dtype=dtype) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", "c"]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", pd.NA]) expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin([]) expected = pd.Series([False, False, False]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 66cc000b9f458..eac1e35699585 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,6 +1,9 @@ import numpy as np import pytest +from pandas.compat import pa_version_under2p0 +from pandas.errors import PerformanceWarning + from pandas.core.dtypes.common import is_datetime64tz_dtype import pandas as pd @@ -12,7 +15,11 @@ def test_unique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - result = obj.unique() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + result = obj.unique() # dict.fromkeys preserves the order unique_values = list(dict.fromkeys(obj.values)) @@ -50,7 +57,11 @@ def test_unique_null(null_obj, index_or_series_obj): klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) - result = obj.unique() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + result = obj.unique() unique_values_raw = dict.fromkeys(obj.values) # because np.nan == np.nan is False, but None == None is True @@ -75,7 +86,11 @@ def test_unique_null(null_obj, index_or_series_obj): def test_nunique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - expected = len(obj.unique()) + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @@ -99,9 +114,21 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: - num_unique_values = len(obj.unique()) - assert obj.nunique() == max(0, num_unique_values - 1) - assert obj.nunique(dropna=False) == max(0, num_unique_values) + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + num_unique_values = len(obj.unique()) + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + assert obj.nunique() == max(0, num_unique_values - 1) + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + ): + assert obj.nunique(dropna=False) == max(0, num_unique_values) @pytest.mark.single_cpu diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a4c22e016581d..8a8bdee90e467 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,7 +18,11 @@ import numpy as np import pytest +from pandas.compat import pa_version_under6p0 +from pandas.errors import PerformanceWarning + import pandas as pd +import pandas._testing as tm from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base @@ -139,7 +143,14 @@ class TestIndex(base.BaseIndexTests): class TestMissing(base.BaseMissingTests): - pass + def test_dropna_array(self, data_missing): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under6p0 and data_missing.dtype.storage == "pyarrow", + ): + result = data_missing.dropna() + expected = data_missing[[1]] + self.assert_extension_array_equal(result, expected) class TestNoReduce(base.BaseNoReduceTests): diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 05c66191ca3a2..d582a469eaf0e 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + pa_version_under2p0, +) from pandas.core.dtypes.common import is_integer_dtype @@ -395,7 +398,10 @@ def test_astype_preserves_name(self, index, dtype): try: # Some of these conversions cannot succeed so we use a try / except - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning( + warn, + raise_on_extra_warnings=not pa_version_under2p0, + ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): return diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c1d96ca7993e1..a6e51cc2f98d6 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import datetime import re @@ -19,13 +18,6 @@ # -------------------------------------------------------------------------------------- -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -52,7 +44,10 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) @@ -63,7 +58,9 @@ def test_contains(any_string_dtype): np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,7 +77,10 @@ def test_contains(any_string_dtype): ) pat = "mmm[_]+" - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series( @@ -97,7 +97,10 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -162,8 +165,9 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex, ): result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") @@ -185,7 +189,9 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], @@ -193,7 +199,10 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], @@ -201,7 +210,10 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], @@ -209,7 +221,9 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], @@ -222,18 +236,27 @@ def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) @@ -241,7 +264,10 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) @@ -287,8 +313,9 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.startswith("foo", na=na) exp = Series( @@ -296,8 +323,9 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): ) tm.assert_series_equal(result, exp) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.startswith("rege.", na=na) exp = Series( @@ -345,8 +373,9 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("foo", na=na) exp = Series( @@ -354,8 +383,9 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): ) tm.assert_series_equal(result, exp) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("rege.", na=na) exp = Series( @@ -372,7 +402,10 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): def test_replace(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD[_]*", "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -382,12 +415,18 @@ def test_replace_max_replacements(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD[_]*", "", n=1, regex=True) tm.assert_series_equal(result, expected) expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected) @@ -404,7 +443,9 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -425,7 +466,9 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -444,7 +487,9 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): values.str.replace("a", repl) @@ -453,7 +498,9 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -465,12 +512,16 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -490,7 +541,9 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, ", ") tm.assert_series_equal(result, expected) @@ -518,7 +571,9 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, repl, n=2) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -531,7 +586,10 @@ def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(expected, dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) @@ -568,7 +626,9 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ @@ -587,7 +647,9 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ @@ -611,12 +673,16 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -677,7 +743,10 @@ def test_match(any_string_dtype): expected_dtype = "object" if any_string_dtype == "object" else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -685,12 +754,18 @@ def test_match(any_string_dtype): values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match(".*BAD[_]+.*BAD") expected = Series([True, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -698,12 +773,18 @@ def test_match(any_string_dtype): values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("^BAD[_]+.*BAD") expected = Series([False, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("\\^BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -735,13 +816,19 @@ def test_match_na_kwarg(any_string_dtype): # GH #6609 s = Series(["a", "b", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.match("a", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.match("a") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) @@ -750,7 +837,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) @@ -767,7 +856,10 @@ def test_fullmatch(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch(".*BAD[_]+.*BAD") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) @@ -778,7 +870,10 @@ def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -791,17 +886,24 @@ def test_fullmatch_case_kwarg(any_string_dtype): expected = Series([True, False, False, False], dtype=expected_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch("ab", case=True) tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], dtype=expected_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -984,11 +1086,11 @@ def test_flags_kwarg(any_string_dtype): result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with maybe_perf_warn(using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] - with maybe_perf_warn(using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result[0] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 6d6d69280b9dd..db99ba8368a8a 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import ( datetime, timedelta, @@ -23,13 +22,6 @@ import pandas._testing as tm -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern): # GH3485 @@ -190,18 +182,33 @@ def test_empty_str_methods(any_string_dtype): assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.contains("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.startswith("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=any_string_dtype), @@ -218,7 +225,10 @@ def test_empty_str_methods(any_string_dtype): ) tm.assert_frame_equal(empty_df, empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) @@ -233,11 +243,20 @@ def test_empty_str_methods(any_string_dtype): tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.strip()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.lstrip()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) @@ -247,7 +266,10 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under2p0, + ): tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) @@ -299,10 +321,11 @@ def test_ismethods(method, expected, any_string_dtype): ) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) - with maybe_perf_warn( + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under2p0 - and method == "isspace" + and method == "isspace", ): result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -374,7 +397,10 @@ def test_len(any_string_dtype): ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) @@ -462,7 +488,10 @@ def test_pipe_failures(any_string_dtype): expected = Series([["A", "B", "C"]], dtype=object) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("|", " ", regex=False) expected = Series(["A B C"], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -565,7 +594,10 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = getattr(ser.str, method)("x") expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected)