From 4c010aa34408c41fa486f5781f3371df1ca5d020 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 9 Apr 2022 20:06:18 -0700 Subject: [PATCH 01/10] ENH: Use pyarrow.compute for unique, dropna --- pandas/compat/__init__.py | 4 ++++ pandas/core/arrays/arrow/array.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5078d87bc91c7..bc8948cc8aee1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -23,6 +23,8 @@ pa_version_under3p0, pa_version_under4p0, pa_version_under5p0, + pa_version_under6p0, + pa_version_under7p0, ) PY39 = sys.version_info >= (3, 9) @@ -150,4 +152,6 @@ def get_lzma_file(): "pa_version_under3p0", "pa_version_under4p0", "pa_version_under5p0", + "pa_version_under6p0", + "pa_version_under7p0", ] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0a48638f5cf05..3e9f39881b945 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ pa_version_under1p01, pa_version_under2p0, pa_version_under5p0, + pa_version_under6p0, ) from pandas.util._decorators import doc @@ -104,6 +105,19 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ return type(self)(self._data) + def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return ArrowExtensionArray without NA values. + + Returns + ------- + valid : ArrowExtensionArray + """ + if pa_version_under6p0: + return super().dropna() + else: + return type(self)(pc.drop_null(self._data)) + @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: encoded = self._data.dictionary_encode() @@ -219,6 +233,19 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Compute the ArrowExtensionArray of unique values. + + Returns + ------- + uniques : ArrowExtensionArray + """ + if pa_version_under2p0: + return super().unique() + else: + return type(self)(pc.unique(self._data)) + def value_counts(self, dropna: bool = True) -> Series: """ Return a Series containing counts of each unique value. From 315f59a43bf8538db4b27ab47f3c9ce0f30c274d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 23 Apr 2022 20:31:24 -0700 Subject: [PATCH 02/10] Add fallback warning --- pandas/core/arrays/arrow/array.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3e9f39881b945..7271ac2bd43b0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -38,6 +38,8 @@ import pyarrow as pa import pyarrow.compute as pc + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + if TYPE_CHECKING: from pandas import Series @@ -114,6 +116,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: valid : ArrowExtensionArray """ if pa_version_under6p0: + fallback_performancewarning(version="6") return super().dropna() else: return type(self)(pc.drop_null(self._data)) @@ -242,6 +245,7 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: uniques : ArrowExtensionArray """ if pa_version_under2p0: + fallback_performancewarning(version="2") return super().unique() else: return type(self)(pc.unique(self._data)) From 2dc591840f0888de76d88a74a99f4d565413aa71 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 24 Apr 2022 20:18:30 -0700 Subject: [PATCH 03/10] Fix extra warning test --- pandas/tests/indexes/test_common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 05c66191ca3a2..fe94d44d21698 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + pa_version_under2p0, +) from pandas.core.dtypes.common import is_integer_dtype @@ -395,7 +398,9 @@ def test_astype_preserves_name(self, index, dtype): try: # Some of these conversions cannot succeed so we use a try / except - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning( + warn, raise_on_extra_warnings=pa_version_under2p0 + ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): return From ebf62e86933b78a387bf709627f7936f622d1e4c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 24 Apr 2022 20:38:43 -0700 Subject: [PATCH 04/10] Fix again --- pandas/tests/indexes/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index fe94d44d21698..7be3527ac186b 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -399,7 +399,7 @@ def test_astype_preserves_name(self, index, dtype): try: # Some of these conversions cannot succeed so we use a try / except with tm.assert_produces_warning( - warn, raise_on_extra_warnings=pa_version_under2p0 + warn, raise_on_extra_warnings=not pa_version_under2p0 ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): From ea4e9e9ddedbb5b69b343a6da2580f05f42fa231 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 24 Apr 2022 20:49:28 -0700 Subject: [PATCH 05/10] Test some warnings --- pandas/tests/base/test_unique.py | 42 +++++++++++++++++++++++---- pandas/tests/extension/test_string.py | 20 ++++++++++++- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 66cc000b9f458..0cb7a510e4073 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,6 +1,11 @@ +from contextlib import nullcontext + import numpy as np import pytest +from pandas.compat import pa_version_under2p0 +from pandas.errors import PerformanceWarning + from pandas.core.dtypes.common import is_datetime64tz_dtype import pandas as pd @@ -9,10 +14,20 @@ from pandas.tests.base.common import allow_na_ops +def maybe_perf_warn(using_pyarrow): + if using_pyarrow: + return tm.assert_produces_warning(PerformanceWarning, match="Falling back") + else: + return nullcontext() + + def test_unique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - result = obj.unique() + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + result = obj.unique() # dict.fromkeys preserves the order unique_values = list(dict.fromkeys(obj.values)) @@ -50,7 +65,10 @@ def test_unique_null(null_obj, index_or_series_obj): klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) - result = obj.unique() + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + result = obj.unique() unique_values_raw = dict.fromkeys(obj.values) # because np.nan == np.nan is False, but None == None is True @@ -75,7 +93,10 @@ def test_unique_null(null_obj, index_or_series_obj): def test_nunique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - expected = len(obj.unique()) + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @@ -99,9 +120,18 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: - num_unique_values = len(obj.unique()) - assert obj.nunique() == max(0, num_unique_values - 1) - assert obj.nunique(dropna=False) == max(0, num_unique_values) + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + num_unique_values = len(obj.unique()) + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + assert obj.nunique() == max(0, num_unique_values - 1) + with maybe_perf_warn( + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + ): + assert obj.nunique(dropna=False) == max(0, num_unique_values) @pytest.mark.single_cpu diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a4c22e016581d..a43f5b29f5adf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -13,17 +13,29 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +from contextlib import nullcontext import string import numpy as np import pytest +from pandas.compat import pa_version_under6p0 +from pandas.errors import PerformanceWarning + import pandas as pd +import pandas._testing as tm from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base +def maybe_perf_warn(using_pyarrow): + if using_pyarrow: + return tm.assert_produces_warning(PerformanceWarning, match="Falling back") + else: + return nullcontext() + + def split_array(arr): if arr.dtype.storage != "pyarrow": pytest.skip("only applicable for pyarrow chunked array n/a") @@ -139,7 +151,13 @@ class TestIndex(base.BaseIndexTests): class TestMissing(base.BaseMissingTests): - pass + def test_dropna_array(self, data_missing): + with maybe_perf_warn( + pa_version_under6p0 and data_missing.dtype.storage == "pyarrow" + ): + result = data_missing.dropna() + expected = data_missing[[1]] + self.assert_extension_array_equal(result, expected) class TestNoReduce(base.BaseNoReduceTests): From e2a093f7a1c4c929ff4da1e941ae9f4aaf6c14bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 25 Apr 2022 20:47:41 -0700 Subject: [PATCH 06/10] Add and use maybe_produces_warning --- pandas/_testing/__init__.py | 5 +- pandas/_testing/_warnings.py | 15 +- pandas/tests/arrays/string_/test_string.py | 25 ++- pandas/tests/base/test_unique.py | 39 ++-- pandas/tests/extension/test_string.py | 13 +- pandas/tests/strings/test_find_replace.py | 224 +++++++++++++++------ pandas/tests/strings/test_strings.py | 78 ++++--- 7 files changed, 269 insertions(+), 130 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0a62ee956be61..96f37bd47e10c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -64,7 +64,10 @@ rands_array, randu_array, ) -from pandas._testing._warnings import assert_produces_warning # noqa:F401 +from pandas._testing._warnings import ( # noqa:F401 + assert_produces_warning, + maybe_produces_warning, +) from pandas._testing.asserters import ( # noqa:F401 assert_almost_equal, assert_attr_equal, diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f9443f80e585c..9e89e09e418b3 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -1,6 +1,9 @@ from __future__ import annotations -from contextlib import contextmanager +from contextlib import ( + contextmanager, + nullcontext, +) import re import sys from typing import ( @@ -97,6 +100,16 @@ class for all warnings. To check that no warning is returned, ) +def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs): + """ + Return a context manager that possibly checks a warning based on the condition + """ + if condition: + return assert_produces_warning(warning, **kwargs) + else: + return nullcontext() + + def _assert_caught_expected_warning( *, caught_warnings: Sequence[warnings.WarningMessage], diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c21319f6de6ef..5442f96ab2d22 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,8 +2,6 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ -from contextlib import nullcontext - import numpy as np import pytest @@ -18,13 +16,6 @@ from pandas.core.arrays.string_arrow import ArrowStringArray -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - @pytest.fixture def dtype(string_storage): return pd.StringDtype(storage=string_storage) @@ -568,22 +559,30 @@ def test_to_numpy_na_value(dtype, nulls_fixture): def test_isin(dtype, fixed_now_ts): s = pd.Series(["a", "b", None], dtype=dtype) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", "c"]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", pd.NA]) expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin([]) expected = pd.Series([False, False, False]) tm.assert_series_equal(result, expected) - with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0 + ): result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 0cb7a510e4073..eac1e35699585 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,5 +1,3 @@ -from contextlib import nullcontext - import numpy as np import pytest @@ -14,18 +12,12 @@ from pandas.tests.base.common import allow_na_ops -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - def test_unique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() @@ -65,8 +57,9 @@ def test_unique_null(null_obj, index_or_series_obj): klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() @@ -93,8 +86,9 @@ def test_unique_null(null_obj, index_or_series_obj): def test_nunique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @@ -120,16 +114,19 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): num_unique_values = len(obj.unique()) - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique() == max(0, num_unique_values - 1) - with maybe_perf_warn( - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique(dropna=False) == max(0, num_unique_values) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a43f5b29f5adf..8a8bdee90e467 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -13,7 +13,6 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ -from contextlib import nullcontext import string import numpy as np @@ -29,13 +28,6 @@ from pandas.tests.extension import base -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - def split_array(arr): if arr.dtype.storage != "pyarrow": pytest.skip("only applicable for pyarrow chunked array n/a") @@ -152,8 +144,9 @@ class TestIndex(base.BaseIndexTests): class TestMissing(base.BaseMissingTests): def test_dropna_array(self, data_missing): - with maybe_perf_warn( - pa_version_under6p0 and data_missing.dtype.storage == "pyarrow" + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under6p0 and data_missing.dtype.storage == "pyarrow", ): result = data_missing.dropna() expected = data_missing[[1]] diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c1d96ca7993e1..a6e51cc2f98d6 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import datetime import re @@ -19,13 +18,6 @@ # -------------------------------------------------------------------------------------- -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -52,7 +44,10 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) @@ -63,7 +58,9 @@ def test_contains(any_string_dtype): np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,7 +77,10 @@ def test_contains(any_string_dtype): ) pat = "mmm[_]+" - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series( @@ -97,7 +97,10 @@ def test_contains(any_string_dtype): np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_), dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.contains(pat) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -162,8 +165,9 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 and regex, ): result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") @@ -185,7 +189,9 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], @@ -193,7 +199,10 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], @@ -201,7 +210,10 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False], @@ -209,7 +221,9 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], @@ -222,18 +236,27 @@ def test_contains_nan(any_string_dtype): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) @@ -241,7 +264,10 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) @@ -287,8 +313,9 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.startswith("foo", na=na) exp = Series( @@ -296,8 +323,9 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): ) tm.assert_series_equal(result, exp) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.startswith("rege.", na=na) exp = Series( @@ -345,8 +373,9 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], dtype=nullable_string_dtype, ) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("foo", na=na) exp = Series( @@ -354,8 +383,9 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): ) tm.assert_series_equal(result, exp) - with maybe_perf_warn( - nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0 + with tm.maybe_produces_warning( + PerformanceWarning, + nullable_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = values.str.endswith("rege.", na=na) exp = Series( @@ -372,7 +402,10 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): def test_replace(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD[_]*", "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -382,12 +415,18 @@ def test_replace_max_replacements(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD[_]*", "", n=1, regex=True) tm.assert_series_equal(result, expected) expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("BAD", "", n=1, regex=False) tm.assert_series_equal(result, expected) @@ -404,7 +443,9 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -425,7 +466,9 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -444,7 +487,9 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): values.str.replace("a", repl) @@ -453,7 +498,9 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -465,12 +512,16 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -490,7 +541,9 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, ", ") tm.assert_series_equal(result, expected) @@ -518,7 +571,9 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace(pat, repl, n=2) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -531,7 +586,10 @@ def test_replace_literal(regex, expected, any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) expected = Series(expected, dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("f.", "ba", regex=regex) tm.assert_series_equal(result, expected) @@ -568,7 +626,9 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ @@ -587,7 +647,9 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ @@ -611,12 +673,16 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -677,7 +743,10 @@ def test_match(any_string_dtype): expected_dtype = "object" if any_string_dtype == "object" else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match(".*(BAD[_]+).*(BAD)") expected = Series([True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -685,12 +754,18 @@ def test_match(any_string_dtype): values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match(".*BAD[_]+.*BAD") expected = Series([True, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -698,12 +773,18 @@ def test_match(any_string_dtype): values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("^BAD[_]+.*BAD") expected = Series([False, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = values.str.match("\\^BAD[_]+.*BAD") expected = Series([False, True, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -735,13 +816,19 @@ def test_match_na_kwarg(any_string_dtype): # GH #6609 s = Series(["a", "b", np.nan], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.match("a", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = s.str.match("a") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) @@ -750,7 +837,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) @@ -767,7 +856,10 @@ def test_fullmatch(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch(".*BAD[_]+.*BAD") expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) @@ -778,7 +870,10 @@ def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -791,17 +886,24 @@ def test_fullmatch_case_kwarg(any_string_dtype): expected = Series([True, False, False, False], dtype=expected_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.fullmatch("ab", case=True) tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], dtype=expected_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]"): + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" + ): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -984,11 +1086,11 @@ def test_flags_kwarg(any_string_dtype): result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with maybe_perf_warn(using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] - with maybe_perf_warn(using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result[0] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 6d6d69280b9dd..db99ba8368a8a 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import ( datetime, timedelta, @@ -23,13 +22,6 @@ import pandas._testing as tm -def maybe_perf_warn(using_pyarrow): - if using_pyarrow: - return tm.assert_produces_warning(PerformanceWarning, match="Falling back") - else: - return nullcontext() - - @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern): # GH3485 @@ -190,18 +182,33 @@ def test_empty_str_methods(any_string_dtype): assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.contains("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.startswith("a")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=any_string_dtype), @@ -218,7 +225,10 @@ def test_empty_str_methods(any_string_dtype): ) tm.assert_frame_equal(empty_df, empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) @@ -233,11 +243,20 @@ def test_empty_str_methods(any_string_dtype): tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.strip()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.lstrip()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) @@ -247,7 +266,10 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under2p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under2p0, + ): tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) @@ -299,10 +321,11 @@ def test_ismethods(method, expected, any_string_dtype): ) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) - with maybe_perf_warn( + with tm.maybe_produces_warning( + PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under2p0 - and method == "isspace" + and method == "isspace", ): result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -374,7 +397,10 @@ def test_len(any_string_dtype): ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) @@ -462,7 +488,10 @@ def test_pipe_failures(any_string_dtype): expected = Series([["A", "B", "C"]], dtype=object) tm.assert_series_equal(result, expected) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = ser.str.replace("|", " ", regex=False) expected = Series(["A B C"], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -565,7 +594,10 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) - with maybe_perf_warn(any_string_dtype == "string[pyarrow]" and pa_version_under4p0): + with tm.maybe_produces_warning( + PerformanceWarning, + any_string_dtype == "string[pyarrow]" and pa_version_under4p0, + ): result = getattr(ser.str, method)("x") expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected) From ecefbeed0204ba3137da1541c1247556061580ad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 25 Apr 2022 20:48:38 -0700 Subject: [PATCH 07/10] Add additional issue number --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e4879a6c41515..57aad64f9b155 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -95,7 +95,7 @@ Other enhancements - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) -- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) +- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`) .. --------------------------------------------------------------------------- From 7a5d4fbd98975b3b0d79b5cbd6767bad66ed329b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 26 Apr 2022 13:15:18 -0700 Subject: [PATCH 08/10] Address review --- pandas/core/arrays/arrow/array.py | 4 ++-- pandas/tests/base/test_unique.py | 12 ++++++------ pandas/tests/indexes/test_common.py | 4 +++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7271ac2bd43b0..fdd505e259dd9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -113,7 +113,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: Returns ------- - valid : ArrowExtensionArray + ArrowExtensionArray """ if pa_version_under6p0: fallback_performancewarning(version="6") @@ -242,7 +242,7 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: Returns ------- - uniques : ArrowExtensionArray + ArrowExtensionArray """ if pa_version_under2p0: fallback_performancewarning(version="2") diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index eac1e35699585..0bddb1b3b4fa3 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -17,7 +17,7 @@ def test_unique(index_or_series_obj): obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): result = obj.unique() @@ -59,7 +59,7 @@ def test_unique_null(null_obj, index_or_series_obj): obj = klass(repeated_values, dtype=obj.dtype) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): result = obj.unique() @@ -88,7 +88,7 @@ def test_nunique(index_or_series_obj): obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @@ -116,17 +116,17 @@ def test_nunique_null(null_obj, index_or_series_obj): else: with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): num_unique_values = len(obj.unique()) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): assert obj.nunique() == max(0, num_unique_values - 1) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", + pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", ): assert obj.nunique(dropna=False) == max(0, num_unique_values) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 7be3527ac186b..c4bc66e9240c1 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -399,7 +399,9 @@ def test_astype_preserves_name(self, index, dtype): try: # Some of these conversions cannot succeed so we use a try / except with tm.assert_produces_warning( - warn, raise_on_extra_warnings=not pa_version_under2p0 + warn, + raise_on_extra_warnings=index.dtype == "string[pyarrow]" + and not pa_version_under2p0, ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): From de815df6cd4f9d7777968e19204a4b41bfc1beee Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 26 Apr 2022 21:34:19 -0700 Subject: [PATCH 09/10] Use str again --- pandas/tests/base/test_unique.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 0bddb1b3b4fa3..eac1e35699585 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -17,7 +17,7 @@ def test_unique(index_or_series_obj): obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() @@ -59,7 +59,7 @@ def test_unique_null(null_obj, index_or_series_obj): obj = klass(repeated_values, dtype=obj.dtype) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() @@ -88,7 +88,7 @@ def test_nunique(index_or_series_obj): obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @@ -116,17 +116,17 @@ def test_nunique_null(null_obj, index_or_series_obj): else: with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): num_unique_values = len(obj.unique()) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique() == max(0, num_unique_values - 1) with tm.maybe_produces_warning( PerformanceWarning, - pa_version_under2p0 and index_or_series_obj.dtype == "string[pyarrow]", + pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique(dropna=False) == max(0, num_unique_values) From 40bd857b267b16f1a1b4239dcad3e3cceee98369 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 26 Apr 2022 21:36:21 -0700 Subject: [PATCH 10/10] revert another check raises an error --- pandas/tests/indexes/test_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index c4bc66e9240c1..d582a469eaf0e 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -400,8 +400,7 @@ def test_astype_preserves_name(self, index, dtype): # Some of these conversions cannot succeed so we use a try / except with tm.assert_produces_warning( warn, - raise_on_extra_warnings=index.dtype == "string[pyarrow]" - and not pa_version_under2p0, + raise_on_extra_warnings=not pa_version_under2p0, ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError):