diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c5680dbca431..e2fb4a3635387 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1500,7 +1500,7 @@ cdef class Validator: f'must define is_value_typed') cdef bint is_valid_null(self, object value) except -1: - return value is None or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index f848310d961e1..adbc25b96ea42 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if a == b: - # object comparison - return True if isna(a) and isna(b): # TODO: Should require same-dtype NA? # nan / None comparison return True + + if a == b: + # object comparison + return True + if is_comparable_as_number(a) and is_comparable_as_number(b): if array_equivalent(a, b, strict_nan=True): # inf comparison diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6f2bb095a014d..8ba5cd7565850 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None): return new_values def take(self, indices, allow_fill=False, fill_value=None): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value result = take( self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..bb2a9a22e601f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,9 @@ import operator -from typing import TYPE_CHECKING, Type +from typing import Type import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -17,9 +17,6 @@ from pandas.core.construction import extract_array from pandas.core.missing import isna -if TYPE_CHECKING: - from pandas._typing import Scalar - @register_extension_dtype class StringDtype(ExtensionDtype): @@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype): StringDtype """ - @property - def na_value(self) -> "Scalar": - """ - StringDtype uses :attr:`numpy.nan` as the missing NA value. - - .. warning:: - - `na_value` may change in a future release. - """ - return np.nan + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA @property def type(self) -> Type: @@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) - # convert None to np.nan + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = np.nan + result[result.isna()] = StringDtype.na_value return result @classmethod @@ -192,6 +181,12 @@ def __arrow_array__(self, type=None): type = pa.string() return pa.array(self._ndarray, type=type, from_pandas=True) + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): @@ -205,9 +200,9 @@ def __setitem__(self, key, value): # validate new items if scalar_value: - if scalar_value is None: - value = np.nan - elif not (isinstance(value, str) or np.isnan(value)): + if isna(value): + value = StringDtype.na_value + elif not isinstance(value, str): raise ValueError( "Cannot set non-string value '{}' into a StringArray.".format(value) ) @@ -265,7 +260,7 @@ def method(self, other): other = other[valid] result = np.empty_like(self._ndarray, dtype="object") - result[mask] = np.nan + result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0a8f636b4cb2a..e5234c06c9ba0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -440,6 +440,9 @@ def array_equivalent(left, right, strict_nan=False): if left_value is NaT and right_value is not NaT: return False + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False @@ -451,6 +454,8 @@ def array_equivalent(left, right, strict_nan=False): if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False + elif "boolean value of NA is ambiguous" in str(err): + return False raise return True diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7194d1cf08e4a..0a0f1c4d2527e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -123,6 +123,13 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if na_mask: mask = isna(arr) convert = not np.all(mask) + if convert: + # XXX: This converts pd.NA to np.nan to match the output of + # object-dtype ops that return numeric, like str.count + # We probably want to return Int64Dtype instead. + # NA -> nan + arr[mask] = np.nan + try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..3029f3007929b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ def test_none_to_nan(): a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None - assert np.isnan(a[1]) + assert a[1] is pd.NA def test_setitem_validates(): @@ -24,6 +24,15 @@ def test_setitem_validates(): a[:] = np.array([1, 2]) +def test_setitem_with_scalar_string(): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype="string") + arr[0] = "d" + expected = pd.array(["d", "c"], dtype="string") + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize( "input, method", [ diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5b872d5b72227..471a1b79d23bc 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -25,7 +25,7 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([np.nan, "A"]) + return StringArray._from_sequence([pd.NA, "A"]) @pytest.fixture @@ -35,17 +35,17 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return StringArray._from_sequence(["B", np.nan, "A"]) + return StringArray._from_sequence(["B", pd.NA, "A"]) @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture def data_for_grouping(): - return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..9b8ccdf4a7d9e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3507,3 +3507,11 @@ def test_string_array(any_string_method): assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.xfail(reason="not implmented yet") +def test_string_dtype_numeric(): + s = Series(["a", "aa", None], dtype="string") + result = s.str.count("a") + expected = Series([1, 2, None], dtype="Int64") + tm.assert_series_equal(result, expected)