diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 63902b53ea36d..155486953f4ba 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -570,6 +570,7 @@ Conversion - Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) +- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`) - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) - diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0a2893ac49a49..666afb65e19ff 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -327,6 +327,9 @@ def astype(self, dtype, copy=True): arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) return FloatingArray(values, mask, copy=False) + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) elif np.issubdtype(dtype, np.floating): arr = self._ndarray.copy() mask = self.isna() diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4e068690c41e5..d23c44733949a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -350,6 +350,38 @@ def test_astype_bytes(self): assert result.dtypes == np.dtype("S3") +class TestAstypeString: + @pytest.mark.parametrize( + "data, dtype", + [ + ([True, NA], "boolean"), + (["A", NA], "category"), + (["2020-10-10", "2020-10-10"], "datetime64[ns]"), + (["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), + ( + ["2012-01-01 00:00:00-05:00", NaT], + "datetime64[ns, US/Eastern]", + ), + ([1, None], "UInt16"), + (["1/1/2021", "2/1/2021"], "period[M]"), + (["1/1/2021", "2/1/2021", NaT], "period[M]"), + (["1 Day", "59 Days", NaT], "timedelta64[ns]"), + # currently no way to parse IntervalArray from a list of strings + ], + ) + def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): + if dtype == "boolean" or ( + dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data + ): + mark = pytest.mark.xfail( + reason="TODO StringArray.astype() with missing values #GH40566" + ) + request.node.add_marker(mark) + # GH-40351 + s = Series(data, dtype=dtype) + tm.assert_series_equal(s, s.astype("string").astype(dtype)) + + class TestAstypeCategorical: def test_astype_categorical_to_other(self): cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])