From ca7ab6a24f2aad22c208f90b254e3806cc396755 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 8 May 2022 15:25:29 -0700 Subject: [PATCH 1/2] Add numeric and integer array and dtype --- pandas/core/arrays/arrow/dtype.py | 4 +- pandas/core/arrays/arrow/integer.py | 150 ++++++++++++++++++++++++++++ pandas/core/arrays/arrow/numeric.py | 71 +++++++++++++ 3 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 pandas/core/arrays/arrow/integer.py create mode 100644 pandas/core/arrays/arrow/numeric.py diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index c0ecb0856f27f..0dda908ab2abf 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -6,12 +6,12 @@ from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import StorageExtensionDtype +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.arrays.arrow import ArrowExtensionArray -class ArrowDtype(StorageExtensionDtype): +class ArrowDtype(ExtensionDtype): """ Base class for dtypes for BaseArrowArray subclasses. Modeled after BaseMaskedDtype diff --git a/pandas/core/arrays/arrow/integer.py b/pandas/core/arrays/arrow/integer.py new file mode 100644 index 0000000000000..40a65356f14c8 --- /dev/null +++ b/pandas/core/arrays/arrow/integer.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import pyarrow as pa + +from pandas.core.dtypes.base import register_extension_dtype + +from pandas.core.arrays.arrow.numeric import ( + NumericArrowArray, + NumericArrowDtype, +) + + +class IntegerArrowDtype(NumericArrowDtype): + """ + An ExtensionDtype to hold a single size & kind of integer Arrow dtype. + + These specific implementations are subclasses of the non-public + IntegerArrowDtype. For example we have Int8ArrowDtype to represent signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + + _default_pa_dtype = pa.int64() + _checker = pa.types.is_integer + + @classmethod + def construct_array_type(cls) -> type[IntegerArrowArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return IntegerArrowArray + + @classmethod + def _str_to_dtype_mapping(cls): + return INT_STR_TO_DTYPE + + +class IntegerArrowArray(NumericArrowArray): + """ + Array of pyarrow integer values. + + To construct an IntegerArray from generic array-like ipaut, use + :func:`pandas.array` with one of the integer dtypes (see examples). + + Parameters + ---------- + values : pa.ChunkedArray + A 1-d integer-dtype array. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + IntegerArrowArray + """ + + _dtype_cls = IntegerArrowDtype + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} integer pyarrow data. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype + + +@register_extension_dtype +class Int8ArrowDtype(IntegerArrowDtype): + type = pa.int8() + name = "int8[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="int8") + + +@register_extension_dtype +class Int16ArrowDtype(IntegerArrowDtype): + type = pa.int16() + name = "int16[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="int16") + + +@register_extension_dtype +class Int32ArrowDtype(IntegerArrowDtype): + type = pa.int32() + name = "int32[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="int32") + + +@register_extension_dtype +class Int64ArrowDtype(IntegerArrowDtype): + type = pa.int64() + name = "int64[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="int64") + + +@register_extension_dtype +class UInt8ArrowDtype(IntegerArrowDtype): + type = pa.uint8() + name = "uint8[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="uint8") + + +@register_extension_dtype +class UInt16ArrowDtype(IntegerArrowDtype): + type = pa.uint16() + name = "uint16[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="uint16") + + +@register_extension_dtype +class UInt32ArrowDtype(IntegerArrowDtype): + type = pa.uint32() + name = "uint32[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="uint32") + + +@register_extension_dtype +class UInt64ArrowDtype(IntegerArrowDtype): + type = pa.uint64() + name = "uint64[pyarrow]" + __doc__ = _dtype_docstring.format(dtype="uint64") + + +INT_STR_TO_DTYPE: dict[str, IntegerArrowDtype] = { + "int8": Int8ArrowDtype(), + "int16": Int16ArrowDtype(), + "int32": Int32ArrowDtype(), + "int64": Int64ArrowDtype(), + "uint8": UInt8ArrowDtype(), + "uint16": UInt16ArrowDtype(), + "uint32": UInt32ArrowDtype(), + "uint64": UInt64ArrowDtype(), +} diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py new file mode 100644 index 0000000000000..a194829ff6977 --- /dev/null +++ b/pandas/core/arrays/arrow/numeric.py @@ -0,0 +1,71 @@ +from typing import ( + Any, + Callable, + TypeVar, +) + +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.dtype import ArrowDtype + +T = TypeVar("T", bound="NumericArrowArray") + + +class NumericArrowDtype(ArrowDtype): + _default_pa_dtype: pa.null() + _dtype_checker: Callable[[Any], bool] # pa.types.is_ + + @property + def _is_numeric(self) -> bool: + return True + + @cache_readonly + def is_signed_integer(self) -> bool: + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return self.kind == "u" + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + +class NumericArrowArray(ArrowExtensionArray): + """ + Base class for Integer and Floating and Boolean dtypes. + """ + + _dtype_cls: type[NumericArrowDtype] + + def __init__(self, values: pa.ChunkedArray) -> None: + checker = self._dtype_cls._dtype_checker + if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): + descr = ( + "floating" + if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] + else "integer" + ) + raise TypeError(f"values should be {descr} arrow array.") + super().__init__(values) + + @cache_readonly + def dtype(self) -> NumericArrowDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.type)] + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + return cls(pa.chunked_array(scalars, type=cls._dtype_cls._default_pa_dtype)) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) From af77d8fa044c1c46dd5e439d1e894ea4ba90859a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 8 May 2022 20:04:52 -0700 Subject: [PATCH 2/2] Add imports --- pandas/__init__.py | 8 ++++++++ pandas/core/api.py | 10 ++++++++++ pandas/core/arrays/arrow/dtype.py | 4 ++-- pandas/core/arrays/arrow/integer.py | 18 +++++++++--------- pandas/core/arrays/arrow/numeric.py | 6 +++++- 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 3645e8744d8af..1423e65bf52b7 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -47,6 +47,14 @@ from pandas.core.api import ( # dtype + Int8ArrowDtype, + Int16ArrowDtype, + Int32ArrowDtype, + Int64ArrowDtype, + UInt8ArrowDtype, + UInt16ArrowDtype, + UInt32ArrowDtype, + UInt64ArrowDtype, Int8Dtype, Int16Dtype, Int32Dtype, diff --git a/pandas/core/api.py b/pandas/core/api.py index cf082d2013d3b..373003e2b42a1 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -27,6 +27,16 @@ value_counts, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.integer import ( + Int8ArrowDtype, + Int16ArrowDtype, + Int32ArrowDtype, + Int64ArrowDtype, + UInt8ArrowDtype, + UInt16ArrowDtype, + UInt32ArrowDtype, + UInt64ArrowDtype, +) from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import ( Float32Dtype, diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 0dda908ab2abf..c0ecb0856f27f 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -6,12 +6,12 @@ from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.base import StorageExtensionDtype from pandas.core.arrays.arrow import ArrowExtensionArray -class ArrowDtype(ExtensionDtype): +class ArrowDtype(StorageExtensionDtype): """ Base class for dtypes for BaseArrowArray subclasses. Modeled after BaseMaskedDtype diff --git a/pandas/core/arrays/arrow/integer.py b/pandas/core/arrays/arrow/integer.py index 40a65356f14c8..e7de9c03ec711 100644 --- a/pandas/core/arrays/arrow/integer.py +++ b/pandas/core/arrays/arrow/integer.py @@ -21,7 +21,7 @@ class IntegerArrowDtype(NumericArrowDtype): """ _default_pa_dtype = pa.int64() - _checker = pa.types.is_integer + _dtype_checker = pa.types.is_integer @classmethod def construct_array_type(cls) -> type[IntegerArrowArray]: @@ -85,56 +85,56 @@ class IntegerArrowArray(NumericArrowArray): @register_extension_dtype class Int8ArrowDtype(IntegerArrowDtype): type = pa.int8() - name = "int8[pyarrow]" + name = "int8" __doc__ = _dtype_docstring.format(dtype="int8") @register_extension_dtype class Int16ArrowDtype(IntegerArrowDtype): type = pa.int16() - name = "int16[pyarrow]" + name = "int16" __doc__ = _dtype_docstring.format(dtype="int16") @register_extension_dtype class Int32ArrowDtype(IntegerArrowDtype): type = pa.int32() - name = "int32[pyarrow]" + name = "int32" __doc__ = _dtype_docstring.format(dtype="int32") @register_extension_dtype class Int64ArrowDtype(IntegerArrowDtype): type = pa.int64() - name = "int64[pyarrow]" + name = "int64" __doc__ = _dtype_docstring.format(dtype="int64") @register_extension_dtype class UInt8ArrowDtype(IntegerArrowDtype): type = pa.uint8() - name = "uint8[pyarrow]" + name = "uint8" __doc__ = _dtype_docstring.format(dtype="uint8") @register_extension_dtype class UInt16ArrowDtype(IntegerArrowDtype): type = pa.uint16() - name = "uint16[pyarrow]" + name = "uint16" __doc__ = _dtype_docstring.format(dtype="uint16") @register_extension_dtype class UInt32ArrowDtype(IntegerArrowDtype): type = pa.uint32() - name = "uint32[pyarrow]" + name = "uint32" __doc__ = _dtype_docstring.format(dtype="uint32") @register_extension_dtype class UInt64ArrowDtype(IntegerArrowDtype): type = pa.uint64() - name = "uint64[pyarrow]" + name = "uint64" __doc__ = _dtype_docstring.format(dtype="uint64") diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py index a194829ff6977..3b55d34f6e67b 100644 --- a/pandas/core/arrays/arrow/numeric.py +++ b/pandas/core/arrays/arrow/numeric.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import ( Any, Callable, @@ -61,7 +63,9 @@ def dtype(self) -> NumericArrowDtype: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): - return cls(pa.chunked_array(scalars, type=cls._dtype_cls._default_pa_dtype)) + if dtype is None: + dtype = cls._dtype_cls._default_pa_dtype + return cls(pa.chunked_array([scalars], type=dtype.type)) @classmethod def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False):