diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c911edfa03670..11173a54cdf72 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,6 +46,10 @@ repos: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] + exclude: | + (?x) + ^pandas/_libs/include/pandas/vendored/nanoarrow.h + |pandas/_libs/src/vendored/nanoarrow.c - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.15.0 hooks: @@ -74,7 +78,11 @@ repos: rev: 1.6.1 hooks: - id: cpplint - exclude: ^pandas/_libs/include/pandas/vendored/klib + exclude: | + (?x) + ^pandas/_libs/include/pandas/vendored/klib + |pandas/_libs/include/pandas/vendored/nanoarrow.h + |pandas/_libs/src/vendored/nanoarrow.c args: [ --quiet, '--extensions=c,h', diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 0229cf15fbfb8..b23c1b6d2a342 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -28,6 +28,17 @@ def time_from_float_array(self): pd.array(self.values_float, dtype="boolean") +class BooleanArrayMem: + def setup_cache(self): + N = 250_000 + data = np.array([True] * N) + mask = np.array([False] * N) + return [pd.arrays.BooleanArray(data, mask)] * 500 + + def peakmem_array(self, arrays): + return [~x for x in arrays] + + class IntegerArray: def setup(self): N = 250_000 diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 78fee8f01319c..7a6d16c8cefd7 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -3,10 +3,13 @@ from typing import Sequence import numpy as np from pandas._typing import ( + ArrayLike, AxisInt, DtypeObj, + PositionalIndexer, Self, Shape, + type_t, ) class NDArrayBacked: @@ -38,3 +41,35 @@ class NDArrayBacked: def _concat_same_type( cls, to_concat: Sequence[Self], axis: AxisInt = ... ) -> Self: ... + +class BitmaskArray: + parent: Self + def __init__(self, data: np.ndarray | Self) -> None: ... + def __len__(self) -> int: ... + def __setitem__(self, key: PositionalIndexer, value: ArrayLike | bool) -> None: ... + def __getitem__(self, key: PositionalIndexer) -> bool: ... + def __invert__(self) -> Self: ... + def __and__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... + def __or__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... + def __xor__(self, other: np.ndarray | Self | bool) -> np.ndarray: ... + def __getstate__(self) -> dict: ... + def __setstate__(self, other: dict) -> None: ... + def __iter__(self): ... + @classmethod + def concatenate(cls, objs: list[Self], axis: int) -> Self: ... + @property + def size(self) -> int: ... + @property + def nbytes(self) -> int: ... + @property + def bytes(self) -> bytes: ... + @property + def shape(self) -> tuple[int, ...]: ... + @property + def dtype(self) -> type_t[bool]: ... + def any(self) -> bool: ... + def all(self) -> bool: ... + def sum(self) -> int: ... + def take_1d(self, indices: np.ndarray, axis: int) -> Self: ... + def copy(self) -> Self: ... + def to_numpy(self) -> np.ndarray: ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 718fb358e26bc..8f0f0f58db983 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -7,10 +7,59 @@ import numpy as np cimport numpy as cnp from cpython cimport PyErr_Clear -from numpy cimport ndarray +from cpython.slice cimport PySlice_Unpack +from libc.stdlib cimport ( + free, + malloc, +) +from libc.string cimport memcpy +from numpy cimport ( + int8_t, + int64_t, + ndarray, + uint8_t, +) + +from pandas.core.common import is_null_slice cnp.import_array() +cdef extern from "pandas/vendored/nanoarrow.h": + struct ArrowBuffer: + uint8_t* data + int64_t size_bytes + int64_t capacity_bytes + + struct ArrowBitmap: + ArrowBuffer buffer + int64_t size_bits + + void ArrowBitmapInit(ArrowBitmap*) + void ArrowBitmapReserve(ArrowBitmap*, int64_t) + void ArrowBitmapAppendUnsafe(ArrowBitmap*, uint8_t, int64_t) + void ArrowBitmapAppendInt8Unsafe(ArrowBitmap*, const int8_t *, int64_t) + void ArrowBitmapReset(ArrowBitmap*) + void ArrowBitsUnpackInt8(const uint8_t*, int64_t, int64_t, int8_t*) + int8_t ArrowBitGet(const uint8_t*, int64_t) + void ArrowBitSetTo(uint8_t*, int64_t, uint8_t) + void ArrowBitsSetTo(uint8_t*, int64_t, int64_t, uint8_t) + int64_t ArrowBitCountSet(const uint8_t*, int64_t, int64_t) + void ArrowBitmapReset(ArrowBitmap*) + +cdef extern from "pandas/bitmask_algorithms.h": + void ConcatenateBitmapData(const ArrowBitmap**, size_t, ArrowBitmap*) + bint BitmapAny(const ArrowBitmap*) + bint BitmapAll(const ArrowBitmap*) + bint BitmapOr(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapXor(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapAnd(const ArrowBitmap*, const ArrowBitmap*, ArrowBitmap*) + bint BitmapOrBool(const ArrowBitmap*, bint, ArrowBitmap*) + bint BitmapXorBool(const ArrowBitmap*, bint, ArrowBitmap*) + bint BitmapAndBool(const ArrowBitmap*, bint, ArrowBitmap*) + bint BitmapInvert(const ArrowBitmap*, ArrowBitmap*) + bint BitmapTake(const ArrowBitmap*, const int64_t*, size_t, ArrowBitmap*) + bint BitmapPutFromBufferMask(ArrowBitmap*, const uint8_t*, size_t, uint8_t) + @cython.freelist(16) cdef class NDArrayBacked: @@ -189,3 +238,666 @@ cdef class NDArrayBacked: new_values = [obj._ndarray for obj in to_concat] new_arr = cnp.PyArray_Concatenate(new_values, axis) return to_concat[0]._from_backing_data(new_arr) + + +cdef class BitmaskArray: + cdef: + ArrowBitmap bitmap + bint buffer_owner # set when parent is None, but gives C-level access + # NumPy compatibility + cdef Py_ssize_t ndim + cdef Py_ssize_t[2] shape + cdef Py_ssize_t[2] strides + # Buffer protocol support + int n_consumers + uint8_t* memview_buf + cdef public: + object parent # assignments gives RC to ensure proper buffer lifecycle + + @cython.boundscheck(False) + @cython.wraparound(False) + cdef void init_from_ndarray(self, const uint8_t[::1] arr) noexcept: + cdef ArrowBitmap bitmap + # As long as we have a 1D arr argument we can use .shape[0] to avoid + # a call to Python via .size + cdef int64_t nobs = arr.shape[0] + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, nobs) + ArrowBitmapAppendInt8Unsafe(&bitmap, &arr[0], nobs) + self.bitmap = bitmap + self.buffer_owner = True + + cdef void init_from_bitmaskarray(self, BitmaskArray bma) noexcept: + self.bitmap = bma.bitmap + self.buffer_owner = False + self.ndim = bma.ndim + self.shape[0] = bma.shape[0] + self.strides[0] = bma.strides[0] + if self.ndim == 2: + self.shape[1] = bma.shape[1] + self.strides[1] = bma.strides[1] + + def __init__(self, data): + cdef ndarray arr + if cnp.PyArray_Check(data): + arr = data + if not cnp.PyArray_IS_C_CONTIGUOUS(arr): + arr = cnp.PyArray_GETCONTIGUOUS(arr) + + self.init_from_ndarray(arr.ravel()) + self.ndim = arr.ndim + self.shape[0] = arr.shape[0] + self.strides[0] = arr.strides[0] + if self.ndim == 2: + self.shape[1] = arr.shape[1] + self.strides[1] = arr.strides[1] + self.parent = None + elif isinstance(data, BitmaskArray): + self.init_from_bitmaskarray(data) + self.parent = data + else: + raise TypeError("Unsupported argument to BitmaskArray constructor") + + def __dealloc__(self): + if self.buffer_owner: + ArrowBitmapReset(&self.bitmap) + + @staticmethod + cdef BitmaskArray copy_from_bitmaskarray(BitmaskArray old_bma): + """ + Constructs a new BitmaskArray from a bitmap pointer. Copies data + and manages the subsequenty lifecycle of the bitmap. + """ + # Bypass __init__ calls + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) + cdef uint8_t* buf + cdef ArrowBitmap bitmap + # TODO: this leaks a bit into the internals of the nanoarrow bitmap + # We may want to upstream a BitmapCopy function instead + ArrowBitmapInit(&bitmap) + buf = malloc(old_bma.bitmap.buffer.size_bytes) + memcpy(buf, old_bma.bitmap.buffer.data, old_bma.bitmap.buffer.size_bytes) + bitmap.buffer.size_bytes = old_bma.bitmap.buffer.size_bytes + bitmap.buffer.capacity_bytes = old_bma.bitmap.buffer.capacity_bytes + bitmap.size_bits = old_bma.bitmap.size_bits + bitmap.buffer.data = buf + + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = old_bma.ndim + bma.shape = old_bma.shape + bma.strides = old_bma.strides + bma.parent = False + + return bma + + def __len__(self): + return self.bitmap.size_bits + + def __repr__(self): + if self.parent: + par = object.__repr__(self.parent) + else: + par = None + + data = self.bytes + + return ( + f"{object.__repr__(self)}\nparent: {par}\ndata: {data}\n" + ) + + @cython.wraparound(False) + @cython.boundscheck(False) + @staticmethod + cdef BitmaskArray c_concatenate(list objs): + cdef Py_ssize_t i + cdef int64_t total_bits = 0 + cdef BitmaskArray current_bma + cdef Py_ssize_t nbitmaps = len(objs) + + cdef BitmaskArray first_bma = objs[0] + cdef int expected_ndim = first_bma.ndim + cdef Py_ssize_t expected_stride0 = first_bma.strides[0] + cdef Py_ssize_t expected_shape1, expected_stride1 + if expected_ndim == 2: + expected_stride1 = first_bma.strides[1] + expected_shape1 = first_bma.shape[1] + + cdef Py_ssize_t dim0shape = 0 + + cdef ArrowBitmap** bitmaps = malloc( + sizeof(ArrowBitmap*) * nbitmaps + ) + + for i in range(nbitmaps): + current_bma = objs[i] + if ( + current_bma.ndim != expected_ndim + or current_bma.strides[0] != expected_stride0 + or ( + expected_ndim == 2 and ( + current_bma.shape[1] != expected_shape1 + or current_bma.strides[1] != expected_stride1 + ) + ) + ): + free(bitmaps) + raise NotImplementedError( + "BitmaskArray.concatenate does not support broadcasting" + ) + total_bits += current_bma.bitmap.size_bits + bitmaps[i] = ¤t_bma.bitmap + dim0shape += current_bma.shape[0] + + # Bypass __init__ calls + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) + cdef ArrowBitmap bitmap + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, total_bits) + + ConcatenateBitmapData(bitmaps, nbitmaps, &bitmap) + free(bitmaps) + + bma.bitmap = bitmap + bma.buffer_owner = True + + bma.ndim = expected_ndim + bma.shape[0] = dim0shape # only allowed because of axis=0 assumption + bma.strides[0] = expected_stride0 + if expected_ndim == 2: + bma.shape[1] = expected_shape1 + bma.strides[1] = expected_stride1 + + bma.parent = None + + return bma + + @classmethod + def concatenate(cls, objs, axis): + if axis != 0: + raise NotImplementedError( + "BitmaskArray.concatenate only implemented for axis=0" + ) + + return BitmaskArray.c_concatenate(objs) + + def __setitem__(self, key, value): + cdef const uint8_t[:] keymask + cdef const uint8_t[:] arr1d + cdef Py_ssize_t i = 0 + cdef Py_ssize_t ckey + cdef bint cvalue + cdef BitmaskArray self_ = self + + if isinstance(key, int): + ckey = key + cvalue = value + if ckey >= 0 and ckey < self.bitmap.size_bits: + ArrowBitSetTo(self.bitmap.buffer.data, ckey, cvalue) + return + + if is_null_slice(key) and isinstance(value, (int, bool)): + cvalue = value # blindly assuming ints are 0 or 1 + ArrowBitsSetTo( + self.bitmap.buffer.data, + 0, + self.bitmap.size_bits, + cvalue + ) + elif ( + isinstance(key, np.ndarray) + and key.dtype == bool + and isinstance(value, (int, bool)) + and len(key) == len(self) + ): + keymask = key + if BitmapPutFromBufferMask( + &self_.bitmap, + &keymask[0], + keymask.shape[0], + value + ) != 0: + raise ValueError("BitmaskArray.__setitem__ failed!") + else: + arr = self.to_numpy() + arr[key] = value + arr1d = arr.ravel() + for i in range(arr1d.shape[0]): + ArrowBitSetTo(self.bitmap.buffer.data, i, arr1d[i]) + + def __getitem__(self, key): + cdef Py_ssize_t ckey + cdef Py_ssize_t start, stop, step + cdef BitmaskArray bma + cdef ArrowBitmap bitmap + cdef int64_t nbytes, nbits + cdef BitmaskArray self_ = self + cdef bint result + # to_numpy can be expensive, so try to avoid for simple cases + if isinstance(key, int) and self.ndim == 1: + ckey = key + if ckey >= 0 and ckey < self.bitmap.size_bits: + result = ArrowBitGet(self.bitmap.buffer.data, ckey) + return result + elif is_null_slice(key): + return self + elif isinstance(key, slice) and self.ndim == 1: + # fastpath for slices that start at 0 and step 1 at a time + # towards a positive number. + # TODO: upstream generic ArrowBitsGet function in nanoarrow + PySlice_Unpack(key, &start, &stop, &step) + if start == 0 and stop > 0 and step == 1: + if stop > self_.bitmap.size_bits: + nbits = self_.bitmap.size_bits + else: + nbits = stop + + nbytes = (nbits + 7) // 8 + + bma = BitmaskArray.__new__(BitmaskArray) + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, nbits) + memcpy(bitmap.buffer.data, self_.bitmap.buffer.data, nbytes) + bitmap.buffer.size_bytes = nbytes + bitmap.size_bits = nbits + + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape[0] = nbits + bma.strides = self_.strides + bma.parent = False + + return bma + + return self.to_numpy()[key] + + def __invert__(self): + # note that this inverts the entire byte, even if the + # bitmap only uses a few of the bits within that byte + # the remaining bits of the byte are of undefined value + # so be sure to only check bytes we need + cdef BitmaskArray self_ = self + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) + cdef ArrowBitmap bitmap + cdef int ret + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + + ret = BitmapInvert(&self_.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapInvert failed") + + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + bma.parent = None + + return bma + + def __and__(self, other): + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma + cdef ArrowBitmap bitmap + cdef bint bval + cdef int ret + + if isinstance(other, BitmaskArray): + # TODO: maybe should return Self here instead of ndarray + other_bma = other + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapAnd(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapAnd failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapAndBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapAndBool failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + + return self.to_numpy() & other + + def __or__(self, other): + cdef ndarray[uint8_t] result + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma + cdef ArrowBitmap bitmap + cdef bint bval + cdef int ret + + if isinstance(other, BitmaskArray): + other_bma = other + if self_.bitmap.size_bits == 0: + result = np.empty([], dtype=bool) + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result + + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapOr(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapOr failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapOrBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapOrBool failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + + return self.to_numpy() | other + + def __xor__(self, other): + cdef BitmaskArray other_bma, self_ = self # self_ required for Cython < 3 + cdef BitmaskArray bma + cdef ArrowBitmap bitmap + cdef bint bval + cdef int ret + + if isinstance(other, BitmaskArray): + # TODO: maybe should return Self here instead of ndarray + other_bma = other + if self_.bitmap.size_bits != other_bma.bitmap.size_bits: + raise ValueError("bitmaps are not equal size") + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapXor(&self_.bitmap, &other_bma.bitmap, &bitmap) + if ret == -1: + raise RuntimeError("BitmapXor failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + elif isinstance(other, bool): + bval = other + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, self_.bitmap.size_bits) + ret = BitmapXorBool(&self_.bitmap, bval, &bitmap) + if ret == -1: + raise RuntimeError("BitmapXorBool failed") + + bma = BitmaskArray.__new__(BitmaskArray) + bma.bitmap = bitmap + bma.buffer_owner = True + bma.ndim = self_.ndim + bma.shape = self_.shape + bma.strides = self_.strides + + return bma + + return self.to_numpy() ^ other + + def __getstate__(self): + cdef BitmaskArray self_ = self + state = { + "parent": self.parent, + "ndim": self_.ndim, + "shape0": self_.shape[0], + "stride0": self_.strides[0], + "n_consumers": self_.n_consumers, + "buffer_owner": self_.buffer_owner, + # Private ArrowBitmap attributes below + "bitmap.buffer.size_bytes": self_.bitmap.buffer.size_bytes, + "bitmap.buffer.capacity_bytes": self_.bitmap.buffer.capacity_bytes, + "bitmap.size_bits": self_.bitmap.size_bits + } + + if self_.ndim == 2: + state["shape1"] = self_.shape[1] + state["stride1"] = self_.strides[1] + + # memview should only exist when n_consumers > 0 + if self_.n_consumers > 0: + memview_buf_data = bytearray(len(self)) + for i in range(len(self)): + memview_buf_data[i] = self_.memview_buf[i] + + state["memview_buf_data"] = memview_buf_data + + # Only parents own data + if self_.buffer_owner: + bitmap_data = bytearray(self_.bitmap.buffer.size_bytes) + for i in range(self_.bitmap.buffer.size_bytes): + bitmap_data[i] = self_.bitmap.buffer.data[i] + + state["bitmap_data"] = bitmap_data + + return state + + def __setstate__(self, state): + cdef ArrowBitmap bitmap + cdef BitmaskArray self_ = self, other + self.parent = state["parent"] + self_.ndim = state["ndim"] + self_.shape[0] = state["shape0"] + self_.strides[0] = state["stride0"] + self_.n_consumers = state["n_consumers"] + self_.buffer_owner = state["buffer_owner"] + + nbytes = state["bitmap.buffer.size_bytes"] + capacity_bytes = state["bitmap.buffer.capacity_bytes"] + nbits = state["bitmap.size_bits"] + + if self_.ndim == 2: + self_.shape[1] = state["shape1"] + self_.strides[1] = state["stride1"] + + if self_.n_consumers > 0: + self_.memview_buf = malloc(nbits) + memview_buf_data = state["memview_buf_data"] + for i in range(nbits): + self_.memview_buf[i] = memview_buf_data[i] + + if not self_.buffer_owner: + other = self.parent + self_.bitmap = other.bitmap + self_.bitmap.size_bits = nbits + self_.bitmap.buffer.size_bytes = nbytes + self_.bitmap.buffer.capacity_bytes = capacity_bytes + else: + ArrowBitmapInit(&bitmap) + + buf = malloc(nbytes) + data = state["bitmap_data"] + for i in range(nbytes): + buf[i] = data[i] + + bitmap.buffer.data = buf + bitmap.buffer.size_bytes = nbytes + bitmap.buffer.capacity_bytes = nbytes + bitmap.size_bits = nbits + self_.bitmap = bitmap + + @cython.boundscheck(False) + @cython.wraparound(False) + def __iter__(self): + cdef Py_ssize_t i + cdef BitmaskArray self_ = self # self_ required for Cython < 3 + cdef bint result + for i in range(self_.bitmap.size_bits): + result = ArrowBitGet(self_.bitmap.buffer.data, i) + yield result + + def __getbuffer__(self, Py_buffer *buffer, int flags): + cdef BitmaskArray self_ = self + + if self_.n_consumers == 0: + self_.memview_buf = malloc(self_.bitmap.size_bits) + ArrowBitsUnpackInt8( + self_.bitmap.buffer.data, + 0, + self_.bitmap.size_bits, + self_.memview_buf + ) + + buffer.buf = self_.memview_buf + buffer.format = "?" + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self_.bitmap.size_bits + buffer.ndim = self_.ndim + buffer.obj = self + buffer.readonly = 1 + buffer.shape = self_.shape + buffer.strides = self_.strides + buffer.suboffsets = NULL + + self_.n_consumers += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + cdef BitmaskArray self_ = self + self_.n_consumers -= 1 + if self_.n_consumers == 0: + free(self_.memview_buf) + + @property + def size(self) -> int: + return self.bitmap.size_bits + + @property + def nbytes(self) -> int: + return self.bitmap.buffer.size_bytes + + @property + def bytes(self): + cdef Py_ssize_t i, nbytes = self.bitmap.buffer.size_bytes + arr_bytes = bytearray(nbytes) + for i in range(nbytes): + arr_bytes[i] = self.bitmap.buffer.data[i] + + return bytes(arr_bytes) + + @property + def shape(self): + """Strictly for NumPy compat in mask_ops""" + cdef BitmaskArray self_ = self + if self_.ndim == 1: + return tuple((self_.shape[0],)) + return tuple((self_.shape[0], self_.shape[1])) + + @property + def dtype(self): + """Strictly for NumPy compat in mask_ops""" + return np.dtype("bool") + + def any(self) -> bool: + cdef bint result = BitmapAny(&self.bitmap) + return result + + def all(self) -> bool: + return BitmapAll(&self.bitmap) + + def sum(self) -> int: + return ArrowBitCountSet(self.bitmap.buffer.data, 0, self.bitmap.size_bits) + + def take_1d( + self, + const int64_t[:] indices, + const int axis=0, + ): + cdef BitmaskArray self_ = self + cdef Py_ssize_t nindices = len(indices) + if axis != 0: + raise NotImplementedError( + "BitmaskArray.take_1d only implemented for axis=0" + ) + + if nindices <= 0: + raise NotImplementedError( + "take_1d does not support empty takes" + ) + + cdef ArrowBitmap bitmap + cdef BitmaskArray bma = BitmaskArray.__new__(BitmaskArray) + + ArrowBitmapInit(&bitmap) + ArrowBitmapReserve(&bitmap, nindices) + + if BitmapTake(&self_.bitmap, &indices[0], nindices, &bitmap) != 0: + ArrowBitmapReset(&bitmap) + raise ValueError("take_1d does not support negative indexing") + + bma.bitmap = bitmap + bma.buffer_owner = True + + bma.ndim = self_.ndim + bma.shape[0] = indices.shape[0] + bma.strides = self_.strides + + bma.parent = None + + return bma + + def copy(self): + return BitmaskArray.copy_from_bitmaskarray(self) + + def to_numpy(self) -> ndarray: + cdef BitmaskArray self_ = self + cdef ndarray[uint8_t] result = np.empty(self_.bitmap.size_bits, dtype=bool) + + ArrowBitsUnpackInt8( + self_.bitmap.buffer.data, + 0, + self_.bitmap.size_bits, + cnp.PyArray_BYTES(result), + ) + + if self_.ndim == 2: + return result.reshape(self_.shape[0], self_.shape[1]) + return result diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 0ac914e86f699..c63c8ab6e19e0 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -6,6 +6,7 @@ from typing import ( import numpy as np +from pandas._libs.arrays import BitmaskArray from pandas._typing import npt def unique_label_indices( @@ -231,7 +232,7 @@ class IntpHashTable(HashTable): ... def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., - mask: npt.NDArray[np.bool_] | None = ..., + mask: npt.NDArray[np.bool_] | BitmaskArray | None = ..., ) -> npt.NDArray[np.bool_]: ... def mode( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ... @@ -239,7 +240,7 @@ def mode( def value_count( values: np.ndarray, dropna: bool, - mask: npt.NDArray[np.bool_] | None = ..., + mask: npt.NDArray[np.bool_] | BitmaskArray | None = ..., ) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values] # arr and values should have same dtype diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cf5d734705af..4a94e5c256eb3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -665,7 +665,7 @@ cdef class {{name}}HashTable(HashTable): rmd = result_mask.data if use_mask: - mask_values = mask.view("uint8") + mask_values = mask if use_na_value: # We need this na_value2 because we want to allow users diff --git a/pandas/_libs/include/pandas/bitmask_algorithms.h b/pandas/_libs/include/pandas/bitmask_algorithms.h new file mode 100644 index 0000000000000..fa70b1a472fc4 --- /dev/null +++ b/pandas/_libs/include/pandas/bitmask_algorithms.h @@ -0,0 +1,64 @@ +/* + +Copyright (c) 2023, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +*/ + +#pragma once + +#include +#include +#include + +#include "pandas/vendored/nanoarrow.h" + +/* + Concatenates the data from an array of bitmaps with size nbitmaps + into a buffer "out". Order is preserved and out is assumed to have + enough bytes to hold all elements. +*/ +void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, + struct ArrowBitmap *out); + +bool BitmapAny(const struct ArrowBitmap *bitmap); +bool BitmapAll(const struct ArrowBitmap *bitmap); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapOr(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapXor(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapAnd(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool, + struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapTake(const struct ArrowBitmap *bitmap, const int64_t *indices, + size_t nindices, struct ArrowBitmap *out); + +/* Returns -1 on failure. On success returns 0 and writes to out */ +int BitmapPutFromBufferMask(struct ArrowBitmap *bitmap, const uint8_t *buf, + size_t n, uint8_t value); diff --git a/pandas/_libs/include/pandas/vendored/nanoarrow.h b/pandas/_libs/include/pandas/vendored/nanoarrow.h new file mode 100644 index 0000000000000..30fcf04008eba --- /dev/null +++ b/pandas/_libs/include/pandas/vendored/nanoarrow.h @@ -0,0 +1,3433 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUILD_ID_H_INCLUDED +#define NANOARROW_BUILD_ID_H_INCLUDED + +#define NANOARROW_VERSION_MAJOR 0 +#define NANOARROW_VERSION_MINOR 3 +#define NANOARROW_VERSION_PATCH 0 +#define NANOARROW_VERSION "0.3.0-SNAPSHOT" + +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) + +// #define NANOARROW_NAMESPACE YourNamespaceHere + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED +#define NANOARROW_NANOARROW_TYPES_H_INCLUDED + +#include +#include + + + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface +/// +/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) +/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) +/// interfaces are part of the +/// Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for +/// documentation of these structures. +/// +/// @{ + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +/// \brief Move the contents of src into dst and set src->release to NULL +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +/// @} + +// Utility macros +#define _NANOARROW_CONCAT(x, y) x##y +#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) + +#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) return NAME; \ + } while (0) + +#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ + NAME, __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +/// \brief Return code for success. +/// \ingroup nanoarrow-errors +#define NANOARROW_OK 0 + +/// \brief Represents an errno-compatible error code +/// \ingroup nanoarrow-errors +typedef int ArrowErrorCode; + +/// \brief Check the result of an expression and return it if not NANOARROW_OK +/// \ingroup nanoarrow-errors +#define NANOARROW_RETURN_NOT_OK(EXPR) \ + _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with errno %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was built in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) EXPR +#endif + +static char _ArrowIsLittleEndian(void) { + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; +} + +/// \brief Arrow type enumerator +/// \ingroup nanoarrow-utils +/// +/// These names are intended to map to the corresponding arrow::Type::type +/// enumerator; however, the numeric values are specifically not equal +/// (i.e., do not rely on numeric comparison). +enum ArrowType { + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO +}; + +/// \brief Get a string value of an enum ArrowType value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + +static inline const char* ArrowTypeString(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + default: + return NULL; + } +} + +/// \brief Arrow time unit enumerator +/// \ingroup nanoarrow-utils +/// +/// These names and values map to the corresponding arrow::TimeUnit::type +/// enumerator. +enum ArrowTimeUnit { + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 +}; + +/// \brief Validation level enumerator +/// \ingroup nanoarrow-array +enum ArrowValidationLevel { + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer data access, + /// but do not perform any checks that are O(1) along the length of the buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful in the + /// context of untrusted input or input that may have been corrupted in transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 +}; + +/// \brief Get a string value of an enum ArrowTimeUnit value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar Specification +/// \ingroup nanoarrow-array-view +enum ArrowBufferType { + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA +}; + +/// \brief An non-owning view of a string +/// \ingroup nanoarrow-utils +struct ArrowStringView { + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; +}; + +/// \brief Return a view of a const C string +/// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + +static inline struct ArrowStringView ArrowCharView(const char* value) { + struct ArrowStringView out; + + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } + + return out; +} + +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; +}; + +/// \brief An non-owning view of a buffer +/// \ingroup nanoarrow-utils +struct ArrowBufferView { + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; +}; + +/// \brief Array buffer allocation and deallocation +/// \ingroup nanoarrow-buffer +/// +/// Container for allocate, reallocate, and free methods that can be used +/// to customize allocation and deallocation of buffers when constructing +/// an ArrowArray. +struct ArrowBufferAllocator { + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t old_size, int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; +}; + +/// \brief An owning mutable view of a buffer +/// \ingroup nanoarrow-buffer +struct ArrowBuffer { + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; + + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; + + /// \brief The allocator that will be used to reallocate and/or free the buffer + struct ArrowBufferAllocator allocator; +}; + +/// \brief An owning mutable view of a bitmap +/// \ingroup nanoarrow-bitmap +struct ArrowBitmap { + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; + + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; +}; + +/// \brief A description of an arrangement of buffers +/// \ingroup nanoarrow-utils +/// +/// Contains the minimum amount of information required to +/// calculate the size of each buffer in an ArrowArray knowing only +/// the length and offset of the array. +struct ArrowLayout { + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[3]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[3]; + + /// \brief The size of an element each buffer or 0 if this size is variable or unknown + int64_t element_size_bits[3]; + + /// \brief The number of elements in the child array per element in this array for a + /// fixed-size list + int64_t child_size_elements; +}; + +/// \brief A non-owning view of an ArrowArray +/// \ingroup nanoarrow-array-view +/// +/// This data structure provides access to the values contained within +/// an ArrowArray with fields provided in a more readily-extractible +/// form. You can re-use an ArrowArrayView for multiple ArrowArrays +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. +struct ArrowArrayView { + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; + + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; + + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[3]; + + /// \brief The number of children of this view + int64_t n_children; + + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; + + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; +}; + +// Used as the private data member for ArrowArrays allocated here and accessed +// internally within inline ArrowArray* helpers. +struct ArrowArrayPrivateData { + // Holder for the validity buffer (or first buffer for union types, which are + // the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; + + // Holder for additional buffers as required + struct ArrowBuffer buffers[2]; + + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have ben reallocated uring that time) + const void* buffer_data[3]; + + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; + + // The buffer arrangement for the storage type + struct ArrowLayout layout; + + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != child_index + int8_t union_type_id_is_child_index; +}; + +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + +/// \brief A representation of a fixed-precision decimal number +/// \ingroup nanoarrow-utils +/// +/// This structure should be initialized with ArrowDecimalInit() once and +/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), +/// or ArrowDecimalSetBytes256(). +struct ArrowDecimal { + /// \brief An array of 64-bit integers of n_words length defined in native-endian order + uint64_t words[4]; + + /// \brief The number of significant digits this decimal number can represent + int32_t precision; + + /// \brief The number of digits after the decimal point. This can be negative. + int32_t scale; + + /// \brief The number of words in the words array + int n_words; + + /// \brief Cached value used by the implementation + int high_word_index; + + /// \brief Cached value used by the implementation + int low_word_index; +}; + +/// \brief Initialize a decimal with a given set of type parameters +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, + int32_t precision, int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = bitwidth / 8 / sizeof(uint64_t); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } +} + +/// \brief Get a signed integer value of a sufficiently small ArrowDecimal +/// +/// This does not check if the decimal's precision sufficiently small to fit +/// within the signed 64-bit integer range (A precision less than or equal +/// to 18 is sufficiently small). +static inline int64_t ArrowDecimalGetIntUnsafe(struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; +} + +/// \brief Copy the bytes of this decimal into a sufficiently large buffer +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalGetBytes(struct ArrowDecimal* decimal, uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +} + +/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise +/// \ingroup nanoarrow-utils +static inline int64_t ArrowDecimalSign(struct ArrowDecimal* decimal) { + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); +} + +/// \brief Sets the integer value of this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } + + decimal->words[decimal->low_word_index] = value; +} + +/// \brief Copy bytes from a buffer into this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, + const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_H_INCLUDED +#define NANOARROW_H_INCLUDED + +#include +#include +#include + + + +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this +// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE +// MyNamespace here. + +// This section remaps the non-prefixed symbols to the prefixed symbols so that +// code written against this build can be used independent of the value of +// NANOARROW_NAMESPACE. +#ifdef NANOARROW_NAMESPACE +#define NANOARROW_CAT(A, B) A##B +#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) + +#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersionInt \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) +#define ArrowErrorMessage NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorMessage) +#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) +#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) +#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) +#define ArrowBufferAllocatorDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) +#define ArrowBufferDeallocator \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) +#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) +#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) +#define ArrowSchemaInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) +#define ArrowSchemaSetTypeStruct \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) +#define ArrowSchemaSetTypeFixedSize \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) +#define ArrowSchemaSetTypeDecimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeDateTime \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) +#define ArrowSchemaSetTypeUnion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) +#define ArrowSchemaSetMetadata \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) +#define ArrowSchemaAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) +#define ArrowSchemaAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) +#define ArrowMetadataReaderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) +#define ArrowMetadataReaderRead \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) +#define ArrowMetadataBuilderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) +#define ArrowMetadataBuilderAppend \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) +#define ArrowMetadataBuilderSet \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) +#define ArrowMetadataBuilderRemove \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) +#define ArrowArrayInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) +#define ArrowArrayInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) +#define ArrowArrayAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) +#define ArrowArraySetValidityBitmap \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) +#define ArrowArrayFinishBuilding \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) +#define ArrowArrayFinishBuildingDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayViewInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) +#define ArrowArrayViewInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) +#define ArrowArrayViewAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) +#define ArrowArrayViewSetLength \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) +#define ArrowArrayViewSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) +#define ArrowBasicArrayStreamInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) +#define ArrowBasicArrayStreamSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) +#define ArrowBasicArrayStreamValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// \defgroup nanoarrow Nanoarrow C library +/// +/// Except where noted, objects are not thread-safe and clients should +/// take care to serialize accesses to methods. +/// +/// Because this library is intended to be vendored, it provides full type +/// definitions and encourages clients to stack or statically allocate +/// where convenient. + +/// \defgroup nanoarrow-malloc Memory management +/// +/// Non-buffer members of a struct ArrowSchema and struct ArrowArray +/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed +/// using ArrowFree() for schemas and arrays allocated here. Buffer members +/// are allocated using an ArrowBufferAllocator. +/// +/// @{ + +/// \brief Allocate like malloc() +void* ArrowMalloc(int64_t size); + +/// \brief Reallocate like realloc() +void* ArrowRealloc(void* ptr, int64_t size); + +/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). +void ArrowFree(void* ptr); + +/// \brief Return the default allocator +/// +/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and +/// ArrowFree(). +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); + +/// \brief Create a custom deallocator +/// +/// Creates a buffer allocator with only a free method that can be used to +/// attach a custom deallocator to an ArrowBuffer. This may be used to +/// avoid copying an existing buffer that was not allocated using the +/// infrastructure provided here (e.g., by an R or Python object). +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data); + +/// @} + +/// \defgroup nanoarrow-errors Error handling +/// +/// Functions generally return an errno-compatible error code; functions that +/// need to communicate more verbose error information accept a pointer +/// to an ArrowError. This can be stack or statically allocated. The +/// content of the message is undefined unless an error code has been +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. +/// +/// @{ + +/// \brief Error type containing a UTF-8 encoded message. +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error) { + error->message[0] = '\0'; + } +} + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); + +/// \brief Get the contents of an error +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +const char* ArrowErrorMessage(struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-utils Utility data structures +/// +/// @{ + +/// \brief Return a version string in the form "major.minor.patch" +const char* ArrowNanoarrowVersion(void); + +/// \brief Return an integer that can be used to compare versions sequentially +int ArrowNanoarrowVersionInt(void); + +/// \brief Initialize a description of buffer arrangements from a storage type +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); + +/// \brief Create a string view from a null-terminated string +static inline struct ArrowStringView ArrowCharView(const char* value); + +/// @} + +/// \defgroup nanoarrow-schema Creating schemas +/// +/// These functions allocate, copy, and destroy ArrowSchema structures +/// +/// @{ + +/// \brief Initialize an ArrowSchema +/// +/// Initializes the fields and release callback of schema_out. Caller +/// is responsible for calling the schema->release callback if +/// NANOARROW_OK is returned. +void ArrowSchemaInit(struct ArrowSchema* schema); + +/// \brief Initialize an ArrowSchema from an ArrowType +/// +/// A convenience constructor for that calls ArrowSchemaInit() and +/// ArrowSchemaSetType() for the common case of constructing an +/// unparameterized type. The caller is responsible for calling the schema->release +/// callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Get a human-readable summary of a Schema +/// +/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) +/// and returns the number of characters required for the output if +/// n were sufficiently large. If recursive is non-zero, the result will +/// also include children. +int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, + char recursive); + +/// \brief Set the format field of a schema from an ArrowType +/// +/// Initializes the fields and release callback of schema_out. For +/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and +/// NANOARROW_TYPE_MAP, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Set the format field and initialize children of a struct schema +/// +/// The specified number of children are initialized; however, the caller is responsible +/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); + +/// \brief Set the format field of a fixed-size schema +/// +/// Returns EINVAL for fixed_size <= 0 or for type that is not +/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. +/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() the first child. Schema must have been initialized using +/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size); + +/// \brief Set the format field of a decimal schema +/// +/// Returns EINVAL for scale <= 0 or for type that is not +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); + +/// \brief Set the format field of a time, timestamp, or duration schema +/// +/// Returns EINVAL for type that is not +/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, +/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The +/// timezone parameter must be NULL for a non-timestamp type. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); + +/// \brief Seet the format field of a union schema +/// +/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION +/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are +/// allocated, and initialized. +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children); + +/// \brief Make a (recursive) copy of a schema +/// +/// Allocates and copies fields of schema into schema_out. +ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, + struct ArrowSchema* schema_out); + +/// \brief Copy format into schema->format +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); + +/// \brief Copy name into schema->name +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); + +/// \brief Copy metadata into schema->metadata +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy. +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); + +/// \brief Allocate the schema->children array +/// +/// Includes the memory for each child struct ArrowSchema. +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children); + +/// \brief Allocate the schema->dictionary member +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); + +/// @} + +/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata +/// +/// @{ + +/// \brief Reader for key/value pairs in schema metadata +/// +/// The ArrowMetadataReader does not own any data and is only valid +/// for the lifetime of the underlying metadata pointer. +struct ArrowMetadataReader { + /// \brief A metadata string from a schema->metadata field. + const char* metadata; + + /// \brief The current offset into the metadata string + int64_t offset; + + /// \brief The number of remaining keys + int32_t remaining_keys; +}; + +/// \brief Initialize an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata); + +/// \brief Read the next key/value pair from an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); + +/// \brief The number of bytes in in a key/value metadata string +int64_t ArrowMetadataSizeOf(const char* metadata); + +/// \brief Check for a key in schema metadata +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); + +/// \brief Extract a value from schema metadata +/// +/// If key does not exist in metadata, value_out is unmodified +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out); + +/// \brief Initialize a builder for schema metadata from key/value pairs +/// +/// metadata can be an existing metadata string or NULL to initialize +/// an empty metadata string. +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); + +/// \brief Append a key/value pair to a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Set a key/value pair to a buffer containing serialized metadata +/// +/// Ensures that the only entry for key in the metadata is set to value. +/// This function maintains the existing position of (the first instance of) +/// key if present in the data. +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Remove a key from a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key); + +/// @} + +/// \defgroup nanoarrow-schema-view Reading schemas +/// +/// @{ + +/// \brief A non-owning view of a parsed ArrowSchema +/// +/// Contains more readily extractable values than a raw ArrowSchema. +/// Clients can stack or statically allocate this structure but are +/// encouraged to use the provided getters to ensure forward +/// compatibility. +struct ArrowSchemaView { + /// \brief A pointer to the schema represented by this view + struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION + /// or any datetime type. This value represents only the type required to + /// interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; +}; + +/// \brief Initialize an ArrowSchemaView +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + struct ArrowSchema* schema, struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-buffer Owning, growable buffers +/// +/// @{ + +/// \brief Initialize an ArrowBuffer +/// +/// Initialize a buffer with a NULL, zero-size buffer using the default +/// buffer allocator. +static inline void ArrowBufferInit(struct ArrowBuffer* buffer); + +/// \brief Set a newly-initialized buffer's allocator +/// +/// Returns EINVAL if the buffer has already been allocated. +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); + +/// \brief Reset an ArrowBuffer +/// +/// Releases the buffer using the allocator's free method if +/// the buffer's data member is non-null, sets the data member +/// to NULL, and sets the buffer's size and capacity to 0. +static inline void ArrowBufferReset(struct ArrowBuffer* buffer); + +/// \brief Move an ArrowBuffer +/// +/// Transfers the buffer data and lifecycle management to another +/// address and resets buffer. +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); + +/// \brief Grow or shrink a buffer to a given capacity +/// +/// When shrinking the capacity of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not +/// adjust the buffer's size member except to ensure that the invariant +/// capacity >= size remains true. +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit); + +/// \brief Ensure a buffer has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bytes, overallocating when required. +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function does not check that buffer has the required capacity +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function writes and ensures that the buffer has the required capacity, +/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will +/// overallocate when reallocation is required. +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes); + +/// \brief Write fill to buffer and increment the buffer size +/// +/// This function writes the specified number of fill bytes and +/// ensures that the buffer has the required capacity, +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes); + +/// \brief Write an 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value); + +/// \brief Write an unsigned 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value); + +/// \brief Write a 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value); + +/// \brief Write an unsigned 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value); + +/// \brief Write a 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value); + +/// \brief Write an unsigned 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value); + +/// \brief Write a 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value); + +/// \brief Write an unsigned 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value); + +/// \brief Write a double to a buffer +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value); + +/// \brief Write a float to a buffer +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value); + +/// \brief Write an ArrowStringView to a buffer +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value); + +/// \brief Write an ArrowBufferView to a buffer +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value); + +/// @} + +/// \defgroup nanoarrow-bitmap Bitmap utilities +/// +/// @{ + +/// \brief Extract a boolean value from a bitmap +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to true +static inline void ArrowBitSet(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to false +static inline void ArrowBitClear(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); + +/// \brief Set a boolean value to a range in a bitmap +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set); + +/// \brief Count true values in a bitmap +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); + + +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + +/// \brief Initialize an ArrowBitmap +/// +/// Initialize the builder's buffer, empty its cache, and reset the size to zero +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); + +/// \brief Move an ArrowBitmap +/// +/// Transfers the underlying buffer data and lifecycle management to another +/// address and resets the bitmap. +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); + +/// \brief Ensure a bitmap builder has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bits, overallocating when required. +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits); + +/// \brief Grow or shrink a bitmap to a given capacity +/// +/// When shrinking the capacity of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not +/// adjust the buffer's size member except when shrinking new_capacity_bits +/// to a value less than the current number of bits in the bitmap. +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit); + +/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append zero or more of the same boolean value to a bitmap +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append boolean values encoded as int8_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values); + +/// \brief Append boolean values encoded as int32_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values); + +/// \brief Reset a bitmap builder +/// +/// Releases any memory held by buffer, empties the cache, and resets the size to zero +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); + +/// @} + +/// \defgroup nanoarrow-array Creating arrays +/// +/// These functions allocate, copy, and destroy ArrowArray structures. +/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() +/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing +/// it using the embedded release callback. +/// +/// @{ + +/// \brief Initialize the fields of an array +/// +/// Initializes the fields and release callback of array. Caller +/// is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type); + +/// \brief Initialize the contents of an ArrowArray from an ArrowSchema +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error); + +/// \brief Allocate the array->children array +/// +/// Includes the memory for each child struct ArrowArray, +/// whose members are marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// schema must have been allocated using ArrowArrayInitFromType(). +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); + +/// \brief Allocate the array->dictionary member +/// +/// Includes the memory for the struct ArrowArray, whose contents +/// is marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); + +/// \brief Set the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); + +/// \brief Set a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer); + +/// \brief Get the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); + +/// \brief Get a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); + +/// \brief Start element-wise appending to an ArrowArray +/// +/// Initializes any values needed to use ArrowArrayAppend*() functions. +/// All element-wise appenders append by value and return EINVAL if the exact value +/// cannot be represented by the underlying storage type. +/// array must have been allocated using ArrowArrayInitFromType() +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); + +/// \brief Reserve space for future appends +/// +/// For buffer sizes that can be calculated (i.e., not string data buffers or +/// child array sizes for non-fixed-size arrays), recursively reserve space for +/// additional elements. This is useful for reducing the number of reallocations +/// that occur using the item-wise appenders. +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements); + +/// \brief Append a null value to an array +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); + +/// \brief Append an empty, non-null value to an array +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); + +/// \brief Append a signed integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); + +/// \brief Append an unsigned integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value); + +/// \brief Append a double value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range or there is an attempt to append +/// a non-integer to an array with an integer storage type). +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value); + +/// \brief Append a string of bytes to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., +/// the underlying array is not a binary, string, large binary, large string, +/// or fixed-size binary array, or value is the wrong size for a fixed-size +/// binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value); + +/// \brief Append a string value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., +/// the underlying array is not a string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value); + +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value); + +/// \brief Append a decimal value to an array +/// +/// Returns NANOARROW_OK if array is a decimal array with the appropriate +/// bitwidth or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + struct ArrowDecimal* value); + +/// \brief Finish a nested array element +/// +/// Appends a non-null element to the array based on the first child's current +/// length. Returns NANOARROW_OK if the item was successfully added or EINVAL +/// if the underlying storage type is not a struct, list, large list, or fixed-size +/// list, or if there was an attempt to add a struct or fixed-size list element where the +/// length of the child array(s) did not match the expected length. +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); + +/// \brief Finish a union array element +/// +/// Appends an element to the union type ids buffer and increments array->length. +/// For sparse unions, up to one element is added to non type-id children. Returns +/// EINVAL if the underlying storage type is not a union, if type_id is not valid, +/// or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id); + +/// \brief Shrink buffer capacity to the size required +/// +/// Also applies shrinking to any child arrays. array must have been allocated using +/// ArrowArrayInitFromType +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); + +/// \brief Finish building an ArrowArray +/// +/// Flushes any pointers from internal buffers that may have been reallocated +/// into array->buffers and checks the actual size of the buffers +/// against the expected size based on the final length. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Finish building an ArrowArray with explicit validation +/// +/// Finish building with an explicit validation level. This could perform less validation +/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU +/// buffer data access is not possible or more validation (i.e., +/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or +/// corruptible source. +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-array-view Reading arrays +/// +/// These functions read and validate the contents ArrowArray structures. +/// +/// @{ + +/// \brief Initialize the contents of an ArrowArrayView +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type); + +/// \brief Move an ArrowArrayView +/// +/// Transfers the ArrowArrayView data and lifecycle management to another +/// address and resets the contents of src. +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst); + +/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Allocate the array_view->children array +/// +/// Includes the memory for each child struct ArrowArrayView +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children); + +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + +/// \brief Set data-independent buffer sizes from length +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); + +/// \brief Set buffer sizes and data pointers from an ArrowArray +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + struct ArrowArray* array, struct ArrowError* error); + +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Reset the contents of an ArrowArrayView and frees resources +void ArrowArrayViewReset(struct ArrowArrayView* array_view); + +/// \brief Check for a null element in an ArrowArrayView +static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the type id of a union array element +static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the child index of a union array element +static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the index to use into the relevant union child array +static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an unsigned integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for a uint64. +static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as a double +/// +/// This function does not check for null values, or +/// that values are within a valid range for a double. +static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowStringView +/// +/// This function does not check for null values. +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowBufferView +/// +/// This function does not check for null values. +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowDecimal +/// +/// This function does not check for null values. The out parameter must +/// be initialized with ArrowDecimalInit() with the proper parameters for this +/// type before calling this for the first time. +static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out); + +/// @} + +/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation +/// +/// An implementation of an ArrowArrayStream based on a collection of +/// zero or more previously-existing ArrowArray objects. Users should +/// initialize and/or validate the contents before transferring the +/// responsibility of the ArrowArrayStream elsewhere. +/// +/// @{ + +/// \brief Initialize an ArrowArrayStream backed by this implementation +/// +/// This function moves the ownership of schema to the array_stream. If +/// this function returns NANOARROW_OK, the caller is responsible for +/// releasing the ArrowArrayStream. +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays); + +/// \brief Set the ith ArrowArray in this ArrowArrayStream. +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function move the ownership of array to the array_stream. i must +/// be greater than zero and less than the value of n_arrays passed in +/// ArrowBasicArrayStreamInit(). Callers are not required to fill all +/// n_arrays members (i.e., n_arrays is a maximum bound). +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array); + +/// \brief Validate the contents of this ArrowArrayStream +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() +/// to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, + struct ArrowError* error); + +/// @} + +// Inline function definitions + + + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED +#define NANOARROW_BUFFER_INLINE_H_INCLUDED + +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } +} + +static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); +} + +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } +} + +static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { + if (buffer->data != NULL) { + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + buffer->data = NULL; + } + + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; +} + +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferReset(src); +} + +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_capacity_bytes, + char shrink_to_fit) { + if (new_capacity_bytes < 0) { + return EINVAL; + } + + if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { + buffer->data = buffer->allocator.reallocate( + &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + } + + // Ensures that when shrinking that size <= capacity + if (new_capacity_bytes < buffer->size_bytes) { + buffer->size_bytes = new_capacity_bytes; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } + + return ArrowBufferResize( + buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); +} + +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes) { + if (size_bytes > 0) { + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } +} + +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); +} + +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); +} + +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + return NANOARROW_OK; +} + +static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + +static const uint8_t _ArrowkBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { + return (value + 7) & ~((int64_t)7); +} + +static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { + return (value / 8) * 8; +} + +static inline int64_t _ArrowBytesForBits(int64_t bits) { + return (bits >> 3) + ((bits & 7) != 0); +} + +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { + *out = (values[0] + | ((values[1] + 0x1) & 0x2) + | ((values[2] + 0x3) & 0x4) + | ((values[3] + 0x7) & 0x8) + | ((values[4] + 0xf) & 0x10) + | ((values[5] + 0x1f) & 0x20) + | ((values[6] + 0x3f) & 0x40) + | ((values[7] + 0x7f) & 0x80) + ); +} + +static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { + *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | values[4] << 4 | + values[5] << 5 | values[6] << 6 | values[7] << 7); +} + +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = i_end % 8 == 0 ? 8 : i_end % 8; + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitSet(uint8_t* bits, int64_t i) { + bits[i / 8] |= _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitClear(uint8_t* bits, int64_t i) { + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; +} + +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { + bits[i / 8] ^= + ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set) { + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) { + return; + } + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); +} + +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, + int64_t length) { + if (length == 0) { + return 0; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + + const uint8_t only_byte_mask = + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; + + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } + + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + + return count; +} + +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; +} + +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; +} + +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); + + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_capacity_bits, + char shrink_to_fit) { + if (new_capacity_bits < 0) { + return EINVAL; + } + + int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); + + if (new_capacity_bits < bitmap->size_bits) { + bitmap->size_bits = new_capacity_bits; + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); + + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} + +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED +#define NANOARROW_ARRAY_INLINE_H_INCLUDED + +#include +#include +#include +#include +#include + + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } +} + +// We don't currently support the case of unions where type_id != child_index; +// however, these functions are used to keep track of where that assumption +// is made. +static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, + int8_t type_id) { + return type_id; +} + +static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, + int8_t child_index) { + return child_index; +} + +static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; + } + + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, + int64_t n_type_ids, + int64_t n_children) { + if (n_type_ids != n_children) { + return 0; + } + + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } + + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, + int64_t n_children) { + int8_t type_ids[128]; + int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + // Initialize any data offset buffer with a single zero + for (int i = 0; i < 3; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } + + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < 3; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, + int64_t buffer_i, uint8_t value, + int64_t n) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = + _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * + (array->length + 1)) / + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); + } + + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, + int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; + } + + // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet + // and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } + + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < 3; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for each element + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } + + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, + int64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((offset + value.size_bytes) > INT32_MAX) { + return EINVAL; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EINVAL; + } + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != + ((array->length + 1) * private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { + return EINVAL; + } + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Append the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the right length + // or abort if appending a null will result in a column with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} + +static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[i]; + default: + return -1; + } +} + +static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, + int64_t i) { + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } +} + +static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_SPARSE_UNION: + return i; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewListChildOffset(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } +} + +static inline uint64_t ArrowArrayViewGetUIntUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + i += array_view->offset; + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } +} + +static inline double ArrowArrayViewGetDoubleUnsafe(struct ArrowArrayView* array_view, + int64_t i) { + i += array_view->offset; + struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } +} + +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); + break; + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = + array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline void ArrowArrayViewGetIntervalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe(struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 5c7680bc6fb6c..c3d7769e6a612 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1182,7 +1182,7 @@ cdef class MaskedIndexEngine(IndexEngine): def _get_mask(self, object values) -> np.ndarray: if hasattr(values, "_mask"): - return values._mask + return values._mask.to_numpy() # We are an ArrowExtensionArray return values.isna() diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index fd632790546f6..68f3c383499ac 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -62,7 +62,7 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, - 'arrays': {'sources': ['arrays.pyx']}, + 'arrays': {'sources': ['arrays.pyx', 'src/vendored/nanoarrow.c', 'src/bitmask_algorithms.c'], 'includes': ['include/pandas/vendored']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, @@ -107,7 +107,7 @@ foreach ext_name, ext_dict : libs_sources ext_name, ext_dict.get('sources'), cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], - include_directories: [inc_np, inc_pd], + include_directories: [inc_np, inc_pd] + ext_dict.get('includes', []), dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', install: true diff --git a/pandas/_libs/src/bitmask_algorithms.c b/pandas/_libs/src/bitmask_algorithms.c new file mode 100644 index 0000000000000..6b944729445d0 --- /dev/null +++ b/pandas/_libs/src/bitmask_algorithms.c @@ -0,0 +1,416 @@ +/* + +Copyright (c) 2023, PyData Development Team +All rights reserved. + +Distributed under the terms of the BSD Simplified License. + +The full license is in the LICENSE file, distributed with this software. + +*/ + + +#include + +#include "pandas/bitmask_algorithms.h" + +static const uint8_t clear_mask[8] = {0x0, 0x1, 0x3, 0x7, + 0xf, 0x1f, 0x3f, 0x7f}; + +void ConcatenateBitmapData(const struct ArrowBitmap **bitmaps, size_t nbitmaps, + struct ArrowBitmap *out) { + if (nbitmaps == 0) { + return; + } + + int64_t bits_processed = 0; + uint8_t *out_cursor = out->buffer.data; + size_t start_bit_pos = 0; + for (size_t i = 0; i < nbitmaps; i++) { + const struct ArrowBitmap *bitmap = bitmaps[i]; + const int64_t nbytes = bitmap->buffer.size_bytes; + if (nbytes == 0) { + continue; + } + const size_t trailing_nbits = bitmap->size_bits % 8; + + // As we loop through each array, any time we end up starting + // on a word boundary we can simply use memcpy. If we are not + // so lucky we fall back to bit shifting each element + if (start_bit_pos == 0) { + memcpy(out_cursor, bitmap->buffer.data, nbytes); + } else { + for (int64_t j = 0; j < nbytes - 1; j++) { + const uint8_t lshifted = bitmap->buffer.data[j] << start_bit_pos; + out_cursor[j] = (out_cursor[j] & clear_mask[start_bit_pos]) | lshifted; + + const uint8_t rshifted = bitmap->buffer.data[j] >> (8 - start_bit_pos); + out_cursor[j + 1] = rshifted; + } + + // last byte can overrun - check outside loop for performance + const size_t index = nbytes - 1; + const uint8_t lshifted = bitmap->buffer.data[index] << start_bit_pos; + out_cursor[index] = + (out_cursor[index] & clear_mask[start_bit_pos]) | lshifted; + + if (out_cursor - out->buffer.data < out->buffer.capacity_bytes - 1) { + const uint8_t rshifted = + bitmap->buffer.data[index] >> (8 - start_bit_pos); + out_cursor[index + 1] = rshifted; + } + } + + out_cursor += nbytes; + const int64_t next_bit_pos = start_bit_pos + trailing_nbits; + if ((next_bit_pos > 0) && (next_bit_pos < 8)) { + out_cursor--; + } + + start_bit_pos = next_bit_pos % 8; + bits_processed += bitmap->size_bits; + } + + out->size_bits = bits_processed; + out->buffer.size_bytes = bits_processed / 8; + if ((bits_processed % 8) > 0) { + out->buffer.size_bytes += 1; + } +} + +bool BitmapAny(const struct ArrowBitmap *bitmap) { + const size_t nbits = bitmap->size_bits; + const size_t size_bytes = bitmap->buffer.size_bytes; + if (nbits < 1) { + return false; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + if (value != 0x0) { + return true; + } + } + + for (; i < bitmap->buffer.size_bytes - 1; i++) { + if (bitmap->buffer.data[i] != 0x0) { + return true; + } + } + + const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); + for (size_t i = 0; i < bits_remaining; i++) { + if (ArrowBitGet(bitmap->buffer.data, nbits - i - 1) == 1) { + return true; + } + } + + return false; +} + +bool BitmapAll(const struct ArrowBitmap *bitmap) { + const size_t nbits = bitmap->size_bits; + const size_t size_bytes = bitmap->buffer.size_bytes; + if (nbits < 1) { + return true; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + if (value != SIZE_MAX) { + return false; + } + } + + for (; i < bitmap->buffer.size_bytes - 1; i++) { + if (bitmap->buffer.data[i] != 0xff) { + return false; + } + } + + const size_t bits_remaining = nbits - ((size_bytes - 1) * 8); + for (size_t i = 0; i < bits_remaining; i++) { + if (ArrowBitGet(bitmap->buffer.data, nbits - i - 1) == 0) { + return false; + } + } + + return true; +} + +int BitmapOr(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { + return -1; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 | value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] | bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapOrBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 | mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] | umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapAnd(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { + return -1; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 & value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] & bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapAndBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 & mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] & umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapXor(const struct ArrowBitmap *bitmap1, + const struct ArrowBitmap *bitmap2, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (bitmap1->size_bits != bitmap2->size_bits) { + return -1; + } else if (!(out->buffer.capacity_bytes >= size_bytes)) { + return -1; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t value2; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + memcpy(&value2, &bitmap2->buffer.data[i], sizeof(size_t)); + result = value1 ^ value2; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] ^ bitmap2->buffer.data[i]; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapXorBool(const struct ArrowBitmap *bitmap1, bool other, + struct ArrowBitmap *out) { + const size_t size_bytes = bitmap1->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= bitmap1->buffer.size_bytes)) { + return -1; + } + + const size_t mask = other ? SIZE_MAX : 0; + const uint8_t umask = other ? UINT8_MAX : 0; + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value1; + size_t result; + memcpy(&value1, &bitmap1->buffer.data[i], sizeof(size_t)); + result = value1 ^ mask; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap1->buffer.size_bytes; i++) { + out->buffer.data[i] = bitmap1->buffer.data[i] ^ umask; + } + + out->size_bits = bitmap1->size_bits; + out->buffer.size_bytes = bitmap1->buffer.size_bytes; + + return 0; +} + +int BitmapInvert(const struct ArrowBitmap *bitmap, struct ArrowBitmap *out) { + const size_t size_bytes = bitmap->buffer.size_bytes; + if (!(out->buffer.capacity_bytes >= size_bytes)) { + return -1; + } + + const size_t overflow_limit = SIZE_MAX - sizeof(size_t); + const size_t limit = + size_bytes > overflow_limit ? overflow_limit : size_bytes; + size_t i = 0; + for (; i + sizeof(size_t) - 1 < limit; i += sizeof(size_t)) { + size_t value; + size_t result; + memcpy(&value, &bitmap->buffer.data[i], sizeof(size_t)); + result = ~value; + memcpy(&out->buffer.data[i], &result, sizeof(size_t)); + } + + for (; i < bitmap->buffer.size_bytes; i++) { + out->buffer.data[i] = ~bitmap->buffer.data[i]; + } + + out->size_bits = bitmap->size_bits; + out->buffer.size_bytes = bitmap->buffer.size_bytes; + + return 0; +} + +int BitmapTake(const struct ArrowBitmap *bitmap, const int64_t *indices, + size_t nindices, struct ArrowBitmap *out) { + int64_t bytes_needed = nindices / 8; + if ((nindices % 8) > 0) { + bytes_needed += 1; + } + + if (!(out->buffer.capacity_bytes >= bytes_needed)) { + return -1; + } + + for (size_t i = 0; i < nindices; i++) { + int64_t index = indices[i]; + if (index < 0) { + return -1; + } + + int8_t value = ArrowBitGet(bitmap->buffer.data, index); + ArrowBitmapAppendUnsafe(out, value, 1); + } + + return 0; +} + +int BitmapPutFromBufferMask(struct ArrowBitmap *bitmap, const uint8_t *buf, + size_t n, uint8_t value) { + int64_t bytes_needed = n / 8; + if ((n % 8) > 0) { + bytes_needed += 1; + } + + if (bytes_needed > bitmap->buffer.capacity_bytes) { + return -1; + } + + for (size_t i = 0; i < n; i++) { + if (buf[i]) { + ArrowBitSetTo(bitmap->buffer.data, i, value); + } + } + + return 0; +} diff --git a/pandas/_libs/src/vendored/nanoarrow.c b/pandas/_libs/src/vendored/nanoarrow.c new file mode 100644 index 0000000000000..fc23c71992c4b --- /dev/null +++ b/pandas/_libs/src/vendored/nanoarrow.c @@ -0,0 +1,3107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "nanoarrow.h" + +const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } + +int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } + +int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { + if (error == NULL) { + return NANOARROW_OK; + } + + memset(error->message, 0, sizeof(error->message)); + + va_list args; + va_start(args, fmt); + int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); + va_end(args); + + if (chars_needed < 0) { + return EINVAL; + } else if (((size_t)chars_needed) >= sizeof(error->message)) { + return ERANGE; + } else { + return NANOARROW_OK; + } +} + +const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; + + layout->element_size_bits[0] = 1; + layout->element_size_bits[1] = 0; + layout->element_size_bits[2] = 0; + + layout->child_size_elements = 0; + + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + layout->element_size_bits[0] = 0; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_LARGE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_BOOL: + layout->element_size_bits[1] = 1; + break; + + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + layout->element_size_bits[1] = 8; + break; + + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_HALF_FLOAT: + layout->element_size_bits[1] = 16; + break; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; + case NANOARROW_TYPE_INTERVAL_MONTHS: + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + layout->element_size_bits[1] = 128; + break; + + case NANOARROW_TYPE_DECIMAL256: + layout->element_size_bits[1] = 256; + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; + break; + + case NANOARROW_TYPE_DENSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_SPARSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; + break; + + case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; + case NANOARROW_TYPE_LARGE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; + break; + + default: + break; + } +} + +void* ArrowMalloc(int64_t size) { return malloc(size); } + +void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } + +void ArrowFree(void* ptr) { free(ptr); } + +static uint8_t* ArrowBufferAllocatorMallocReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + return (uint8_t*)ArrowRealloc(ptr, new_size); +} + +static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + ArrowFree(ptr); +} + +static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { + &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; + +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { + return ArrowBufferAllocatorMalloc; +} + +static uint8_t* ArrowBufferAllocatorNeverReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + return NULL; +} + +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data) { + struct ArrowBufferAllocator allocator; + allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; + allocator.free = custom_free; + allocator.private_data = private_data; + return allocator; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowSchemaRelease(struct ArrowSchema* schema) { + if (schema->format != NULL) ArrowFree((void*)schema->format); + if (schema->name != NULL) ArrowFree((void*)schema->name); + if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (schema->children != NULL) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i] != NULL) { + if (schema->children[i]->release != NULL) { + schema->children[i]->release(schema->children[i]); + } + + ArrowFree(schema->children[i]); + } + } + + ArrowFree(schema->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (schema->dictionary != NULL) { + if (schema->dictionary->release != NULL) { + schema->dictionary->release(schema->dictionary); + } + + ArrowFree(schema->dictionary); + } + + // private data not currently used + if (schema->private_data != NULL) { + ArrowFree(schema->private_data); + } + + schema->release = NULL; +} + +static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_UNINITIALIZED: + return NULL; + case NANOARROW_TYPE_NA: + return "n"; + case NANOARROW_TYPE_BOOL: + return "b"; + + case NANOARROW_TYPE_UINT8: + return "C"; + case NANOARROW_TYPE_INT8: + return "c"; + case NANOARROW_TYPE_UINT16: + return "S"; + case NANOARROW_TYPE_INT16: + return "s"; + case NANOARROW_TYPE_UINT32: + return "I"; + case NANOARROW_TYPE_INT32: + return "i"; + case NANOARROW_TYPE_UINT64: + return "L"; + case NANOARROW_TYPE_INT64: + return "l"; + + case NANOARROW_TYPE_HALF_FLOAT: + return "e"; + case NANOARROW_TYPE_FLOAT: + return "f"; + case NANOARROW_TYPE_DOUBLE: + return "g"; + + case NANOARROW_TYPE_STRING: + return "u"; + case NANOARROW_TYPE_LARGE_STRING: + return "U"; + case NANOARROW_TYPE_BINARY: + return "z"; + case NANOARROW_TYPE_LARGE_BINARY: + return "Z"; + + case NANOARROW_TYPE_DATE32: + return "tdD"; + case NANOARROW_TYPE_DATE64: + return "tdm"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "tiM"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "tiD"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "tin"; + + case NANOARROW_TYPE_LIST: + return "+l"; + case NANOARROW_TYPE_LARGE_LIST: + return "+L"; + case NANOARROW_TYPE_STRUCT: + return "+s"; + case NANOARROW_TYPE_MAP: + return "+m"; + + default: + return NULL; + } +} + +static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, + enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); + break; + case NANOARROW_TYPE_MAP: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); + ArrowSchemaInit(schema->children[0]->children[0]); + ArrowSchemaInit(schema->children[0]->children[1]); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[0], "key")); + schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[1], "value")); + break; + default: + break; + } + + return NANOARROW_OK; +} + +void ArrowSchemaInit(struct ArrowSchema* schema) { + schema->format = NULL; + schema->name = NULL; + schema->metadata = NULL; + schema->flags = ARROW_FLAG_NULLABLE; + schema->n_children = 0; + schema->children = NULL; + schema->dictionary = NULL; + schema->private_data = NULL; + schema->release = &ArrowSchemaRelease; +} + +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { + // We don't allocate the dictionary because it has to be nullptr + // for non-dictionary-encoded arrays. + + // Set the format to a valid format string for type + const char* template_format = ArrowSchemaFormatTemplate(type); + + // If type isn't recognized and not explicitly unset + if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); + + // For types with an umabiguous child structure, allocate children + return ArrowSchemaInitChildrenIfNeeded(schema, type); +} + +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { + ArrowSchemaInit(schema); + + int result = ArrowSchemaSetType(schema, type); + if (result != NANOARROW_OK) { + schema->release(schema); + return result; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size) { + if (fixed_size <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); + + if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale) { + if (decimal_precision <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_DECIMAL128: + n_chars = + snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); + break; + case NANOARROW_TYPE_DECIMAL256: + n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, + decimal_scale); + break; + default: + return EINVAL; + } + + buffer[n_chars] = '\0'; + return ArrowSchemaSetFormat(schema, buffer); +} + +static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "m"; + case NANOARROW_TIME_UNIT_MICRO: + return "u"; + case NANOARROW_TIME_UNIT_NANO: + return "n"; + default: + return NULL; + } +} + +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone) { + const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); + if (time_unit_str == NULL) { + return EINVAL; + } + + char buffer[128]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIMESTAMP: + if (timezone == NULL) { + timezone = ""; + } + n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); + break; + case NANOARROW_TYPE_DURATION: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer)) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + + return ArrowSchemaSetFormat(schema, buffer); +} + +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children) { + if (n_children < 0 || n_children > 127) { + return EINVAL; + } + + // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator + char format_out[512]; + int64_t format_out_size = 512; + memset(format_out, 0, format_out_size); + int n_chars; + char* format_cursor = format_out; + + switch (type) { + case NANOARROW_TYPE_SPARSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+us:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + case NANOARROW_TYPE_DENSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+ud:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + default: + return EINVAL; + } + + if (n_children > 0) { + n_chars = snprintf(format_cursor, format_out_size, "0"); + format_cursor += n_chars; + format_out_size -= n_chars; + + for (int64_t i = 1; i < n_children; i++) { + n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); + format_cursor += n_chars; + format_out_size -= n_chars; + } + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); + + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { + if (schema->format != NULL) { + ArrowFree((void*)schema->format); + } + + if (format != NULL) { + size_t format_size = strlen(format) + 1; + schema->format = (const char*)ArrowMalloc(format_size); + if (schema->format == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->format, format, format_size); + } else { + schema->format = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { + if (schema->name != NULL) { + ArrowFree((void*)schema->name); + } + + if (name != NULL) { + size_t name_size = strlen(name) + 1; + schema->name = (const char*)ArrowMalloc(name_size); + if (schema->name == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->name, name, name_size); + } else { + schema->name = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { + if (schema->metadata != NULL) { + ArrowFree((void*)schema->metadata); + } + + if (metadata != NULL) { + size_t metadata_size = ArrowMetadataSizeOf(metadata); + schema->metadata = (const char*)ArrowMalloc(metadata_size); + if (schema->metadata == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->metadata, metadata, metadata_size); + } else { + schema->metadata = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children) { + if (schema->children != NULL) { + return EEXIST; + } + + if (n_children > 0) { + schema->children = + (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); + + if (schema->children == NULL) { + return ENOMEM; + } + + schema->n_children = n_children; + + memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); + + for (int64_t i = 0; i < n_children; i++) { + schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + + if (schema->children[i] == NULL) { + return ENOMEM; + } + + schema->children[i]->release = NULL; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { + if (schema->dictionary != NULL) { + return EEXIST; + } + + schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + if (schema->dictionary == NULL) { + return ENOMEM; + } + + schema->dictionary->release = NULL; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaDeepCopy(struct ArrowSchema* schema, + struct ArrowSchema* schema_out) { + ArrowSchemaInit(schema_out); + + int result = ArrowSchemaSetFormat(schema_out, schema->format); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + schema_out->flags = schema->flags; + + result = ArrowSchemaSetName(schema_out, schema->name); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaSetMetadata(schema_out, schema->metadata); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowSchemaAllocateDictionary(schema_out); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + + result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); + if (result != NANOARROW_OK) { + schema_out->release(schema_out); + return result; + } + } + + return NANOARROW_OK; +} + +static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, + enum ArrowType type) { + schema_view->type = type; + schema_view->storage_type = type; +} + +static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, + const char* format, + const char** format_end_out, + struct ArrowError* error) { + *format_end_out = format; + + // needed for decimal parsing + const char* parse_start; + char* parse_end; + + switch (format[0]) { + case 'n': + schema_view->type = NANOARROW_TYPE_NA; + schema_view->storage_type = NANOARROW_TYPE_NA; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'b': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'c': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'C': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'S': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'i': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'I': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'l': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'L': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'e': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'f': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'g': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); + *format_end_out = format + 1; + return NANOARROW_OK; + + // decimal + case 'd': + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'", + format + 3); + return EINVAL; + } + + parse_start = format + 2; + schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start || parse_end[0] != ',') { + ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); + return EINVAL; + } + + parse_start = parse_end + 1; + schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start) { + ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); + return EINVAL; + } else if (parse_end[0] != ',') { + schema_view->decimal_bitwidth = 128; + } else { + parse_start = parse_end + 1; + schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_start == parse_end) { + ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); + return EINVAL; + } + } + + *format_end_out = parse_end; + + switch (schema_view->decimal_bitwidth) { + case 128: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); + return NANOARROW_OK; + case 256: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", + (int)schema_view->decimal_bitwidth); + return EINVAL; + } + + // validity + data + case 'w': + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':' following 'w'"); + return EINVAL; + } + + schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); + return NANOARROW_OK; + + // validity + offset + data + case 'z': + schema_view->type = NANOARROW_TYPE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'u': + schema_view->type = NANOARROW_TYPE_STRING; + schema_view->storage_type = NANOARROW_TYPE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // validity + large_offset + data + case 'Z': + schema_view->type = NANOARROW_TYPE_LARGE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'U': + schema_view->type = NANOARROW_TYPE_LARGE_STRING; + schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // nested types + case '+': + switch (format[1]) { + // list has validity + offset or offset + case 'l': + schema_view->storage_type = NANOARROW_TYPE_LIST; + schema_view->type = NANOARROW_TYPE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // large list has validity + large_offset or large_offset + case 'L': + schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; + schema_view->type = NANOARROW_TYPE_LARGE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // just validity buffer + case 'w': + if (format[2] != ':' || format[3] == '\0') { + ArrowErrorSet(error, "Expected ':' following '+w'"); + return EINVAL; + } + + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->fixed_size = + (int32_t)strtol(format + 3, (char**)format_end_out, 10); + return NANOARROW_OK; + case 's': + schema_view->storage_type = NANOARROW_TYPE_STRUCT; + schema_view->type = NANOARROW_TYPE_STRUCT; + *format_end_out = format + 2; + return NANOARROW_OK; + case 'm': + schema_view->storage_type = NANOARROW_TYPE_MAP; + schema_view->type = NANOARROW_TYPE_MAP; + *format_end_out = format + 2; + return NANOARROW_OK; + + // unions + case 'u': + switch (format[2]) { + case 'd': + schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; + schema_view->type = NANOARROW_TYPE_DENSE_UNION; + break; + case 's': + schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; + schema_view->type = NANOARROW_TYPE_SPARSE_UNION; + break; + default: + ArrowErrorSet(error, + "Expected union format string +us: or " + "+ud: but found '%s'", + format); + return EINVAL; + } + + if (format[3] == ':') { + schema_view->union_type_ids = format + 4; + int64_t n_type_ids = + _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); + if (n_type_ids != schema_view->schema->n_children) { + ArrowErrorSet( + error, + "Expected union type_ids parameter to be a comma-separated list of %ld " + "values between 0 and 127 but found '%s'", + (long)schema_view->schema->n_children, schema_view->union_type_ids); + return EINVAL; + } + *format_end_out = format + strlen(format); + return NANOARROW_OK; + } else { + ArrowErrorSet(error, + "Expected union format string +us: or +ud: " + "but found '%s'", + format); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Expected nested type format string but found '%s'", + format); + return EINVAL; + } + + // date/time types + case 't': + switch (format[1]) { + // date + case 'd': + switch (format[2]) { + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_DATE32; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DATE64; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", + format + 2); + return EINVAL; + } + + // time of day + case 't': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", + format + 2); + return EINVAL; + } + + // timestamp + case 's': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + break; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + break; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + break; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", + format + 2); + return EINVAL; + } + + if (format[3] != ':') { + ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, + format + 3); + return EINVAL; + } + + schema_view->timezone = format + 4; + *format_end_out = format + strlen(format); + return NANOARROW_OK; + + // duration + case 'D': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", + format + 2); + return EINVAL; + } + + // interval + case 'i': + switch (format[2]) { + case 'M': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", + format + 2); + return EINVAL; + } + + default: + ArrowErrorSet( + error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", + format + 1); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Unknown format: '%s'", format); + return EINVAL; + } +} + +static ArrowErrorCode ArrowSchemaViewValidateNChildren( + struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { + if (n_children != -1 && schema_view->schema->n_children != n_children) { + ArrowErrorSet(error, "Expected schema with %d children but found %d children", + (int)n_children, (int)schema_view->schema->n_children); + return EINVAL; + } + + // Don't do a full validation of children but do check that they won't + // segfault if inspected + struct ArrowSchema* child; + for (int64_t i = 0; i < schema_view->schema->n_children; i++) { + child = schema_view->schema->children[i]; + if (child == NULL) { + ArrowErrorSet(error, "Expected valid schema at schema->children[%d] but found NULL", + i); + return EINVAL; + } else if (child->release == NULL) { + ArrowErrorSet( + error, + "Expected valid schema at schema->children[%d] but found a released schema", i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); +} + +static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); + + if (schema_view->schema->children[0]->n_children != 2) { + ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", + (int)schema_view->schema->children[0]->n_children); + return EINVAL; + } + + if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { + ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", + schema_view->schema->children[0]->format); + return EINVAL; + } + + if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, + "Expected child of map type to be non-nullable but was nullable"); + return EINVAL; + } + + if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateDictionary( + struct ArrowSchemaView* schema_view, struct ArrowError* error) { + // check for valid index type + switch (schema_view->storage_type) { + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + break; + default: + ArrowErrorSet( + error, + "Expected dictionary schema index type to be an integral type but found '%s'", + schema_view->schema->format); + return EINVAL; + } + + struct ArrowSchemaView dictionary_schema_view; + return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, + error); +} + +static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, + enum ArrowType type, + struct ArrowError* error) { + switch (type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_DATE32: + case NANOARROW_TYPE_DATE64: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (schema_view->fixed_size <= 0) { + ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", + schema_view->fixed_size); + return EINVAL; + } + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return ArrowSchemaViewValidateNChildren(schema_view, 1, error); + + case NANOARROW_TYPE_STRUCT: + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); + + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return ArrowSchemaViewValidateUnion(schema_view, error); + + case NANOARROW_TYPE_MAP: + return ArrowSchemaViewValidateMap(schema_view, error); + + case NANOARROW_TYPE_DICTIONARY: + return ArrowSchemaViewValidateDictionary(schema_view, error); + + default: + ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", + (int)schema_view->type); + return EINVAL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + struct ArrowSchema* schema, struct ArrowError* error) { + if (schema == NULL) { + ArrowErrorSet(error, "Expected non-NULL schema"); + return EINVAL; + } + + if (schema->release == NULL) { + ArrowErrorSet(error, "Expected non-released schema"); + return EINVAL; + } + + schema_view->schema = schema; + + const char* format = schema->format; + if (format == NULL) { + ArrowErrorSet( + error, + "Error parsing schema->format: Expected a null-terminated string but found NULL"); + return EINVAL; + } + + size_t format_len = strlen(format); + if (format_len == 0) { + ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); + return EINVAL; + } + + const char* format_end_out; + ArrowErrorCode result = + ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + + if (result != NANOARROW_OK) { + if (error != NULL) { + char child_error[1024]; + memcpy(child_error, ArrowErrorMessage(error), 1024); + ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); + } + + return result; + } + + if ((format + format_len) != format_end_out) { + ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", + format, (int)(format_end_out - format), (int)(format_len)); + return EINVAL; + } + + if (schema->dictionary != NULL) { + schema_view->type = NANOARROW_TYPE_DICTIONARY; + } + + result = ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error); + if (result != NANOARROW_OK) { + return result; + } + + if (schema_view->storage_type != schema_view->type) { + result = ArrowSchemaViewValidate(schema_view, schema_view->type, error); + if (result != NANOARROW_OK) { + return result; + } + } + + ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); + if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { + schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; + } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + schema_view->layout.child_size_elements = schema_view->fixed_size; + } + + schema_view->extension_name = ArrowCharView(NULL); + schema_view->extension_metadata = ArrowCharView(NULL); + ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name); + ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata); + + return NANOARROW_OK; +} + +static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, + char* out, int64_t n) { + const char* type_string = ArrowTypeString(schema_view->type); + switch (schema_view->type) { + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + return snprintf(out, n, "%s(%d, %d)", type_string, + (int)schema_view->decimal_precision, + (int)schema_view->decimal_scale); + case NANOARROW_TYPE_TIMESTAMP: + return snprintf(out, n, "%s('%s', '%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return snprintf(out, n, "%s('%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit)); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); + default: + return snprintf(out, n, "%s", type_string); + } +} + +// Helper for bookkeeping to emulate sprintf()-like behaviour spread +// among multiple sprintf calls. +static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, + int64_t* n_remaining, int64_t* n_chars) { + *n_chars += n_chars_last; + *n_remaining -= n_chars_last; + + // n_remaining is never less than 0 + if (*n_remaining < 0) { + *n_remaining = 0; + } + + // Can't do math on a NULL pointer + if (*out != NULL) { + *out += n_chars_last; + } +} + +int64_t ArrowSchemaToString(struct ArrowSchema* schema, char* out, int64_t n, + char recursive) { + if (schema == NULL) { + return snprintf(out, n, "[invalid: pointer is null]"); + } + + if (schema->release == NULL) { + return snprintf(out, n, "[invalid: schema is released]"); + } + + struct ArrowSchemaView schema_view; + struct ArrowError error; + + if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { + return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); + } + + // Extension type and dictionary should include both the top-level type + // and the storage type. + int is_extension = schema_view.extension_name.size_bytes > 0; + int is_dictionary = schema->dictionary != NULL; + int64_t n_chars = 0; + int64_t n_chars_last = 0; + + // Uncommon but not technically impossible that both are true + if (is_extension && is_dictionary) { + n_chars_last = snprintf( + out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); + } else if (is_extension) { + n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data); + } else if (is_dictionary) { + n_chars_last = + snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (!is_dictionary) { + n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); + } else { + n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (recursive && schema->format[0] == '+') { + n_chars_last = snprintf(out, n, "<"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + for (int64_t i = 0; i < schema->n_children; i++) { + if (i > 0) { + n_chars_last = snprintf(out, n, ", "); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + // ArrowSchemaToStringInternal() will validate the child and print the error, + // but we need the name first + if (schema->children[i] != NULL && schema->children[i]->release != NULL && + schema->children[i]->name != NULL) { + n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = snprintf(out, n, ">"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + if (is_extension && is_dictionary) { + n_chars += snprintf(out, n, ">}"); + } else if (is_extension) { + n_chars += snprintf(out, n, "}"); + } else if (is_dictionary) { + n_chars += snprintf(out, n, ">"); + } + + return n_chars; +} + +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata) { + reader->metadata = metadata; + + if (reader->metadata == NULL) { + reader->offset = 0; + reader->remaining_keys = 0; + } else { + memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); + reader->offset = sizeof(int32_t); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out) { + if (reader->remaining_keys <= 0) { + return EINVAL; + } + + int64_t pos = 0; + + int32_t key_size; + memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + key_out->data = reader->metadata + reader->offset + pos; + key_out->size_bytes = key_size; + pos += key_size; + + int32_t value_size; + memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + value_out->data = reader->metadata + reader->offset + pos; + value_out->size_bytes = value_size; + pos += value_size; + + reader->offset += pos; + reader->remaining_keys--; + return NANOARROW_OK; +} + +int64_t ArrowMetadataSizeOf(const char* metadata) { + if (metadata == NULL) { + return 0; + } + + struct ArrowMetadataReader reader; + struct ArrowStringView key; + struct ArrowStringView value; + ArrowMetadataReaderInit(&reader, metadata); + + int64_t size = sizeof(int32_t); + while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { + size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; + } + + return size; +} + +static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, + struct ArrowStringView* key, + struct ArrowStringView* value_out) { + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + ArrowMetadataReaderInit(&reader, metadata); + + while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == + NANOARROW_OK) { + int key_equal = key->size_bytes == existing_key.size_bytes && + strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; + if (key_equal) { + value_out->data = existing_value.data; + value_out->size_bytes = existing_value.size_bytes; + break; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out) { + if (value_out == NULL) { + return EINVAL; + } + + return ArrowMetadataGetValueInternal(metadata, &key, value_out); +} + +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { + struct ArrowStringView value = ArrowCharView(NULL); + ArrowMetadataGetValue(metadata, key, &value); + return value.data != NULL; +} + +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, + const char* metadata) { + ArrowBufferInit(buffer); + return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); +} + +static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + if (value == NULL) { + return NANOARROW_OK; + } + + if (buffer->capacity_bytes == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); + } + + if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { + return EINVAL; + } + + int32_t n_keys; + memcpy(&n_keys, buffer->data, sizeof(int32_t)); + + int32_t key_size = (int32_t)key->size_bytes; + int32_t value_size = (int32_t)value->size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); + + ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, key->data, key_size); + ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, value->data, value_size); + + n_keys++; + memcpy(buffer->data, &n_keys, sizeof(int32_t)); + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + // Inspect the current value to see if we can avoid copying the buffer + struct ArrowStringView current_value = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK( + ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); + + // The key should be removed but no key exists + if (value == NULL && current_value.data == NULL) { + return NANOARROW_OK; + } + + // The key/value can be appended because no key exists + if (value != NULL && current_value.data == NULL) { + return ArrowMetadataBuilderAppendInternal(buffer, key, value); + } + + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); + + struct ArrowBuffer new_buffer; + NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); + + while (reader.remaining_keys > 0) { + int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + + if (key->size_bytes == existing_key.size_bytes && + strncmp((const char*)key->data, (const char*)existing_key.data, + existing_key.size_bytes) == 0) { + result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); + value = NULL; + } else { + result = + ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); + } + + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + } + + ArrowBufferReset(buffer); + ArrowBufferMove(&new_buffer, buffer); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderSetInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key) { + return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowArrayRelease(struct ArrowArray* array) { + // Release buffers held by this array + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + if (private_data != NULL) { + ArrowBitmapReset(&private_data->bitmap); + ArrowBufferReset(&private_data->buffers[0]); + ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data); + } + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (array->children != NULL) { + for (int64_t i = 0; i < array->n_children; i++) { + if (array->children[i] != NULL) { + if (array->children[i]->release != NULL) { + array->children[i]->release(array->children[i]); + } + + ArrowFree(array->children[i]); + } + } + + ArrowFree(array->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (array->dictionary != NULL) { + if (array->dictionary->release != NULL) { + array->dictionary->release(array->dictionary); + } + + ArrowFree(array->dictionary); + } + + // Mark released + array->release = NULL; +} + +static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, + enum ArrowType storage_type) { + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + array->n_buffers = 0; + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + array->n_buffers = 1; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_MAP: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_DENSE_UNION: + array->n_buffers = 2; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + array->n_buffers = 3; + break; + + default: + return EINVAL; + + return NANOARROW_OK; + } + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->storage_type = storage_type; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type) { + array->length = 0; + array->null_count = 0; + array->offset = 0; + array->n_buffers = 0; + array->n_children = 0; + array->buffers = NULL; + array->children = NULL; + array->dictionary = NULL; + array->release = &ArrowArrayRelease; + array->private_data = NULL; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); + if (private_data == NULL) { + array->release = NULL; + return ENOMEM; + } + + ArrowBitmapInit(&private_data->bitmap); + ArrowBufferInit(&private_data->buffers[0]); + ArrowBufferInit(&private_data->buffers[1]); + private_data->buffer_data[0] = NULL; + private_data->buffer_data[1] = NULL; + private_data->buffer_data[2] = NULL; + + array->private_data = private_data; + array->buffers = (const void**)(&private_data->buffer_data); + + int result = ArrowArraySetStorageType(array, storage_type); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + ArrowLayoutInit(&private_data->layout, storage_type); + // We can only know this not to be true when initializing based on a schema + // so assume this to be true. + private_data->union_type_id_is_child_index = 1; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; + + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + } + } + + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); + if (result != NANOARROW_OK) { + array->release(array); + return result; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); + if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + // We can still build arrays if this isn't true; however, the append + // functions won't work. Instead, we store this value and error only + // when StartAppending is called. + private_data->union_type_id_is_child_index = + _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { + if (array->children != NULL) { + return EINVAL; + } + + if (n_children == 0) { + return NANOARROW_OK; + } + + array->children = + (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); + if (array->children == NULL) { + return ENOMEM; + } + + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); + + for (int64_t i = 0; i < n_children; i++) { + array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->children[i] == NULL) { + return ENOMEM; + } + array->children[i]->release = NULL; + } + + array->n_children = n_children; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { + if (array->dictionary != NULL) { + return EINVAL; + } + + array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->dictionary == NULL) { + return ENOMEM; + } + + array->dictionary->release = NULL; + return NANOARROW_OK; +} + +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); + private_data->bitmap.size_bits = bitmap->size_bits; + bitmap->size_bits = 0; + private_data->buffer_data[0] = private_data->bitmap.buffer.data; + array->null_count = -1; +} + +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (i) { + case 0: + ArrowBufferMove(buffer, &private_data->bitmap.buffer); + private_data->buffer_data[i] = private_data->bitmap.buffer.data; + break; + case 1: + case 2: + ArrowBufferMove(buffer, &private_data->buffers[i - 1]); + private_data->buffer_data[i] = private_data->buffers[i - 1].data; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + ArrowArrayViewInitFromType(array_view, private_data->storage_type); + array_view->layout = private_data->layout; + array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; + + int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < array->n_children; i++) { + result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, + struct ArrowArrayView* array_view) { + // Loop through buffers and reserve the extra space that we know about + for (int64_t i = 0; i < array->n_buffers; i++) { + // Don't reserve on a validity buffer that hasn't been allocated yet + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && + ArrowArrayBuffer(array, i)->data == NULL) { + continue; + } + + int64_t additional_size_bytes = + array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; + + if (additional_size_bytes > 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); + } + } + + // Recursively reserve children + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayReserveInternal(array->children[i], array_view->children[i])); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); + + // Calculate theoretical buffer sizes (recursively) + ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); + + // Walk the structure (recursively) + int result = ArrowArrayReserveInternal(array, &array_view); + ArrowArrayViewReset(&array_view); + if (result != NANOARROW_OK) { + return result; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + // The only buffer finalizing this currently does is make sure the data + // buffer for (Large)String|Binary is never NULL + switch (private_data->storage_type) { + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_LARGE_STRING: + if (ArrowArrayBuffer(array, 2)->data == NULL) { + ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0); + } + break; + default: + break; + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + + return NANOARROW_OK; +} + +static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + for (int64_t i = 0; i < 3; i++) { + private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; + } + + for (int64_t i = 0; i < array->n_children; i++) { + ArrowArrayFlushInternalPointers(array->children[i]); + } + + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); + } +} + +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + // Even if the data buffer is size zero, the pointer value needed to be non-null + // in some implementations (at least one version of Arrow C++ at the time this + // was added). Only do this fix if we can assume CPU data access. + if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); + } + + // Make sure the value we get with array->buffers[i] is set to the actual + // pointer (which may have changed from the original due to reallocation) + ArrowArrayFlushInternalPointers(array); + + if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { + return NANOARROW_OK; + } + + // For validation, initialize an ArrowArrayView with our known buffer sizes + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); + ArrowArrayViewReset(&array_view); + return result; +} + +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error) { + return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); +} + +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type) { + memset(array_view, 0, sizeof(struct ArrowArrayView)); + array_view->storage_type = storage_type; + ArrowLayoutInit(&array_view->layout, storage_type); +} + +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children) { + if (array_view->children != NULL) { + return EINVAL; + } + + array_view->children = + (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); + if (array_view->children == NULL) { + return ENOMEM; + } + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = NULL; + } + + array_view->n_children = n_children; + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->children[i] == NULL) { + return ENOMEM; + } + ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowSchemaView schema_view; + int result = ArrowSchemaViewInit(&schema_view, schema, error); + if (result != NANOARROW_OK) { + return result; + } + + ArrowArrayViewInitFromType(array_view, schema_view.storage_type); + array_view->layout = schema_view.layout; + + result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); + if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = + ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || + array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { + array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); + if (array_view->union_type_id_map == NULL) { + return ENOMEM; + } + + memset(array_view->union_type_id_map, -1, 256); + int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); + for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { + int8_t type_id = array_view->union_type_id_map[128 + child_index]; + array_view->union_type_id_map[type_id] = child_index; + } + } + + return NANOARROW_OK; +} + +void ArrowArrayViewReset(struct ArrowArrayView* array_view) { + if (array_view->children != NULL) { + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i] != NULL) { + ArrowArrayViewReset(array_view->children[i]); + ArrowFree(array_view->children[i]); + } + } + + ArrowFree(array_view->children); + } + + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + + if (array_view->union_type_id_map != NULL) { + ArrowFree(array_view->union_type_id_map); + } + + ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); +} + +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { + for (int i = 0; i < 3; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + array_view->buffer_views[i].size_bytes = + (length != 0) * element_size_bytes * (length + 1); + continue; + case NANOARROW_BUFFER_TYPE_DATA: + array_view->buffer_views[i].size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / + 8; + continue; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + array_view->buffer_views[i].size_bytes = element_size_bytes * length; + continue; + case NANOARROW_BUFFER_TYPE_NONE: + array_view->buffer_views[i].size_bytes = 0; + continue; + } + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + for (int64_t i = 0; i < array_view->n_children; i++) { + ArrowArrayViewSetLength(array_view->children[i], length); + } + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + if (array_view->n_children >= 1) { + ArrowArrayViewSetLength(array_view->children[0], + length * array_view->layout.child_size_elements); + } + default: + break; + } +} + +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Check length and offset + if (array->offset < 0) { + ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", + (long)array->offset); + return EINVAL; + } + + if (array->length < 0) { + ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", + (long)array->length); + return EINVAL; + } + + array_view->array = array; + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; + + int64_t buffers_required = 0; + for (int i = 0; i < 3; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + buffers_required++; + + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { + array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; + } + } + + // Check the number of buffers + if (buffers_required != array->n_buffers) { + ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", + (int)buffers_required, (int)array->n_buffers); + return EINVAL; + } + + // Check number of children + if (array_view->n_children != array->n_children) { + ArrowErrorSet(error, "Expected %ld children but found %ld children", + (long)array_view->n_children, (long)array->n_children); + return EINVAL; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (int)i, + (long)min_buffer_size_bytes, + (long)array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", + ArrowTypeString(array_view->storage_type), + (long)array_view->n_children); + return EINVAL; + } + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)(child_min_length), + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %ld but " + "found array with length %ld", + (long)child_min_length, (long)array_view->children[0]->length); + return EINVAL; + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + int64_t first_offset; + int64_t last_offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %ld bytes but found " + "buffer with %ld bytes", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < offset_plus_length) { + ArrowErrorSet( + error, + "Expected struct child %d to have length >= %ld but found child with " + "length %ld", + (int)(i + 1), (long)offset_plus_length, + (long)array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of %s array to have length >= %ld but found array with " + "length %ld", + ArrowTypeString(array_view->storage_type), (long)last_offset, + (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_LIST: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[0]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", + (long)first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet( + error, + "Expected child of large list array to have length >= %ld but found array " + "with length %ld", + (long)last_offset, (long)array_view->children[0]->length); + return EINVAL; + } + } + break; + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateDefault(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int32_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { + int32_t diff = view.data.as_int32[i] - view.data.as_int32[i - 1]; + if (diff < 0) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", + (long)i, (long)diff); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int64_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { + int64_t diff = view.data.as_int64[i] - view.data.as_int64[i - 1]; + if (diff < 0) { + ArrowErrorSet(error, "[%ld] Expected element size >= 0 but found element size %ld", + (long)i, (long)diff); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, + int8_t max_value, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { + ArrowErrorSet(error, + "[%ld] Expected buffer value between %d and %d but found value %d", + (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, + int64_t n_values, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + int item_found = 0; + for (int64_t j = 0; j < n_values; j++) { + if (view.data.as_int8[i] == values[j]) { + item_found = 1; + break; + } + } + + if (!item_found) { + ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, + (int)view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < 3; i++) { + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + if (array_view->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); + } else { + NANOARROW_RETURN_NOT_OK( + ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); + } + break; + default: + break; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { + if (array_view->union_type_id_map == NULL) { + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; + } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( + array_view->union_type_id_map, array_view->n_children, + array_view->n_children)) { + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); + } else { + NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], + array_view->union_type_id_map + 128, + array_view->n_children, error)); + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && + array_view->union_type_id_map != NULL) { + // Check that offsets refer to child elements that actually exist + for (int64_t i = 0; i < array_view->length; i++) { + int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); + int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); + int64_t child_length = array_view->children[child_id]->length; + if (offset < 0 || offset > child_length) { + ArrowErrorSet( + error, + "[%ld] Expected union offset for child id %d to be between 0 and %ld but " + "found offset value %ld", + (long)i, (int)child_id, (long)child_length, offset); + return EINVAL; + } + } + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); + } + + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented"); + return ENOTSUP; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "nanoarrow.h" + +struct BasicArrayStreamPrivate { + struct ArrowSchema schema; + int64_t n_arrays; + struct ArrowArray* arrays; + int64_t arrays_i; +}; + +static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + return ArrowSchemaDeepCopy(&private_data->schema, schema); +} + +static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, + struct ArrowArray* array) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->arrays_i == private_data->n_arrays) { + array->release = NULL; + return NANOARROW_OK; + } + + ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); + return NANOARROW_OK; +} + +static const char* ArrowBasicArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + return NULL; +} + +static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { + if (array_stream == NULL || array_stream->release == NULL) { + return; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->schema.release != NULL) { + private_data->schema.release(&private_data->schema); + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + private_data->arrays[i].release(&private_data->arrays[i]); + } + } + + if (private_data->arrays != NULL) { + ArrowFree(private_data->arrays); + } + + ArrowFree(private_data); + array_stream->release = NULL; +} + +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); + if (private_data == NULL) { + return ENOMEM; + } + + ArrowSchemaMove(schema, &private_data->schema); + + private_data->n_arrays = n_arrays; + private_data->arrays = NULL; + private_data->arrays_i = 0; + + if (n_arrays > 0) { + private_data->arrays = + (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); + if (private_data->arrays == NULL) { + ArrowBasicArrayStreamRelease(array_stream); + return ENOMEM; + } + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + private_data->arrays[i].release = NULL; + } + + array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; + array_stream->get_next = &ArrowBasicArrayStreamGetNext; + array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; + array_stream->release = ArrowBasicArrayStreamRelease; + array_stream->private_data = private_data; + return NANOARROW_OK; +} + +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + ArrowArrayMove(array, &private_data->arrays[i]); +} + +ArrowErrorCode ArrowBasicArrayStreamValidate(struct ArrowArrayStream* array_stream, + struct ArrowError* error) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(&array_view); + return result; + } + } + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4ff3de2fc7b2b..68d0c7821a3b0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -21,6 +21,7 @@ iNaT, lib, ) +from pandas._libs.arrays import BitmaskArray from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -423,7 +424,7 @@ def nunique_ints(values: ArrayLike) -> int: return result -def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): +def unique_with_mask(values, mask: npt.NDArray[np.bool_] | BitmaskArray | None = None): """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values, func_name="unique") @@ -444,7 +445,7 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) assert mask is not None # for mypy - return uniques, mask.astype("bool") + return uniques, mask unique1d = unique @@ -549,7 +550,7 @@ def factorize_array( use_na_sentinel: bool = True, size_hint: int | None = None, na_value: object = None, - mask: npt.NDArray[np.bool_] | None = None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, ) -> tuple[npt.NDArray[np.intp], np.ndarray]: """ Factorize a numpy array to codes and uniques. @@ -947,7 +948,9 @@ def value_counts_internal( # Called once from SparseArray, otherwise could be private def value_counts_arraylike( - values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None + values: np.ndarray, + dropna: bool, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, ) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: """ Parameters @@ -971,7 +974,7 @@ def value_counts_arraylike( if dropna: mask = keys != iNaT - keys, counts = keys[mask], counts[mask] + keys, counts = keys[mask], counts[mask] # type: ignore[index] res_keys = _reconstruct_data(keys, original.dtype, original) return res_keys, counts, na_counter @@ -1288,12 +1291,14 @@ def take( ... fill_value=-10) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, BitmaskArray) + ): # GH#52981 warnings.warn( "pd.api.extensions.take accepting non-standard inputs is deprecated " "and will raise in a future version. Pass either a numpy.ndarray, " - "ExtensionArray, Index, or Series instead.", + "ExtensionArray, Index, Series, or BitmaskArray instead.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..2149d168cf898 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -17,16 +17,14 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: - from pandas._typing import ( - AxisInt, - npt, - ) + from pandas._libs.arrays import BitmaskArray + from pandas._typing import AxisInt def _reductions( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -67,7 +65,7 @@ def _reductions( def sum( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -80,7 +78,7 @@ def sum( def prod( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, min_count: int = 0, @@ -94,7 +92,7 @@ def prod( def _minmax( func: Callable, values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -121,7 +119,7 @@ def _minmax( else: return func(values, axis=axis) else: - subset = values[~mask] + subset = values[~mask] # type: ignore[index] if subset.size: return func(subset, axis=axis) else: @@ -131,7 +129,7 @@ def _minmax( def min( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -141,7 +139,7 @@ def min( def max( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: np.ndarray | BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -151,7 +149,7 @@ def max( def mean( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -163,7 +161,7 @@ def mean( def var( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, @@ -181,7 +179,7 @@ def var( def std( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: BitmaskArray, *, skipna: bool = True, axis: AxisInt | None = None, diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..f6ea0b3fff3d6 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -10,6 +10,7 @@ ) if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( ArrayLike, Scalar, @@ -43,7 +44,7 @@ def quantile_compat( def quantile_with_mask( values: np.ndarray, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitmaskArray, fill_value, qs: npt.NDArray[np.float64], interpolation: str, @@ -80,7 +81,7 @@ def quantile_with_mask( if values.ndim == 1: # unsqueeze, operate, re-squeeze values = np.atleast_2d(values) - mask = np.atleast_2d(mask) + mask = np.atleast_2d(mask) # type: ignore[arg-type] res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation) return res_values[0] @@ -157,7 +158,7 @@ def _nanpercentile( qs: npt.NDArray[np.float64], *, na_value, - mask: npt.NDArray[np.bool_], + mask: npt.NDArray[np.bool_] | BitmaskArray, interpolation: str, ): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c06bf7366447b..c9e4991d25d03 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -597,7 +597,7 @@ def nbytes(self) -> int: Examples -------- >>> pd.array([1, 2, 3]).nbytes - 27 + 25 """ # If this is expensive to compute, return an approximate lower bound # on the number of bytes needed. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 04e6f0a0bcdde..ab0c81aa68c34 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -28,6 +28,7 @@ if TYPE_CHECKING: import pyarrow + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -299,13 +300,15 @@ class BooleanArray(BaseMaskedArray): _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} @classmethod - def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + def _simple_new( + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitmaskArray + ) -> Self: result = super()._simple_new(values, mask) result._dtype = BooleanDtype() return result def __init__( - self, values: np.ndarray, mask: np.ndarray, copy: bool = False + self, values: np.ndarray, mask: np.ndarray | BitmaskArray, copy: bool = False ) -> None: if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 56d3711c7d13b..4b332a4df0063 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -15,6 +15,7 @@ lib, missing as libmissing, ) +from pandas._libs.arrays import BitmaskArray from pandas._libs.tslibs import ( get_unit_from_dtype, is_supported_unit, @@ -112,37 +113,46 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray - _mask: npt.NDArray[np.bool_] + _mask: BitmaskArray # Fill values used for any/all _truthy_value = Scalar # bool(_truthy_value) = True _falsey_value = Scalar # bool(_falsey_value) = False @classmethod - def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + def _simple_new( + cls, values: np.ndarray, mask: npt.NDArray[np.bool_] | BitmaskArray + ) -> Self: result = BaseMaskedArray.__new__(cls) result._data = values - result._mask = mask + result._mask = BitmaskArray(mask) return result def __init__( - self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + self, + values: np.ndarray, + mask: npt.NDArray[np.bool_] | BitmaskArray, + copy: bool = False, ) -> None: # values is supposed to already be validated in the subclass - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + if not ( + isinstance(mask, BitmaskArray) + or (isinstance(mask, np.ndarray) and mask.dtype == np.bool_) + ): raise TypeError( - "mask should be boolean numpy array. Use " - "the 'pd.array' function instead" + "mask should be boolean numpy array or BitmaskArray. " + "Use the 'pd.array' function instead" ) - if values.shape != mask.shape: - raise ValueError("values.shape must match mask.shape") + if isinstance(mask, np.ndarray): + if values.shape != mask.shape: + raise ValueError("values.shape must match mask.shape") if copy: values = values.copy() mask = mask.copy() self._data = values - self._mask = mask + self._mask = BitmaskArray(mask) @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @@ -188,7 +198,7 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self.dtype.na_value return self._data[item] - return self._simple_new(self._data[item], newmask) + return self._simple_new(self._data[item], newmask) # type: ignore[arg-type] def _pad_or_backfill( self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True @@ -199,7 +209,7 @@ def _pad_or_backfill( func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -229,7 +239,7 @@ def fillna( if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) npvalues = self._data.T - new_mask = mask.T + new_mask = mask.to_numpy().T if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() @@ -336,28 +346,28 @@ def ndim(self) -> int: def swapaxes(self, axis1, axis2) -> Self: data = self._data.swapaxes(axis1, axis2) - mask = self._mask.swapaxes(axis1, axis2) + mask = self._mask.to_numpy().swapaxes(axis1, axis2) return self._simple_new(data, mask) def delete(self, loc, axis: AxisInt = 0) -> Self: data = np.delete(self._data, loc, axis=axis) - mask = np.delete(self._mask, loc, axis=axis) + mask = np.delete(self._mask, loc, axis=axis) # type: ignore[call-overload] return self._simple_new(data, mask) def reshape(self, *args, **kwargs) -> Self: data = self._data.reshape(*args, **kwargs) - mask = self._mask.reshape(*args, **kwargs) + mask = self._mask.to_numpy().reshape(*args, **kwargs) return self._simple_new(data, mask) def ravel(self, *args, **kwargs) -> Self: # TODO: need to make sure we have the same order for data/mask data = self._data.ravel(*args, **kwargs) - mask = self._mask.ravel(*args, **kwargs) + mask = self._mask.to_numpy().ravel(*args, **kwargs) return type(self)(data, mask) @property def T(self) -> Self: - return self._simple_new(self._data.T, self._mask.T) + return self._simple_new(self._data.T, self._mask.to_numpy().T) def round(self, decimals: int = 0, *args, **kwargs): """ @@ -659,19 +669,17 @@ def __arrow_array__(self, type=None): """ import pyarrow as pa - return pa.array(self._data, mask=self._mask, type=type) + return pa.array(self._data, mask=self._mask.to_numpy(), type=type) @property def _hasna(self) -> bool: # Note: this is expensive right now! The hope is that we can # make this faster by having an optional mask, but not have to change # source code using it.. - - # error: Incompatible return value type (got "bool_", expected "bool") - return self._mask.any() # type: ignore[return-value] + return self._mask.any() def _propagate_mask( - self, mask: npt.NDArray[np.bool_] | None, other + self, mask: npt.NDArray[np.bool_] | BitmaskArray | None, other ) -> npt.NDArray[np.bool_]: if mask is None: mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy @@ -822,7 +830,9 @@ def _cmp_method(self, other, op) -> BooleanArray: return BooleanArray(result, mask, copy=False) def _maybe_mask_result( - self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray + self, + result: np.ndarray | tuple[np.ndarray, np.ndarray], + mask: np.ndarray | BitmaskArray, ): """ Parameters @@ -871,7 +881,7 @@ def _maybe_mask_result( return result def isna(self) -> np.ndarray: - return self._mask.copy() + return self._mask.to_numpy() @property def _na_value(self): @@ -888,7 +898,10 @@ def _concat_same_type( axis: AxisInt = 0, ) -> Self: data = np.concatenate([x._data for x in to_concat], axis=axis) - mask = np.concatenate([x._mask for x in to_concat], axis=axis) + try: + mask = BitmaskArray.concatenate([x._mask for x in to_concat], axis=axis) + except NotImplementedError: + mask = np.concatenate([x._mask.to_numpy() for x in to_concat], axis=axis) return cls(data, mask) def take( @@ -910,9 +923,16 @@ def take( axis=axis, ) - mask = take( - self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis - ) + try: + mask = self._mask.take_1d(indexer, axis=axis) + except (TypeError, ValueError, NotImplementedError): + mask = take( + self._mask.to_numpy(), + indexer, + fill_value=True, + allow_fill=allow_fill, + axis=axis, + ) # if we are filling # we only fill where the indexer is null @@ -1010,7 +1030,7 @@ def factorize( size = len(uniques) + 1 uniques_mask = np.zeros(size, dtype=bool) if not use_na_sentinel and has_na: - na_index = mask.argmax() + na_index = mask.to_numpy().argmax() # Insert na with the proper code if na_index == 0: na_code = np.intp(0) @@ -1078,10 +1098,10 @@ def equals(self, other) -> bool: # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT # equal. - if not np.array_equal(self._mask, other._mask): + if not np.array_equal(self._mask.to_numpy(), other._mask.to_numpy()): return False - left = self._data[~self._mask] + left = self._data[~self._mask] # type: ignore[call-overload] right = other._data[~other._mask] return array_equivalent(left, right, strict_nan=True, dtype_equal=True) @@ -1132,6 +1152,7 @@ def _quantile( def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + mask: BitmaskArray | np.ndarray if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: result = getattr(self, name)(skipna=skipna, **kwargs) else: @@ -1159,9 +1180,9 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): if skipna: # we only retain mask for all-NA rows/columns - mask = self._mask.all(axis=axis) + mask = self._mask.to_numpy().all(axis=axis) else: - mask = self._mask.any(axis=axis) + mask = self._mask.to_numpy().any(axis=axis) return self._maybe_mask_result(result, mask) return result @@ -1355,18 +1376,31 @@ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_any((), kwargs) - values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + is_all_na = self._mask.all() + if len(self) == 0 or (skipna and is_all_na): + return np.bool_(False) + + is_any_na = self._mask.any() + if is_any_na: + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask( + values, + self._mask, # type: ignore[arg-type] + self._falsey_value, # type: ignore[arg-type] + ) + else: + values = self._data + result = values.any() if skipna: return result else: - if result or len(self) == 0 or not self._mask.any(): + if result or not is_any_na: return result else: return self.dtype.na_value @@ -1436,19 +1470,32 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ nv.validate_all((), kwargs) - values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + is_all_na = self._mask.all() + if len(self) == 0 or (skipna and is_all_na): + return np.bool_(True) + + is_any_na = self._mask.any() + if is_any_na: + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask( + values, + self._mask, # type: ignore[arg-type] + self._truthy_value, # type: ignore[arg-type] + ) + else: + values = self._data + result = values.all(axis=axis) if skipna: return result else: - if not result or len(self) == 0 or not self._mask.any(): + if not result or not self._mask.any(): return result else: return self.dtype.na_value @@ -1485,7 +1532,7 @@ def _groupby_op( # libgroupby functions are responsible for NOT altering mask mask = self._mask if op.kind != "aggregate": - result_mask = mask.copy() + result_mask = mask.to_numpy() else: result_mask = np.zeros(ngroups, dtype=bool) @@ -1523,7 +1570,7 @@ def transpose_homogeneous_masked_arrays( values = [arr._data.reshape(1, -1) for arr in masked_arrays] transposed_values = np.concatenate(values, axis=0) - masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] + masks = [arr._mask.to_numpy().reshape(1, -1) for arr in masked_arrays] transposed_masks = np.concatenate(masks, axis=0) dtype = masked_arrays[0].dtype diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 0e86c1efba17a..38cb1ccc14b7f 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -32,6 +32,7 @@ import pyarrow + from pandas._libs.arrays import BitmaskArray from pandas._typing import ( Dtype, DtypeObj, @@ -230,7 +231,10 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] def __init__( - self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + self, + values: np.ndarray, + mask: npt.NDArray[np.bool_] | BitmaskArray, + copy: bool = False, ) -> None: checker = self._dtype_cls._checker if not (isinstance(values, np.ndarray) and checker(values.dtype)): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..db6756231b0bc 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -165,7 +165,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values.to_numpy(), type=pa.string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): return cls(pc.cast(scalars, pa.string())) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a022bfd1bd9bc..4e8bcbfca66f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -143,6 +143,8 @@ class providing the base-class of operations. if TYPE_CHECKING: from typing import Any + from pandas._libs.arrays import BitmaskArray + from pandas.core.resample import Resampler from pandas.core.window import ( ExpandingGroupby, @@ -4443,6 +4445,7 @@ def post_processor( def blk_func(values: ArrayLike) -> ArrayLike: orig_vals = values + mask: np.ndarray | BitmaskArray if isinstance(values, BaseMaskedArray): mask = values._mask result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 607059e5183ec..80818df694d27 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -22,6 +22,7 @@ NaT, lib, ) +from pandas._libs.arrays import BitmaskArray import pandas._libs.groupby as libgroupby from pandas._typing import ( ArrayLike, @@ -309,11 +310,13 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None = None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None, result_mask: npt.NDArray[np.bool_] | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: + if isinstance(mask, BitmaskArray): + mask = mask.to_numpy() # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] if mask is not None: @@ -353,7 +356,7 @@ def _call_cython_op( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: npt.NDArray[np.bool_] | None, + mask: npt.NDArray[np.bool_] | BitmaskArray | None, result_mask: npt.NDArray[np.bool_] | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] @@ -387,6 +390,9 @@ def _call_cython_op( values = values.T if mask is not None: + if isinstance(mask, BitmaskArray): + mask = mask.to_numpy() + mask = mask.T if result_mask is not None: result_mask = result_mask.T diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..80473b210b63a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -47,10 +47,12 @@ ) if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray + from pandas import Index -def check_value_size(value, mask: npt.NDArray[np.bool_], length: int): +def check_value_size(value, mask: npt.NDArray[np.bool_] | BitmaskArray, length: int): """ Validate the size of the values passed to ExtensionArray.fillna. """ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e60c42a20a9af..b27392ba39155 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,6 +3,7 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, Callable, cast, @@ -49,6 +50,10 @@ notna, ) +if TYPE_CHECKING: + from pandas._libs.arrays import BitmaskArray + + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -1537,7 +1542,9 @@ def _maybe_null_out( def check_below_min_count( - shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int + shape: tuple[int, ...], + mask: npt.NDArray[np.bool_] | BitmaskArray | None, + min_count: int, ) -> bool: """ Check for the `min_count` keyword. Returns True if below `min_count` (when @@ -1561,7 +1568,7 @@ def check_below_min_count( # no missing values, only check size non_nulls = np.prod(shape) else: - non_nulls = mask.size - mask.sum() + non_nulls = mask.size - mask.sum() # type: ignore[assignment] if non_nulls < min_count: return True return False diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index adc1f63c568bf..dfe64a37bbd30 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -9,13 +9,14 @@ lib, missing as libmissing, ) +from pandas._libs.arrays import BitmaskArray def kleene_or( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``or`` using Kleene logic. @@ -53,6 +54,10 @@ def kleene_or( result = left | right if right_mask is not None: + if isinstance(left_mask, BitmaskArray): + left_mask = left_mask.to_numpy() + if isinstance(right_mask, BitmaskArray): + right_mask = right_mask.to_numpy() # output is unknown where (False & NA), (NA & False), (NA & NA) left_false = ~(left | left_mask) right_false = ~(right | right_mask) @@ -63,12 +68,14 @@ def kleene_or( ) else: if right is True: - mask = np.zeros_like(left_mask) - elif right is libmissing.NA: - mask = (~left & ~left_mask) | left_mask + mask = np.zeros(left_mask.shape, left_mask.dtype) else: - # False - mask = left_mask.copy() + if isinstance(left_mask, BitmaskArray): + left_mask = left_mask.to_numpy() + if right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + mask = left_mask return result, mask @@ -76,8 +83,8 @@ def kleene_or( def kleene_xor( left: bool | np.ndarray | libmissing.NAType, right: bool | np.ndarray | libmissing.NAType, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``xor`` using Kleene logic. @@ -117,9 +124,12 @@ def kleene_xor( if right_mask is None: if right is libmissing.NA: - mask = np.ones_like(left_mask) + mask = np.ones(left_mask.shape, left_mask.dtype) else: - mask = left_mask.copy() + if isinstance(left_mask, BitmaskArray): + mask = left_mask.to_numpy() + else: + mask = left_mask.copy() else: mask = left_mask | right_mask @@ -129,8 +139,8 @@ def kleene_xor( def kleene_and( left: bool | libmissing.NAType | np.ndarray, right: bool | libmissing.NAType | np.ndarray, - left_mask: np.ndarray | None, - right_mask: np.ndarray | None, + left_mask: np.ndarray | BitmaskArray | None, + right_mask: np.ndarray | BitmaskArray | None, ): """ Boolean ``and`` using Kleene logic. @@ -168,14 +178,23 @@ def kleene_and( if right_mask is None: # Scalar `right` if right is libmissing.NA: - mask = (left & ~left_mask) | left_mask - + if left_mask.any(): + mask = (left & ~left_mask) | left_mask # type: ignore[operator] + else: + mask = left else: mask = left_mask.copy() if right is False: # unmask everything mask[:] = False else: + # Since we must compare to left / right it helps perf to convert + # to numpy up front, rather than deferring multiple times + if isinstance(left_mask, BitmaskArray): + left_mask = left_mask.to_numpy() + if isinstance(right_mask, BitmaskArray): + right_mask = right_mask.to_numpy() + # unmask where either left or right is False left_false = ~(left | left_mask) right_false = ~(right | right_mask) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 7a445ad7ac2b2..28612e7aef6ef 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import lib +from pandas._libs.arrays import BitmaskArray from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -201,10 +202,10 @@ def to_numeric( # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting - mask: npt.NDArray[np.bool_] | None = None + mask: npt.NDArray[np.bool_] | BitmaskArray | None = None if isinstance(values, BaseMaskedArray): mask = values._mask - values = values._data[~mask] + values = values._data[~mask] # type: ignore[call-overload] values_dtype = getattr(values, "dtype", None) if isinstance(values_dtype, ArrowDtype): @@ -280,7 +281,7 @@ def to_numeric( mask = new_mask else: mask = mask.copy() - assert isinstance(mask, np.ndarray) + assert isinstance(mask, (np.ndarray, BitmaskArray)) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index d26eea19c06e9..12378cf719065 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -40,7 +40,6 @@ def test_boolean_array_constructor_copy(): result = BooleanArray(values, mask) assert result._data is values - assert result._mask is mask result = BooleanArray(values, mask, copy=True) assert result._data is not values @@ -159,7 +158,7 @@ def test_coerce_to_array(): expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) assert result._data is values - assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) expected = BooleanArray(values, mask) tm.assert_extension_array_equal(result, expected) @@ -202,12 +201,12 @@ def test_coerce_to_array_from_boolean_array(): tm.assert_extension_array_equal(result, arr) # no copy assert result._data is arr._data - assert result._mask is arr._mask + assert result._mask.parent is arr._mask result = BooleanArray(*coerce_to_array(arr), copy=True) tm.assert_extension_array_equal(result, arr) assert result._data is not arr._data - assert result._mask is not arr._mask + assert result._mask.parent is not arr._mask with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): coerce_to_array(arr, mask=mask) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 2b3f3d3d16ac6..b58ec19dff329 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -13,36 +13,36 @@ def test_ufuncs_binary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a, a) expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s, a) expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_series_equal(result, expected) # Boolean with numpy array arr = np.array([True, True, False]) result = ufunc(a, arr) expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # BooleanArray with scalar result = ufunc(a, True) expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) result = ufunc(True, a) expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) # not handled types @@ -56,13 +56,13 @@ def test_ufuncs_unary(ufunc): a = pd.array([True, False, None], dtype="boolean") result = ufunc(a) expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_extension_array_equal(result, expected) ser = pd.Series(a) result = ufunc(ser) expected = pd.Series(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan + expected[a._mask.to_numpy()] = np.nan tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index d2f9f6dffab49..94b095fc0fa91 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -145,11 +145,3 @@ def test_astype_object_timestamp_categories(self): result = cat.astype(object) expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") tm.assert_numpy_array_equal(result, expected) - - def test_astype_category_readonly_mask_values(self): - # GH#53658 - arr = array([0, 1, 2], dtype="Int64") - arr._mask.flags["WRITEABLE"] = False - result = arr.astype("category") - expected = array([0, 1, 2], dtype="Int64").astype("category") - tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..b38a944238b38 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -88,7 +88,7 @@ def test_astype_copy(): result = arr.astype("Float64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - assert np.shares_memory(result._mask, arr._mask) + assert result._mask is arr._mask result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index a429649f1ce1d..19eb02374d476 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -61,5 +61,5 @@ def test_equals_nan_vs_na(): # with mask[1] = True, the only difference is data[1], which should # not matter for equals - mask[1] = True + left._mask[1] = True assert left.equals(right) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4007ee6b415c9..5a58125a1c126 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -23,9 +23,12 @@ def test_floating_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Float64") tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(result._data, values) - tm.assert_numpy_array_equal(result._mask, mask) + tm.assert_numpy_array_equal(result._mask.to_numpy(), mask) - msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + msg = ( + r".* should be .* numpy array( or BitmaskArray)?. " + r"Use the 'pd.array' function instead" + ) with pytest.raises(TypeError, match=msg): FloatingArray(values.tolist(), mask) @@ -62,7 +65,6 @@ def test_floating_array_constructor_copy(): result = FloatingArray(values, mask) assert result._data is values - assert result._mask is mask result = FloatingArray(values, mask, copy=True) assert result._data is not values diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 9ecfc51cb2208..d442a26c9c4dc 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -80,7 +80,10 @@ def test_integer_array_constructor(): expected = pd.array([1, 2, 3, np.nan], dtype="Int64") tm.assert_extension_array_equal(result, expected) - msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + msg = ( + r".* should be .* numpy array( or BitmaskArray)?. " + r"Use the 'pd.array' function instead" + ) with pytest.raises(TypeError, match=msg): IntegerArray(values.tolist(), mask) @@ -100,7 +103,6 @@ def test_integer_array_constructor_copy(): result = IntegerArray(values, mask) assert result._data is values - assert result._mask is mask result = IntegerArray(values, mask, copy=True) assert result._data is not values diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index f50b4cfd0b520..70c2eeeb852c6 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -163,7 +163,7 @@ def test_astype_copy(): result = arr.astype("Int64", copy=False) assert result is arr assert np.shares_memory(result._data, arr._data) - assert np.shares_memory(result._mask, arr._mask) + assert result._mask is arr._mask result[0] = 10 assert arr[0] == 10 result[0] = pd.NA diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index f4b571ca627b3..04deac24a9211 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -76,22 +76,22 @@ def test_array_NA(data, all_arithmetic_operators): scalar = pd.NA scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) - mask = data._mask.copy() + mask = data._mask.to_numpy() if is_bool_not_implemented(data, all_arithmetic_operators): msg = "operator '.*' not implemented for bool dtypes" with pytest.raises(NotImplementedError, match=msg): op(data, scalar) # GH#45421 check op doesn't alter data._mask inplace - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) return result = op(data, scalar) # GH#45421 check op doesn't alter data._mask inplace - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) expected = op(data, scalar_array) - tm.assert_numpy_array_equal(mask, data._mask) + tm.assert_numpy_array_equal(mask, data._mask.to_numpy()) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/masked/test_bitmask.py b/pandas/tests/arrays/masked/test_bitmask.py new file mode 100644 index 0000000000000..d895618ba3483 --- /dev/null +++ b/pandas/tests/arrays/masked/test_bitmask.py @@ -0,0 +1,591 @@ +import itertools +import pickle + +import numpy as np +import pytest + +from pandas._libs.arrays import BitmaskArray + +import pandas._testing as tm + + +@pytest.mark.parametrize( + "data,expected", + [ + pytest.param(np.array([False, False]), bytes([0x0]), id="all_false"), + pytest.param(np.array([True, False]), bytes([0x1]), id="first_true"), + pytest.param(np.array([False, True]), bytes([0x2]), id="second_true"), + pytest.param(np.array([True, True]), bytes([0x3]), id="all_true"), + pytest.param(np.array([True, False] * 8), bytes([0x55, 0x55]), id="multibyte"), + pytest.param( + np.array([[False, False], [True, True], [False, False]])[:, 0], + bytes([0x2]), + id="non-contiguous", + ), + ], +) +def test_constructor_ndarray(data, expected): + bma = BitmaskArray(data) + assert bma.bytes == expected + assert not bma.parent + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([False, False]), bytes([0x0])), + (np.array([True, False]), bytes([0x1])), + (np.array([False, True]), bytes([0x2])), + (np.array([True, True]), bytes([0x3])), + (np.array([True, False] * 8), bytes([0x55, 0x55])), + ], +) +def test_constructor_bitmap(data, expected): + parent = BitmaskArray(data) + bma = BitmaskArray(parent) + assert bma.bytes == expected + assert bma.parent is parent + + +def test_len(): + bma = BitmaskArray(np.array([True, False, False])) + assert len(bma) == 3 + + +def test_repr_no_parent(): + bma = BitmaskArray(np.array([True, False, False])) + result = repr(bma) + assert "parent: None" in result + assert "data: b'\\x01'" in result + + +def test_repr_parent(): + parent = BitmaskArray(np.array([False, False, True])) + bma = BitmaskArray(parent) + result = repr(bma) + parent_repr = object.__repr__(parent) + assert parent_repr in result + assert "data: b'\\x04'" in result + + +@pytest.mark.parametrize( + "input_data", + [ + pytest.param([[True]], id="identity_case"), + pytest.param([[True], [True]], id="base_case"), + pytest.param([[True], [False]], id="base_case_2"), + pytest.param([[True], [False] * 7], id="single_byte_boundary_end"), + pytest.param([[True], [False] * 8], id="multi_byte_non_boundary"), + pytest.param( + [[True] * 4, [False] * 4, [True] * 6, [False] * 2], + id="multi_byte_boundary_end", + ), + ], +) +def test_concatenate(input_data): + masks = [BitmaskArray(np.array(x)) for x in input_data] + + result = BitmaskArray.concatenate(masks, axis=0) + expected = BitmaskArray(np.array(list(itertools.chain.from_iterable(input_data)))) + + assert result.bytes == expected.bytes + + +def test_concatenate_raises_not_axis0(): + with pytest.raises(NotImplementedError, match="only implemented for axis=0"): + BitmaskArray.concatenate([], axis=1) + + +@pytest.mark.parametrize( + "indexer,expected", + [ + (0, True), + (1, False), + ], +) +def test_getitem_scalar(indexer, expected): + bma = BitmaskArray(np.array([True, False, True])) + result = bma[indexer] + + assert result == expected + + +def test_getitem_null_slice(): + bma = BitmaskArray(np.array([True, False, True])) + result = bma[:] + + assert not result.parent + assert len(result) == 3 + + assert result.bytes[0] & 1 == 1 + assert (result.bytes[0] >> 1) & 1 == 0 + assert (result.bytes[0] >> 2) & 1 == 1 + + +@pytest.mark.parametrize( + "indexer,mask,expected", + [ + pytest.param(slice(2), bytes([0x3]), bytes([0x1]), id="basic_slice"), + pytest.param( + slice(1000), bytes([0x7]), bytes([0x05]), id="slice_exceeding_bounds" + ), + ], +) +def test_getitem_monotonic_slice(indexer, mask, expected): + bma = BitmaskArray(np.array([True, False, True])) + result = bma[indexer] + + assert not result.parent + + # the bits past the length of result are undefined, so explicitly mask them out + assert (result.bytes[0] & mask[0]) == expected[0] + + +@pytest.mark.parametrize( + "indexer,expected", + [ + ([0, 1], np.array([True, False])), + (np.array([2, 1]), np.array([True, False])), + (slice(1, 2), np.array([False])), + ], +) +def test_getitem_numpy_fallback(indexer, expected): + bma = BitmaskArray(np.array([True, False, True])) + result = bma[indexer] + + tm.assert_numpy_array_equal(result, expected) + + +def test_setitem_scalar(): + bma = BitmaskArray(np.array([True, False, True])) + + bma[0] = False + assert not bma[0] + + bma[:] = True + assert bma[0] and bma[1] and bma[2] + + bma[np.array([False, False, True])] = False + assert bma[0] and bma[1] and not bma[2] + + bma[[False, True, False]] = False + assert bma[0] and not bma[1] and not bma[2] + + +def test_setitem_array(): + bma = BitmaskArray(np.array([True, False, True])) + + bma[:] = [False, True, False] + assert not bma[0] and bma[1] and not bma[2] + + bma[:] = np.array([True, False, True]) + assert bma[0] and not bma[1] and bma[2] + + +def test_invert(): + result1 = ~BitmaskArray(np.array([True, False])) + assert (result1.bytes[0] & 0x1) == 0 + assert ((result1.bytes[0] >> 1) & 0x1) == 1 + + result2 = ~BitmaskArray(np.array([False, True])) + assert (result2.bytes[0] & 0x1) == 1 + assert ((result2.bytes[0] >> 1) & 0x1) == 0 + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x1])), + ([True], [False], bytes([0x0])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0xFF, 0x3])), + ([False] * 10, [True] * 10, bytes([0x0, 0x0])), + ], +) +def test_and_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 & BitmaskArray(np.array(rhs)) + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [True]), + ([True], [False], [False]), + ([False], [False], [False]), + ([True] * 10, [True] * 10, [True] * 10), + ([False] * 10, [True] * 10, [False] * 10), + ], +) +def test_and_ndarray(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + + result = bma1 & np.array(rhs) + assert (result == np.array(expected)).all() + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0x1])), + ([True], False, bytes([0x0])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0xFF, 0x3])), + ([False] * 10, True, bytes([0x0, 0x0])), + ], +) +def test_and_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 & rhs + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x1])), + ([True], [False], bytes([0x1])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0xFF, 0x3])), + ([False] * 10, [True] * 10, bytes([0xFF, 0x3])), + ], +) +def test_or_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 | BitmaskArray(np.array(rhs)) + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [True]), + ( + [True], + [False], + [True], + ), + ( + [False], + [False], + [False], + ), + ([True] * 10, [True] * 10, [True] * 10), + ([False] * 10, [True] * 10, [True] * 10), + ], +) +def test_or_ndarray(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + + result = bma1 | np.array(rhs) + assert (result == np.array(expected)).all() + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0xFF])), + ([True], False, bytes([0x1])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0xFF, 0xFF])), + ([False] * 10, True, bytes([0xFF, 0xFF])), + ], +) +def test_or_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + result = bma1 | rhs + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], bytes([0x0])), + ([True], [False], bytes([0x1])), + ([False], [False], bytes([0x0])), + ([True] * 10, [True] * 10, bytes([0x0, 0x0])), + ([False] * 10, [True] * 10, bytes([0xFF, 0x3])), + ], +) +def test_xor_bitmask(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + other = BitmaskArray(np.array(rhs)) + result = bma1 ^ other + assert result.bytes == expected + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], [True], [False]), + ([True], [False], [True]), + ([False], [False], [False]), + ([True] * 10, [True] * 10, [False] * 10), + ([False] * 10, [True] * 10, [True] * 10), + ], +) +def test_xor_ndarray(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + other = np.array(rhs) + result = bma1 ^ other + assert (result == np.array(expected)).all() + + +@pytest.mark.parametrize( + "lhs,rhs,expected", + [ + ([True], True, bytes([0xFE])), + ([True], False, bytes([0x1])), + ([False], False, bytes([0x0])), + ([True] * 10, True, bytes([0x0, 0xFC])), + ([False] * 10, True, bytes([0xFF, 0xFF])), + ], +) +def test_xor_scalar(lhs, rhs, expected): + bma1 = BitmaskArray(np.array(lhs)) + other = rhs + result = bma1 ^ other + + # We don't really care about the bits that + # exist beyond the length of the bitmask, but + # to make testing easy we assume XOR still operates + # on them. Might be better to implement equality + # on bitmaskarray and test instead of looking at bytes + assert result.bytes == expected + + +def test_pickle(): + parent = BitmaskArray(np.array([True, False, True])) + child = BitmaskArray(parent) + + result_child = pickle.loads(pickle.dumps(child)) + + assert result_child.shape == child.shape + assert result_child.bytes == child.bytes + + assert result_child.parent.shape == parent.shape + assert result_child.parent.bytes == parent.bytes + assert not result_child.parent.parent + + +def test_iter(): + bma = BitmaskArray(np.array([True, False, True])) + itr = iter(bma) + + assert next(itr) is True + assert next(itr) is False + assert next(itr) is True + + with pytest.raises(StopIteration, match=""): + next(itr) + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True, False, True]), 3), + (np.array([True] * 8), 8), + (np.array([True] * 8 + [False]), 9), + ], +) +def test_size(data, expected): + bma = BitmaskArray(data) + result = bma.size + assert result == expected + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True, False, True]), 1), + (np.array([True] * 8), 1), + (np.array([True] * 8 + [False]), 2), + ], +) +def test_nbytes(data, expected): + bma = BitmaskArray(data) + result = bma.nbytes + assert result == expected + + +@pytest.mark.parametrize( + "data", + [ + np.array([True, False]), + np.array([True, False]).reshape(2, -1), + np.array([True, False]).reshape(-1, 2), + ], +) +def test_shape(data): + bma = BitmaskArray(data) + assert bma.shape == data.shape + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), False), + (np.array([True]), True), + (np.array([False]), False), + (np.array([True] * 8 + [False]), True), + ], +) +def test_any(data, expected): + bma = BitmaskArray(data) + assert bma.any() == expected + + +def test_any_sliced_bitmask(): + # Need to ensure any doesn't look beyond bounds of slice + bma = BitmaskArray(np.array([False, False, True, True])) + assert bma.any() + + new_bma = bma[:2] + assert not new_bma.any() + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), True), + (np.array([True]), True), + (np.array([False]), False), + (np.array([True] * 8 + [False]), False), + ], +) +def test_all(data, expected): + bma = BitmaskArray(data) + assert bma.all() == expected + + +def test_all_sliced_bitmask(): + # Need to ensure all doesn't look beyond bounds of slice + bma = BitmaskArray(np.array([True, True, False, False])) + assert not bma.all() + + new_bma = bma[:2] + assert new_bma.all() + + +@pytest.mark.parametrize( + "data,expected", + [ + (np.array([], dtype=bool), 0), + (np.array([True]), 1), + (np.array([False]), 0), + (np.array([True] * 8 + [False]), 8), + ], +) +def test_sum(data, expected): + bma = BitmaskArray(data) + assert bma.sum() == expected + + +def test_take1d(): + bma = BitmaskArray(np.array([True, False, True, False])) + + result1 = bma.take_1d(np.array([0], dtype=np.int64), axis=0) + assert (result1.bytes[0] & 0x1) == 1 + + result2 = bma.take_1d(np.array([1], dtype=np.int64), axis=0) + assert (result2.bytes[0] & 0x1) == 0 + + result3 = bma.take_1d(np.array([0, 1], dtype=np.int64), axis=0) + assert (result3.bytes[0] & 0x1) == 1 + assert ((result3.bytes[0] >> 1) & 0x1) == 0 + + result4 = bma.take_1d(np.array([0, 0], dtype=np.int64), axis=0) + assert (result4.bytes[0] & 0x1) == 1 + assert ((result4.bytes[0] >> 1) & 0x1) == 1 + + result5 = bma.take_1d(np.array([3, 2, 1, 0], dtype=np.int64), axis=0) + assert (result5.bytes[0] & 0x1) == 0 + assert ((result5.bytes[0] >> 1) & 0x1) == 1 + assert ((result5.bytes[0] >> 2) & 0x1) == 0 + assert ((result5.bytes[0] >> 3) & 0x1) == 1 + + +def test_take1d_raises_not_axis0(): + bma = BitmaskArray(np.array([True, False, True])) + with pytest.raises(NotImplementedError, match="only implemented for axis=0"): + bma.take_1d(np.array([1], dtype=np.int64), axis=1) + + +def test_take_1d_raises_empty_indices(): + bma = BitmaskArray(np.array([True, False, True])) + with pytest.raises(NotImplementedError, match="does not support empty takes"): + bma.take_1d(np.array([], dtype="int64"), axis=0) + + +def test_take_1d_raises_negative_indices(): + bma = BitmaskArray(np.array([True, False, True])) + with pytest.raises(ValueError, match="does not support negative indexing"): + bma.take_1d(np.array([-1], dtype="int64"), axis=0) + + +def test_copy(): + old_bma = BitmaskArray(np.array([True, False, True, False])) + bma = old_bma.copy() + + assert bma.bytes == old_bma.bytes + assert bma.shape == old_bma.shape + assert not bma.parent + + +@pytest.mark.parametrize( + "data", + [ + np.array([], dtype=bool), + np.array([True] * 100, dtype=bool), + np.array([[True, False], [True, False], [True, True], [False, False]]), + np.array([[True, False, True, False], [True, True, False, False]]), + ], +) +def test_to_numpy(data): + bma = BitmaskArray(data) + + result = bma.to_numpy() + tm.assert_numpy_array_equal(result, data) + + +@pytest.mark.parametrize( + "array,expected", + [ + pytest.param(np.array([False, False]), [False, False], id="all_false"), + pytest.param(np.array([True, False]), [True, False], id="first_true"), + pytest.param(np.array([False, True]), [False, True], id="second_true"), + pytest.param(np.array([True, True]), [True, True], id="all_true"), + pytest.param(np.array([True, False] * 8), [True, False] * 8, id="multibyte"), + pytest.param( + np.array([[False, False], [True, True], [False, False]])[:, 0], + [False, True, False], + id="non-contiguous", + ), + ], +) +def test_memoryview(array, expected): + bma = BitmaskArray(array) + vw = memoryview(bma) + assert vw.tolist() == expected + + +def test_bitmask_array_shape_from_sliced_bitmask(): + orig_bma = BitmaskArray(np.array([True] * 100)) + bma = BitmaskArray(orig_bma[:10]) + + assert bma.shape == (10,) diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py index 3e74402263cf9..2f025d039389d 100644 --- a/pandas/tests/arrays/masked_shared.py +++ b/pandas/tests/arrays/masked_shared.py @@ -16,7 +16,7 @@ def _compare_other(self, data, op, other): expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA # TODO: have series accept memview tm.assert_series_equal(result, expected) @@ -28,7 +28,7 @@ def _compare_other(self, data, op, other): expected = op(pd.Series(data._data), other).astype("boolean") # fill the nan locations - expected[data._mask] = pd.NA + expected[data._mask.to_numpy()] = pd.NA tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 79dc423f12a85..3f65229e2921a 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -286,9 +286,11 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._data, result._values._data, check_same="same" ) - assert np.shares_memory(index._values._mask, result._values._mask) + assert index._values._mask is result._values._mask tm.assert_numpy_array_equal( - index._values._mask, result._values._mask, check_same="same" + index._values._mask.to_numpy(), + result._values._mask.to_numpy(), + check_same="copy", ) elif index.dtype == "string[python]": assert np.shares_memory(index._values._ndarray, result._values._ndarray)