diff --git a/asv_bench/benchmarks/boolean.py b/asv_bench/benchmarks/boolean.py new file mode 100644 index 0000000000000..71c422c641775 --- /dev/null +++ b/asv_bench/benchmarks/boolean.py @@ -0,0 +1,32 @@ +import numpy as np + +import pandas as pd + + +class TimeLogicalOps: + def setup(self): + N = 10_000 + left, right, lmask, rmask = np.random.randint(0, 2, size=(4, N)).astype("bool") + self.left = pd.arrays.BooleanArray(left, lmask) + self.right = pd.arrays.BooleanArray(right, rmask) + + def time_or_scalar(self): + self.left | True + self.left | False + + def time_or_array(self): + self.left | self.right + + def time_and_scalar(self): + self.left & True + self.left & False + + def time_and_array(self): + self.left & self.right + + def time_xor_scalar(self): + self.left ^ True + self.left ^ False + + def time_xor_array(self): + self.left ^ self.right diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 9ec330c956ff1..9cea68530fbe7 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -73,6 +73,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`user_guide/missing_data` * :doc:`user_guide/categorical` * :doc:`user_guide/integer_na` + * :doc:`user_guide/boolean` * :doc:`user_guide/visualization` * :doc:`user_guide/computation` * :doc:`user_guide/groupby` diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst new file mode 100644 index 0000000000000..e0f676d3072fc --- /dev/null +++ b/doc/source/user_guide/boolean.rst @@ -0,0 +1,79 @@ +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import pandas as pd + import numpy as np + +.. _boolean: + +************************** +Nullable Boolean Data Type +************************** + +.. versionadded:: 1.0.0 + +.. _boolean.kleene: + +Kleene Logical Operations +------------------------- + +:class:`arrays.BooleanArray` implements `Kleene Logic`_ (sometimes called three-value logic) for +logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or). + +This table demonstrates the results for every combination. These operations are symmetrical, +so flipping the left- and right-hand side makes no difference in the result. + +================= ========= +Expression Result +================= ========= +``True & True`` ``True`` +``True & False`` ``False`` +``True & NA`` ``NA`` +``False & False`` ``False`` +``False & NA`` ``False`` +``NA & NA`` ``NA`` +``True | True`` ``True`` +``True | False`` ``True`` +``True | NA`` ``True`` +``False | False`` ``False`` +``False | NA`` ``NA`` +``NA | NA`` ``NA`` +``True ^ True`` ``False`` +``True ^ False`` ``True`` +``True ^ NA`` ``NA`` +``False ^ False`` ``False`` +``False ^ NA`` ``NA`` +``NA ^ NA`` ``NA`` +================= ========= + +When an ``NA`` is present in an operation, the output value is ``NA`` only if +the result cannot be determined solely based on the other input. For example, +``True | NA`` is ``True``, because both ``True | True`` and ``True | False`` +are ``True``. In that case, we don't actually need to consider the value +of the ``NA``. + +On the other hand, ``True & NA`` is ``NA``. The result depends on whether +the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``, +but ``True & False`` is ``False``, so we can't determine the output. + + +This differs from how ``np.nan`` behaves in logical operations. Pandas treated +``np.nan`` is *always false in the output*. + +In ``or`` + +.. ipython:: python + + pd.Series([True, False, np.nan], dtype="object") | True + pd.Series([True, False, np.nan], dtype="boolean") | True + +In ``and`` + +.. ipython:: python + + pd.Series([True, False, np.nan], dtype="object") & True + pd.Series([True, False, np.nan], dtype="boolean") & True + +.. _Kleene Logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index b86961a71433b..30b1c0b4eac0d 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -30,6 +30,7 @@ Further information on any specific method can be obtained in the missing_data categorical integer_na + boolean visualization computation groupby diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index aec3397bddd16..31dc656eb4b25 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -184,6 +184,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): represented by 2 numpy arrays: a boolean array with the data and a boolean array with the mask (True indicating missing). + BooleanArray implements Kleene logic (sometimes called three-value + logic) for logical operations. See :ref:`boolean.kleene` for more. + To construct an BooleanArray from generic array-like input, use :func:`pandas.array` specifying ``dtype="boolean"`` (see examples below). @@ -283,7 +286,7 @@ def __getitem__(self, item): def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): """ - Coerce to an ndarary of object dtype or bool dtype (if force_bool=True). + Coerce to an ndarray of object dtype or bool dtype (if force_bool=True). Parameters ---------- @@ -565,10 +568,13 @@ def logical_method(self, other): # Rely on pandas to unbox and dispatch to us. return NotImplemented + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} other = lib.item_from_zerodim(other) + other_is_booleanarray = isinstance(other, BooleanArray) + other_is_scalar = lib.is_scalar(other) mask = None - if isinstance(other, BooleanArray): + if other_is_booleanarray: other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other, dtype="bool") @@ -576,22 +582,26 @@ def logical_method(self, other): raise NotImplementedError( "can only perform ops with 1-d structures" ) - if len(self) != len(other): - raise ValueError("Lengths must match to compare") other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() + + if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + raise TypeError( + "'other' should be pandas.NA or a bool. Got {} instead.".format( + type(other).__name__ + ) + ) - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) + if not other_is_scalar and len(self) != len(other): + raise ValueError("Lengths must match to compare") - # nans propagate - if mask is None: - mask = self._mask - else: - mask = self._mask | mask + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + elif op.__name__ in {"xor", "rxor"}: + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) return BooleanArray(result, mask) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d14fb040c4e30..f3c01efed6d43 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -39,6 +39,7 @@ _op_descriptions, ) from pandas.core.ops.invalid import invalid_comparison # noqa:F401 +from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 from pandas.core.ops.methods import ( # noqa:F401 add_flex_arithmetic_methods, add_special_arithmetic_methods, diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index c39f4d6d9698d..016a89eb56da3 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -189,6 +189,9 @@ def maybe_dispatch_ufunc_to_dunder_op( "ge", "remainder", "matmul", + "or", + "xor", + "and", } aliases = { "subtract": "sub", @@ -204,6 +207,9 @@ def maybe_dispatch_ufunc_to_dunder_op( "less_equal": "le", "greater": "gt", "greater_equal": "ge", + "bitwise_or": "or", + "bitwise_and": "and", + "bitwise_xor": "xor", } # For op(., Array) -> Array.__r{op}__ diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py new file mode 100644 index 0000000000000..fd91e78451da9 --- /dev/null +++ b/pandas/core/ops/mask_ops.py @@ -0,0 +1,178 @@ +""" +Ops for masked ararys. +""" +from typing import Optional, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing + + +def kleene_or( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``or`` using Kleene logic. + + Values are NA where we have ``NA | NA`` or ``NA | False``. + ``NA | True`` is considered True. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical or, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_or(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + + raise_for_nan(right, method="or") + + if right is libmissing.NA: + result = left.copy() + else: + result = left | right + + if right_mask is not None: + # output is unknown where (False & NA), (NA & False), (NA & NA) + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = ( + (left_false & right_mask) + | (right_false & left_mask) + | (left_mask & right_mask) + ) + else: + if right is True: + mask = np.zeros_like(left_mask) + elif right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + # False + mask = left_mask.copy() + + return result, mask + + +def kleene_xor( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``xor`` using Kleene logic. + + This is the same as ``or``, with the following adjustments + + * True, True -> False + * True, NA -> NA + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + if left_mask is None: + return kleene_xor(right, left, right_mask, left_mask) + + raise_for_nan(right, method="xor") + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left ^ right + + if right_mask is None: + if right is libmissing.NA: + mask = np.ones_like(left_mask) + else: + mask = left_mask.copy() + else: + mask = left_mask | right_mask + + return result, mask + + +def kleene_and( + left: Union[bool, libmissing.NAType, np.ndarray], + right: Union[bool, libmissing.NAType, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``and`` using Kleene logic. + + Values are ``NA`` for ``NA & NA`` or ``True & NA``. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_and(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + raise_for_nan(right, method="and") + + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left & right + + if right_mask is None: + # Scalar `right` + if right is libmissing.NA: + mask = (left & ~left_mask) | left_mask + + else: + mask = left_mask.copy() + if right is False: + # unmask everything + mask[:] = False + else: + # unmask where either left or right is False + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = (left_mask & ~right_false) | (right_mask & ~left_false) + + return result, mask + + +def raise_for_nan(value, method): + if lib.is_float(value) and np.isnan(value): + raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index a13bb8edc8e48..d2f1a4a60ada8 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -356,6 +356,13 @@ def test_ufunc_reduce_raises(values): class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + def get_op_from_name(self, op_name): short_opname = op_name.strip("_") short_opname = short_opname if "xor" in short_opname else short_opname + "_" @@ -368,43 +375,205 @@ def get_op_from_name(self, op_name): return op - def _compare_other(self, data, op_name, other): - op = self.get_op_from_name(op_name) - - # array - result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other), dtype="boolean") + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) - # fill the nan locations - expected[data._mask] = np.nan + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) - tm.assert_series_equal(result, expected) + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) - # series - s = pd.Series(data) - result = op(s, other) + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" - expected = pd.Series(data._data) - expected = op(expected, other) - expected = pd.Series(expected, dtype="boolean") + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) - # fill the nan locations - expected[data._mask] = np.nan + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) - def test_scalar(self, data, all_logical_operators): + def test_logical_nan_raises(self, all_logical_operators): op_name = all_logical_operators - self._compare_other(data, op_name, True) + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" - def test_array(self, data, all_logical_operators): - op_name = all_logical_operators - other = pd.array([True] * len(data), dtype="boolean") - self._compare_other(data, op_name, other) - other = np.array([True] * len(data)) - self._compare_other(data, op_name, other) - other = pd.Series([True] * len(data), dtype="boolean") - self._compare_other(data, op_name, other) + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize( + "other", [True, False, pd.NA, [True, False, None] * 3], + ) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) class TestComparisonOps(BaseOpsUtil):