From 03f83bd7d5419c33b139c5a9247ee4372365c62e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Nov 2019 22:33:18 +0100 Subject: [PATCH 01/18] ENH: add NA scalar for missing value indicator --- pandas/__init__.py | 1 + pandas/core/api.py | 1 + pandas/core/na_scalar.py | 113 ++++++++++++++++++++++++++ pandas/tests/scalar/test_na_scalar.py | 108 ++++++++++++++++++++++++ 4 files changed, 223 insertions(+) create mode 100644 pandas/core/na_scalar.py create mode 100644 pandas/tests/scalar/test_na_scalar.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 5d163e411c0ac..3787b5115276b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -68,6 +68,7 @@ DatetimeTZDtype, StringDtype, # missing + NA, isna, isnull, notna, diff --git a/pandas/core/api.py b/pandas/core/api.py index 04f2f84c92a15..136b8535c9257 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -45,6 +45,7 @@ from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexing import IndexSlice +from pandas.core.na_scalar import NA from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py new file mode 100644 index 0000000000000..56e8c5763bbd9 --- /dev/null +++ b/pandas/core/na_scalar.py @@ -0,0 +1,113 @@ +import numbers + + +def _create_binary_propagating_op(name): + def method(self, other): + if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): + return NA + + return NotImplemented + + method.__name__ = name + return method + + +def _create_unary_propagating_op(name): + def method(self): + return NA + + method.__name__ = name + return method + + +class NAType: + + _instance = None + + def __new__(cls, *args, **kwargs): + if NAType._instance is None: + NAType._instance = object.__new__(cls, *args, **kwargs) + return NAType._instance + + def __repr__(self) -> str: + return "NA" + + def __str__(self) -> str: + return "NA" + + def __bool__(self): + raise TypeError("boolean value of NA is ambiguous") + + def __hash__(self): + # TODO what should we use here to hash? + return 0 + + # Binary arithmetic and comparison ops -> propagate + + __add__ = _create_binary_propagating_op("__add__") + __radd__ = _create_binary_propagating_op("__radd__") + __sub__ = _create_binary_propagating_op("__sub__") + __rsub__ = _create_binary_propagating_op("__rsub__") + __mul__ = _create_binary_propagating_op("__mul__") + __rmul__ = _create_binary_propagating_op("__rmul__") + __matmul__ = _create_binary_propagating_op("__matmul__") + __rmatmul__ = _create_binary_propagating_op("__rmatmul__") + __truediv__ = _create_binary_propagating_op("__truediv__") + __rtruediv__ = _create_binary_propagating_op("__rtruediv__") + __floordiv__ = _create_binary_propagating_op("__floordiv__") + __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") + __mod__ = _create_binary_propagating_op("__mod__") + __rmod__ = _create_binary_propagating_op("__rmod__") + __divmod__ = _create_binary_propagating_op("__divmod__") + __rdivmod__ = _create_binary_propagating_op("__rdivmod__") + __pow__ = _create_binary_propagating_op("__pow__") + __rpow__ = _create_binary_propagating_op("__rpow__") + + # __lshift__ = _create_binary_propagating_op("__add__") + # __rshift__ = _create_binary_propagating_op("__add__") + + __eq__ = _create_binary_propagating_op("__eq__") + __ne__ = _create_binary_propagating_op("__ne__") + __le__ = _create_binary_propagating_op("__le__") + __lt__ = _create_binary_propagating_op("__lt__") + __gt__ = _create_binary_propagating_op("__gt__") + __ge__ = _create_binary_propagating_op("__ge__") + + # Unary ops + + __neg__ = _create_unary_propagating_op("__neg__") + __pos__ = _create_unary_propagating_op("__pos__") + __abs__ = _create_unary_propagating_op("__abs__") + __invert__ = _create_unary_propagating_op("__invert__") + + # Logical ops using Kleene logic + + def __and__(self, other): + if other is False: + return False + elif other is True or other is NA: + return NA + else: + return NotImplemented + + __rand__ = __and__ + + def __or__(self, other): + if other is True: + return True + elif other is False or other is NA: + return NA + else: + return NotImplemented + + __ror__ = __or__ + + def __xor__(self, other): + if other is False or other is True or other is NA: + return NA + return NotImplemented + + __rxor__ = __xor__ + + +NA = NAType() diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py new file mode 100644 index 0000000000000..9a88b74cac20e --- /dev/null +++ b/pandas/tests/scalar/test_na_scalar.py @@ -0,0 +1,108 @@ +import numpy as np +import pytest + +from pandas.core.na_scalar import NA + + +def test_singleton(): + assert NA is NA + new_NA = type(NA)() + assert new_NA is NA + + +def test_repr(): + assert repr(NA) == "NA" + assert str(NA) == "NA" + + +def test_truthiness(): + with pytest.raises(TypeError): + bool(NA) + + with pytest.raises(TypeError): + not NA + + +def test_hashable(): + assert hash(NA) == hash(NA) + d = {NA: "test"} + assert d[NA] == "test" + + +def test_arithmetic_ops(all_arithmetic_functions): + op = all_arithmetic_functions + + for other in [NA, 1, 1.0, "a", np.int64(1)]: + if op.__name__ == "rmod" and isinstance(other, str): + continue + assert op(NA, other) is NA + + +def test_comparison_ops(): + + for other in [NA, 1, 1.0, "a", np.int64(1)]: + assert (NA == other) is NA + assert (NA != other) is NA + assert (NA > other) is NA + assert (NA >= other) is NA + assert (NA < other) is NA + assert (NA <= other) is NA + + if isinstance(other, np.int64): + # for numpy scalars we get a deprecation warning and False as result + # for equality or error for larger/lesser than + continue + + assert (other == NA) is NA + assert (other != NA) is NA + assert (other > NA) is NA + assert (other >= NA) is NA + assert (other < NA) is NA + assert (other <= NA) is NA + + +def test_unary_ops(): + assert +NA is NA + assert -NA is NA + assert abs(NA) is NA + assert ~NA is NA + + +def test_logical_and(): + + assert NA & True is NA + assert True & NA is NA + assert NA & False is False + assert False & NA is False + assert NA & NA is NA + + with pytest.raises(TypeError): + NA & 5 + + +def test_logical_or(): + + assert NA | True is True + assert True | NA is True + assert NA | False is NA + assert False | NA is NA + assert NA | NA is NA + + with pytest.raises(TypeError): + NA | 5 + + +def test_logical_xor(): + + assert NA ^ True is NA + assert True ^ NA is NA + assert NA ^ False is NA + assert False ^ NA is NA + assert NA ^ NA is NA + + with pytest.raises(TypeError): + NA ^ 5 + + +def test_logical_not(): + assert ~NA is NA From c1797d54a168780a37e51f77419f12b41c035b6a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2019 15:40:57 +0100 Subject: [PATCH 02/18] add np.nan to arithmetic/comparison tests --- pandas/tests/scalar/test_na_scalar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 9a88b74cac20e..02f6c2386249f 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -32,7 +32,7 @@ def test_hashable(): def test_arithmetic_ops(all_arithmetic_functions): op = all_arithmetic_functions - for other in [NA, 1, 1.0, "a", np.int64(1)]: + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: if op.__name__ == "rmod" and isinstance(other, str): continue assert op(NA, other) is NA @@ -40,7 +40,7 @@ def test_arithmetic_ops(all_arithmetic_functions): def test_comparison_ops(): - for other in [NA, 1, 1.0, "a", np.int64(1)]: + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: assert (NA == other) is NA assert (NA != other) is NA assert (NA > other) is NA From 3339eaacc2cdeb87586f94a9f71b27245c40c845 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2019 15:41:29 +0100 Subject: [PATCH 03/18] use id(self) for hash --- pandas/core/na_scalar.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py index 56e8c5763bbd9..e60c3f79c8e67 100644 --- a/pandas/core/na_scalar.py +++ b/pandas/core/na_scalar.py @@ -39,8 +39,7 @@ def __bool__(self): raise TypeError("boolean value of NA is ambiguous") def __hash__(self): - # TODO what should we use here to hash? - return 0 + return id(self) # Binary arithmetic and comparison ops -> propagate From e9d4d6aa59cf8b91c05ac4397586f1ca3b0a25a7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2019 16:24:46 +0100 Subject: [PATCH 04/18] fix api test --- pandas/tests/api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 3a8e263ac2a6d..489079e3ab3c7 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -46,7 +46,7 @@ class TestPDApi(Base): deprecated_modules = [] # type: List[str] # misc - misc = ["IndexSlice", "NaT"] + misc = ["IndexSlice", "NaT", "NA"] # top-level classes classes = [ From 4450d2dc9491f51076e14384bc13702d8a7f6bf0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2019 22:30:38 +0100 Subject: [PATCH 05/18] move to cython --- pandas/_libs/lib.pyx | 4 +- pandas/_libs/missing.pxd | 5 ++ pandas/_libs/missing.pyx | 125 +++++++++++++++++++++++++- pandas/core/api.py | 3 +- pandas/core/na_scalar.py | 112 ----------------------- pandas/tests/scalar/test_na_scalar.py | 22 ++++- 6 files changed, 154 insertions(+), 117 deletions(-) delete mode 100644 pandas/core/na_scalar.py diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7d65cb52bce1e..1c5680dbca431 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -58,10 +58,9 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare from pandas._libs.missing cimport ( - checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period + checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA ) - # constants that will be compared to potentially arbitrarily large # python int cdef: @@ -161,6 +160,7 @@ def is_scalar(val: object) -> bool: or PyTime_Check(val) # We differ from numpy, which claims that None is not scalar; # see np.isscalar + or val is C_NA or val is None or isinstance(val, (Fraction, Number)) or util.is_period_object(val) diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index d0dd306680ae8..d4303ac28b9a5 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) cdef bint is_null_period(v) + +cdef class C_NAType: + pass + +cdef C_NAType C_NA diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 9568ddb7fe53f..c994b6e87cb42 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -44,7 +44,7 @@ cpdef bint checknull(object val): The difference between `checknull` and `checknull_old` is that `checknull` does *not* consider INF or NEGINF to be NA. """ - return is_null_datetimelike(val, inat_is_null=False) + return val is C_NA or is_null_datetimelike(val, inat_is_null=False) cpdef bint checknull_old(object val): @@ -278,3 +278,126 @@ cdef inline bint is_null_period(v): # determine if we have a null for a Period (or integer versions), # excluding np.datetime64('nat') and np.timedelta64('nat') return checknull_with_nat(v) + + +# ----------------------------------------------------------------------------- +# Implementation of NA singleton + + +def _create_binary_propagating_op(name): + import numbers + + def method(self, other): + if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): + return NA + + return NotImplemented + + method.__name__ = name + return method + + +def _create_unary_propagating_op(name): + def method(self): + return NA + + method.__name__ = name + return method + + +cdef class C_NAType: + pass + + +class NAType(C_NAType): + """ + NA ("not available") missing value indicator. + """ + + _instance = None + + def __new__(cls, *args, **kwargs): + if NAType._instance is None: + NAType._instance = C_NAType.__new__(cls, *args, **kwargs) + return NAType._instance + + def __repr__(self) -> str: + return "NA" + + def __str__(self) -> str: + return "NA" + + def __bool__(self): + raise TypeError("boolean value of NA is ambiguous") + + def __hash__(self): + return id(self) + + # Binary arithmetic and comparison ops -> propagate + + __add__ = _create_binary_propagating_op("__add__") + __radd__ = _create_binary_propagating_op("__radd__") + __sub__ = _create_binary_propagating_op("__sub__") + __rsub__ = _create_binary_propagating_op("__rsub__") + __mul__ = _create_binary_propagating_op("__mul__") + __rmul__ = _create_binary_propagating_op("__rmul__") + __matmul__ = _create_binary_propagating_op("__matmul__") + __rmatmul__ = _create_binary_propagating_op("__rmatmul__") + __truediv__ = _create_binary_propagating_op("__truediv__") + __rtruediv__ = _create_binary_propagating_op("__rtruediv__") + __floordiv__ = _create_binary_propagating_op("__floordiv__") + __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") + __mod__ = _create_binary_propagating_op("__mod__") + __rmod__ = _create_binary_propagating_op("__rmod__") + __divmod__ = _create_binary_propagating_op("__divmod__") + __rdivmod__ = _create_binary_propagating_op("__rdivmod__") + __pow__ = _create_binary_propagating_op("__pow__") + __rpow__ = _create_binary_propagating_op("__rpow__") + # __lshift__ and __rshift__ are not implemented + + __eq__ = _create_binary_propagating_op("__eq__") + __ne__ = _create_binary_propagating_op("__ne__") + __le__ = _create_binary_propagating_op("__le__") + __lt__ = _create_binary_propagating_op("__lt__") + __gt__ = _create_binary_propagating_op("__gt__") + __ge__ = _create_binary_propagating_op("__ge__") + + # Unary ops + + __neg__ = _create_unary_propagating_op("__neg__") + __pos__ = _create_unary_propagating_op("__pos__") + __abs__ = _create_unary_propagating_op("__abs__") + __invert__ = _create_unary_propagating_op("__invert__") + + # Logical ops using Kleene logic + + def __and__(self, other): + if other is False: + return False + elif other is True or other is NA: + return NA + else: + return NotImplemented + + __rand__ = __and__ + + def __or__(self, other): + if other is True: + return True + elif other is False or other is NA: + return NA + else: + return NotImplemented + + __ror__ = __or__ + + def __xor__(self, other): + if other is False or other is True or other is NA: + return NA + return NotImplemented + + __rxor__ = __xor__ + + +C_NA = NAType() # C-visible +NA = C_NA # Python-visible diff --git a/pandas/core/api.py b/pandas/core/api.py index 136b8535c9257..b6c621a369e19 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -45,7 +45,6 @@ from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexing import IndexSlice -from pandas.core.na_scalar import NA from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime @@ -57,3 +56,5 @@ # DataFrame needs to be imported after NamedAgg to avoid a circular import from pandas.core.frame import DataFrame # isort:skip + +from pandas._libs.missing import NA diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py deleted file mode 100644 index e60c3f79c8e67..0000000000000 --- a/pandas/core/na_scalar.py +++ /dev/null @@ -1,112 +0,0 @@ -import numbers - - -def _create_binary_propagating_op(name): - def method(self, other): - if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): - return NA - - return NotImplemented - - method.__name__ = name - return method - - -def _create_unary_propagating_op(name): - def method(self): - return NA - - method.__name__ = name - return method - - -class NAType: - - _instance = None - - def __new__(cls, *args, **kwargs): - if NAType._instance is None: - NAType._instance = object.__new__(cls, *args, **kwargs) - return NAType._instance - - def __repr__(self) -> str: - return "NA" - - def __str__(self) -> str: - return "NA" - - def __bool__(self): - raise TypeError("boolean value of NA is ambiguous") - - def __hash__(self): - return id(self) - - # Binary arithmetic and comparison ops -> propagate - - __add__ = _create_binary_propagating_op("__add__") - __radd__ = _create_binary_propagating_op("__radd__") - __sub__ = _create_binary_propagating_op("__sub__") - __rsub__ = _create_binary_propagating_op("__rsub__") - __mul__ = _create_binary_propagating_op("__mul__") - __rmul__ = _create_binary_propagating_op("__rmul__") - __matmul__ = _create_binary_propagating_op("__matmul__") - __rmatmul__ = _create_binary_propagating_op("__rmatmul__") - __truediv__ = _create_binary_propagating_op("__truediv__") - __rtruediv__ = _create_binary_propagating_op("__rtruediv__") - __floordiv__ = _create_binary_propagating_op("__floordiv__") - __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") - __mod__ = _create_binary_propagating_op("__mod__") - __rmod__ = _create_binary_propagating_op("__rmod__") - __divmod__ = _create_binary_propagating_op("__divmod__") - __rdivmod__ = _create_binary_propagating_op("__rdivmod__") - __pow__ = _create_binary_propagating_op("__pow__") - __rpow__ = _create_binary_propagating_op("__rpow__") - - # __lshift__ = _create_binary_propagating_op("__add__") - # __rshift__ = _create_binary_propagating_op("__add__") - - __eq__ = _create_binary_propagating_op("__eq__") - __ne__ = _create_binary_propagating_op("__ne__") - __le__ = _create_binary_propagating_op("__le__") - __lt__ = _create_binary_propagating_op("__lt__") - __gt__ = _create_binary_propagating_op("__gt__") - __ge__ = _create_binary_propagating_op("__ge__") - - # Unary ops - - __neg__ = _create_unary_propagating_op("__neg__") - __pos__ = _create_unary_propagating_op("__pos__") - __abs__ = _create_unary_propagating_op("__abs__") - __invert__ = _create_unary_propagating_op("__invert__") - - # Logical ops using Kleene logic - - def __and__(self, other): - if other is False: - return False - elif other is True or other is NA: - return NA - else: - return NotImplemented - - __rand__ = __and__ - - def __or__(self, other): - if other is True: - return True - elif other is False or other is NA: - return NA - else: - return NotImplemented - - __ror__ = __or__ - - def __xor__(self, other): - if other is False or other is True or other is NA: - return NA - return NotImplemented - - __rxor__ = __xor__ - - -NA = NAType() diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 02f6c2386249f..fdf70495c45ac 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -1,7 +1,12 @@ import numpy as np import pytest -from pandas.core.na_scalar import NA +from pandas._libs.missing import NA + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +import pandas.util.testing as tm def test_singleton(): @@ -106,3 +111,18 @@ def test_logical_xor(): def test_logical_not(): assert ~NA is NA + + +def test_is_scalar(): + assert is_scalar(NA) is True + + +def test_isna(): + assert pd.isna(NA) is True + assert pd.notna(NA) is False + + +def test_series_isna(): + s = pd.Series([1, NA], dtype=object) + expected = pd.Series([False, True]) + tm.assert_series_equal(s.isna(), expected) From 1849a23030c045dc629c6ccca1951c78717f2a97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2019 09:20:46 +0100 Subject: [PATCH 06/18] add examples to isna/notna docstring --- pandas/core/dtypes/missing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0a8f636b4cb2a..fdb86b619d779 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -79,6 +79,9 @@ def isna(obj): >>> pd.isna('dog') False + >>> pd.isna(pd.NA) + True + >>> pd.isna(np.nan) True @@ -323,6 +326,9 @@ def notna(obj): >>> pd.notna('dog') True + >>> pd.notna(pd.NA) + False + >>> pd.notna(np.nan) False From c72e3eeae2a5ce05d110c087ba9411ea522c37f8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Nov 2019 07:53:03 -0600 Subject: [PATCH 07/18] Use NA scalar in string dtype (#1) --- pandas/_libs/lib.pyx | 2 +- pandas/_libs/testing.pyx | 8 +++-- pandas/core/arrays/numpy_.py | 3 ++ pandas/core/arrays/string_.py | 37 ++++++++++------------ pandas/core/dtypes/missing.py | 5 +++ pandas/core/strings.py | 7 ++++ pandas/tests/arrays/string_/test_string.py | 11 ++++++- pandas/tests/extension/test_string.py | 8 ++--- pandas/tests/test_strings.py | 8 +++++ 9 files changed, 59 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c5680dbca431..e2fb4a3635387 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1500,7 +1500,7 @@ cdef class Validator: f'must define is_value_typed') cdef bint is_valid_null(self, object value) except -1: - return value is None or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index f848310d961e1..adbc25b96ea42 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if a == b: - # object comparison - return True if isna(a) and isna(b): # TODO: Should require same-dtype NA? # nan / None comparison return True + + if a == b: + # object comparison + return True + if is_comparable_as_number(a) and is_comparable_as_number(b): if array_equivalent(a, b, strict_nan=True): # inf comparison diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6f2bb095a014d..8ba5cd7565850 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None): return new_values def take(self, indices, allow_fill=False, fill_value=None): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value result = take( self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..bb2a9a22e601f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,9 @@ import operator -from typing import TYPE_CHECKING, Type +from typing import Type import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -17,9 +17,6 @@ from pandas.core.construction import extract_array from pandas.core.missing import isna -if TYPE_CHECKING: - from pandas._typing import Scalar - @register_extension_dtype class StringDtype(ExtensionDtype): @@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype): StringDtype """ - @property - def na_value(self) -> "Scalar": - """ - StringDtype uses :attr:`numpy.nan` as the missing NA value. - - .. warning:: - - `na_value` may change in a future release. - """ - return np.nan + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA @property def type(self) -> Type: @@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) - # convert None to np.nan + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = np.nan + result[result.isna()] = StringDtype.na_value return result @classmethod @@ -192,6 +181,12 @@ def __arrow_array__(self, type=None): type = pa.string() return pa.array(self._ndarray, type=type, from_pandas=True) + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): @@ -205,9 +200,9 @@ def __setitem__(self, key, value): # validate new items if scalar_value: - if scalar_value is None: - value = np.nan - elif not (isinstance(value, str) or np.isnan(value)): + if isna(value): + value = StringDtype.na_value + elif not isinstance(value, str): raise ValueError( "Cannot set non-string value '{}' into a StringArray.".format(value) ) @@ -265,7 +260,7 @@ def method(self, other): other = other[valid] result = np.empty_like(self._ndarray, dtype="object") - result[mask] = np.nan + result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index fdb86b619d779..2d57997c36ef1 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -446,6 +446,9 @@ def array_equivalent(left, right, strict_nan=False): if left_value is NaT and right_value is not NaT: return False + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False @@ -457,6 +460,8 @@ def array_equivalent(left, right, strict_nan=False): if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False + elif "boolean value of NA is ambiguous" in str(err): + return False raise return True diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7194d1cf08e4a..0a0f1c4d2527e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -123,6 +123,13 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if na_mask: mask = isna(arr) convert = not np.all(mask) + if convert: + # XXX: This converts pd.NA to np.nan to match the output of + # object-dtype ops that return numeric, like str.count + # We probably want to return Int64Dtype instead. + # NA -> nan + arr[mask] = np.nan + try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..3029f3007929b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ def test_none_to_nan(): a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None - assert np.isnan(a[1]) + assert a[1] is pd.NA def test_setitem_validates(): @@ -24,6 +24,15 @@ def test_setitem_validates(): a[:] = np.array([1, 2]) +def test_setitem_with_scalar_string(): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype="string") + arr[0] = "d" + expected = pd.array(["d", "c"], dtype="string") + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize( "input, method", [ diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5b872d5b72227..471a1b79d23bc 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -25,7 +25,7 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([np.nan, "A"]) + return StringArray._from_sequence([pd.NA, "A"]) @pytest.fixture @@ -35,17 +35,17 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return StringArray._from_sequence(["B", np.nan, "A"]) + return StringArray._from_sequence(["B", pd.NA, "A"]) @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture def data_for_grouping(): - return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..9b8ccdf4a7d9e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3507,3 +3507,11 @@ def test_string_array(any_string_method): assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.xfail(reason="not implmented yet") +def test_string_dtype_numeric(): + s = Series(["a", "aa", None], dtype="string") + result = s.str.count("a") + expected = Series([1, 2, None], dtype="Int64") + tm.assert_series_equal(result, expected) From 230266176939cf33b42cec3fe5c963b767bc4092 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2019 15:53:52 +0100 Subject: [PATCH 08/18] fix doctest --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bb2a9a22e601f..cf3e427f0ee26 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,7 +120,7 @@ class StringArray(PandasArray): -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', nan, 'data.'] + ['This is', 'some text', NA, 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string From 2ab592a4b2e25d3bd30573e86c818a5d0015e463 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2019 19:48:17 +0100 Subject: [PATCH 09/18] small edits --- pandas/_libs/lib.pyx | 1 + pandas/_libs/missing.pyx | 12 +++++++++++- pandas/tests/test_strings.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2765b0f17da5d..747eedb3e9629 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -61,6 +61,7 @@ from pandas._libs.missing cimport ( checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA ) + # constants that will be compared to potentially arbitrarily large # python int cdef: diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index c994b6e87cb42..37e7fa1520cfd 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,6 +1,8 @@ import cython from cython import Py_ssize_t +import numbers + import numpy as np cimport numpy as cnp from numpy cimport ndarray, int64_t, uint8_t, float64_t @@ -285,7 +287,6 @@ cdef inline bint is_null_period(v): def _create_binary_propagating_op(name): - import numbers def method(self, other): if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): @@ -312,6 +313,15 @@ cdef class C_NAType: class NAType(C_NAType): """ NA ("not available") missing value indicator. + + .. warning:: + + Experimental: the behaviour of NA can still change without warning. + + .. versionadded:: 1.0.0 + + The NA singleton is a missing value indicator defined by pandas. It is + used in certain new extension dtypes (currently the "string" dtype). """ _instance = None diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9b8ccdf4a7d9e..e34bb9321a667 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3509,7 +3509,7 @@ def test_string_array(any_string_method): tm.assert_equal(result, expected) -@pytest.mark.xfail(reason="not implmented yet") +@pytest.mark.xfail(reason="not implemented yet") def test_string_dtype_numeric(): s = Series(["a", "aa", None], dtype="string") result = s.str.count("a") From 018399e60c7218011b83fa12ebf79e5c19c7ec64 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2019 09:22:45 +0100 Subject: [PATCH 10/18] fix NA in repr --- pandas/io/formats/format.py | 3 +++ pandas/tests/arrays/string_/test_string.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 17603809c2ea6..5036a96545d4e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -35,6 +35,7 @@ from pandas._config.config import get_option, set_option from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType @@ -1218,6 +1219,8 @@ def _format(x): # determine na_rep if x is None or NaT-like if x is None: return "None" + elif x is NA: + return "NA" elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3029f3007929b..80193ec230790 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,6 +9,16 @@ import pandas.util.testing as tm +def test_repr_with_NA(): + a = pd.array(["a", pd.NA, "b"], dtype="string") + for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]: + assert "NA" in repr(obj) and "NaN" not in repr(obj) + assert "NA" in str(obj) and "NaN" not in str(obj) + if hasattr(obj, "_repr_html_"): + html_repr = obj._repr_html_() + assert "NA" in html_repr and "NaN" not in html_repr + + def test_none_to_nan(): a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None From 33fd3e09eb9123e93856aee7ddad1f8d8e5f6c62 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 10:24:49 -0600 Subject: [PATCH 11/18] remove redundant test --- pandas/tests/test_strings.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index c3494f9f4d9a7..1261c3bbc86db 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3527,14 +3527,6 @@ def test_string_array(any_string_method): tm.assert_equal(result, expected) -@pytest.mark.xfail(reason="not implemented yet") -def test_string_dtype_numeric(): - s = Series(["a", "aa", None], dtype="string") - result = s.str.count("a") - expected = Series([1, 2, None], dtype="Int64") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "method,expected", [ From 289c8857b2646b2890fc079f37785adc4dd27463 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 10:26:23 -0600 Subject: [PATCH 12/18] remove dead code --- pandas/core/strings.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e3143cd9dc8c0..413e7e85eb6fe 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -201,13 +201,6 @@ def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): if na_mask: mask = isna(arr) convert = not np.all(mask) - if convert: - # XXX: This converts pd.NA to np.nan to match the output of - # object-dtype ops that return numeric, like str.count - # We probably want to return Int64Dtype instead. - # NA -> nan - arr[mask] = np.nan - try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: From f8208db7f92a80a1ac8a7a1295d8b07a48306a10 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Nov 2019 16:57:40 +0100 Subject: [PATCH 13/18] fix divmod --- pandas/_libs/missing.pyx | 11 +++++++---- pandas/tests/scalar/test_na_scalar.py | 5 ++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 37e7fa1520cfd..5a2874a708e6e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -286,11 +286,14 @@ cdef inline bint is_null_period(v): # Implementation of NA singleton -def _create_binary_propagating_op(name): +def _create_binary_propagating_op(name, divmod=False): def method(self, other): if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): - return NA + if divmod: + return NA, NA + else: + return NA return NotImplemented @@ -359,8 +362,8 @@ class NAType(C_NAType): __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") __mod__ = _create_binary_propagating_op("__mod__") __rmod__ = _create_binary_propagating_op("__rmod__") - __divmod__ = _create_binary_propagating_op("__divmod__") - __rdivmod__ = _create_binary_propagating_op("__rdivmod__") + __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True) + __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True) __pow__ = _create_binary_propagating_op("__pow__") __rpow__ = _create_binary_propagating_op("__rpow__") # __lshift__ and __rshift__ are not implemented diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index fdf70495c45ac..e68e49814245f 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -40,7 +40,10 @@ def test_arithmetic_ops(all_arithmetic_functions): for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: if op.__name__ == "rmod" and isinstance(other, str): continue - assert op(NA, other) is NA + if op.__name__ in ("divmod", "rdivmod"): + assert op(NA, other) is (NA, NA) + else: + assert op(NA, other) is NA def test_comparison_ops(): From 1fcf4b7400a1419ea272140fa480dcad94af338b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Nov 2019 18:55:19 +0100 Subject: [PATCH 14/18] NA -> C_NA --- pandas/_libs/missing.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 5a2874a708e6e..9bf955ad369e7 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -289,7 +289,7 @@ cdef inline bint is_null_period(v): def _create_binary_propagating_op(name, divmod=False): def method(self, other): - if isinstance(other, numbers.Number) or other is NA or isinstance(other, str): + if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number): if divmod: return NA, NA else: @@ -387,7 +387,7 @@ class NAType(C_NAType): def __and__(self, other): if other is False: return False - elif other is True or other is NA: + elif other is True or other is C_NA: return NA else: return NotImplemented @@ -397,7 +397,7 @@ class NAType(C_NAType): def __or__(self, other): if other is True: return True - elif other is False or other is NA: + elif other is False or other is C_NA: return NA else: return NotImplemented @@ -405,7 +405,7 @@ class NAType(C_NAType): __ror__ = __or__ def __xor__(self, other): - if other is False or other is True or other is NA: + if other is False or other is True or other is C_NA: return NA return NotImplemented From f6798e5a6e92b7581cccd44aaa750c6ecb9abd2c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 26 Nov 2019 19:03:01 +0100 Subject: [PATCH 15/18] start some docs --- doc/source/user_guide/missing_data.rst | 114 ++++++++++++++++++++++++- doc/source/whatsnew/v1.0.0.rst | 40 +++++++++ 2 files changed, 150 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 6c36a6470f841..9b6851c44c5bc 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -12,10 +12,10 @@ pandas. .. note:: The choice of using ``NaN`` internally to denote missing data was largely - for simplicity and performance reasons. It differs from the MaskedArray - approach of, for example, :mod:`scikits.timeseries`. We are hopeful that - NumPy will soon be able to provide a native NA type solution (similar to R) - performant enough to be used in pandas. + for simplicity and performance reasons. + Starting from pandas 1.0, some optional data types start experimenting + with a native ``NA`` scalar using a mask-based approach. See + :ref:`here ` for more. See the :ref:`cookbook` for some advanced strategies. @@ -771,3 +771,109 @@ the ``dtype="Int64"``. s See :ref:`integer_na` for more. + + +.. _missing_data.NA: + +Experimental ``NA`` scalar to denote missing values +--------------------------------------------------- + +.. warning:: + + Experimental: the behaviour of ``pd.NA`` can still change without warning. + +.. versionadded:: 1.0.0 + +Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is +available to represent scalar missing values. At this moment, it is used in +the nullable integer and boolean data types and the dedicated string data type +as the missing value indicator. + +The goal of ``pd.NA`` is provide a "missing" indicator that can be used +consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` +depending on the data type). + +For example, when having missing values in a Series with the nullable integer +dtype will use ``pd.NA``: + +.. ipython:: python + + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + +Propagation in arithmetic and comparison operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In general, missing values *propagate* in operations involving ``pd.NA``. When +one of the operands is unknown, the outcome of the operation is also unknown. + +For example, ``pd.NA`` propagates in arithmetic operations, similarly to +``np.nan``: + +.. ipython:: python + + pd.NA + 1 + "a" * pd.NA + +In equality and comparison operations, ``pd.NA`` also propagates. This deviates +from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always +return ``False``. + +.. ipython:: python + + pd.NA == 1 + pd.NA == pd.NA + pd.NA < 2.5 + +To check if a value is equal to ``pd.NA``, the :func:`isna` function can be +used: + +.. ipython:: python + + pd.isna(pd.NA) + +Logical operations +~~~~~~~~~~~~~~~~~~ + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*, similarly to R, SQL and Julia). This logic means to only +propagate missing values when it is logically required. + +For example, for the logical "or" operation (``|``), if one of the operands +is ``True``, we already know the result will be ``True``, regardless of the +other values (so regardless the missing value would be ``True`` or ``False``). +In this case, ``pd.NA`` does not propagate: + +.. ipython:: python + + True | False + True | pd.NA + pd.NA | True + +On the other hand, if one of the operands is ``False``, the result depends +on the value of the other operand. Therefore, in thise case ``pd.NA`` +propagates: + +.. ipython:: python + + False | True + False | True + False | pd.NA + +The behaviour of the logical "and" operation (``&``) can be derived using +similar logic (where now ``pd.NA`` will not propagate if one of the operands +is already ``False``): + +.. ipython:: python + + False & True + False & False + False & pd.NA + +.. ipython:: python + + True & True + True & False + True & pd.NA diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7d11d90eeb670..2f8f58159066a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -102,6 +102,46 @@ String accessor methods returning integers will return a value with :class:`Int6 We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. +.. _whatsnew_100.NA: + +Experimental ``NA`` scalar to denote missing values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new ``pd.NA`` value (singleton) is introduced to represent scalar missing +values. Up to now, ``np.nan`` is used for this for float data, ``np.nan``or +``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The +goal of ``pd.NA`` is provide a "missing" indicator that can be used +consistently accross data types. For now, the nullable integer and boolean +data types and the new string data type make use of ``pd.NA`` (:issue:`28095`). + +For example, creating a Series using the nullable integer dtype: + +.. ipython:: python + + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + +Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. +In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" +or "unknown" in comparison operations: + +.. ipython:: python + + np.nan > 1 + pd.NA > 1 + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*). For example: + +.. ipython:: python + + pd.NA | True + +For more, see :ref:`NA section ` in the user guide on missing +data. + .. _whatsnew_100.boolean: Boolean data type with missing values support From 14c143452625e8be013bbc85031e6b531132e145 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Nov 2019 08:37:58 +0100 Subject: [PATCH 16/18] futher doc updates --- doc/source/user_guide/missing_data.rst | 40 ++++++++++++++++++++------ doc/source/whatsnew/v1.0.0.rst | 6 +++- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 9b6851c44c5bc..56444fcd3cb05 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -110,7 +110,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. .. _missing.inserting: Inserting missing data ----------------------- +~~~~~~~~~~~~~~~~~~~~~~ You can insert missing values by simply assigning to containers. The actual missing value used will be chosen based on the dtype. @@ -135,9 +135,10 @@ For object containers, pandas will use the value given: s.loc[1] = np.nan s +.. _missing_data.calculations: Calculations with missing data ------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Missing values propagate naturally through arithmetic operations between pandas objects. @@ -776,7 +777,7 @@ See :ref:`integer_na` for more. .. _missing_data.NA: Experimental ``NA`` scalar to denote missing values ---------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: @@ -794,16 +795,17 @@ consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` depending on the data type). For example, when having missing values in a Series with the nullable integer -dtype will use ``pd.NA``: +dtype, it will use ``pd.NA``: .. ipython:: python s = pd.Series([1, 2, None], dtype="Int64") s s[2] + s[2] is pd.NA Propagation in arithmetic and comparison operations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------------------- In general, missing values *propagate* in operations involving ``pd.NA``. When one of the operands is unknown, the outcome of the operation is also unknown. @@ -833,8 +835,12 @@ used: pd.isna(pd.NA) +An exception on this basic propagation rule are *reductions* (such as the +mean or the minimum), where pandas defaults to skipping missing values. See +:ref:`above ` for more. + Logical operations -~~~~~~~~~~~~~~~~~~ +------------------ For logical operations, ``pd.NA`` follows the rules of the `three-valued logic `__ (or @@ -843,7 +849,7 @@ propagate missing values when it is logically required. For example, for the logical "or" operation (``|``), if one of the operands is ``True``, we already know the result will be ``True``, regardless of the -other values (so regardless the missing value would be ``True`` or ``False``). +other value (so regardless the missing value would be ``True`` or ``False``). In this case, ``pd.NA`` does not propagate: .. ipython:: python @@ -859,7 +865,7 @@ propagates: .. ipython:: python False | True - False | True + False | False False | pd.NA The behaviour of the logical "and" operation (``&``) can be derived using @@ -877,3 +883,21 @@ is already ``False``): True & True True & False True & pd.NA + + +``NA`` in a boolean context +--------------------------- + +Since the actual value of an NA is unknown, it is ambiguous to convert NA +to a boolean value. The following raises an error: + +.. ipython:: python + :okexception: + + bool(pd.NA) + +This also means that ``pd.NA`` cannot be used in a context where it is +evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can +potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check +for ``pd.NA`` or it could be prevented that ``condition`` can be ``pd.NA`` +by for example filling missing values beforehand. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2f8f58159066a..5c195eda28416 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -108,12 +108,16 @@ Experimental ``NA`` scalar to denote missing values ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A new ``pd.NA`` value (singleton) is introduced to represent scalar missing -values. Up to now, ``np.nan`` is used for this for float data, ``np.nan``or +values. Up to now, ``np.nan`` is used for this for float data, ``np.nan`` or ``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The goal of ``pd.NA`` is provide a "missing" indicator that can be used consistently accross data types. For now, the nullable integer and boolean data types and the new string data type make use of ``pd.NA`` (:issue:`28095`). +.. warning:: + + Experimental: the behaviour of ``pd.NA`` can still change without warning. + For example, creating a Series using the nullable integer dtype: .. ipython:: python From 1bcbab2d90f60cdd86af615fa86d3a8c35919916 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Nov 2019 09:56:33 +0100 Subject: [PATCH 17/18] doc fixup --- doc/source/user_guide/missing_data.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 56444fcd3cb05..d81a62a27507c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -843,7 +843,7 @@ Logical operations ------------------ For logical operations, ``pd.NA`` follows the rules of the -`three-valued logic `__ (or +`three-valued logic `__ (or *Kleene logic*, similarly to R, SQL and Julia). This logic means to only propagate missing values when it is logically required. @@ -892,7 +892,7 @@ Since the actual value of an NA is unknown, it is ambiguous to convert NA to a boolean value. The following raises an error: .. ipython:: python - :okexception: + :okexcept: bool(pd.NA) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e18a8706ef35f..704f240ed32a1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -136,7 +136,7 @@ or "unknown" in comparison operations: pd.NA > 1 For logical operations, ``pd.NA`` follows the rules of the -`three-valued logic `__ (or +`three-valued logic `__ (or *Kleene logic*). For example: .. ipython:: python From 589a961100f6e2674dd6101a5e4cde8edf31fd03 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 28 Nov 2019 13:30:23 +0100 Subject: [PATCH 18/18] further doc updates --- doc/source/user_guide/missing_data.rst | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index d81a62a27507c..11957cfa265f5 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -787,8 +787,8 @@ Experimental ``NA`` scalar to denote missing values Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is available to represent scalar missing values. At this moment, it is used in -the nullable integer and boolean data types and the dedicated string data type -as the missing value indicator. +the nullable :doc:`integer `, boolean and +:ref:`dedicated string ` data types as the missing value indicator. The goal of ``pd.NA`` is provide a "missing" indicator that can be used consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` @@ -804,6 +804,10 @@ dtype, it will use ``pd.NA``: s[2] s[2] is pd.NA +Currently, pandas does not yet use those data types by default (when creating +a DataFrame or Series, or when reading in data), so you need to specify +the dtype explicitly. + Propagation in arithmetic and comparison operations --------------------------------------------------- @@ -859,7 +863,7 @@ In this case, ``pd.NA`` does not propagate: pd.NA | True On the other hand, if one of the operands is ``False``, the result depends -on the value of the other operand. Therefore, in thise case ``pd.NA`` +on the value of the other operand. Therefore, in this case ``pd.NA`` propagates: .. ipython:: python @@ -899,5 +903,8 @@ to a boolean value. The following raises an error: This also means that ``pd.NA`` cannot be used in a context where it is evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check -for ``pd.NA`` or it could be prevented that ``condition`` can be ``pd.NA`` -by for example filling missing values beforehand. +for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by +filling missing values beforehand. + +A similar situation occurs when using Series or DataFrame objects in ``if`` +statements, see :ref:`gotchas.truth`.