From 03f83bd7d5419c33b139c5a9247ee4372365c62e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 12 Nov 2019 22:33:18 +0100
Subject: [PATCH 01/18] ENH: add NA scalar for missing value indicator

---
 pandas/__init__.py                    |   1 +
 pandas/core/api.py                    |   1 +
 pandas/core/na_scalar.py              | 113 ++++++++++++++++++++++++++
 pandas/tests/scalar/test_na_scalar.py | 108 ++++++++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 pandas/core/na_scalar.py
 create mode 100644 pandas/tests/scalar/test_na_scalar.py

diff --git a/pandas/__init__.py b/pandas/__init__.py
index 5d163e411c0ac..3787b5115276b 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -68,6 +68,7 @@
     DatetimeTZDtype,
     StringDtype,
     # missing
+    NA,
     isna,
     isnull,
     notna,
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 04f2f84c92a15..136b8535c9257 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -45,6 +45,7 @@
 from pandas.core.indexes.period import Period, period_range
 from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
 from pandas.core.indexing import IndexSlice
+from pandas.core.na_scalar import NA
 from pandas.core.reshape.reshape import get_dummies
 from pandas.core.series import Series
 from pandas.core.tools.datetimes import to_datetime
diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py
new file mode 100644
index 0000000000000..56e8c5763bbd9
--- /dev/null
+++ b/pandas/core/na_scalar.py
@@ -0,0 +1,113 @@
+import numbers
+
+
+def _create_binary_propagating_op(name):
+    def method(self, other):
+        if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
+            return NA
+
+        return NotImplemented
+
+    method.__name__ = name
+    return method
+
+
+def _create_unary_propagating_op(name):
+    def method(self):
+        return NA
+
+    method.__name__ = name
+    return method
+
+
+class NAType:
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if NAType._instance is None:
+            NAType._instance = object.__new__(cls, *args, **kwargs)
+        return NAType._instance
+
+    def __repr__(self) -> str:
+        return "NA"
+
+    def __str__(self) -> str:
+        return "NA"
+
+    def __bool__(self):
+        raise TypeError("boolean value of NA is ambiguous")
+
+    def __hash__(self):
+        # TODO what should we use here to hash?
+        return 0
+
+    # Binary arithmetic and comparison ops -> propagate
+
+    __add__ = _create_binary_propagating_op("__add__")
+    __radd__ = _create_binary_propagating_op("__radd__")
+    __sub__ = _create_binary_propagating_op("__sub__")
+    __rsub__ = _create_binary_propagating_op("__rsub__")
+    __mul__ = _create_binary_propagating_op("__mul__")
+    __rmul__ = _create_binary_propagating_op("__rmul__")
+    __matmul__ = _create_binary_propagating_op("__matmul__")
+    __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
+    __truediv__ = _create_binary_propagating_op("__truediv__")
+    __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
+    __floordiv__ = _create_binary_propagating_op("__floordiv__")
+    __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
+    __mod__ = _create_binary_propagating_op("__mod__")
+    __rmod__ = _create_binary_propagating_op("__rmod__")
+    __divmod__ = _create_binary_propagating_op("__divmod__")
+    __rdivmod__ = _create_binary_propagating_op("__rdivmod__")
+    __pow__ = _create_binary_propagating_op("__pow__")
+    __rpow__ = _create_binary_propagating_op("__rpow__")
+
+    # __lshift__ = _create_binary_propagating_op("__add__")
+    # __rshift__ = _create_binary_propagating_op("__add__")
+
+    __eq__ = _create_binary_propagating_op("__eq__")
+    __ne__ = _create_binary_propagating_op("__ne__")
+    __le__ = _create_binary_propagating_op("__le__")
+    __lt__ = _create_binary_propagating_op("__lt__")
+    __gt__ = _create_binary_propagating_op("__gt__")
+    __ge__ = _create_binary_propagating_op("__ge__")
+
+    # Unary ops
+
+    __neg__ = _create_unary_propagating_op("__neg__")
+    __pos__ = _create_unary_propagating_op("__pos__")
+    __abs__ = _create_unary_propagating_op("__abs__")
+    __invert__ = _create_unary_propagating_op("__invert__")
+
+    # Logical ops using Kleene logic
+
+    def __and__(self, other):
+        if other is False:
+            return False
+        elif other is True or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __rand__ = __and__
+
+    def __or__(self, other):
+        if other is True:
+            return True
+        elif other is False or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __ror__ = __or__
+
+    def __xor__(self, other):
+        if other is False or other is True or other is NA:
+            return NA
+        return NotImplemented
+
+    __rxor__ = __xor__
+
+
+NA = NAType()
diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
new file mode 100644
index 0000000000000..9a88b74cac20e
--- /dev/null
+++ b/pandas/tests/scalar/test_na_scalar.py
@@ -0,0 +1,108 @@
+import numpy as np
+import pytest
+
+from pandas.core.na_scalar import NA
+
+
+def test_singleton():
+    assert NA is NA
+    new_NA = type(NA)()
+    assert new_NA is NA
+
+
+def test_repr():
+    assert repr(NA) == "NA"
+    assert str(NA) == "NA"
+
+
+def test_truthiness():
+    with pytest.raises(TypeError):
+        bool(NA)
+
+    with pytest.raises(TypeError):
+        not NA
+
+
+def test_hashable():
+    assert hash(NA) == hash(NA)
+    d = {NA: "test"}
+    assert d[NA] == "test"
+
+
+def test_arithmetic_ops(all_arithmetic_functions):
+    op = all_arithmetic_functions
+
+    for other in [NA, 1, 1.0, "a", np.int64(1)]:
+        if op.__name__ == "rmod" and isinstance(other, str):
+            continue
+        assert op(NA, other) is NA
+
+
+def test_comparison_ops():
+
+    for other in [NA, 1, 1.0, "a", np.int64(1)]:
+        assert (NA == other) is NA
+        assert (NA != other) is NA
+        assert (NA > other) is NA
+        assert (NA >= other) is NA
+        assert (NA < other) is NA
+        assert (NA <= other) is NA
+
+        if isinstance(other, np.int64):
+            # for numpy scalars we get a deprecation warning and False as result
+            # for equality or error for larger/lesser than
+            continue
+
+        assert (other == NA) is NA
+        assert (other != NA) is NA
+        assert (other > NA) is NA
+        assert (other >= NA) is NA
+        assert (other < NA) is NA
+        assert (other <= NA) is NA
+
+
+def test_unary_ops():
+    assert +NA is NA
+    assert -NA is NA
+    assert abs(NA) is NA
+    assert ~NA is NA
+
+
+def test_logical_and():
+
+    assert NA & True is NA
+    assert True & NA is NA
+    assert NA & False is False
+    assert False & NA is False
+    assert NA & NA is NA
+
+    with pytest.raises(TypeError):
+        NA & 5
+
+
+def test_logical_or():
+
+    assert NA | True is True
+    assert True | NA is True
+    assert NA | False is NA
+    assert False | NA is NA
+    assert NA | NA is NA
+
+    with pytest.raises(TypeError):
+        NA | 5
+
+
+def test_logical_xor():
+
+    assert NA ^ True is NA
+    assert True ^ NA is NA
+    assert NA ^ False is NA
+    assert False ^ NA is NA
+    assert NA ^ NA is NA
+
+    with pytest.raises(TypeError):
+        NA ^ 5
+
+
+def test_logical_not():
+    assert ~NA is NA

From c1797d54a168780a37e51f77419f12b41c035b6a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 Nov 2019 15:40:57 +0100
Subject: [PATCH 02/18] add np.nan to arithmetic/comparison tests

---
 pandas/tests/scalar/test_na_scalar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
index 9a88b74cac20e..02f6c2386249f 100644
--- a/pandas/tests/scalar/test_na_scalar.py
+++ b/pandas/tests/scalar/test_na_scalar.py
@@ -32,7 +32,7 @@ def test_hashable():
 def test_arithmetic_ops(all_arithmetic_functions):
     op = all_arithmetic_functions
 
-    for other in [NA, 1, 1.0, "a", np.int64(1)]:
+    for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
         if op.__name__ == "rmod" and isinstance(other, str):
             continue
         assert op(NA, other) is NA
@@ -40,7 +40,7 @@ def test_arithmetic_ops(all_arithmetic_functions):
 
 def test_comparison_ops():
 
-    for other in [NA, 1, 1.0, "a", np.int64(1)]:
+    for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
         assert (NA == other) is NA
         assert (NA != other) is NA
         assert (NA > other) is NA

From 3339eaacc2cdeb87586f94a9f71b27245c40c845 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 Nov 2019 15:41:29 +0100
Subject: [PATCH 03/18] use id(self) for hash

---
 pandas/core/na_scalar.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py
index 56e8c5763bbd9..e60c3f79c8e67 100644
--- a/pandas/core/na_scalar.py
+++ b/pandas/core/na_scalar.py
@@ -39,8 +39,7 @@ def __bool__(self):
         raise TypeError("boolean value of NA is ambiguous")
 
     def __hash__(self):
-        # TODO what should we use here to hash?
-        return 0
+        return id(self)
 
     # Binary arithmetic and comparison ops -> propagate
 

From e9d4d6aa59cf8b91c05ac4397586f1ca3b0a25a7 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 Nov 2019 16:24:46 +0100
Subject: [PATCH 04/18] fix api test

---
 pandas/tests/api/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 3a8e263ac2a6d..489079e3ab3c7 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -46,7 +46,7 @@ class TestPDApi(Base):
     deprecated_modules = []  # type: List[str]
 
     # misc
-    misc = ["IndexSlice", "NaT"]
+    misc = ["IndexSlice", "NaT", "NA"]
 
     # top-level classes
     classes = [

From 4450d2dc9491f51076e14384bc13702d8a7f6bf0 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 13 Nov 2019 22:30:38 +0100
Subject: [PATCH 05/18] move to cython

---
 pandas/_libs/lib.pyx                  |   4 +-
 pandas/_libs/missing.pxd              |   5 ++
 pandas/_libs/missing.pyx              | 125 +++++++++++++++++++++++++-
 pandas/core/api.py                    |   3 +-
 pandas/core/na_scalar.py              | 112 -----------------------
 pandas/tests/scalar/test_na_scalar.py |  22 ++++-
 6 files changed, 154 insertions(+), 117 deletions(-)
 delete mode 100644 pandas/core/na_scalar.py

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 7d65cb52bce1e..1c5680dbca431 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -58,10 +58,9 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
 from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare
 
 from pandas._libs.missing cimport (
-    checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period
+    checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
 )
 
-
 # constants that will be compared to potentially arbitrarily large
 # python int
 cdef:
@@ -161,6 +160,7 @@ def is_scalar(val: object) -> bool:
             or PyTime_Check(val)
             # We differ from numpy, which claims that None is not scalar;
             # see np.isscalar
+            or val is C_NA
             or val is None
             or isinstance(val, (Fraction, Number))
             or util.is_period_object(val)
diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd
index d0dd306680ae8..d4303ac28b9a5 100644
--- a/pandas/_libs/missing.pxd
+++ b/pandas/_libs/missing.pxd
@@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr)
 cdef bint is_null_datetime64(v)
 cdef bint is_null_timedelta64(v)
 cdef bint is_null_period(v)
+
+cdef class C_NAType:
+    pass
+
+cdef C_NAType C_NA
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 9568ddb7fe53f..c994b6e87cb42 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -44,7 +44,7 @@ cpdef bint checknull(object val):
     The difference between `checknull` and `checknull_old` is that `checknull`
     does *not* consider INF or NEGINF to be NA.
     """
-    return is_null_datetimelike(val, inat_is_null=False)
+    return val is C_NA or is_null_datetimelike(val, inat_is_null=False)
 
 
 cpdef bint checknull_old(object val):
@@ -278,3 +278,126 @@ cdef inline bint is_null_period(v):
     # determine if we have a null for a Period (or integer versions),
     # excluding np.datetime64('nat') and np.timedelta64('nat')
     return checknull_with_nat(v)
+
+
+# -----------------------------------------------------------------------------
+# Implementation of NA singleton
+
+
+def _create_binary_propagating_op(name):
+    import numbers
+
+    def method(self, other):
+        if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
+            return NA
+
+        return NotImplemented
+
+    method.__name__ = name
+    return method
+
+
+def _create_unary_propagating_op(name):
+    def method(self):
+        return NA
+
+    method.__name__ = name
+    return method
+
+
+cdef class C_NAType:
+    pass
+
+
+class NAType(C_NAType):
+    """
+    NA ("not available") missing value indicator.
+    """
+
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if NAType._instance is None:
+            NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
+        return NAType._instance
+
+    def __repr__(self) -> str:
+        return "NA"
+
+    def __str__(self) -> str:
+        return "NA"
+
+    def __bool__(self):
+        raise TypeError("boolean value of NA is ambiguous")
+
+    def __hash__(self):
+        return id(self)
+
+    # Binary arithmetic and comparison ops -> propagate
+
+    __add__ = _create_binary_propagating_op("__add__")
+    __radd__ = _create_binary_propagating_op("__radd__")
+    __sub__ = _create_binary_propagating_op("__sub__")
+    __rsub__ = _create_binary_propagating_op("__rsub__")
+    __mul__ = _create_binary_propagating_op("__mul__")
+    __rmul__ = _create_binary_propagating_op("__rmul__")
+    __matmul__ = _create_binary_propagating_op("__matmul__")
+    __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
+    __truediv__ = _create_binary_propagating_op("__truediv__")
+    __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
+    __floordiv__ = _create_binary_propagating_op("__floordiv__")
+    __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
+    __mod__ = _create_binary_propagating_op("__mod__")
+    __rmod__ = _create_binary_propagating_op("__rmod__")
+    __divmod__ = _create_binary_propagating_op("__divmod__")
+    __rdivmod__ = _create_binary_propagating_op("__rdivmod__")
+    __pow__ = _create_binary_propagating_op("__pow__")
+    __rpow__ = _create_binary_propagating_op("__rpow__")
+    # __lshift__ and __rshift__ are not implemented
+
+    __eq__ = _create_binary_propagating_op("__eq__")
+    __ne__ = _create_binary_propagating_op("__ne__")
+    __le__ = _create_binary_propagating_op("__le__")
+    __lt__ = _create_binary_propagating_op("__lt__")
+    __gt__ = _create_binary_propagating_op("__gt__")
+    __ge__ = _create_binary_propagating_op("__ge__")
+
+    # Unary ops
+
+    __neg__ = _create_unary_propagating_op("__neg__")
+    __pos__ = _create_unary_propagating_op("__pos__")
+    __abs__ = _create_unary_propagating_op("__abs__")
+    __invert__ = _create_unary_propagating_op("__invert__")
+
+    # Logical ops using Kleene logic
+
+    def __and__(self, other):
+        if other is False:
+            return False
+        elif other is True or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __rand__ = __and__
+
+    def __or__(self, other):
+        if other is True:
+            return True
+        elif other is False or other is NA:
+            return NA
+        else:
+            return NotImplemented
+
+    __ror__ = __or__
+
+    def __xor__(self, other):
+        if other is False or other is True or other is NA:
+            return NA
+        return NotImplemented
+
+    __rxor__ = __xor__
+
+
+C_NA = NAType()   # C-visible
+NA = C_NA         # Python-visible
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 136b8535c9257..b6c621a369e19 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -45,7 +45,6 @@
 from pandas.core.indexes.period import Period, period_range
 from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
 from pandas.core.indexing import IndexSlice
-from pandas.core.na_scalar import NA
 from pandas.core.reshape.reshape import get_dummies
 from pandas.core.series import Series
 from pandas.core.tools.datetimes import to_datetime
@@ -57,3 +56,5 @@
 
 # DataFrame needs to be imported after NamedAgg to avoid a circular import
 from pandas.core.frame import DataFrame  # isort:skip
+
+from pandas._libs.missing import NA
diff --git a/pandas/core/na_scalar.py b/pandas/core/na_scalar.py
deleted file mode 100644
index e60c3f79c8e67..0000000000000
--- a/pandas/core/na_scalar.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import numbers
-
-
-def _create_binary_propagating_op(name):
-    def method(self, other):
-        if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
-            return NA
-
-        return NotImplemented
-
-    method.__name__ = name
-    return method
-
-
-def _create_unary_propagating_op(name):
-    def method(self):
-        return NA
-
-    method.__name__ = name
-    return method
-
-
-class NAType:
-
-    _instance = None
-
-    def __new__(cls, *args, **kwargs):
-        if NAType._instance is None:
-            NAType._instance = object.__new__(cls, *args, **kwargs)
-        return NAType._instance
-
-    def __repr__(self) -> str:
-        return "NA"
-
-    def __str__(self) -> str:
-        return "NA"
-
-    def __bool__(self):
-        raise TypeError("boolean value of NA is ambiguous")
-
-    def __hash__(self):
-        return id(self)
-
-    # Binary arithmetic and comparison ops -> propagate
-
-    __add__ = _create_binary_propagating_op("__add__")
-    __radd__ = _create_binary_propagating_op("__radd__")
-    __sub__ = _create_binary_propagating_op("__sub__")
-    __rsub__ = _create_binary_propagating_op("__rsub__")
-    __mul__ = _create_binary_propagating_op("__mul__")
-    __rmul__ = _create_binary_propagating_op("__rmul__")
-    __matmul__ = _create_binary_propagating_op("__matmul__")
-    __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
-    __truediv__ = _create_binary_propagating_op("__truediv__")
-    __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
-    __floordiv__ = _create_binary_propagating_op("__floordiv__")
-    __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
-    __mod__ = _create_binary_propagating_op("__mod__")
-    __rmod__ = _create_binary_propagating_op("__rmod__")
-    __divmod__ = _create_binary_propagating_op("__divmod__")
-    __rdivmod__ = _create_binary_propagating_op("__rdivmod__")
-    __pow__ = _create_binary_propagating_op("__pow__")
-    __rpow__ = _create_binary_propagating_op("__rpow__")
-
-    # __lshift__ = _create_binary_propagating_op("__add__")
-    # __rshift__ = _create_binary_propagating_op("__add__")
-
-    __eq__ = _create_binary_propagating_op("__eq__")
-    __ne__ = _create_binary_propagating_op("__ne__")
-    __le__ = _create_binary_propagating_op("__le__")
-    __lt__ = _create_binary_propagating_op("__lt__")
-    __gt__ = _create_binary_propagating_op("__gt__")
-    __ge__ = _create_binary_propagating_op("__ge__")
-
-    # Unary ops
-
-    __neg__ = _create_unary_propagating_op("__neg__")
-    __pos__ = _create_unary_propagating_op("__pos__")
-    __abs__ = _create_unary_propagating_op("__abs__")
-    __invert__ = _create_unary_propagating_op("__invert__")
-
-    # Logical ops using Kleene logic
-
-    def __and__(self, other):
-        if other is False:
-            return False
-        elif other is True or other is NA:
-            return NA
-        else:
-            return NotImplemented
-
-    __rand__ = __and__
-
-    def __or__(self, other):
-        if other is True:
-            return True
-        elif other is False or other is NA:
-            return NA
-        else:
-            return NotImplemented
-
-    __ror__ = __or__
-
-    def __xor__(self, other):
-        if other is False or other is True or other is NA:
-            return NA
-        return NotImplemented
-
-    __rxor__ = __xor__
-
-
-NA = NAType()
diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
index 02f6c2386249f..fdf70495c45ac 100644
--- a/pandas/tests/scalar/test_na_scalar.py
+++ b/pandas/tests/scalar/test_na_scalar.py
@@ -1,7 +1,12 @@
 import numpy as np
 import pytest
 
-from pandas.core.na_scalar import NA
+from pandas._libs.missing import NA
+
+from pandas.core.dtypes.common import is_scalar
+
+import pandas as pd
+import pandas.util.testing as tm
 
 
 def test_singleton():
@@ -106,3 +111,18 @@ def test_logical_xor():
 
 def test_logical_not():
     assert ~NA is NA
+
+
+def test_is_scalar():
+    assert is_scalar(NA) is True
+
+
+def test_isna():
+    assert pd.isna(NA) is True
+    assert pd.notna(NA) is False
+
+
+def test_series_isna():
+    s = pd.Series([1, NA], dtype=object)
+    expected = pd.Series([False, True])
+    tm.assert_series_equal(s.isna(), expected)

From 1849a23030c045dc629c6ccca1951c78717f2a97 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 14 Nov 2019 09:20:46 +0100
Subject: [PATCH 06/18] add examples to isna/notna docstring

---
 pandas/core/dtypes/missing.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 0a8f636b4cb2a..fdb86b619d779 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -79,6 +79,9 @@ def isna(obj):
     >>> pd.isna('dog')
     False
 
+    >>> pd.isna(pd.NA)
+    True
+
     >>> pd.isna(np.nan)
     True
 
@@ -323,6 +326,9 @@ def notna(obj):
     >>> pd.notna('dog')
     True
 
+    >>> pd.notna(pd.NA)
+    False
+
     >>> pd.notna(np.nan)
     False
 

From c72e3eeae2a5ce05d110c087ba9411ea522c37f8 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <TomAugspurger@users.noreply.github.com>
Date: Thu, 14 Nov 2019 07:53:03 -0600
Subject: [PATCH 07/18] Use NA scalar in string dtype (#1)

---
 pandas/_libs/lib.pyx                       |  2 +-
 pandas/_libs/testing.pyx                   |  8 +++--
 pandas/core/arrays/numpy_.py               |  3 ++
 pandas/core/arrays/string_.py              | 37 ++++++++++------------
 pandas/core/dtypes/missing.py              |  5 +++
 pandas/core/strings.py                     |  7 ++++
 pandas/tests/arrays/string_/test_string.py | 11 ++++++-
 pandas/tests/extension/test_string.py      |  8 ++---
 pandas/tests/test_strings.py               |  8 +++++
 9 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 1c5680dbca431..e2fb4a3635387 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1500,7 +1500,7 @@ cdef class Validator:
                                   f'must define is_value_typed')
 
     cdef bint is_valid_null(self, object value) except -1:
-        return value is None or util.is_nan(value)
+        return value is None or value is C_NA or util.is_nan(value)
 
     cdef bint is_array_typed(self) except -1:
         return False
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index f848310d961e1..adbc25b96ea42 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if a == b:
-        # object comparison
-        return True
     if isna(a) and isna(b):
         # TODO: Should require same-dtype NA?
         # nan / None comparison
         return True
+
+    if a == b:
+        # object comparison
+        return True
+
     if is_comparable_as_number(a) and is_comparable_as_number(b):
         if array_equivalent(a, b, strict_nan=True):
             # inf comparison
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 6f2bb095a014d..8ba5cd7565850 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):
         return new_values
 
     def take(self, indices, allow_fill=False, fill_value=None):
+        if fill_value is None:
+            # Primarily for subclasses
+            fill_value = self.dtype.na_value
         result = take(
             self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
         )
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7c487b227de20..bb2a9a22e601f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,9 +1,9 @@
 import operator
-from typing import TYPE_CHECKING, Type
+from typing import Type
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import lib, missing as libmissing
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import pandas_dtype
@@ -17,9 +17,6 @@
 from pandas.core.construction import extract_array
 from pandas.core.missing import isna
 
-if TYPE_CHECKING:
-    from pandas._typing import Scalar
-
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    @property
-    def na_value(self) -> "Scalar":
-        """
-        StringDtype uses :attr:`numpy.nan` as the missing NA value.
-
-        .. warning::
-
-           `na_value` may change in a future release.
-        """
-        return np.nan
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
 
     @property
     def type(self) -> Type:
@@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
             assert dtype == "string"
         result = super()._from_sequence(scalars, dtype=object, copy=copy)
-        # convert None to np.nan
+        # Standardize all missing-like values to NA
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
-        result[result.isna()] = np.nan
+        result[result.isna()] = StringDtype.na_value
         return result
 
     @classmethod
@@ -192,6 +181,12 @@ def __arrow_array__(self, type=None):
             type = pa.string()
         return pa.array(self._ndarray, type=type, from_pandas=True)
 
+    def _values_for_factorize(self):
+        arr = self._ndarray.copy()
+        mask = self.isna()
+        arr[mask] = -1
+        return arr, -1
+
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
@@ -205,9 +200,9 @@ def __setitem__(self, key, value):
 
         # validate new items
         if scalar_value:
-            if scalar_value is None:
-                value = np.nan
-            elif not (isinstance(value, str) or np.isnan(value)):
+            if isna(value):
+                value = StringDtype.na_value
+            elif not isinstance(value, str):
                 raise ValueError(
                     "Cannot set non-string value '{}' into a StringArray.".format(value)
                 )
@@ -265,7 +260,7 @@ def method(self, other):
                 other = other[valid]
 
             result = np.empty_like(self._ndarray, dtype="object")
-            result[mask] = np.nan
+            result[mask] = StringDtype.na_value
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index fdb86b619d779..2d57997c36ef1 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -446,6 +446,9 @@ def array_equivalent(left, right, strict_nan=False):
             if left_value is NaT and right_value is not NaT:
                 return False
 
+            elif left_value is libmissing.NA and right_value is not libmissing.NA:
+                return False
+
             elif isinstance(left_value, float) and np.isnan(left_value):
                 if not isinstance(right_value, float) or not np.isnan(right_value):
                     return False
@@ -457,6 +460,8 @@ def array_equivalent(left, right, strict_nan=False):
                     if "Cannot compare tz-naive" in str(err):
                         # tzawareness compat failure, see GH#28507
                         return False
+                    elif "boolean value of NA is ambiguous" in str(err):
+                        return False
                     raise
         return True
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 7194d1cf08e4a..0a0f1c4d2527e 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -123,6 +123,13 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
     if na_mask:
         mask = isna(arr)
         convert = not np.all(mask)
+        if convert:
+            # XXX: This converts pd.NA to np.nan to match the output of
+            # object-dtype ops that return numeric, like str.count
+            # We probably want to return Int64Dtype instead.
+            # NA -> nan
+            arr[mask] = np.nan
+
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
         except (TypeError, AttributeError) as e:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index efe2b4e0b2deb..3029f3007929b 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -12,7 +12,7 @@
 def test_none_to_nan():
     a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
     assert a[1] is not None
-    assert np.isnan(a[1])
+    assert a[1] is pd.NA
 
 
 def test_setitem_validates():
@@ -24,6 +24,15 @@ def test_setitem_validates():
         a[:] = np.array([1, 2])
 
 
+def test_setitem_with_scalar_string():
+    # is_float_dtype considers some strings, like 'd', to be floats
+    # which can cause issues.
+    arr = pd.array(["a", "c"], dtype="string")
+    arr[0] = "d"
+    expected = pd.array(["d", "c"], dtype="string")
+    tm.assert_extension_array_equal(arr, expected)
+
+
 @pytest.mark.parametrize(
     "input, method",
     [
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 5b872d5b72227..471a1b79d23bc 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -25,7 +25,7 @@ def data():
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    return StringArray._from_sequence([np.nan, "A"])
+    return StringArray._from_sequence([pd.NA, "A"])
 
 
 @pytest.fixture
@@ -35,17 +35,17 @@ def data_for_sorting():
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return StringArray._from_sequence(["B", np.nan, "A"])
+    return StringArray._from_sequence(["B", pd.NA, "A"])
 
 
 @pytest.fixture
 def na_value():
-    return np.nan
+    return pd.NA
 
 
 @pytest.fixture
 def data_for_grouping():
-    return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
+    return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
 
 
 class TestDtype(base.BaseDtypeTests):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index f5d28ec82d1d4..9b8ccdf4a7d9e 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -3507,3 +3507,11 @@ def test_string_array(any_string_method):
         assert all(result[columns].dtypes == "string")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="not implmented yet")
+def test_string_dtype_numeric():
+    s = Series(["a", "aa", None], dtype="string")
+    result = s.str.count("a")
+    expected = Series([1, 2, None], dtype="Int64")
+    tm.assert_series_equal(result, expected)

From 230266176939cf33b42cec3fe5c963b767bc4092 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 14 Nov 2019 15:53:52 +0100
Subject: [PATCH 08/18] fix doctest

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index bb2a9a22e601f..cf3e427f0ee26 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -120,7 +120,7 @@ class StringArray(PandasArray):
     --------
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
     <StringArray>
-    ['This is', 'some text', nan, 'data.']
+    ['This is', 'some text', NA, 'data.']
     Length: 4, dtype: string
 
     Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string

From 2ab592a4b2e25d3bd30573e86c818a5d0015e463 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 14 Nov 2019 19:48:17 +0100
Subject: [PATCH 09/18] small edits

---
 pandas/_libs/lib.pyx         |  1 +
 pandas/_libs/missing.pyx     | 12 +++++++++++-
 pandas/tests/test_strings.py |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 2765b0f17da5d..747eedb3e9629 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -61,6 +61,7 @@ from pandas._libs.missing cimport (
     checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
 )
 
+
 # constants that will be compared to potentially arbitrarily large
 # python int
 cdef:
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index c994b6e87cb42..37e7fa1520cfd 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -1,6 +1,8 @@
 import cython
 from cython import Py_ssize_t
 
+import numbers
+
 import numpy as np
 cimport numpy as cnp
 from numpy cimport ndarray, int64_t, uint8_t, float64_t
@@ -285,7 +287,6 @@ cdef inline bint is_null_period(v):
 
 
 def _create_binary_propagating_op(name):
-    import numbers
 
     def method(self, other):
         if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
@@ -312,6 +313,15 @@ cdef class C_NAType:
 class NAType(C_NAType):
     """
     NA ("not available") missing value indicator.
+
+    .. warning::
+
+       Experimental: the behaviour of NA can still change without warning.
+
+    .. versionadded:: 1.0.0
+
+    The NA singleton is a missing value indicator defined by pandas. It is
+    used in certain new extension dtypes (currently the "string" dtype).
     """
 
     _instance = None
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 9b8ccdf4a7d9e..e34bb9321a667 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -3509,7 +3509,7 @@ def test_string_array(any_string_method):
     tm.assert_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="not implmented yet")
+@pytest.mark.xfail(reason="not implemented yet")
 def test_string_dtype_numeric():
     s = Series(["a", "aa", None], dtype="string")
     result = s.str.count("a")

From 018399e60c7218011b83fa12ebf79e5c19c7ec64 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Nov 2019 09:22:45 +0100
Subject: [PATCH 10/18] fix NA in repr

---
 pandas/io/formats/format.py                |  3 +++
 pandas/tests/arrays/string_/test_string.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 17603809c2ea6..5036a96545d4e 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -35,6 +35,7 @@
 from pandas._config.config import get_option, set_option
 
 from pandas._libs import lib
+from pandas._libs.missing import NA
 from pandas._libs.tslib import format_array_from_datetime
 from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
 from pandas._libs.tslibs.nattype import NaTType
@@ -1218,6 +1219,8 @@ def _format(x):
                     # determine na_rep if x is None or NaT-like
                     if x is None:
                         return "None"
+                    elif x is NA:
+                        return "NA"
                     elif x is NaT or np.isnat(x):
                         return "NaT"
                 except (TypeError, ValueError):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 3029f3007929b..80193ec230790 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -9,6 +9,16 @@
 import pandas.util.testing as tm
 
 
+def test_repr_with_NA():
+    a = pd.array(["a", pd.NA, "b"], dtype="string")
+    for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]:
+        assert "NA" in repr(obj) and "NaN" not in repr(obj)
+        assert "NA" in str(obj) and "NaN" not in str(obj)
+        if hasattr(obj, "_repr_html_"):
+            html_repr = obj._repr_html_()
+            assert "NA" in html_repr and "NaN" not in html_repr
+
+
 def test_none_to_nan():
     a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
     assert a[1] is not None

From 33fd3e09eb9123e93856aee7ddad1f8d8e5f6c62 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 19 Nov 2019 10:24:49 -0600
Subject: [PATCH 11/18] remove redundant test

---
 pandas/tests/test_strings.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index c3494f9f4d9a7..1261c3bbc86db 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -3527,14 +3527,6 @@ def test_string_array(any_string_method):
     tm.assert_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="not implemented yet")
-def test_string_dtype_numeric():
-    s = Series(["a", "aa", None], dtype="string")
-    result = s.str.count("a")
-    expected = Series([1, 2, None], dtype="Int64")
-    tm.assert_series_equal(result, expected)
-
-
 @pytest.mark.parametrize(
     "method,expected",
     [

From 289c8857b2646b2890fc079f37785adc4dd27463 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 19 Nov 2019 10:26:23 -0600
Subject: [PATCH 12/18] remove dead code

---
 pandas/core/strings.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index e3143cd9dc8c0..413e7e85eb6fe 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -201,13 +201,6 @@ def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object):
     if na_mask:
         mask = isna(arr)
         convert = not np.all(mask)
-        if convert:
-            # XXX: This converts pd.NA to np.nan to match the output of
-            # object-dtype ops that return numeric, like str.count
-            # We probably want to return Int64Dtype instead.
-            # NA -> nan
-            arr[mask] = np.nan
-
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
         except (TypeError, AttributeError) as e:

From f8208db7f92a80a1ac8a7a1295d8b07a48306a10 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 21 Nov 2019 16:57:40 +0100
Subject: [PATCH 13/18] fix divmod

---
 pandas/_libs/missing.pyx              | 11 +++++++----
 pandas/tests/scalar/test_na_scalar.py |  5 ++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 37e7fa1520cfd..5a2874a708e6e 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -286,11 +286,14 @@ cdef inline bint is_null_period(v):
 # Implementation of NA singleton
 
 
-def _create_binary_propagating_op(name):
+def _create_binary_propagating_op(name, divmod=False):
 
     def method(self, other):
         if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
-            return NA
+            if divmod:
+                return NA, NA
+            else:
+                return NA
 
         return NotImplemented
 
@@ -359,8 +362,8 @@ class NAType(C_NAType):
     __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
     __mod__ = _create_binary_propagating_op("__mod__")
     __rmod__ = _create_binary_propagating_op("__rmod__")
-    __divmod__ = _create_binary_propagating_op("__divmod__")
-    __rdivmod__ = _create_binary_propagating_op("__rdivmod__")
+    __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True)
+    __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True)
     __pow__ = _create_binary_propagating_op("__pow__")
     __rpow__ = _create_binary_propagating_op("__rpow__")
     # __lshift__ and __rshift__ are not implemented
diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
index fdf70495c45ac..e68e49814245f 100644
--- a/pandas/tests/scalar/test_na_scalar.py
+++ b/pandas/tests/scalar/test_na_scalar.py
@@ -40,7 +40,10 @@ def test_arithmetic_ops(all_arithmetic_functions):
     for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
         if op.__name__ == "rmod" and isinstance(other, str):
             continue
-        assert op(NA, other) is NA
+        if op.__name__ in ("divmod", "rdivmod"):
+            assert op(NA, other) is (NA, NA)
+        else:
+            assert op(NA, other) is NA
 
 
 def test_comparison_ops():

From 1fcf4b7400a1419ea272140fa480dcad94af338b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 25 Nov 2019 18:55:19 +0100
Subject: [PATCH 14/18] NA -> C_NA

---
 pandas/_libs/missing.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 5a2874a708e6e..9bf955ad369e7 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -289,7 +289,7 @@ cdef inline bint is_null_period(v):
 def _create_binary_propagating_op(name, divmod=False):
 
     def method(self, other):
-        if isinstance(other, numbers.Number) or other is NA or isinstance(other, str):
+        if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number):
             if divmod:
                 return NA, NA
             else:
@@ -387,7 +387,7 @@ class NAType(C_NAType):
     def __and__(self, other):
         if other is False:
             return False
-        elif other is True or other is NA:
+        elif other is True or other is C_NA:
             return NA
         else:
             return NotImplemented
@@ -397,7 +397,7 @@ class NAType(C_NAType):
     def __or__(self, other):
         if other is True:
             return True
-        elif other is False or other is NA:
+        elif other is False or other is C_NA:
             return NA
         else:
             return NotImplemented
@@ -405,7 +405,7 @@ class NAType(C_NAType):
     __ror__ = __or__
 
     def __xor__(self, other):
-        if other is False or other is True or other is NA:
+        if other is False or other is True or other is C_NA:
             return NA
         return NotImplemented
 

From f6798e5a6e92b7581cccd44aaa750c6ecb9abd2c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 26 Nov 2019 19:03:01 +0100
Subject: [PATCH 15/18] start some docs

---
 doc/source/user_guide/missing_data.rst | 114 ++++++++++++++++++++++++-
 doc/source/whatsnew/v1.0.0.rst         |  40 +++++++++
 2 files changed, 150 insertions(+), 4 deletions(-)

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index 6c36a6470f841..9b6851c44c5bc 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -12,10 +12,10 @@ pandas.
 .. note::
 
     The choice of using ``NaN`` internally to denote missing data was largely
-    for simplicity and performance reasons. It differs from the MaskedArray
-    approach of, for example, :mod:`scikits.timeseries`. We are hopeful that
-    NumPy will soon be able to provide a native NA type solution (similar to R)
-    performant enough to be used in pandas.
+    for simplicity and performance reasons.
+    Starting from pandas 1.0, some optional data types start experimenting
+    with a native ``NA`` scalar using a mask-based approach. See
+    :ref:`here <missing_data.NA>` for more.
 
 See the :ref:`cookbook<cookbook.missing_data>` for some advanced strategies.
 
@@ -771,3 +771,109 @@ the ``dtype="Int64"``.
    s
 
 See :ref:`integer_na` for more.
+
+
+.. _missing_data.NA:
+
+Experimental ``NA`` scalar to denote missing values
+---------------------------------------------------
+
+.. warning::
+
+   Experimental: the behaviour of ``pd.NA`` can still change without warning.
+
+.. versionadded:: 1.0.0
+
+Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is
+available to represent scalar missing values. At this moment, it is used in
+the nullable integer and boolean data types and the dedicated string data type
+as the missing value indicator.
+
+The goal of ``pd.NA`` is provide a "missing" indicator that can be used
+consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT``
+depending on the data type).
+
+For example, when having missing values in a Series with the nullable integer
+dtype will use ``pd.NA``:
+
+.. ipython:: python
+
+    s = pd.Series([1, 2, None], dtype="Int64")
+    s
+    s[2]
+
+Propagation in arithmetic and comparison operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In general, missing values *propagate* in operations involving ``pd.NA``. When
+one of the operands is unknown, the outcome of the operation is also unknown.
+
+For example, ``pd.NA`` propagates in arithmetic operations, similarly to
+``np.nan``:
+
+.. ipython:: python
+
+   pd.NA + 1
+   "a" * pd.NA
+
+In equality and comparison operations, ``pd.NA`` also propagates. This deviates
+from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always
+return ``False``.
+
+.. ipython:: python
+
+   pd.NA == 1
+   pd.NA == pd.NA
+   pd.NA < 2.5
+
+To check if a value is equal to ``pd.NA``, the :func:`isna` function can be
+used:
+
+.. ipython:: python
+
+   pd.isna(pd.NA)
+
+Logical operations
+~~~~~~~~~~~~~~~~~~
+
+For logical operations, ``pd.NA`` follows the rules of the
+`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic> `__ (or
+*Kleene logic*, similarly to R, SQL and Julia). This logic means to only
+propagate missing values when it is logically required.
+
+For example, for the logical "or" operation (``|``), if one of the operands
+is ``True``, we already know the result will be ``True``, regardless of the
+other values (so regardless the missing value would be ``True`` or ``False``).
+In this case, ``pd.NA`` does not propagate:
+
+.. ipython:: python
+
+   True | False
+   True | pd.NA
+   pd.NA | True
+
+On the other hand, if one of the operands is ``False``, the result depends
+on the value of the other operand. Therefore, in thise case ``pd.NA``
+propagates:
+
+.. ipython:: python
+
+   False | True
+   False | True
+   False | pd.NA
+
+The behaviour of the logical "and" operation (``&``) can be derived using
+similar logic (where now ``pd.NA`` will not propagate if one of the operands
+is already ``False``):
+
+.. ipython:: python
+
+   False & True
+   False & False
+   False & pd.NA
+
+.. ipython:: python
+
+   True & True
+   True & False
+   True & pd.NA
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 7d11d90eeb670..2f8f58159066a 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -102,6 +102,46 @@ String accessor methods returning integers will return a value with :class:`Int6
 We recommend explicitly using the ``string`` data type when working with strings.
 See :ref:`text.types` for more.
 
+.. _whatsnew_100.NA:
+
+Experimental ``NA`` scalar to denote missing values
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A new ``pd.NA`` value (singleton) is introduced to represent scalar missing
+values. Up to now, ``np.nan`` is used for this for float data, ``np.nan``or
+``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The
+goal of ``pd.NA`` is provide a "missing" indicator that can be used
+consistently accross data types. For now, the nullable integer and boolean
+data types and the new string data type make use of ``pd.NA`` (:issue:`28095`).
+
+For example, creating a Series using the nullable integer dtype:
+
+.. ipython:: python
+
+    s = pd.Series([1, 2, None], dtype="Int64")
+    s
+    s[2]
+
+Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations.
+In addition to arithmetic operations, ``pd.NA`` also propagates as "missing"
+or "unknown" in comparison operations:
+
+.. ipython:: python
+
+    np.nan > 1
+    pd.NA > 1
+
+For logical operations, ``pd.NA`` follows the rules of the
+`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic> `__ (or
+*Kleene logic*). For example:
+
+.. ipython:: python
+
+    pd.NA | True
+
+For more, see :ref:`NA section <missing_data.NA>` in the user guide on missing
+data.
+
 .. _whatsnew_100.boolean:
 
 Boolean data type with missing values support

From 14c143452625e8be013bbc85031e6b531132e145 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 27 Nov 2019 08:37:58 +0100
Subject: [PATCH 16/18] futher doc updates

---
 doc/source/user_guide/missing_data.rst | 40 ++++++++++++++++++++------
 doc/source/whatsnew/v1.0.0.rst         |  6 +++-
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index 9b6851c44c5bc..56444fcd3cb05 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -110,7 +110,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``.
 .. _missing.inserting:
 
 Inserting missing data
-----------------------
+~~~~~~~~~~~~~~~~~~~~~~
 
 You can insert missing values by simply assigning to containers. The
 actual missing value used will be chosen based on the dtype.
@@ -135,9 +135,10 @@ For object containers, pandas will use the value given:
    s.loc[1] = np.nan
    s
 
+.. _missing_data.calculations:
 
 Calculations with missing data
-------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Missing values propagate naturally through arithmetic operations between pandas
 objects.
@@ -776,7 +777,7 @@ See :ref:`integer_na` for more.
 .. _missing_data.NA:
 
 Experimental ``NA`` scalar to denote missing values
----------------------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. warning::
 
@@ -794,16 +795,17 @@ consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT``
 depending on the data type).
 
 For example, when having missing values in a Series with the nullable integer
-dtype will use ``pd.NA``:
+dtype, it will use ``pd.NA``:
 
 .. ipython:: python
 
     s = pd.Series([1, 2, None], dtype="Int64")
     s
     s[2]
+    s[2] is pd.NA
 
 Propagation in arithmetic and comparison operations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------------------
 
 In general, missing values *propagate* in operations involving ``pd.NA``. When
 one of the operands is unknown, the outcome of the operation is also unknown.
@@ -833,8 +835,12 @@ used:
 
    pd.isna(pd.NA)
 
+An exception on this basic propagation rule are *reductions* (such as the
+mean or the minimum), where pandas defaults to skipping missing values. See
+:ref:`above <missing_data.calculations>` for more.
+
 Logical operations
-~~~~~~~~~~~~~~~~~~
+------------------
 
 For logical operations, ``pd.NA`` follows the rules of the
 `three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic> `__ (or
@@ -843,7 +849,7 @@ propagate missing values when it is logically required.
 
 For example, for the logical "or" operation (``|``), if one of the operands
 is ``True``, we already know the result will be ``True``, regardless of the
-other values (so regardless the missing value would be ``True`` or ``False``).
+other value (so regardless the missing value would be ``True`` or ``False``).
 In this case, ``pd.NA`` does not propagate:
 
 .. ipython:: python
@@ -859,7 +865,7 @@ propagates:
 .. ipython:: python
 
    False | True
-   False | True
+   False | False
    False | pd.NA
 
 The behaviour of the logical "and" operation (``&``) can be derived using
@@ -877,3 +883,21 @@ is already ``False``):
    True & True
    True & False
    True & pd.NA
+
+
+``NA`` in a boolean context
+---------------------------
+
+Since the actual value of an NA is unknown, it is ambiguous to convert NA
+to a boolean value. The following raises an error:
+
+.. ipython:: python
+   :okexception:
+
+   bool(pd.NA)
+
+This also means that ``pd.NA`` cannot be used in a context where it is
+evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can
+potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check
+for ``pd.NA`` or it could be prevented that ``condition`` can be ``pd.NA``
+by for example filling missing values beforehand.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 2f8f58159066a..5c195eda28416 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -108,12 +108,16 @@ Experimental ``NA`` scalar to denote missing values
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 A new ``pd.NA`` value (singleton) is introduced to represent scalar missing
-values. Up to now, ``np.nan`` is used for this for float data, ``np.nan``or
+values. Up to now, ``np.nan`` is used for this for float data, ``np.nan`` or
 ``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The
 goal of ``pd.NA`` is provide a "missing" indicator that can be used
 consistently accross data types. For now, the nullable integer and boolean
 data types and the new string data type make use of ``pd.NA`` (:issue:`28095`).
 
+.. warning::
+
+   Experimental: the behaviour of ``pd.NA`` can still change without warning.
+
 For example, creating a Series using the nullable integer dtype:
 
 .. ipython:: python

From 1bcbab2d90f60cdd86af615fa86d3a8c35919916 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 27 Nov 2019 09:56:33 +0100
Subject: [PATCH 17/18] doc fixup

---
 doc/source/user_guide/missing_data.rst | 4 ++--
 doc/source/whatsnew/v1.0.0.rst         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index 56444fcd3cb05..d81a62a27507c 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -843,7 +843,7 @@ Logical operations
 ------------------
 
 For logical operations, ``pd.NA`` follows the rules of the
-`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic> `__ (or
+`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic>`__ (or
 *Kleene logic*, similarly to R, SQL and Julia). This logic means to only
 propagate missing values when it is logically required.
 
@@ -892,7 +892,7 @@ Since the actual value of an NA is unknown, it is ambiguous to convert NA
 to a boolean value. The following raises an error:
 
 .. ipython:: python
-   :okexception:
+   :okexcept:
 
    bool(pd.NA)
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index e18a8706ef35f..704f240ed32a1 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -136,7 +136,7 @@ or "unknown" in comparison operations:
     pd.NA > 1
 
 For logical operations, ``pd.NA`` follows the rules of the
-`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic> `__ (or
+`three-valued logic <https://en.wikipedia.org/wiki/Three-valued_logic>`__ (or
 *Kleene logic*). For example:
 
 .. ipython:: python

From 589a961100f6e2674dd6101a5e4cde8edf31fd03 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 28 Nov 2019 13:30:23 +0100
Subject: [PATCH 18/18] further doc updates

---
 doc/source/user_guide/missing_data.rst | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index d81a62a27507c..11957cfa265f5 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -787,8 +787,8 @@ Experimental ``NA`` scalar to denote missing values
 
 Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is
 available to represent scalar missing values. At this moment, it is used in
-the nullable integer and boolean data types and the dedicated string data type
-as the missing value indicator.
+the nullable :doc:`integer <integer_na>`, boolean and
+:ref:`dedicated string <text.types>` data types as the missing value indicator.
 
 The goal of ``pd.NA`` is provide a "missing" indicator that can be used
 consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT``
@@ -804,6 +804,10 @@ dtype, it will use ``pd.NA``:
     s[2]
     s[2] is pd.NA
 
+Currently, pandas does not yet use those data types by default (when creating
+a DataFrame or Series, or when reading in data), so you need to specify
+the dtype explicitly.
+
 Propagation in arithmetic and comparison operations
 ---------------------------------------------------
 
@@ -859,7 +863,7 @@ In this case, ``pd.NA`` does not propagate:
    pd.NA | True
 
 On the other hand, if one of the operands is ``False``, the result depends
-on the value of the other operand. Therefore, in thise case ``pd.NA``
+on the value of the other operand. Therefore, in this case ``pd.NA``
 propagates:
 
 .. ipython:: python
@@ -899,5 +903,8 @@ to a boolean value. The following raises an error:
 This also means that ``pd.NA`` cannot be used in a context where it is
 evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can
 potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check
-for ``pd.NA`` or it could be prevented that ``condition`` can be ``pd.NA``
-by for example filling missing values beforehand.
+for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by
+filling missing values beforehand.
+
+A similar situation occurs when using Series or DataFrame objects in ``if``
+statements, see :ref:`gotchas.truth`.