pandas-dev · jreback · May 5, 2021 · Apr 13, 2021 · Apr 18, 2021 · Apr 18, 2021
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -5,6 +5,8 @@ from typing import (
     Any,
     Callable,
     Generator,
+    Literal,
+    overload,
 )
 
 import numpy as np
@@ -70,13 +72,25 @@ def maybe_convert_objects(
     convert_to_nullable_integer: bool = False,
 ) -> ArrayLike: ...
 
+@overload
 def maybe_convert_numeric(
     values: np.ndarray,  # np.ndarray[object]
     na_values: set,
     convert_empty: bool = True,
     coerce_numeric: bool = False,
+    convert_to_masked_nullable: Literal[False] = ...,
 ) -> np.ndarray: ...
 
+@overload
+def maybe_convert_numeric(
+    values: np.ndarray,  # np.ndarray[object]
+    na_values: set,
+    convert_empty: bool = True,
+    coerce_numeric: bool = False,
+    *,
+    convert_to_masked_nullable: Literal[True],
+) -> tuple[np.ndarray,np.ndarray]: ...
+
 # TODO: restrict `arr`?
 def ensure_string_array(
     arr,

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2008,7 +2008,8 @@ def maybe_convert_numeric(
     set na_values,
     bint convert_empty=True,
     bint coerce_numeric=False,
-) -> ndarray:
+    bint convert_to_masked_nullable=False,
+) -> np.ndarray:
     """
     Convert object array to a numeric array if possible.
 
@@ -2032,11 +2033,14 @@ def maybe_convert_numeric(
         numeric array has no suitable numerical dtype to return (i.e. uint64,
         int32, uint8). If set to False, the original object array will be
         returned. Otherwise, a ValueError will be raised.
-
+    convert_to_masked_nullable : bool, default False
+        Whether to return a mask for the converted values. This also disables
+        upcasting for ints with nulls to float64.
     Returns
     -------
-    np.ndarray
+    np.ndarray or tuple of converted values and its mask
         Array of converted object values to numerical ones.
+        Also returns mask if convert_to_masked_nullable is True.
     """
     if len(values) == 0:
         return np.array([], dtype='i8')
@@ -2063,21 +2067,40 @@ def maybe_convert_numeric(
         ndarray[int64_t] ints = np.empty(n, dtype='i8')
         ndarray[uint64_t] uints = np.empty(n, dtype='u8')
         ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+        ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
         float64_t fval
+        bint allow_null_in_int = convert_to_masked_nullable
 
     for i in range(n):
         val = values[i]
+        # We only want to disable NaNs showing as float if
+        # a) convert_to_masked_nullable = True
+        # b) no floats have been seen ( assuming an int shows up later )
+        # However, if no ints present (all null array), we need to return floats
+        allow_null_in_int = convert_to_masked_nullable and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
-            seen.saw_null()
+            if allow_null_in_int:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                if convert_to_masked_nullable:
+                    mask[i] = 1
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
                 seen.null_ = True
-
+                if allow_null_in_int:
+                    mask[i] = 1
+                else:
+                    if convert_to_masked_nullable:
+                        mask[i] = 1
+                    seen.float_ = True
+            else:
+                seen.float_ = True
             floats[i] = complexes[i] = fval
-            seen.float_ = True
         elif util.is_integer_object(val):
             floats[i] = complexes[i] = val
 
@@ -2100,7 +2123,13 @@ def maybe_convert_numeric(
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
         elif val is None or val is C_NA:
-            seen.saw_null()
+            if allow_null_in_int:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                if convert_to_masked_nullable:
+                    mask[i] = 1
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif hasattr(val, '__len__') and len(val) == 0:
             if convert_empty or seen.coerce_numeric:
@@ -2121,17 +2150,22 @@ def maybe_convert_numeric(
                 if fval in na_values:
                     seen.saw_null()
                     floats[i] = complexes[i] = NaN
+                    mask[i] = 1
                 else:
                     if fval != fval:
                         seen.null_ = True
+                        mask[i] = 1
 
                     floats[i] = fval
 
                 if maybe_int:
                     as_int = int(val)
 
                     if as_int in na_values:
-                        seen.saw_null()
+                        mask[i] = 1
+                        seen.null_ = True
+                        if not allow_null_in_int:
+                            seen.float_ = True
                     else:
                         seen.saw_int(as_int)
 
@@ -2161,11 +2195,23 @@ def maybe_convert_numeric(
     if seen.check_uint64_conflict():
         return values
 
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if allow_null_in_int and seen.null_ and not seen.int_:
+        seen.float_ = True
+
     if seen.complex_:
         return complexes
     elif seen.float_:
+        if seen.null_ and convert_to_masked_nullable:
+            return (floats, mask.view(np.bool_))
         return floats
     elif seen.int_:
+        if seen.null_ and convert_to_masked_nullable:
+            if seen.uint_:
+                return (uints, mask.view(np.bool_))
+            else:
+                return (ints, mask.view(np.bool_))
         if seen.uint_:
             return uints
         else:

diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
@@ -39,5 +39,6 @@ def vec_binop(
 def maybe_convert_bool(
     arr: np.ndarray,  # np.ndarray[object]
     true_values=...,
-    false_values=...
+    false_values=...,
+    convert_to_masked_nullable: bool = True,
 ) -> np.ndarray: ...
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
@@ -24,10 +24,7 @@ import_array()
 
 
 from pandas._libs.missing cimport checknull
-from pandas._libs.util cimport (
-    UINT8_MAX,
-    is_nan,
-)
+from pandas._libs.util cimport is_nan
 
 
 @cython.wraparound(False)
@@ -258,17 +255,21 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
 
 
 def maybe_convert_bool(ndarray[object] arr,
-                       true_values=None, false_values=None) -> ndarray:
+                       true_values=None,
+                       false_values=None,
+                       convert_to_masked_nullable=False
+                       ) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
     cdef:
         Py_ssize_t i, n
         ndarray[uint8_t] result
+        ndarray[uint8_t] mask
         object val
         set true_vals, false_vals
-        int na_count = 0
+        bint has_na = False
 
     n = len(arr)
     result = np.empty(n, dtype=np.uint8)
-
+    mask = np.zeros(n, dtype=np.uint8)
     # the defaults
     true_vals = {'True', 'TRUE', 'true'}
     false_vals = {'False', 'FALSE', 'false'}
@@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
             result[i] = 1
         elif val in false_vals:
             result[i] = 0
-        elif isinstance(val, float):
-            result[i] = UINT8_MAX
-            na_count += 1
+        elif is_nan(val):
+            mask[i] = 1
+            result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
+            has_na = True
         else:
             return arr
 
-    if na_count > 0:
-        mask = result == UINT8_MAX
-        arr = result.view(np.bool_).astype(object)
-        np.putmask(arr, mask, np.nan)
-        return arr
+    if has_na:
+        if convert_to_masked_nullable:
+            return (result.view(np.bool_), mask.view(np.bool_))
+        else:
+            arr = result.view(np.bool_).astype(object)
+            np.putmask(arr, mask, np.nan)
+            return arr
     else:
         return result.view(np.bool_)