Skip to content

ENH: Nullable integer/boolean/floating support in lib inferencing functions #40914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ from typing import (
Any,
Callable,
Generator,
Literal,
overload,
)

import numpy as np
Expand Down Expand Up @@ -70,13 +72,25 @@ def maybe_convert_objects(
convert_to_nullable_integer: bool = False,
) -> ArrayLike: ...

@overload
def maybe_convert_numeric(
values: np.ndarray, # np.ndarray[object]
na_values: set,
convert_empty: bool = True,
coerce_numeric: bool = False,
convert_to_masked_nullable: Literal[False] = ...,
) -> np.ndarray: ...

@overload
def maybe_convert_numeric(
values: np.ndarray, # np.ndarray[object]
na_values: set,
convert_empty: bool = True,
coerce_numeric: bool = False,
*,
convert_to_masked_nullable: Literal[True],
) -> tuple[np.ndarray,np.ndarray]: ...

# TODO: restrict `arr`?
def ensure_string_array(
arr,
Expand Down
62 changes: 54 additions & 8 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2008,7 +2008,8 @@ def maybe_convert_numeric(
set na_values,
bint convert_empty=True,
bint coerce_numeric=False,
) -> ndarray:
bint convert_to_masked_nullable=False,
) -> np.ndarray:
"""
Convert object array to a numeric array if possible.

Expand All @@ -2032,11 +2033,14 @@ def maybe_convert_numeric(
numeric array has no suitable numerical dtype to return (i.e. uint64,
int32, uint8). If set to False, the original object array will be
returned. Otherwise, a ValueError will be raised.

convert_to_masked_nullable : bool, default False
Whether to return a mask for the converted values. This also disables
upcasting for ints with nulls to float64.
Returns
-------
np.ndarray
np.ndarray or tuple of converted values and its mask
Array of converted object values to numerical ones.
Also returns mask if convert_to_masked_nullable is True.
"""
if len(values) == 0:
return np.array([], dtype='i8')
Expand All @@ -2063,21 +2067,40 @@ def maybe_convert_numeric(
ndarray[int64_t] ints = np.empty(n, dtype='i8')
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
float64_t fval
bint allow_null_in_int = convert_to_masked_nullable

for i in range(n):
val = values[i]
# We only want to disable NaNs showing as float if
# a) convert_to_masked_nullable = True
# b) no floats have been seen ( assuming an int shows up later )
# However, if no ints present (all null array), we need to return floats
allow_null_in_int = convert_to_masked_nullable and not seen.float_

if val.__hash__ is not None and val in na_values:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif util.is_float_object(val):
fval = val
if fval != fval:
seen.null_ = True

if allow_null_in_int:
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.float_ = True
else:
seen.float_ = True
floats[i] = complexes[i] = fval
seen.float_ = True
elif util.is_integer_object(val):
floats[i] = complexes[i] = val

Expand All @@ -2100,7 +2123,13 @@ def maybe_convert_numeric(
floats[i] = uints[i] = ints[i] = bools[i] = val
seen.bool_ = True
elif val is None or val is C_NA:
seen.saw_null()
if allow_null_in_int:
seen.null_ = True
mask[i] = 1
else:
if convert_to_masked_nullable:
mask[i] = 1
seen.saw_null()
floats[i] = complexes[i] = NaN
elif hasattr(val, '__len__') and len(val) == 0:
if convert_empty or seen.coerce_numeric:
Expand All @@ -2121,17 +2150,22 @@ def maybe_convert_numeric(
if fval in na_values:
seen.saw_null()
floats[i] = complexes[i] = NaN
mask[i] = 1
else:
if fval != fval:
seen.null_ = True
mask[i] = 1

floats[i] = fval

if maybe_int:
as_int = int(val)

if as_int in na_values:
seen.saw_null()
mask[i] = 1
seen.null_ = True
if not allow_null_in_int:
seen.float_ = True
else:
seen.saw_int(as_int)

Expand Down Expand Up @@ -2161,11 +2195,23 @@ def maybe_convert_numeric(
if seen.check_uint64_conflict():
return values

# This occurs since we disabled float nulls showing as null in anticipation
# of seeing ints that were never seen. So then, we return float
if allow_null_in_int and seen.null_ and not seen.int_:
seen.float_ = True

if seen.complex_:
return complexes
elif seen.float_:
if seen.null_ and convert_to_masked_nullable:
return (floats, mask.view(np.bool_))
return floats
elif seen.int_:
if seen.null_ and convert_to_masked_nullable:
if seen.uint_:
return (uints, mask.view(np.bool_))
else:
return (ints, mask.view(np.bool_))
if seen.uint_:
return uints
else:
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/ops.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ def vec_binop(
def maybe_convert_bool(
arr: np.ndarray, # np.ndarray[object]
true_values=...,
false_values=...
false_values=...,
convert_to_masked_nullable: bool = True,
) -> np.ndarray: ...
34 changes: 19 additions & 15 deletions pandas/_libs/ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ import_array()


from pandas._libs.missing cimport checknull
from pandas._libs.util cimport (
UINT8_MAX,
is_nan,
)
from pandas._libs.util cimport is_nan


@cython.wraparound(False)
Expand Down Expand Up @@ -258,17 +255,21 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:


def maybe_convert_bool(ndarray[object] arr,
true_values=None, false_values=None) -> ndarray:
true_values=None,
false_values=None,
convert_to_masked_nullable=False
) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
ndarray[uint8_t] mask
object val
set true_vals, false_vals
int na_count = 0
bint has_na = False

n = len(arr)
result = np.empty(n, dtype=np.uint8)

mask = np.zeros(n, dtype=np.uint8)
# the defaults
true_vals = {'True', 'TRUE', 'true'}
false_vals = {'False', 'FALSE', 'false'}
Expand All @@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
result[i] = 1
elif val in false_vals:
result[i] = 0
elif isinstance(val, float):
result[i] = UINT8_MAX
na_count += 1
elif is_nan(val):
mask[i] = 1
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
has_na = True
else:
return arr

if na_count > 0:
mask = result == UINT8_MAX
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return arr
if has_na:
if convert_to_masked_nullable:
return (result.view(np.bool_), mask.view(np.bool_))
else:
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return arr
else:
return result.view(np.bool_)
Loading