diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 096040bb85a10..22965f8d45d4b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -24,6 +24,8 @@ New features `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``skipna`` parameter :func:`~pandas.api.types.infer_dtype` to support + type inference in the presence of missing values (:issue:`17059`). .. _whatsnew_0210.enhancements.infer_objects: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 38e95fe6ee652..8026d5097d943 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -222,7 +222,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(object value): +def infer_dtype(object value, bint skipna=False): """ Effeciently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -230,6 +230,8 @@ def infer_dtype(object value): Parameters ---------- value : scalar, list, ndarray, or pandas type + skipna : bool, default False + Ignore NaN values when inferring the type. Returns ------- @@ -272,6 +274,9 @@ def infer_dtype(object value): >>> infer_dtype(['foo', 'bar']) 'string' + >>> infer_dtype(['a', np.nan, 'b'], skipna=True) + 'string' + >>> infer_dtype([b'foo', b'bar']) 'bytes' @@ -310,7 +315,6 @@ def infer_dtype(object value): >>> infer_dtype(pd.Series(list('aabc')).astype('category')) 'categorical' - """ cdef: Py_ssize_t i, n @@ -356,7 +360,7 @@ def infer_dtype(object value): values = values.ravel() # try to use a valid value - for i from 0 <= i < n: + for i in range(n): val = util.get_value_1d(values, i) # do not use is_nul_datetimelike to keep @@ -403,11 +407,11 @@ def infer_dtype(object value): return 'datetime' elif is_date(val): - if is_date_array(values): + if is_date_array(values, skipna=skipna): return 'date' elif is_time(val): - if is_time_array(values): + if is_time_array(values, skipna=skipna): return 'time' elif is_decimal(val): @@ -420,19 +424,19 @@ def infer_dtype(object value): return 'mixed-integer-float' elif util.is_bool_object(val): - if is_bool_array(values): + if is_bool_array(values, skipna=skipna): return 'boolean' elif PyString_Check(val): - if is_string_array(values): + if is_string_array(values, skipna=skipna): return 'string' elif PyUnicode_Check(val): - if is_unicode_array(values): + if is_unicode_array(values, skipna=skipna): return 'unicode' elif PyBytes_Check(val): - if is_bytes_array(values): + if is_bytes_array(values, skipna=skipna): return 'bytes' elif is_period(val): @@ -593,190 +597,267 @@ cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) -cpdef bint is_bool_array(ndarray values): +cdef class Validator: + cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + Py_ssize_t n + object dtype + bint skipna - if issubclass(values.dtype.type, np.bool_): - return True - elif values.dtype == np.object_: - objbuf = values + def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + self.n = n + self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.skipna = skipna + + cdef bint validate(self, object[:] values): + if not self.n: + return False - if n == 0: + if self.is_array_typed(): + return True + elif self.dtype.type == np.object_: + if self.skipna: + return self._validate_skipna(values) + else: + return self._validate(values) + else: return False + cdef bint _validate(self, object[:] values): + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + + for i in range(n): + if not self.is_valid(values[i]): + return False + return self.finalize() + + cdef bint _validate_skipna(self, object[:] values): + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + for i in range(n): - if not util.is_bool_object(objbuf[i]): + if not self.is_valid_skipna(values[i]): return False + return self.finalize() + + cdef bint is_valid(self, object value) except -1: + return self.is_value_typed(value) + + cdef bint is_valid_skipna(self, object value) except -1: + return self.is_valid(value) or self.is_valid_null(value) + + cdef bint is_value_typed(self, object value) except -1: + raise NotImplementedError( + 'is_value_typed must be implemented in subclasses' + ) + + cdef bint is_valid_null(self, object value) except -1: + return util._checknull(value) + + cdef bint is_array_typed(self) except -1: + raise NotImplementedError( + 'is_array_typed must be implemented in subclasses' + ) + + cdef bint finalize(self): return True - else: - return False + + +cdef class BoolValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_bool_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bool_) + + +cpdef bint is_bool_array(ndarray values, bint skipna=False): + cdef: + BoolValidator validator = BoolValidator( + len(values), + values.dtype, + skipna=skipna + ) + return validator.validate(values) + + +cdef class IntegerValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) cpdef bint is_integer_array(ndarray values): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + IntegerValidator validator = IntegerValidator( + len(values), + values.dtype, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.integer): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class IntegerFloatValidator(Validator): - for i in range(n): - if not util.is_integer_object(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) or util.is_float_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) cpdef bint is_integer_float_array(ndarray values): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + IntegerFloatValidator validator = IntegerFloatValidator( + len(values), + values.dtype, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.integer): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class FloatValidator(Validator): - for i in range(n): - if not (util.is_integer_object(objbuf[i]) or - util.is_float_object(objbuf[i])): + cdef bint is_value_typed(self, object value) except -1: + return util.is_float_object(value) - return False - return True - else: - return False + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.floating) cpdef bint is_float_array(ndarray values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + cdef FloatValidator validator = FloatValidator(len(values), values.dtype) + return validator.validate(values) - if issubclass(values.dtype.type, np.floating): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class StringValidator(Validator): - for i in range(n): - if not util.is_float_object(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyString_Check(value) + + cdef bint is_array_typed(self) except -1: + return ( + PY2 and issubclass(self.dtype.type, np.string_) + ) or not PY2 and issubclass(self.dtype.type, np.unicode_) -cpdef bint is_string_array(ndarray values): +cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + StringValidator validator = StringValidator( + len(values), + values.dtype, + skipna=skipna, + ) + return validator.validate(values) - if ((PY2 and issubclass(values.dtype.type, np.string_)) or - not PY2 and issubclass(values.dtype.type, np.unicode_)): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class UnicodeValidator(Validator): - for i in range(n): - if not PyString_Check(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyUnicode_Check(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.unicode_) -cpdef bint is_unicode_array(ndarray values): +cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + UnicodeValidator validator = UnicodeValidator( + len(values), + values.dtype, + skipna=skipna, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.unicode_): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class BytesValidator(Validator): - for i in range(n): - if not PyUnicode_Check(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyBytes_Check(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bytes_) -cpdef bint is_bytes_array(ndarray values): +cpdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + BytesValidator validator = BytesValidator( + len(values), + values.dtype, + skipna=skipna + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.bytes_): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class TemporalValidator(Validator): - for i in range(n): - if not PyBytes_Check(objbuf[i]): - return False - return True - else: + cdef Py_ssize_t generic_null_count + + def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + self.n = n + self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.skipna = skipna + self.generic_null_count = 0 + + cdef bint is_valid(self, object value) except -1: + return self.is_value_typed(value) or self.is_valid_null(value) + + cdef bint is_value_typed(self, object value) except -1: + raise NotImplementedError() + + cdef bint is_valid_null(self, object value) except -1: + raise NotImplementedError() + + cdef bint is_valid_skipna(self, object value) except -1: + cdef: + bint is_typed_null = self.is_valid_null(value) + bint is_generic_null = util._checknull(value) + self.generic_null_count += is_typed_null and is_generic_null + return self.is_value_typed(value) or is_typed_null or is_generic_null + + cdef bint is_array_typed(self) except -1: return False + cdef bint finalize(self): + return self.generic_null_count != self.n + + +cdef class DatetimeValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_datetime(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_datetime64(value) + cpdef bint is_datetime_array(ndarray[object] values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef: + DatetimeValidator validator = DatetimeValidator( + len(values), + skipna=True, + ) + return validator.validate(values) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_datetime64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_datetime(v): - return False - return null_count != n +cdef class Datetime64Validator(DatetimeValidator): -cpdef bint is_datetime64_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef bint is_value_typed(self, object value) except -1: + return util.is_datetime64_object(value) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_datetime64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not util.is_datetime64_object(v): - return False - return null_count != n + +cpdef bint is_datetime64_array(ndarray values): + cdef: + Datetime64Validator validator = Datetime64Validator( + len(values), + skipna=True, + ) + return validator.validate(values) cpdef bint is_datetime_with_singletz_array(ndarray[object] values): @@ -807,108 +888,110 @@ cpdef bint is_datetime_with_singletz_array(ndarray[object] values): return True +cdef class TimedeltaValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return PyDelta_Check(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_timedelta64(value) + + cpdef bint is_timedelta_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not PyDelta_Check(v): - return False - return null_count != n + cdef: + TimedeltaValidator validator = TimedeltaValidator( + len(values), + skipna=True, + ) + return validator.validate(values) + + +cdef class Timedelta64Validator(TimedeltaValidator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_timedelta64_object(value) cpdef bint is_timedelta64_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not util.is_timedelta64_object(v): - return False - return null_count != n + cdef: + Timedelta64Validator validator = Timedelta64Validator( + len(values), + skipna=True, + ) + return validator.validate(values) + + +cdef class AnyTimedeltaValidator(TimedeltaValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_timedelta(value) cpdef bint is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_timedelta(v): - return False - return null_count != n + cdef: + AnyTimedeltaValidator validator = AnyTimedeltaValidator( + len(values), + skipna=True, + ) + return validator.validate(values) -cpdef bint is_date_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - if n == 0: +cdef class DateValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_date(value) + + cdef bint is_array_typed(self) except -1: return False - for i in range(n): - if not is_date(values[i]): - return False - return True -cpdef bint is_time_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - if n == 0: +cpdef bint is_date_array(ndarray[object] values, bint skipna=False): + cdef DateValidator validator = DateValidator(len(values), skipna=skipna) + return validator.validate(values) + + +cdef class TimeValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_time(value) + + cdef bint is_array_typed(self) except -1: return False - for i in range(n): - if not is_time(values[i]): - return False - return True + + +cpdef bint is_time_array(ndarray[object] values, bint skipna=False): + cdef TimeValidator validator = TimeValidator(len(values), skipna=skipna) + return validator.validate(values) + + +cdef class PeriodValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_period(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_period(value) cpdef bint is_period_array(ndarray[object] values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef PeriodValidator validator = PeriodValidator(len(values), skipna=True) + return validator.validate(values) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_period(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_period(v): - return False - return null_count != n + +cdef class IntervalValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_interval(value) cpdef bint is_interval_array(ndarray[object] values): cdef: - Py_ssize_t i, n = len(values), null_count = 0 - object v - - if n == 0: - return False - for i in range(n): - v = values[i] - if util._checknull(v): - null_count += 1 - continue - if not is_interval(v): - return False - return null_count != n + IntervalValidator validator = IntervalValidator( + len(values), + skipna=True, + ) + return validator.validate(values) cdef extern from "parse_helper.h": diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ec5fe45d7f610..2ed31d4e2f320 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -240,6 +240,9 @@ def test_infer_dtype_bytes(self): arr = arr.astype(object) assert lib.infer_dtype(arr) == compare + # object array of bytes with missing values + assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare + def test_isinf_scalar(self): # GH 11352 assert lib.isposinf_scalar(float('inf')) @@ -445,6 +448,10 @@ def test_bools(self): result = lib.infer_dtype(arr) assert result == 'boolean' + arr = np.array([True, np.nan, False], dtype='O') + result = lib.infer_dtype(arr, skipna=True) + assert result == 'boolean' + def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr) @@ -473,11 +480,26 @@ def test_decimals(self): result = lib.infer_dtype(arr) assert result == 'mixed' + arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) + result = lib.infer_dtype(arr) + assert result == 'decimal' + + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') + result = lib.infer_dtype(arr) + assert result == 'decimal' + def test_string(self): pass def test_unicode(self): - pass + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr) + assert result == 'mixed' + + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr, skipna=True) + expected = 'unicode' if PY2 else 'string' + assert result == expected def test_datetime(self): @@ -715,10 +737,17 @@ def test_is_datetimelike_array_all_nan_nat_like(self): def test_date(self): - dates = [date(2012, 1, x) for x in range(1, 20)] + dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) assert index.inferred_type == 'date' + dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] + result = lib.infer_dtype(dates) + assert result == 'mixed' + + result = lib.infer_dtype(dates, skipna=True) + assert result == 'date' + def test_to_object_array_tuples(self): r = (5, 6) values = [r]