From f7387127ba2820f7678df974159b627d8fbef0c2 Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Fri, 20 Sep 2013 14:50:06 -0700 Subject: [PATCH] BUG: Constrain date parsing from strings a little bit more #4601 Currently dateutil will parse almost any string into a datetime. This change adds a filter in front of dateutil that will prevent it from parsing certain strings that don't look like datetimes: 1) Strings that parse to float values that are less than 1000 2) Certain special one character strings (this was already in there, this just moves that code) Additionally, this filters out datetimes that are out of range for the datetime64[ns] type. Currently any out-of-range datetimes will just overflow and be mapped to some random time within the bounds of datetime64[ns]. --- doc/source/release.rst | 1 + pandas/tests/test_tslib.py | 123 +++++++++++++++++++++++++++++++++++++ pandas/tslib.pyx | 55 ++++++++++++----- 3 files changed, 165 insertions(+), 14 deletions(-) create mode 100644 pandas/tests/test_tslib.py diff --git a/doc/source/release.rst b/doc/source/release.rst index ffb792ca98da5..b71285758d53b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -433,6 +433,7 @@ Bug Fixes - Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) with thousands != "," (:issue:`4596`) - Bug in getitem with a duplicate index when using where (:issue:`4879`) + - Fix Type inference code coerces float column into datetime (:issue:`4601`) pandas 0.12.0 diff --git a/pandas/tests/test_tslib.py b/pandas/tests/test_tslib.py new file mode 100644 index 0000000000000..b9a7356412a10 --- /dev/null +++ b/pandas/tests/test_tslib.py @@ -0,0 +1,123 @@ +import unittest + +import numpy as np + +from pandas import tslib +from datetime import datetime + +class TestDatetimeParsingWrappers(unittest.TestCase): + def test_verify_datetime_bounds(self): + for year in (1, 1000, 1677, 2262, 5000): + dt = datetime(year, 1, 1) + self.assertRaises( + ValueError, + tslib.verify_datetime_bounds, + dt + ) + + for year in (1678, 2000, 2261): + tslib.verify_datetime_bounds(datetime(year, 1, 1)) + + def test_does_not_convert_mixed_integer(self): + bad_date_strings = ( + '-50000', + '999', + '123.1234', + 'm', + 'T' + ) + + for bad_date_string in bad_date_strings: + self.assertFalse( + tslib._does_string_look_like_datetime(bad_date_string) + ) + + good_date_strings = ( + '2012-01-01', + '01/01/2012', + 'Mon Sep 16, 2013', + '01012012', + '0101', + '1-1', + ) + + for good_date_string in good_date_strings: + self.assertTrue( + tslib._does_string_look_like_datetime(good_date_string) + ) + +class TestArrayToDatetime(unittest.TestCase): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + ) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + ) + + def test_number_looking_strings_not_into_datetime(self): + # #4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + def test_dates_outside_of_datetime64_ns_bounds(self): + # These datetimes are outside of the bounds of the + # datetime64[ns] bounds, so they cannot be converted to + # datetimes + arr = np.array(['1/1/1676', '1/2/1676'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + arr = np.array(['1/1/2263', '1/2/2263'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + # With coercing, the invalid dates becomes iNaT + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT + ], + dtype='M8[ns]' + ) + ) + ) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index fd97512b0528b..075102dd63100 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -317,7 +317,6 @@ class Timestamp(_Timestamp): _nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) -_not_datelike_strings = set(['a','A','m','M','p','P','t','T']) class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" @@ -841,6 +840,43 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz +_not_datelike_strings = set(['a','A','m','M','p','P','t','T']) + +def verify_datetime_bounds(dt): + """Verify datetime.datetime is within the datetime64[ns] bounds.""" + if dt.year <= 1677 or dt.year >= 2262: + raise ValueError( + 'Given datetime not within valid datetime64[ns] bounds' + ) + return dt + +def _does_string_look_like_datetime(date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + +def parse_datetime_string(date_string, verify_bounds=True, **kwargs): + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + dt = parse_date(date_string, **kwargs) + + if verify_bounds: + verify_datetime_bounds(dt) + + return dt def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, format=None, utc=None, coerce=False, unit=None): @@ -908,24 +944,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, &dts) _check_dts_bounds(iresult[i], &dts) except ValueError: - - # for some reason, dateutil parses some single letter len-1 strings into today's date - if len(val) == 1 and val in _not_datelike_strings: - if coerce: - iresult[i] = iNaT - continue - elif raise_: - raise try: - result[i] = parse_date(val, dayfirst=dayfirst) + result[i] = parse_datetime_string( + val, dayfirst=dayfirst + ) except Exception: if coerce: iresult[i] = iNaT continue raise TypeError - pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, - &dts) - _check_dts_bounds(iresult[i], &dts) except: if coerce: iresult[i] = iNaT @@ -946,7 +973,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, oresult[i] = 'NaT' continue try: - oresult[i] = parse_date(val, dayfirst=dayfirst) + oresult[i] = parse_datetime_string(val, dayfirst=dayfirst) except Exception: if raise_: raise