From 0d49cc6c67e4f2cd72cf947a19a8e3bd545423b9 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 27 Nov 2017 11:56:19 +0100 Subject: [PATCH 1/3] BUG: fix initialization of Series with dict containing NaN as key closes #18480 closes #18515 --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/base.py | 5 +- pandas/core/series.py | 55 +++++++++++++++++----- pandas/tests/series/test_apply.py | 1 + pandas/tests/series/test_combine_concat.py | 3 +- pandas/tests/series/test_constructors.py | 17 ++++++- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a27bf54de695..ea58eb74b97b4 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -208,5 +208,6 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`) - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - diff --git a/pandas/core/base.py b/pandas/core/base.py index ae92b62ce1d11..72acd0052202b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -874,9 +874,8 @@ def _map_values(self, mapper, na_action=None): # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples - from pandas import Series, Index - index = Index(mapper, tupleize_cols=False) - mapper = Series(mapper, index=index) + from pandas import Series + mapper = Series(mapper) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either diff --git a/pandas/core/series.py b/pandas/core/series.py index bff7c21ad69b1..5d0e6907a6595 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -42,7 +42,6 @@ _default_index, _asarray_tuplesafe, _values_from_object, - _try_sort, _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, @@ -198,18 +197,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data.reindex(index, copy=copy) data = data._data elif isinstance(data, dict): - if index is None: - if isinstance(data, OrderedDict): - index = Index(data) - else: - index = Index(_try_sort(data)) - - try: - data = index._get_values_from_dict(data) - except TypeError: - data = ([data.get(i, np.nan) for i in index] - if data else np.nan) - + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False elif isinstance(data, SingleBlockManager): if index is None: index = data.index @@ -257,6 +247,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None, self.name = name self._set_axis(0, index, fastpath=True) + def _init_dict(self, data, index=None, dtype=None): + """ + Derive the "_data" and "index" attributes of a new Series from a + dictionary input. + + Parameters + ---------- + data : dict or dict-like + Data used to populate the new Series + index : Index or index-like, default None + index for the new Series: if None, use dict keys + dtype : dtype, default None + dtype for the new Series: if None, infer from data + + Returns + ------- + _data : BlockManager for the new Series + index : index for the new Series + """ + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] + # raises KeyError), so we iterate the entire dict, and align + if data: + keys, values = zip(*compat.iteritems(data)) + else: + keys, values = [], [] + + # Input is now list-like, so rely on "standard" construction: + s = Series(values, index=keys, dtype=dtype) + + # Now we just make sure the order is respected, if any + if index is not None: + s = s.reindex(index, copy=False) + elif not isinstance(data, OrderedDict): + try: + s = s.sort_index() + except TypeError: + pass + return s._data, s.index + @classmethod def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False): diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index fe21ba569ae99..cafe6a34720be 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -422,6 +422,7 @@ def test_map_dict_with_tuple_keys(self): converted to a multi-index, preventing tuple values from being mapped properly. """ + # GH 18496 df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 71ac00975af03..6cf60e818c845 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self): # categorical assert pd.concat([Series(dtype='category'), Series(dtype='category')]).dtype == 'category' - assert pd.concat([Series(dtype='category'), + # GH 18515 + assert pd.concat([Series(np.array([]), dtype='category'), Series(dtype='float64')]).dtype == 'float64' assert pd.concat([Series(dtype='category'), Series(dtype='object')]).dtype == 'object' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ccc04da3299fe..5462464da0e0b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -625,6 +625,21 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + def test_constructor_dict_nan_key(self, value): + # GH 18480 + d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) + assert_series_equal(result, expected) + + # MultiIndex: + d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} + result = Series(d).sort_values() + expected = Series(['a', 'b', 'c'], + index=Index([(1, 1), (2, np.nan), (3, value)])) + assert_series_equal(result, expected) + def test_constructor_dict_datetime64_index(self): # GH 9456 @@ -658,8 +673,6 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data - @pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with ' - 'NaN keys') def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} From dc4c76e162d1e0500ba27da4b61ef04b8d622f6d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 29 Nov 2017 16:29:00 +0100 Subject: [PATCH 2/3] CLN: remove now unused _get_values_from_dict --- pandas/core/indexes/base.py | 21 --------------------- pandas/core/indexes/datetimelike.py | 8 -------- pandas/core/indexes/datetimes.py | 11 ----------- 3 files changed, 40 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10f9022e2666b..2bf3afe47d007 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2822,27 +2822,6 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer - _index_shared_docs['_get_values_from_dict'] = """ - Return the values of the input dictionary in the order the keys are - in the index. np.nan is returned for index values not in the - dictionary. - - Parameters - ---------- - data : dict - The dictionary from which to extract the values - - Returns - ------- - np.array - - """ - - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - return lib.fast_multiget(data, self.values, - default=np.nan) - def _maybe_promote(self, other): # A hack, but it works from pandas.core.indexes.datetimes import DatetimeIndex diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5643d886a4fec..f2bf528d7cc6b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -698,14 +698,6 @@ def __rsub__(self, other): def _add_delta(self, other): return NotImplemented - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - if len(data): - return np.array([data.get(i, np.nan) - for i in self.asobject.values]) - - return np.array([np.nan]) - def _add_delta_td(self, other): # add a delta of a timedeltalike # return the i8 result view diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 196c881f97526..ee6263a9f0aad 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1457,17 +1457,6 @@ def get_value_maybe_box(self, series, key): key, tz=self.tz) return _maybe_box(self, values, series, key) - @Appender(_index_shared_docs['_get_values_from_dict']) - def _get_values_from_dict(self, data): - if len(data): - # coerce back to datetime objects for lookup - data = com._dict_compat(data) - return lib.fast_multiget(data, - self.asobject.values, - default=np.nan) - - return np.array([np.nan]) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label From 1582c42391ffba923bd58d04da841c8b91d0c078 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 30 Nov 2017 17:14:59 +0100 Subject: [PATCH 3/3] TST: test and whatsnew for #18515 --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/tests/series/test_constructors.py | 38 ++++++++++++++++++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index ea58eb74b97b4..411583404a32d 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -68,6 +68,7 @@ Other API Changes - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) - ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) +- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5462464da0e0b..a57385a9cf690 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -4,6 +4,7 @@ import pytest from datetime import datetime, timedelta +from collections import OrderedDict from numpy import nan import numpy as np @@ -79,17 +80,42 @@ def test_constructor(self): m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m) - def test_constructor_empty(self): + @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) + def test_constructor_empty(self, input_class): empty = Series() - empty2 = Series([]) + empty2 = Series(input_class()) - # the are Index() and RangeIndex() which don't compare type equal + # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) - empty = Series(index=lrange(10)) - empty2 = Series(np.nan, index=lrange(10)) - assert_series_equal(empty, empty2) + # With explicit dtype: + empty = Series(dtype='float64') + empty2 = Series(input_class(), dtype='float64') + assert_series_equal(empty, empty2, check_index_type=False) + + # GH 18515 : with dtype=category: + empty = Series(dtype='category') + empty2 = Series(input_class(), dtype='category') + assert_series_equal(empty, empty2, check_index_type=False) + + if input_class is not list: + # With index: + empty = Series(index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10)) + assert_series_equal(empty, empty2) + + # With index and dtype float64: + empty = Series(np.nan, index=lrange(10)) + empty2 = Series(input_class(), index=lrange(10), dtype='float64') + assert_series_equal(empty, empty2) + + @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) + def test_constructor_nan(self, input_arg): + empty = Series(dtype='float64', index=lrange(10)) + empty2 = Series(input_arg, index=lrange(10)) + + assert_series_equal(empty, empty2, check_index_type=False) def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c']