From c6b9f19f43744654111c48aaaf0a73b26b772b4b Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 10 Feb 2013 15:07:58 -0500 Subject: [PATCH 1/3] ENH: provide boolean indexing with dtype preservation if possible --- pandas/core/frame.py | 7 +- pandas/core/internals.py | 133 +++++++++++++++++++++++++++++-------- pandas/tests/test_frame.py | 26 ++++++-- 3 files changed, 129 insertions(+), 37 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ecf2f8ba482f6..b32bb28f512b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3714,14 +3714,14 @@ def _combine_match_columns(self, other, func, fill_value=None): if fill_value is not None: raise NotImplementedError - new_data = left._data.where(func, right, axes = [left.columns, self.index]) + new_data = left._data.eval(func, right, axes = [left.columns, self.index]) return self._constructor(new_data) def _combine_const(self, other, func, raise_on_error = True): if self.empty: return self - new_data = self._data.where(func, other, raise_on_error=raise_on_error) + new_data = self._data.eval(func, other, raise_on_error=raise_on_error) return self._constructor(new_data) def _compare_frame(self, other, func): @@ -5293,8 +5293,7 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr self._data = self._data.putmask(cond,other,inplace=True) else: - func = lambda values, others, conds: np.where(conds, values, others) - new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast) + new_data = self._data.where(other, cond, raise_on_error=raise_on_error, try_cast=try_cast) return self._constructor(new_data) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ee024ce68b5b4..7267ee04758ed 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -384,17 +384,16 @@ def shift(self, indexer, periods): new_values[:, periods:] = np.nan return make_block(new_values, self.items, self.ref_items) - def where(self, func, other, cond = None, raise_on_error = True, try_cast = False): + def eval(self, func, other, raise_on_error = True, try_cast = False): """ - evaluate the block; return result block(s) from the result + evaluate the block; return result block from the result Parameters ---------- func : how to combine self, other other : a ndarray/object - cond : the condition to respect, optional - raise_on_error : if True, raise when I can't perform the function, - False by default (and just return the data that we had coming in) + raise_on_error : if True, raise when I can't perform the function, False by default (and just return + the data that we had coming in) Returns ------- @@ -414,28 +413,7 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals values = values.T is_transposed = True - # see if we can align cond - if cond is not None: - if not hasattr(cond, 'shape'): - raise ValueError('where must have a condition that is ndarray' - ' like') - if hasattr(cond, 'reindex_axis'): - axis = getattr(cond, '_het_axis', 0) - cond = cond.reindex_axis(self.items, axis=axis, - copy=True).values - else: - cond = cond.values - - # may need to undo transpose of values - if hasattr(values, 'ndim'): - if (values.ndim != cond.ndim or - values.shape == cond.shape[::-1]): - values = values.T - is_transposed = not is_transposed - args = [ values, other ] - if cond is not None: - args.append(cond) try: result = func(*args) except: @@ -458,7 +436,105 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals if try_cast: result = self._try_cast_result(result) - return [ make_block(result, self.items, self.ref_items) ] + return make_block(result, self.items, self.ref_items) + + def where(self, other, cond, raise_on_error = True, try_cast = False): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + raise_on_error : if True, raise when I can't perform the function, False by default (and just return + the data that we had coming in) + + Returns + ------- + a new block(s), the result of the func + """ + + values = self.values + + # see if we can align other + if hasattr(other,'reindex_axis'): + axis = getattr(other,'_het_axis',0) + other = other.reindex_axis(self.items, axis=axis, copy=True).values + + # make sure that we can broadcast + is_transposed = False + if hasattr(other, 'ndim') and hasattr(values, 'ndim'): + if values.ndim != other.ndim or values.shape == other.shape[::-1]: + values = values.T + is_transposed = True + + # see if we can align cond + if not hasattr(cond,'shape'): + raise ValueError("where must have a condition that is ndarray like") + if hasattr(cond,'reindex_axis'): + axis = getattr(cond,'_het_axis',0) + cond = cond.reindex_axis(self.items, axis=axis, copy=True).values + else: + cond = cond.values + + # may need to undo transpose of values + if hasattr(values, 'ndim'): + if values.ndim != cond.ndim or values.shape == cond.shape[::-1]: + values = values.T + is_transposed = not is_transposed + + # our where function + def func(c,v,o): + if c.flatten().all(): + return v + + try: + return np.where(c,v,o) + except: + if raise_on_error: + raise TypeError('Coulnd not operate %s with block values' + % repr(o)) + else: + # return the values + result = np.empty(v.shape,dtype='O') + result.fill(np.nan) + return result + + def create_block(result, items, transpose = True): + if not isinstance(result, np.ndarray): + raise TypeError('Could not compare %s with block values' + % repr(other)) + + if transpose and is_transposed: + result = result.T + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + return make_block(result, items, self.ref_items) + + # see if we can operate on the entire block, or need item-by-item + if cond.all().any(): + result_blocks = [] + for item in self.items: + loc = self.items.get_loc(item) + item = self.items.take([loc]) + v = values.take([loc]) + c = cond.take([loc]) + o = other.take([loc]) if hasattr(other,'shape') else other + + result = func(c,v,o) + if len(result) == 1: + result = np.repeat(result,self.shape[1:]) + + result = result.reshape(((1,) + self.shape[1:])) + result_blocks.append(create_block(result, item, transpose = False)) + + return result_blocks + else: + result = func(cond,values,other) + return create_block(result, self.items) def _mask_missing(array, missing_values): if not isinstance(missing_values, (list, np.ndarray)): @@ -840,6 +916,9 @@ def apply(self, f, *args, **kwargs): def where(self, *args, **kwargs): return self.apply('where', *args, **kwargs) + def eval(self, *args, **kwargs): + return self.apply('eval', *args, **kwargs) + def putmask(self, *args, **kwargs): return self.apply('putmask', *args, **kwargs) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c628bf3f0df97..0782de4bcecd6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -244,8 +244,6 @@ def test_getitem_boolean(self): def test_getitem_boolean_casting(self): - #### this currently disabled ### - # don't upcast if we don't need to df = self.tsframe.copy() df['E'] = 1 @@ -254,8 +252,10 @@ def test_getitem_boolean_casting(self): df['F'] = df['F'].astype('int64') casted = df[df>0] result = casted.get_dtype_counts() - #expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1}) - expected = Series({'float64': 6 }) + expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1}) + + ### when we always cast here's the result ### + #expected = Series({'float64': 6 }) assert_series_equal(result, expected) @@ -5997,6 +5997,19 @@ def _check_get(df, cond, check_dtypes = True): cond = df > 0 _check_get(df, cond) + + # upcasting case (GH # 2794) + df = DataFrame(dict([ (c,Series([1]*3,dtype=c)) for c in ['int64','int32','float32','float64'] ])) + df.ix[1,:] = 0 + + result = df.where(df>=0).get_dtype_counts() + + #### when we don't preserver boolean casts #### + #expected = Series({ 'float32' : 1, 'float64' : 3 }) + + expected = Series({ 'float32' : 1, 'float64' : 1, 'int32' : 1, 'int64' : 1 }) + assert_series_equal(result, expected) + # aligning def _check_align(df, cond, other, check_dtypes = True): rs = df.where(cond, other) @@ -6013,8 +6026,9 @@ def _check_align(df, cond, other, check_dtypes = True): else: o = other[k].values - assert_series_equal(v, Series(np.where(c, d, o),index=v.index)) - + new_values = d if c.all() else np.where(c, d, o) + assert_series_equal(v, Series(new_values,index=v.index)) + # dtypes # can't check dtype when other is an ndarray if check_dtypes and not isinstance(other,np.ndarray): From bddfffaa9ff51aca600ea93c027650be5fb553f2 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 21:41:45 -0500 Subject: [PATCH 2/3] ENH: return dtype on invalid function in where is now float64 --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7267ee04758ed..bfed5dbcd9d32 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -496,7 +496,7 @@ def func(c,v,o): % repr(o)) else: # return the values - result = np.empty(v.shape,dtype='O') + result = np.empty(v.shape,dtype='float64') result.fill(np.nan) return result From 9fc888f359f17d2666d84c2365ca2b45f2e67355 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 23:16:19 -0500 Subject: [PATCH 3/3] BUG: fixed bug in IntBlock splitting bug in internals.Block.putmask ; coercing unecessarily --- pandas/core/internals.py | 18 ++++++++++-------- pandas/tests/test_frame.py | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bfed5dbcd9d32..bdcbca7086681 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -304,14 +304,15 @@ def putmask(self, mask, new, inplace=False): if self._can_hold_element(new): new = self._try_cast(new) np.putmask(new_values, mask, new) - # upcast me - else: + + # maybe upcast me + elif mask.any(): # type of the new block if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or isinstance(new, float)): - typ = float + typ = np.float64 else: - typ = object + typ = np.object_ # we need to exiplicty astype here to make a copy new_values = new_values.astype(typ) @@ -515,14 +516,15 @@ def create_block(result, items, transpose = True): return make_block(result, items, self.ref_items) # see if we can operate on the entire block, or need item-by-item - if cond.all().any(): + if not self._can_hold_na: + axis = cond.ndim-1 result_blocks = [] for item in self.items: loc = self.items.get_loc(item) item = self.items.take([loc]) - v = values.take([loc]) - c = cond.take([loc]) - o = other.take([loc]) if hasattr(other,'shape') else other + v = values.take([loc],axis=axis) + c = cond.take([loc],axis=axis) + o = other.take([loc],axis=axis) if hasattr(other,'shape') else other result = func(c,v,o) if len(result) == 1: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 0782de4bcecd6..d249e0f240a82 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -248,16 +248,22 @@ def test_getitem_boolean_casting(self): df = self.tsframe.copy() df['E'] = 1 df['E'] = df['E'].astype('int32') + df['E1'] = df['E'].copy() df['F'] = 1 df['F'] = df['F'].astype('int64') + df['F1'] = df['F'].copy() + casted = df[df>0] result = casted.get_dtype_counts() - expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1}) - - ### when we always cast here's the result ### - #expected = Series({'float64': 6 }) + expected = Series({'float64': 4, 'int32' : 2, 'int64' : 2}) assert_series_equal(result, expected) + # int block splitting + df.ix[1:3,['E1','F1']] = 0 + casted = df[df>0] + result = casted.get_dtype_counts() + expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1}) + assert_series_equal(result, expected) def test_getitem_boolean_list(self): df = DataFrame(np.arange(12).reshape(3, 4)) @@ -6031,6 +6037,7 @@ def _check_align(df, cond, other, check_dtypes = True): # dtypes # can't check dtype when other is an ndarray + if check_dtypes and not isinstance(other,np.ndarray): self.assert_((rs.dtypes == df.dtypes).all() == True) @@ -6066,13 +6073,15 @@ def _check_set(df, cond, check_dtypes = True): dfi = df.copy() econd = cond.reindex_like(df).fillna(True) expected = dfi.mask(~econd) + + #import pdb; pdb.set_trace() dfi.where(cond, np.nan, inplace=True) assert_frame_equal(dfi, expected) # dtypes (and confirm upcasts)x if check_dtypes: for k, v in df.dtypes.iteritems(): - if issubclass(v.type,np.integer): + if issubclass(v.type,np.integer) and not cond[k].all(): v = np.dtype('float64') self.assert_(dfi[k].dtype == v)