From c6b9f19f43744654111c48aaaf0a73b26b772b4b Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sun, 10 Feb 2013 15:07:58 -0500
Subject: [PATCH 1/3] ENH: provide boolean indexing with dtype preservation if
 possible

---
 pandas/core/frame.py       |   7 +-
 pandas/core/internals.py   | 133 +++++++++++++++++++++++++++++--------
 pandas/tests/test_frame.py |  26 ++++++--
 3 files changed, 129 insertions(+), 37 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ecf2f8ba482f6..b32bb28f512b1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3714,14 +3714,14 @@ def _combine_match_columns(self, other, func, fill_value=None):
         if fill_value is not None:
             raise NotImplementedError
 
-        new_data = left._data.where(func, right, axes = [left.columns, self.index])
+        new_data = left._data.eval(func, right, axes = [left.columns, self.index])
         return self._constructor(new_data)
 
     def _combine_const(self, other, func, raise_on_error = True):
         if self.empty:
             return self
 
-        new_data = self._data.where(func, other, raise_on_error=raise_on_error)
+        new_data = self._data.eval(func, other, raise_on_error=raise_on_error)
         return self._constructor(new_data)
 
     def _compare_frame(self, other, func):
@@ -5293,8 +5293,7 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
             self._data = self._data.putmask(cond,other,inplace=True)
 
         else:
-            func = lambda values, others, conds: np.where(conds, values, others)
-            new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
+            new_data = self._data.where(other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
 
             return self._constructor(new_data)
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index ee024ce68b5b4..7267ee04758ed 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -384,17 +384,16 @@ def shift(self, indexer, periods):
             new_values[:, periods:] = np.nan
         return make_block(new_values, self.items, self.ref_items)
 
-    def where(self, func, other, cond = None, raise_on_error = True, try_cast = False):
+    def eval(self, func, other, raise_on_error = True, try_cast = False):
         """ 
-        evaluate the block; return result block(s) from the result 
+        evaluate the block; return result block from the result 
 
         Parameters
         ----------
         func  : how to combine self, other
         other : a ndarray/object
-        cond  : the condition to respect, optional
-        raise_on_error : if True, raise when I can't perform the function,
-            False by default (and just return the data that we had coming in)
+        raise_on_error : if True, raise when I can't perform the function, False by default (and just return
+             the data that we had coming in)
 
         Returns
         -------
@@ -414,28 +413,7 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
                 values = values.T
                 is_transposed = True
 
-        # see if we can align cond
-        if cond is not None:
-            if not hasattr(cond, 'shape'):
-                raise ValueError('where must have a condition that is ndarray'
-                                 ' like')
-            if hasattr(cond, 'reindex_axis'):
-                axis = getattr(cond, '_het_axis', 0)
-                cond = cond.reindex_axis(self.items, axis=axis,
-                                         copy=True).values
-            else:
-                cond = cond.values
-
-            # may need to undo transpose of values
-            if hasattr(values, 'ndim'):
-                if (values.ndim != cond.ndim or
-                        values.shape == cond.shape[::-1]):
-                    values = values.T
-                    is_transposed =  not is_transposed
-
         args = [ values, other ]
-        if cond is not None:
-            args.append(cond)
         try:
             result = func(*args)
         except:
@@ -458,7 +436,105 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
         if try_cast:
             result = self._try_cast_result(result)
 
-        return [ make_block(result, self.items, self.ref_items) ]
+        return make_block(result, self.items, self.ref_items)
+
+    def where(self, other, cond, raise_on_error = True, try_cast = False):
+        """ 
+        evaluate the block; return result block(s) from the result 
+
+        Parameters
+        ----------
+        other : a ndarray/object
+        cond  : the condition to respect
+        raise_on_error : if True, raise when I can't perform the function, False by default (and just return
+             the data that we had coming in)
+
+        Returns
+        -------
+        a new block(s), the result of the func
+        """
+
+        values = self.values
+
+        # see if we can align other
+        if hasattr(other,'reindex_axis'):
+            axis = getattr(other,'_het_axis',0)
+            other = other.reindex_axis(self.items, axis=axis, copy=True).values
+
+        # make sure that we can broadcast
+        is_transposed = False
+        if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
+            if values.ndim != other.ndim or values.shape == other.shape[::-1]:
+                values = values.T
+                is_transposed = True
+
+        # see if we can align cond
+        if not hasattr(cond,'shape'):
+            raise ValueError("where must have a condition that is ndarray like")
+        if hasattr(cond,'reindex_axis'):
+            axis = getattr(cond,'_het_axis',0)
+            cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
+        else:
+            cond = cond.values
+
+        # may need to undo transpose of values
+        if hasattr(values, 'ndim'):
+            if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
+                values = values.T
+                is_transposed =  not is_transposed
+
+        # our where function
+        def func(c,v,o):
+            if c.flatten().all():
+                return v
+            
+            try:
+                return np.where(c,v,o)
+            except:
+                if raise_on_error:
+                    raise TypeError('Coulnd not operate %s with block values'
+                                    % repr(o))
+                else:
+                    # return the values
+                    result = np.empty(v.shape,dtype='O')
+                    result.fill(np.nan)
+                    return result
+
+        def create_block(result, items, transpose = True):
+            if not isinstance(result, np.ndarray):
+                raise TypeError('Could not compare %s with block values'
+                                % repr(other))
+
+            if transpose and is_transposed:
+                result = result.T
+
+            # try to cast if requested
+            if try_cast:
+                result = self._try_cast_result(result)
+
+            return make_block(result, items, self.ref_items)
+
+        # see if we can operate on the entire block, or need item-by-item
+        if cond.all().any():
+            result_blocks = []
+            for item in self.items:
+                loc  = self.items.get_loc(item)
+                item = self.items.take([loc])
+                v    = values.take([loc])
+                c    = cond.take([loc])
+                o    = other.take([loc]) if hasattr(other,'shape') else other
+
+                result = func(c,v,o)
+                if len(result) == 1:
+                    result = np.repeat(result,self.shape[1:])
+
+                result = result.reshape(((1,) + self.shape[1:]))
+                result_blocks.append(create_block(result, item, transpose = False))
+
+            return result_blocks
+        else:
+            result = func(cond,values,other)
+            return create_block(result, self.items)
 
 def _mask_missing(array, missing_values):
     if not isinstance(missing_values, (list, np.ndarray)):
@@ -840,6 +916,9 @@ def apply(self, f, *args, **kwargs):
     def where(self, *args, **kwargs):
         return self.apply('where', *args, **kwargs)
 
+    def eval(self, *args, **kwargs):
+        return self.apply('eval', *args, **kwargs)
+
     def putmask(self, *args, **kwargs):
         return self.apply('putmask', *args, **kwargs)
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index c628bf3f0df97..0782de4bcecd6 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -244,8 +244,6 @@ def test_getitem_boolean(self):
 
     def test_getitem_boolean_casting(self):
 
-        #### this currently disabled ###
-
         # don't upcast if we don't need to
         df = self.tsframe.copy()
         df['E'] = 1
@@ -254,8 +252,10 @@ def test_getitem_boolean_casting(self):
         df['F'] = df['F'].astype('int64')
         casted = df[df>0]
         result = casted.get_dtype_counts()
-        #expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1})
-        expected = Series({'float64': 6 })
+        expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1})
+
+        ### when we always cast here's the result ###
+        #expected = Series({'float64': 6 })
         assert_series_equal(result, expected)
 
 
@@ -5997,6 +5997,19 @@ def _check_get(df, cond, check_dtypes = True):
             cond = df > 0
             _check_get(df, cond)
 
+        
+        # upcasting case (GH # 2794)
+        df = DataFrame(dict([ (c,Series([1]*3,dtype=c)) for c in ['int64','int32','float32','float64'] ]))
+        df.ix[1,:] = 0
+
+        result = df.where(df>=0).get_dtype_counts()
+
+        #### when we don't preserver boolean casts ####
+        #expected = Series({ 'float32' : 1, 'float64' : 3 })
+
+        expected = Series({ 'float32' : 1, 'float64' : 1, 'int32' : 1, 'int64' : 1 })
+        assert_series_equal(result, expected)
+
         # aligning
         def _check_align(df, cond, other, check_dtypes = True):
             rs = df.where(cond, other)
@@ -6013,8 +6026,9 @@ def _check_align(df, cond, other, check_dtypes = True):
                     else:
                         o = other[k].values
 
-                assert_series_equal(v, Series(np.where(c, d, o),index=v.index))
-
+                new_values = d if c.all() else np.where(c, d, o)
+                assert_series_equal(v, Series(new_values,index=v.index))
+            
             # dtypes
             # can't check dtype when other is an ndarray
             if check_dtypes and not isinstance(other,np.ndarray):

From bddfffaa9ff51aca600ea93c027650be5fb553f2 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Wed, 13 Feb 2013 21:41:45 -0500
Subject: [PATCH 2/3] ENH: return dtype on invalid function in where is now
 float64

---
 pandas/core/internals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 7267ee04758ed..bfed5dbcd9d32 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -496,7 +496,7 @@ def func(c,v,o):
                                     % repr(o))
                 else:
                     # return the values
-                    result = np.empty(v.shape,dtype='O')
+                    result = np.empty(v.shape,dtype='float64')
                     result.fill(np.nan)
                     return result
 

From 9fc888f359f17d2666d84c2365ca2b45f2e67355 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Wed, 13 Feb 2013 23:16:19 -0500
Subject: [PATCH 3/3] BUG: fixed bug in IntBlock splitting      bug in
 internals.Block.putmask ; coercing unecessarily

---
 pandas/core/internals.py   | 18 ++++++++++--------
 pandas/tests/test_frame.py | 19 ++++++++++++++-----
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index bfed5dbcd9d32..bdcbca7086681 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -304,14 +304,15 @@ def putmask(self, mask, new, inplace=False):
         if self._can_hold_element(new):
             new = self._try_cast(new)
             np.putmask(new_values, mask, new)
-        # upcast me
-        else:
+
+        # maybe upcast me
+        elif mask.any():
             # type of the new block
             if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
                     isinstance(new, float)):
-                typ = float
+                typ = np.float64
             else:
-                typ = object
+                typ = np.object_
 
             # we need to exiplicty astype here to make a copy
             new_values = new_values.astype(typ)
@@ -515,14 +516,15 @@ def create_block(result, items, transpose = True):
             return make_block(result, items, self.ref_items)
 
         # see if we can operate on the entire block, or need item-by-item
-        if cond.all().any():
+        if not self._can_hold_na:
+            axis = cond.ndim-1
             result_blocks = []
             for item in self.items:
                 loc  = self.items.get_loc(item)
                 item = self.items.take([loc])
-                v    = values.take([loc])
-                c    = cond.take([loc])
-                o    = other.take([loc]) if hasattr(other,'shape') else other
+                v    = values.take([loc],axis=axis)
+                c    = cond.take([loc],axis=axis)
+                o    = other.take([loc],axis=axis) if hasattr(other,'shape') else other
 
                 result = func(c,v,o)
                 if len(result) == 1:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 0782de4bcecd6..d249e0f240a82 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -248,16 +248,22 @@ def test_getitem_boolean_casting(self):
         df = self.tsframe.copy()
         df['E'] = 1
         df['E'] = df['E'].astype('int32')
+        df['E1'] = df['E'].copy()
         df['F'] = 1
         df['F'] = df['F'].astype('int64')
+        df['F1'] = df['F'].copy()
+
         casted = df[df>0]
         result = casted.get_dtype_counts()
-        expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1})
-
-        ### when we always cast here's the result ###
-        #expected = Series({'float64': 6 })
+        expected = Series({'float64': 4, 'int32' : 2, 'int64' : 2})
         assert_series_equal(result, expected)
 
+        # int block splitting
+        df.ix[1:3,['E1','F1']] = 0
+        casted = df[df>0]
+        result = casted.get_dtype_counts()
+        expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1})
+        assert_series_equal(result, expected)
 
     def test_getitem_boolean_list(self):
         df = DataFrame(np.arange(12).reshape(3, 4))
@@ -6031,6 +6037,7 @@ def _check_align(df, cond, other, check_dtypes = True):
             
             # dtypes
             # can't check dtype when other is an ndarray
+
             if check_dtypes and not isinstance(other,np.ndarray):
                 self.assert_((rs.dtypes == df.dtypes).all() == True)
 
@@ -6066,13 +6073,15 @@ def _check_set(df, cond, check_dtypes = True):
             dfi = df.copy()
             econd = cond.reindex_like(df).fillna(True)
             expected = dfi.mask(~econd)
+
+            #import pdb; pdb.set_trace()
             dfi.where(cond, np.nan, inplace=True)
             assert_frame_equal(dfi, expected)
 
             # dtypes (and confirm upcasts)x
             if check_dtypes:
                 for k, v in df.dtypes.iteritems():
-                    if issubclass(v.type,np.integer):
+                    if issubclass(v.type,np.integer) and not cond[k].all():
                         v = np.dtype('float64')
                     self.assert_(dfi[k].dtype == v)