Skip to content

ENH: implement Block splitting to avoid upcasts where possible (GH #2794) #2871

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 15, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3714,14 +3714,14 @@ def _combine_match_columns(self, other, func, fill_value=None):
if fill_value is not None:
raise NotImplementedError

new_data = left._data.where(func, right, axes = [left.columns, self.index])
new_data = left._data.eval(func, right, axes = [left.columns, self.index])
return self._constructor(new_data)

def _combine_const(self, other, func, raise_on_error = True):
if self.empty:
return self

new_data = self._data.where(func, other, raise_on_error=raise_on_error)
new_data = self._data.eval(func, other, raise_on_error=raise_on_error)
return self._constructor(new_data)

def _compare_frame(self, other, func):
Expand Down Expand Up @@ -5293,8 +5293,7 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
self._data = self._data.putmask(cond,other,inplace=True)

else:
func = lambda values, others, conds: np.where(conds, values, others)
new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
new_data = self._data.where(other, cond, raise_on_error=raise_on_error, try_cast=try_cast)

return self._constructor(new_data)

Expand Down
143 changes: 112 additions & 31 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,14 +304,15 @@ def putmask(self, mask, new, inplace=False):
if self._can_hold_element(new):
new = self._try_cast(new)
np.putmask(new_values, mask, new)
# upcast me
else:

# maybe upcast me
elif mask.any():
# type of the new block
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
isinstance(new, float)):
typ = float
typ = np.float64
else:
typ = object
typ = np.object_

# we need to exiplicty astype here to make a copy
new_values = new_values.astype(typ)
Expand Down Expand Up @@ -384,17 +385,16 @@ def shift(self, indexer, periods):
new_values[:, periods:] = np.nan
return make_block(new_values, self.items, self.ref_items)

def where(self, func, other, cond = None, raise_on_error = True, try_cast = False):
def eval(self, func, other, raise_on_error = True, try_cast = False):
"""
evaluate the block; return result block(s) from the result
evaluate the block; return result block from the result

Parameters
----------
func : how to combine self, other
other : a ndarray/object
cond : the condition to respect, optional
raise_on_error : if True, raise when I can't perform the function,
False by default (and just return the data that we had coming in)
raise_on_error : if True, raise when I can't perform the function, False by default (and just return
the data that we had coming in)

Returns
-------
Expand All @@ -414,28 +414,7 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
values = values.T
is_transposed = True

# see if we can align cond
if cond is not None:
if not hasattr(cond, 'shape'):
raise ValueError('where must have a condition that is ndarray'
' like')
if hasattr(cond, 'reindex_axis'):
axis = getattr(cond, '_het_axis', 0)
cond = cond.reindex_axis(self.items, axis=axis,
copy=True).values
else:
cond = cond.values

# may need to undo transpose of values
if hasattr(values, 'ndim'):
if (values.ndim != cond.ndim or
values.shape == cond.shape[::-1]):
values = values.T
is_transposed = not is_transposed

args = [ values, other ]
if cond is not None:
args.append(cond)
try:
result = func(*args)
except:
Expand All @@ -458,7 +437,106 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
if try_cast:
result = self._try_cast_result(result)

return [ make_block(result, self.items, self.ref_items) ]
return make_block(result, self.items, self.ref_items)

def where(self, other, cond, raise_on_error = True, try_cast = False):
"""
evaluate the block; return result block(s) from the result

Parameters
----------
other : a ndarray/object
cond : the condition to respect
raise_on_error : if True, raise when I can't perform the function, False by default (and just return
the data that we had coming in)

Returns
-------
a new block(s), the result of the func
"""

values = self.values

# see if we can align other
if hasattr(other,'reindex_axis'):
axis = getattr(other,'_het_axis',0)
other = other.reindex_axis(self.items, axis=axis, copy=True).values

# make sure that we can broadcast
is_transposed = False
if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
if values.ndim != other.ndim or values.shape == other.shape[::-1]:
values = values.T
is_transposed = True

# see if we can align cond
if not hasattr(cond,'shape'):
raise ValueError("where must have a condition that is ndarray like")
if hasattr(cond,'reindex_axis'):
axis = getattr(cond,'_het_axis',0)
cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
else:
cond = cond.values

# may need to undo transpose of values
if hasattr(values, 'ndim'):
if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
values = values.T
is_transposed = not is_transposed

# our where function
def func(c,v,o):
if c.flatten().all():
return v

try:
return np.where(c,v,o)
except:
if raise_on_error:
raise TypeError('Coulnd not operate %s with block values'
% repr(o))
else:
# return the values
result = np.empty(v.shape,dtype='float64')
result.fill(np.nan)
return result

def create_block(result, items, transpose = True):
if not isinstance(result, np.ndarray):
raise TypeError('Could not compare %s with block values'
% repr(other))

if transpose and is_transposed:
result = result.T

# try to cast if requested
if try_cast:
result = self._try_cast_result(result)

return make_block(result, items, self.ref_items)

# see if we can operate on the entire block, or need item-by-item
if not self._can_hold_na:
axis = cond.ndim-1
result_blocks = []
for item in self.items:
loc = self.items.get_loc(item)
item = self.items.take([loc])
v = values.take([loc],axis=axis)
c = cond.take([loc],axis=axis)
o = other.take([loc],axis=axis) if hasattr(other,'shape') else other

result = func(c,v,o)
if len(result) == 1:
result = np.repeat(result,self.shape[1:])

result = result.reshape(((1,) + self.shape[1:]))
result_blocks.append(create_block(result, item, transpose = False))

return result_blocks
else:
result = func(cond,values,other)
return create_block(result, self.items)

def _mask_missing(array, missing_values):
if not isinstance(missing_values, (list, np.ndarray)):
Expand Down Expand Up @@ -840,6 +918,9 @@ def apply(self, f, *args, **kwargs):
def where(self, *args, **kwargs):
return self.apply('where', *args, **kwargs)

def eval(self, *args, **kwargs):
return self.apply('eval', *args, **kwargs)

def putmask(self, *args, **kwargs):
return self.apply('putmask', *args, **kwargs)

Expand Down
37 changes: 30 additions & 7 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,26 @@ def test_getitem_boolean(self):

def test_getitem_boolean_casting(self):

#### this currently disabled ###

# don't upcast if we don't need to
df = self.tsframe.copy()
df['E'] = 1
df['E'] = df['E'].astype('int32')
df['E1'] = df['E'].copy()
df['F'] = 1
df['F'] = df['F'].astype('int64')
df['F1'] = df['F'].copy()

casted = df[df>0]
result = casted.get_dtype_counts()
#expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1})
expected = Series({'float64': 6 })
expected = Series({'float64': 4, 'int32' : 2, 'int64' : 2})
assert_series_equal(result, expected)

# int block splitting
df.ix[1:3,['E1','F1']] = 0
casted = df[df>0]
result = casted.get_dtype_counts()
expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1})
assert_series_equal(result, expected)

def test_getitem_boolean_list(self):
df = DataFrame(np.arange(12).reshape(3, 4))
Expand Down Expand Up @@ -5997,6 +6003,19 @@ def _check_get(df, cond, check_dtypes = True):
cond = df > 0
_check_get(df, cond)


# upcasting case (GH # 2794)
df = DataFrame(dict([ (c,Series([1]*3,dtype=c)) for c in ['int64','int32','float32','float64'] ]))
df.ix[1,:] = 0

result = df.where(df>=0).get_dtype_counts()

#### when we don't preserver boolean casts ####
#expected = Series({ 'float32' : 1, 'float64' : 3 })

expected = Series({ 'float32' : 1, 'float64' : 1, 'int32' : 1, 'int64' : 1 })
assert_series_equal(result, expected)

# aligning
def _check_align(df, cond, other, check_dtypes = True):
rs = df.where(cond, other)
Expand All @@ -6013,10 +6032,12 @@ def _check_align(df, cond, other, check_dtypes = True):
else:
o = other[k].values

assert_series_equal(v, Series(np.where(c, d, o),index=v.index))

new_values = d if c.all() else np.where(c, d, o)
assert_series_equal(v, Series(new_values,index=v.index))

# dtypes
# can't check dtype when other is an ndarray

if check_dtypes and not isinstance(other,np.ndarray):
self.assert_((rs.dtypes == df.dtypes).all() == True)

Expand Down Expand Up @@ -6052,13 +6073,15 @@ def _check_set(df, cond, check_dtypes = True):
dfi = df.copy()
econd = cond.reindex_like(df).fillna(True)
expected = dfi.mask(~econd)

#import pdb; pdb.set_trace()
dfi.where(cond, np.nan, inplace=True)
assert_frame_equal(dfi, expected)

# dtypes (and confirm upcasts)x
if check_dtypes:
for k, v in df.dtypes.iteritems():
if issubclass(v.type,np.integer):
if issubclass(v.type,np.integer) and not cond[k].all():
v = np.dtype('float64')
self.assert_(dfi[k].dtype == v)

Expand Down