diff --git a/pandas/algos.pyx b/pandas/algos.pyx index cac9c5ccc7a6d..829059c56b705 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1104,8 +1104,11 @@ def roll_skew(ndarray[double_t] input, int win, int minp): R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) + if R != 0 and nobs > 2: + output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / + ((nobs-2) * R * R * R)) + else: + output[i] = NaN else: output[i] = NaN @@ -1173,10 +1176,12 @@ def roll_kurt(ndarray[double_t] input, R = R * A D = xxxx / nobs - R - 6*B*A*A - 4*C*A - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) - - output[i] = K + if B != 0 and nobs > 3: + K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) + K = K / ((nobs - 2.)*(nobs-3.)) + output[i] = K + else: + output[i] = NaN else: output[i] = NaN diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index e53916f113e1b..55582919309f5 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -42,6 +42,8 @@ freq : None or string alias / date offset object, default=None Frequency to conform to before computing statistic time_rule is a legacy alias for freq +pad_val: Value to use for parts of the window which fall outside the array + bounds. (applies only for center=True, currently) Returns ------- @@ -251,7 +253,7 @@ def rolling_corr_pairwise(df, window, min_periods=None): def _rolling_moment(arg, window, func, minp, axis=0, freq=None, - center=False, time_rule=None, **kwargs): + center=False, time_rule=None, pad_val=np.NaN, **kwargs): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -275,19 +277,105 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, """ arg = _conv_timerule(arg, freq, time_rule) calc = lambda x: func(x, window, minp=minp, **kwargs) + # strips the pandas object into a callback and an ndarray + # the callback takes back an ndarray and reconstitutes the + # pandas object with the new data return_hook, values = _process_data_structure(arg) # actually calculate the moment. Faster way to do this? - if values.ndim > 1: - result = np.apply_along_axis(calc, axis, values) - else: - result = calc(values) + def calc(x,axis): + _calc = lambda x: func(x, window, minp=minp, **kwargs) + if x.ndim > 1: + return np.apply_along_axis(_calc, axis, x) + else: + return _calc(x) + + def fwd(x): + """ + reshapes a 1d/2d ndarray into a 2d array so the processing can + be done on a fixed case. + """ + v = x.view() + if v.ndim > 1: + v = v if axis == 1 else v.T + else: + v = v.reshape((1,len(v))) + return v - rs = return_hook(result) + def bkwd(x): + if arg.ndim > 1: + x = x if axis == 1 else x.T + else: + x = x[0] + return x + + def pad(values,nahead,pos="head",window=window,axis=axis,pad_val=pad_val): + v = values + + tip = np.empty((v.shape[0],2*nahead+1+nahead+1),dtype=v.dtype) + if pos == "head": + tip[:,:nahead+1] = pad_val + tip[:,nahead+1:] = v[:,:(2*nahead+1)] + elif pos == "tail": + tip[:,-(nahead+1):] = pad_val + tip[:,:-(nahead+1)] = v[:,-(2*nahead+1):] + else: + raise NotImplementedError() + + return tip + +# window = 5 /4 +# print pad(mkdf(10,5).values,window=5) +# print pad(mkdf(10,5).values,window=5,axis=1) +# print pad(mkdf(10,5).irow(0).values,window=5) + result = calc(values,axis) if center: - rs = _center_window(rs, window, axis) - return rs - - + # GH2953, fixup edges + if window > 2: + result = _center_window(result, window, axis) + result = fwd(result) + values = fwd(values) + # with the data always in a consistent alignment + # we can always apply the func along the same axis=1 + # and eliminate special cases + + # there's an ambiguity in what constitutes + # the "center" when window is even + # we Just close ranks with numpy, so a window of len 4 [0..3] + # 2 is the "center" slot + + if window % 2 == 0 : + nahead = (window-1)//2 or 1 + else: + nahead = (window)//2 + tip = pad(values,nahead,'head') + head = calc(tip,axis=1)[:,-(nahead+1):][:,:nahead+1] + + tip = pad(values,nahead,'tail') + tail = calc(tip,axis=1)[:,-(nahead+1):][:,:nahead] + + result[:,-(nahead):] = tail + result[:,:nahead+1] = head + + if minp > 0: + ld = minp - nahead-1 + rd = ld-1 if window % 2 == 0 else ld + if ld > 0: + result[:,:ld] = NaN + if rd >0: + result[:,-(rd):] = NaN + + result =bkwd(result) + + # rebuild the correct pandas object using the new data + return return_hook(result) +# TODO: test window=2 +# from pandas.stats import moments as mom +# print list(mom.rolling_mean(Series(np.ones(10)),3,min_periods=1,pad_val=0,center=True).values) +# print list(mom.rolling_mean(Series(np.ones(10)),4,min_periods=1,pad_val=0,center=True).values) +# print list(mom.rolling_mean(Series(np.ones(10)),5,min_periods=1,pad_val=0,center=True).values) +# mom.rolling_mean(DataFrame(np.ones((3,10))),3,axis=0,min_periods=1,pad_val=0,center=True) +# mom.rolling_mean(DataFrame(np.ones((3,10))),3,axis=1,min_periods=1,pad_val=0,center=True) +# mom.rolling_mean(Series(range(25)),3,axis=0,min_periods=1,pad_val=0,center=True) def _center_window(rs, window, axis): if axis > rs.ndim-1: raise ValueError("Requested axis is larger then no. of argument dimensions") @@ -497,14 +585,13 @@ def _rolling_func(func, desc, check_minp=_use_window): @Substitution(desc, _unary_arg, _type_of_input) @Appender(_doc_template) @wraps(func) + def f(arg, window, min_periods=None, freq=None, center=False, - time_rule=None, **kwargs): - def call_cython(arg, window, minp, **kwds): - minp = check_minp(minp, window) - return func(arg, window, minp, **kwds) - return _rolling_moment(arg, window, call_cython, min_periods, + time_rule=None, pad_val=np.NaN, **kwargs): + min_periods = check_minp(min_periods, window) + return _rolling_moment(arg, window, func, min_periods, freq=freq, center=center, - time_rule=time_rule, **kwargs) + time_rule=time_rule,pad_val=pad_val, **kwargs) return f diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 88dfcaf5ce7ae..fde1427644a56 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -396,23 +396,30 @@ def _check_ndarray(self, func, static_comp, window=50, assert_almost_equal(result[-1], static_comp(arr[10:-10])) if has_center: + win = 20 if has_min_periods: - result = func(arr, 20, min_periods=15, center=True) - expected = func(arr, 20, min_periods=15) + result = func(arr, win, min_periods=15, center=True) + expected = func(arr, win, min_periods=15) else: - result = func(arr, 20, center=True) - expected = func(arr, 20) + result = func(arr, win, center=True) + expected = func(arr, win) assert_almost_equal(result[1], expected[10]) - if fill_value is None: - self.assert_(np.isnan(result[-9:]).all()) - else: - self.assert_((result[-9:] == 0).all()) + # fill_Value only specified by rolling_Count + # which assumes the old 0 append at end + # behavior, no longer true + + # if fill_value is None: + # self.assert_(np.isnan(result[-9:]).all()) + # else: + # self.assert_((result[-9:] == 0).all()) + + if has_min_periods: self.assert_(np.isnan(expected[23])) self.assert_(np.isnan(result[14])) - self.assert_(np.isnan(expected[-5])) self.assert_(np.isnan(result[-14])) + self.assert_(np.isnan(expected[-5])) def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True, @@ -450,29 +457,87 @@ def _check_structures(self, func, static_comp, assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) - if has_center: - if has_min_periods: - minp = 10 - series_xp = func(self.series, 25, min_periods=minp).shift(-12) - frame_xp = func(self.frame, 25, min_periods=minp).shift(-12) - - series_rs = func(self.series, 25, min_periods=minp, - center=True) - frame_rs = func(self.frame, 25, min_periods=minp, - center=True) - - else: - series_xp = func(self.series, 25).shift(-12) - frame_xp = func(self.frame, 25).shift(-12) - - series_rs = func(self.series, 25, center=True) - frame_rs = func(self.frame, 25, center=True) - if fill_value is not None: - series_xp = series_xp.fillna(fill_value) - frame_xp = frame_xp.fillna(fill_value) - assert_series_equal(series_xp, series_rs) - assert_frame_equal(frame_xp, frame_rs) + def test_centered_rolling_moment(self): + + def participating(i,win): + return [x for x in range(i-win//2,i+(win+1)//2)] + + # validate + self.assertEqual(participating(0,3),[-1,0,1]) + self.assertEqual(participating(1,3),[0,1,2]) + self.assertEqual(participating(0,4),[-2,-1,0,1]) + self.assertEqual(participating(1,4),[-1,0,1,2]) + + def get_v(s,f,i,win,minp,pad_val=np.nan): + _is = np.array(participating(i,win)) + in_range_mask = np.array([ x>=0 and x< len(s) for x in _is ]) + def f_(i,data): + # print( data) + vals = np.array( list(data) ) + if minp: + return f(vals,win,min_periods=minp)[-1] + else: + return f(vals,win)[-1] + + if all(in_range_mask): # middle + return f_(i,s.take(_is)) + + elif minp and sum(in_range_mask) < minp: + return np.NaN + return "minp_nan" + else: + lpad = np.sum([_is<0]) + rpad = np.sum([_is>= len(s)]) + + _in_is = np.ma.array(_is, mask=~in_range_mask).compressed() + vals = np.array([pad_val] * lpad + list(s.take(_in_is)) + [pad_val]* rpad) + return f_(i,vals) + return "edge" + + def build_series(data,func,win,minp,pad_val=np.nan): + return Series(( [get_v(data,func,i,win,minp,pad_val=pad_val) for i in range(len(data))] )) + + N,K = 20,5 + for win in range(3,N-1): + for minp in range(1,win+1): + func = mom.rolling_mean + + # self.series = Series(np.ones(N)) + self.series = Series(randn(N)) + series_xp =build_series(self.series,func,win,minp,pad_val=0) + + # frame_xp = DataFrame(np.ones((N,K)), copy=True) + self.frame= DataFrame(randn(N,K), copy=True) + f = lambda i: func(self.frame.icol(i), win, min_periods=minp, pad_val=0, center=True) + data =[f(i) for i in range(len(self.frame.columns))] + frame_xp = DataFrame(np.array(data).T) + series_rs = func(self.series, win, min_periods=minp, pad_val=0, center=True) + frame_rs = func(self.frame, win, min_periods=minp, pad_val=0,center=True) + + assert_series_equal(series_xp, series_rs) + assert_frame_equal(frame_xp, frame_rs) + + N,K = 20,5 + minp=None + for win in range(3,N-1): + func = mom.rolling_count + # self.series = Series(np.ones(N)) + self.series = Series(randn(N)) + series_xp =build_series(self.series,func,win,minp) + + # frame_xp = DataFrame(np.ones((N,K)), copy=True) + self.frame= DataFrame(randn(N,K), copy=True) + + f = lambda i: func(self.frame.icol(i), win, center=True) + data =[f(i) for i in range(len(self.frame.columns))] + frame_xp = DataFrame(np.array(data).T) + + series_rs = func(self.series, win,center=True) + frame_rs = func(self.frame, win,center=True) + + assert_series_equal(series_xp, series_rs) + assert_frame_equal(frame_xp, frame_rs) def test_legacy_time_rule_arg(self): from StringIO import StringIO @@ -768,6 +833,21 @@ def _check_expanding(self, func, static_comp, has_min_periods=True, preserve_nan=preserve_nan) self._check_expanding_structures(func) + def test_rolling_mean_edges(self): + # GH2803 + # actually, covers edge handling more generally + def movingaverage(interval, window_size): + window = np.ones(int(window_size))/float(window_size) + return np.convolve(interval, window, 'same') + + nitems = 25 + for win in range(1,nitems,1): + ser = Series(range(nitems)) + df = DataFrame(index=range(len(ser))) + df['rm'] = mom.rolling_mean(ser, win, center=True, min_periods=1,pad_val=0) + df['ma'] = movingaverage(ser, win) + tm.assert_almost_equal(df['rm'] , df['ma']) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],