diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 62ee6ced84882..e7451f7ecdb1d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1331,144 +1331,105 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): return output +#---------------------------------------------------------------------- +# Rolling skewness and kurtosis -#------------------------------------------------------------------------------- -# Rolling skewness @cython.boundscheck(False) @cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): +def roll_higher_moment(ndarray[double_t] input, int win, int minp, bint kurt): + """ + Numerically stable implementation of skewness and kurtosis using a + Welford-like method. If `kurt` is True, rolling kurtosis is computed, + if False, rolling skewness. + """ cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) + cdef double mean_x = 0, s2dm_x = 0, s3dm_x = 0, s4dm_x = 0, rep = NaN + cdef double delta, delta_n, tmp + cdef Py_ssize_t i, nobs = 0, nrep = 0, N = len(input) cdef ndarray[double_t] output = np.empty(N, dtype=float) - # 3 components of the skewness equation - cdef double A, B, C, R - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] + minobs = max(minp, 4 if kurt else 3) - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN + for i from 0 <= i < N: + val = input[i] + prev = NaN if i < win else input[i - win] + + if prev == prev: + # prev is not NaN, remove an observation... + nobs -= 1 + if nobs < nrep: + # ...all non-NaN values were identical, remove a repeat + nrep -= 1 + if nobs == nrep: + # We can get here both if all non-NaN were already identical + # or if nobs == 1 after removing the observation + if nrep == 0: + rep = NaN + mean_x = 0 else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) + mean_x = rep + # This is redundant most of the time + s2dm_x = s3dm_x = s4dm_x = 0 else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev - - nobs -= 1 - - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN - - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) - - output[i] = K + # ...update mean and sums of raised differences from mean + delta = prev - mean_x + delta_n = delta / nobs + tmp = delta * delta_n * (nobs + 1) + if kurt: + s4dm_x -= ((tmp * ((nobs + 3) * nobs + 3) - + 6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n + s3dm_x -= (tmp * (nobs + 2) - 3 * s2dm_x) * delta_n + s2dm_x -= tmp + mean_x -= delta_n + if val == val: + # val is not NaN, adding an observation... + nobs += 1 + if val == rep: + # ...and adding a repeat + nrep += 1 else: - output[i] = NaN + # ...and resetting repeats + nrep = 1 + rep = val + if nobs == nrep: + # ...all non-NaN values are identical + mean_x = rep + s2dm_x = s3dm_x = s4dm_x = 0 + else: + # ...update mean and sums of raised differences from mean + delta = val - mean_x + delta_n = delta / nobs + tmp = delta * delta_n * (nobs - 1) + if kurt: + s4dm_x += ((tmp * ((nobs - 3) * nobs + 3) + + 6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n + s3dm_x += (tmp * (nobs - 2) - 3 * s2dm_x) * delta_n + s2dm_x += tmp + mean_x += delta_n + + # Sums of even powers must be positive + if s2dm_x < 0 or s4dm_x < 0: + s2dm_x = s3dm_x = s4_dm_x = 0 + + if nobs < minobs or s2dm_x == 0: + output[i] = NaN + elif kurt: + # multiplications are cheap, divisions are not + tmp = s2dm_x * s2dm_x + output[i] = (nobs - 1) * (nobs * (nobs + 1) * s4dm_x - + 3 * (nobs - 1) * tmp) + output[i] /= tmp * (nobs - 2) * (nobs - 3) + else: + # multiplications are cheap, divisions and square roots are not + tmp = (nobs - 2) * (nobs - 2) * s2dm_x * s2dm_x * s2dm_x + output[i] = s3dm_x * nobs * sqrt((nobs - 1) / tmp) return output + #------------------------------------------------------------------------------- # Rolling median, min, max diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 3cddae45e7516..f383660d1a709 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -355,7 +355,8 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, - how=None, args=(), kwargs={}, **kwds): + how=None, args=(), kwargs={}, center_data=False, + norm_data=False, **kwds): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -378,6 +379,11 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, Passed on to func kwargs : dict Passed on to func + center_data : bool + If True, subtract the mean of the data from the values + norm_data: bool + If True, subtract the mean of the data from the values, and divide + by their standard deviation. Returns ------- @@ -385,8 +391,9 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, """ arg = _conv_timerule(arg, freq, how) - return_hook, values = _process_data_structure(arg) - + return_hook, values = _process_data_structure(arg, + center_data=center_data, + norm_data=norm_data) if values.size == 0: result = values.copy() else: @@ -423,7 +430,8 @@ def _center_window(rs, window, axis): return rs -def _process_data_structure(arg, kill_inf=True): +def _process_data_structure(arg, kill_inf=True, center_data=False, + norm_data=False): if isinstance(arg, DataFrame): return_hook = lambda v: type(arg)(v, index=arg.index, columns=arg.columns) @@ -438,9 +446,15 @@ def _process_data_structure(arg, kill_inf=True): if not issubclass(values.dtype.type, float): values = values.astype(float) - if kill_inf: + if kill_inf or center_data or norm_data: values = values.copy() - values[np.isinf(values)] = np.NaN + mask = np.isfinite(values) + if kill_inf: + values[~mask] = np.NaN + if center_data or norm_data: + values -= np.mean(values[mask]) + if norm_data: + values /= np.std(values[mask]) return return_hook, values @@ -629,7 +643,8 @@ def _use_window(minp, window): return minp -def _rolling_func(func, desc, check_minp=_use_window, how=None, additional_kw=''): +def _rolling_func(func, desc, check_minp=_use_window, how=None, + additional_kw='', center_data=False, norm_data=False): if how is None: how_arg_str = 'None' else: @@ -645,7 +660,8 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): minp = check_minp(minp, window) return func(arg, window, minp, **kwds) return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=center, how=how, **kwargs) + center=center, how=how, center_data=center_data, + norm_data=norm_data, **kwargs) return f @@ -657,16 +673,24 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): how='median') _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) +def _roll_skew(*args, **kwargs): + kwargs['kurt'] = False + return algos.roll_higher_moment(*args, **kwargs) +def _roll_kurt(*args, **kwargs): + kwargs['kurt'] = True + return algos.roll_higher_moment(*args, **kwargs) rolling_std = _rolling_func(_ts_std, 'Moving standard deviation.', check_minp=_require_min_periods(1), additional_kw=_ddof_kw) rolling_var = _rolling_func(algos.roll_var, 'Moving variance.', check_minp=_require_min_periods(1), additional_kw=_ddof_kw) -rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.', - check_minp=_require_min_periods(3)) -rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.', - check_minp=_require_min_periods(4)) +rolling_skew = _rolling_func(_roll_skew, 'Unbiased moving skewness.', + check_minp=_require_min_periods(3), + center_data=True, norm_data=False) +rolling_kurt = _rolling_func(_roll_kurt, 'Unbiased moving kurtosis.', + check_minp=_require_min_periods(4), + center_data=True, norm_data=True) def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, @@ -903,9 +927,9 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): expanding_var = _expanding_func(algos.roll_var, 'Expanding variance.', check_minp=_require_min_periods(1), additional_kw=_ddof_kw) -expanding_skew = _expanding_func(algos.roll_skew, 'Unbiased expanding skewness.', +expanding_skew = _expanding_func(_roll_skew, 'Unbiased expanding skewness.', check_minp=_require_min_periods(3)) -expanding_kurt = _expanding_func(algos.roll_kurt, 'Unbiased expanding kurtosis.', +expanding_kurt = _expanding_func(_roll_kurt, 'Unbiased expanding kurtosis.', check_minp=_require_min_periods(4)) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index e2ed27156d2b5..4826cd7cfccb9 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -9,14 +9,19 @@ import numpy as np from distutils.version import LooseVersion -from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull, concat +from pandas import ( + Series, DataFrame, Panel, bdate_range, isnull, notnull, concat +) from pandas.util.testing import ( - assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal + assert_almost_equal, assert_series_equal, assert_frame_equal, + assert_panel_equal, assert_index_equal ) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.util.testing as tm from pandas.compat import range, zip, PY3, StringIO +from pandas.stats.moments import (_roll_skew, _roll_kurt, _rolling_func, + _require_min_periods) N, K = 100, 10 @@ -425,6 +430,16 @@ def test_rolling_skew(self): raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False)) + # To test the algorithm stability we need the raw function, as + # rolling_skew centers the data + test_roll_skew = _rolling_func(_roll_skew, 'Test rolling_skew', + check_minp=_require_min_periods(3), + center_data=False, norm_data=False) + self._check_moment_func(test_roll_skew, + lambda x: skew(x, bias=False), + has_min_periods=True, has_center=False, + has_time_rule=False, test_stable=True) + def test_rolling_kurt(self): try: @@ -433,6 +448,15 @@ def test_rolling_kurt(self): raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False)) + # To test the algorithm stability we need the raw function, as + # rolling_kurt centers and normalizes the data + test_roll_kurt = _rolling_func(_roll_kurt, 'Test rolling_kurt', + check_minp=_require_min_periods(4), + center_data=False, norm_data=False) + self._check_moment_func(test_roll_kurt, + lambda x: kurtosis(x, bias=False), + has_min_periods=True, has_center=False, + has_time_rule=False, test_stable=True) def test_fperr_robustness(self): # TODO: remove this once python 2.5 out of picture @@ -542,8 +566,8 @@ def _check_ndarray(self, func, static_comp, window=50, if test_stable: result = func(self.arr + 1e9, window) - assert_almost_equal(result[-1], - static_comp(self.arr[-50:] + 1e9)) + assert_almost_equal(result[-1], static_comp(self.arr[-50:]), + check_less_precise=True) # Test window larger than array, #7297 if test_window: