Skip to content

ENH: Use Welford's method in stats.moments.rolling_var #6817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ Improvements to existing features
- Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`)
- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
:func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)

.. _release.bug_fixes-0.14.0:

Expand Down
74 changes: 50 additions & 24 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1122,56 +1122,82 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
# Rolling variance

def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0
"""
Numerically stable implementation using Welford's method.
"""
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
cdef Py_ssize_t i
cdef Py_ssize_t N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

minp = _check_minp(win, minp, N)

for i from 0 <= i < minp - 1:
for i from 0 <= i < win:
val = input[i]

# Not NaN
if val == val:
nobs += 1
sum_x += val
sum_xx += val * val
delta = (val - mean_x)
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)

output[i] = NaN
if nobs >= minp:
#pathological case
if nobs == 1:
val = 0
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
else:
val = NaN

for i from minp - 1 <= i < N:
output[i] = val

for i from win <= i < N:
val = input[i]
prev = input[i - win]

if val == val:
nobs += 1
sum_x += val
sum_xx += val * val

if i > win - 1:
prev = input[i - win]
if prev == prev:
sum_x -= prev
sum_xx -= prev * prev
nobs -= 1
delta = val - prev
prev -= mean_x
mean_x += delta / nobs
val -= mean_x
ssqdm_x += (val + prev) * delta
else:
nobs += 1
delta = (val - mean_x)
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)
elif prev == prev:
nobs -= 1
if nobs:
delta = (prev - mean_x)
mean_x -= delta / nobs
ssqdm_x -= delta * (prev - mean_x)
else:
mean_x = 0
ssqdm_x = 0

if nobs >= minp:
# pathological case
#pathological case
if nobs == 1:
output[i] = 0
continue

val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof))
if val < 0:
val = 0

output[i] = val
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
else:
output[i] = NaN
val = NaN

output[i] = val

return output


#-------------------------------------------------------------------------------
# Rolling skewness

Expand Down
4 changes: 2 additions & 2 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None,
* ``gaussian`` (needs std)
* ``general_gaussian`` (needs power, width)
* ``slepian`` (needs width).

By default, the result is set to the right edge of the window. This can be
changed to the center of the window by setting ``center=True``.

Expand Down Expand Up @@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False,
Returns
-------
y : type of input argument

Notes
-----
The `freq` keyword is used to conform time series data to a specified
Expand Down
18 changes: 14 additions & 4 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self):

def test_rolling_var(self):
self._check_moment_func(mom.rolling_var,
lambda x: np.var(x, ddof=1))
lambda x: np.var(x, ddof=1),
test_stable=True)
self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
lambda x: np.var(x, ddof=0))

Expand Down Expand Up @@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50,
has_center=True,
has_time_rule=True,
preserve_nan=True,
fill_value=None):
fill_value=None,
test_stable=False):

self._check_ndarray(func, static_comp, window=window,
has_min_periods=has_min_periods,
preserve_nan=preserve_nan,
has_center=has_center,
fill_value=fill_value)
fill_value=fill_value,
test_stable=test_stable)

self._check_structures(func, static_comp,
has_min_periods=has_min_periods,
Expand All @@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50,
has_min_periods=True,
preserve_nan=True,
has_center=True,
fill_value=None):
fill_value=None,
test_stable=False):

result = func(self.arr, window)
assert_almost_equal(result[-1],
Expand Down Expand Up @@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50,
self.assert_(np.isnan(expected[-5]))
self.assert_(np.isnan(result[-14]))

if test_stable:
result = func(self.arr + 1e9, window)
assert_almost_equal(result[-1],
static_comp(self.arr[-50:] + 1e9))


def _check_structures(self, func, static_comp,
has_min_periods=True, has_time_rule=True,
has_center=True,
Expand Down