pandas-dev · jreback · Apr 22, 2014 · Apr 6, 2014
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -276,6 +276,7 @@ Improvements to existing features
 - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`)
 - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
   :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
+- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)
 
 .. _release.bug_fixes-0.14.0:
 

diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -1122,56 +1122,82 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
 # Rolling variance
 
 def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
-    cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0
+    """
+    Numerically stable implementation using Welford's method.
+    """
+    cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
     cdef Py_ssize_t i
     cdef Py_ssize_t N = len(input)
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
     minp = _check_minp(win, minp, N)
 
-    for i from 0 <= i < minp - 1:
+    for i from 0 <= i < win:
         val = input[i]
 
         # Not NaN
         if val == val:
             nobs += 1
-            sum_x += val
-            sum_xx += val * val
+            delta = (val - mean_x)
+            mean_x += delta / nobs
+            ssqdm_x += delta * (val - mean_x)
 
-        output[i] = NaN
+        if nobs >= minp:
+            #pathological case
+            if nobs == 1:
+                val = 0
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
+        else:
+            val = NaN
 
-    for i from minp - 1 <= i < N:
+        output[i] = val
+
+    for i from win <= i < N:
         val = input[i]
+        prev = input[i - win]
 
         if val == val:
-            nobs += 1
-            sum_x += val
-            sum_xx += val * val
-
-        if i > win - 1:
-            prev = input[i - win]
             if prev == prev:
-                sum_x -= prev
-                sum_xx -= prev * prev
-                nobs -= 1
+                delta = val - prev
+                prev -= mean_x
+                mean_x += delta / nobs
+                val -= mean_x
+                ssqdm_x += (val + prev) * delta
+            else:
+                nobs += 1
+                delta = (val - mean_x)
+                mean_x += delta / nobs
+                ssqdm_x += delta * (val - mean_x)
+        elif prev == prev:
+            nobs -= 1
+            if nobs:
+                delta = (prev - mean_x)
+                mean_x -= delta  / nobs
+                ssqdm_x -= delta * (prev - mean_x)
+            else:
+                mean_x = 0
+                ssqdm_x = 0
 
         if nobs >= minp:
-            # pathological case
+            #pathological case
             if nobs == 1:
-                output[i] = 0
-                continue
-
-            val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof))
-            if val < 0:
                 val = 0
-
-            output[i] = val
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
         else:
-            output[i] = NaN
+            val = NaN
+
+        output[i] = val
 
     return output
 
+
 #-------------------------------------------------------------------------------
 # Rolling skewness
 

diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None,
     * ``gaussian`` (needs std)
     * ``general_gaussian`` (needs power, width)
     * ``slepian`` (needs width).
-    
+
     By default, the result is set to the right edge of the window. This can be
     changed to the center of the window by setting ``center=True``.
 
@@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False,
     Returns
     -------
     y : type of input argument
-    
+
     Notes
     -----
     The `freq` keyword is used to conform time series data to a specified

diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self):
 
     def test_rolling_var(self):
         self._check_moment_func(mom.rolling_var,
-                                lambda x: np.var(x, ddof=1))
+                                lambda x: np.var(x, ddof=1),
+                                test_stable=True)
         self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
                                 lambda x: np.var(x, ddof=0))
 
@@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50,
                            has_center=True,
                            has_time_rule=True,
                            preserve_nan=True,
-                           fill_value=None):
+                           fill_value=None,
+                           test_stable=False):
 
         self._check_ndarray(func, static_comp, window=window,
                             has_min_periods=has_min_periods,
                             preserve_nan=preserve_nan,
                             has_center=has_center,
-                            fill_value=fill_value)
+                            fill_value=fill_value,
+                            test_stable=test_stable)
 
         self._check_structures(func, static_comp,
                                has_min_periods=has_min_periods,
@@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50,
                        has_min_periods=True,
                        preserve_nan=True,
                        has_center=True,
-                       fill_value=None):
+                       fill_value=None,
+                       test_stable=False):
 
         result = func(self.arr, window)
         assert_almost_equal(result[-1],
@@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50,
                 self.assert_(np.isnan(expected[-5]))
                 self.assert_(np.isnan(result[-14]))
 
+        if test_stable:
+            result = func(self.arr + 1e9, window)
+            assert_almost_equal(result[-1],
+                                static_comp(self.arr[-50:] + 1e9))
+
+
     def _check_structures(self, func, static_comp,
                           has_min_periods=True, has_time_rule=True,
                           has_center=True,