In [1]:
## Backwards stable algorithms for variance computations!
# The following algorithm is a tempting idea to implement
# to make it easier to compute the variance of numbers in a
# single pass
function badvar1(x::Vector{Float64})
    ex2 = 0.0
    ex = 0.0
    n = length(x)
    for i=1:n
      ex2 = ex2 + x[i]^2
      ex = ex + x[i]
    end
    @show ex2, ex^2
    return 1.0/(n-1)*(ex2 - (ex)^2/n)
end
Out[1]:
badvar1 (generic function with 1 method)
In [2]:
using Random
Random.seed!(1)
Out[2]:
MersenneTwister(UInt32[0x00000001], Random.DSFMT.DSFMT_state(Int32[1749029653, 1072851681, 1610647787, 1072862326, 1841712345, 1073426746, -198061126, 1073322060, -156153802, 1073567984  …  1977574422, 1073209915, 278919868, 1072835605, 1290372147, 18858467, 1815133874, -1716870370, 382, 0]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], UInt128[0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000  …  0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000], 1002, 0)
In [3]:
## It seems to work
using Statistics
x = randn(100)
basevar = x -> (length(x)/(length(x)-1))*mean((x .- mean(x)).^2)
@show badvar1(x)
@show basevar(x)
(ex2, ex ^ 2) = (101.05387543499344, 2.9071941253525297)
badvar1(x) = 1.0204525605428276
basevar(x) = 1.0204525605428274
Out[3]:
1.0204525605428274
In [4]:
## Bigger numbers
x = randn(10000).+1e4
@show badvar1(x)
@show basevar(x)
(ex2, ex ^ 2) = (1.0000024378415356e12, 1.000002427689298e16)
badvar1(x) = 1.0153252996198057
basevar(x) = 1.0153260486975786
Out[4]:
1.0153260486975786
In [5]:
## Getting bigger
x = randn(10000).+1e6
@show badvar1(x)
@show basevar(x)
(ex2, ex ^ 2) = (1.000000009796535e16, 1.0000000097955167e20)
badvar1(x) = 1.0185018501850185
basevar(x) = 1.0089159874917228
Out[5]:
1.0089159874917228
In [6]:
##
x = randn(10000).+1e8
@show badvar1(x)
@show basevar(x)
(ex2, ex ^ 2) = (1.0000000001194625e20, 1.0000000001194626e24)
badvar1(x) = -1.6385638563856386
basevar(x) = 1.0011340684439614
Out[6]:
1.0011340684439614
In [ ]:
## Eek! That's the wrong value completely. This shows the routine is not
# backwards stable.
# But there is a better way to do this that keeps some extra data around.
In [7]:
## This is essentially what Julia does
function goodvar(x::Vector{Float64})
    n = length(x); mean = 0.0; m2 = 0.0; N = 0.0
    for i=1:n
        N = N + 1
        delta = x[i] - mean
        mean = mean + delta/N
        m2 = m2 + delta*(x[i]-mean)
    end
    return m2/(n-1)
end

x = randn(10000).+1e10
basevar = var
@show badvar1(x)
@show basevar(x)
@show goodvar(x)
@show var(x)
(ex2, ex ^ 2) = (9.999999999953065e23, 9.999999999953037e27)
badvar1(x) = 281885.41734173417
basevar(x) = 0.9864987970412976
goodvar(x) = 0.9864985157601345
var(x) = 0.9864987970412976
Out[7]:
0.9864987970412976
In [ ]: