Skip to content

Commit afd0bb7

Browse files
author
Carlos Parada
authored
Weighted sem (#754)
1 parent 7fca6e8 commit afd0bb7

File tree

2 files changed

+120
-29
lines changed

2 files changed

+120
-29
lines changed

src/scalarstats.jl

Lines changed: 90 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -421,36 +421,101 @@ realXcY(x::Real, y::Real) = x*y
421421
realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y)
422422

423423
"""
424-
sem(x)
424+
sem(x; mean=nothing)
425+
sem(x::AbstractArray[, weights::AbstractWeights]; mean=nothing)
425426
426-
Return the standard error of the mean of collection `x`,
427-
i.e. `sqrt(var(x, corrected=true) / length(x))`.
427+
Return the standard error of the mean for a collection `x`.
428+
A pre-computed `mean` may be provided.
429+
430+
When not using weights, this is the (sample) standard deviation
431+
divided by the sample size. If weights are used, the
432+
variance of the sample mean is calculated as follows:
433+
434+
* `AnalyticWeights`: Not implemented.
435+
* `FrequencyWeights`: ``\\frac{\\sum_{i=1}^n w_i (x_i - \\bar{x_i})^2}{(\\sum w_i) (\\sum w_i - 1)}``
436+
* `ProbabilityWeights`: ``\\frac{n}{n-1} \\frac{\\sum_{i=1}^n w_i^2 (x_i - \\bar{x_i})^2}{\\left( \\sum w_i \\right)^2}``
437+
438+
The standard error is then the square root of the above quantities.
439+
440+
# References
441+
442+
Carl-Erik Särndal, Bengt Swensson, Jan Wretman (1992). Model Assisted Survey Sampling.
443+
New York: Springer. pp. 51-53.
428444
"""
429-
function sem(x)
430-
y = iterate(x)
431-
if y === nothing
445+
function sem(x; mean=nothing)
446+
if isempty(x)
447+
# Return the NaN of the type that we would get for a nonempty x
432448
T = eltype(x)
433-
# Return the NaN of the type that we would get, had this collection
434-
# contained any elements (this is consistent with std)
435-
return oftype(sqrt((abs2(zero(T)) + abs2(zero(T)))/2), NaN)
436-
end
437-
count = 1
438-
value, state = y
439-
y = iterate(x, state)
440-
# Use Welford algorithm as seen in (among other places)
441-
# Knuth's TAOCP, Vol 2, page 232, 3rd edition.
442-
M = value / 1
443-
S = real(zero(M))
444-
while y !== nothing
449+
_mean = mean === nothing ? zero(T) / 1 : mean
450+
z = abs2(zero(T) - _mean)
451+
return oftype((z + z) / 2, NaN)
452+
elseif mean === nothing
453+
n = 0
454+
y = iterate(x)
455+
value, state = y
456+
# Use Welford algorithm as seen in (among other places)
457+
# Knuth's TAOCP, Vol 2, page 232, 3rd edition.
458+
_mean = value / 1
459+
sse = real(zero(_mean))
460+
while y !== nothing
461+
value, state = y
462+
y = iterate(x, state)
463+
n += 1
464+
new_mean = _mean + (value - _mean) / n
465+
sse += realXcY(value - _mean, value - new_mean)
466+
_mean = new_mean
467+
end
468+
else
469+
n = 1
470+
y = iterate(x)
445471
value, state = y
446-
y = iterate(x, state)
447-
count += 1
448-
new_M = M + (value - M) / count
449-
S = S + realXcY(value - M, value - new_M)
450-
M = new_M
472+
sse = abs2(value - mean)
473+
while (y = iterate(x, state)) !== nothing
474+
value, state = y
475+
n += 1
476+
sse += abs2(value - mean)
477+
end
478+
end
479+
variance = sse / (n - 1)
480+
return sqrt(variance / n)
481+
end
482+
483+
function sem(x::AbstractArray; mean=nothing)
484+
if isempty(x)
485+
# Return the NaN of the type that we would get for a nonempty x
486+
T = eltype(x)
487+
_mean = mean === nothing ? zero(T) / 1 : mean
488+
z = abs2(zero(T) - _mean)
489+
return oftype((z + z) / 2, NaN)
490+
end
491+
return sqrt(var(x; mean=mean, corrected=true) / length(x))
492+
end
493+
494+
function sem(x::AbstractArray, weights::UnitWeights; mean=nothing)
495+
if length(x) length(weights)
496+
throw(DimensionMismatch("array and weights do not have the same length"))
497+
end
498+
return sem(x; mean=mean)
499+
end
500+
501+
502+
# Weighted methods for the above
503+
sem(x::AbstractArray, weights::FrequencyWeights; mean=nothing) =
504+
sqrt(var(x, weights; mean=mean, corrected=true) / sum(weights))
505+
506+
function sem(x::AbstractArray, weights::ProbabilityWeights; mean=nothing)
507+
if isempty(x)
508+
# Return the NaN of the type that we would get for a nonempty x
509+
return var(x, weights; mean=mean, corrected=true) / 0
510+
else
511+
_mean = mean === nothing ? Statistics.mean(x, weights) : mean
512+
# sum of squared errors = sse
513+
sse = sum(Broadcast.instantiate(Broadcast.broadcasted(x, weights) do x_i, w
514+
return abs2(w * (x_i - _mean))
515+
end))
516+
n = count(!iszero, weights)
517+
return sqrt(sse * n / (n - 1)) / sum(weights)
451518
end
452-
var = S / (count - 1)
453-
return sqrt(var/count)
454519
end
455520

456521
# Median absolute deviation

test/scalarstats.jl

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,36 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.]
164164
@test variation([1:5;]) 0.527046276694730
165165
@test variation(skipmissing([missing; 1:5; missing])) 0.527046276694730
166166

167-
@test sem([1:5;]) 0.707106781186548
168-
@test sem(skipmissing([missing; 1:5; missing])) 0.707106781186548
169-
@test sem(Int[]) === NaN
170-
@test sem(skipmissing(Union{Int,Missing}[missing, missing])) === NaN
167+
@test @inferred(sem([1:5;])) 0.707106781186548
168+
@test @inferred(sem(skipmissing([missing; 1:5; missing]))) 0.707106781186548
169+
@test @inferred(sem(skipmissing([missing; 1:5; missing]), mean=3.0)) 0.707106781186548
170+
@test @inferred(sem([1:5;], UnitWeights{Int}(5))) 0.707106781186548
171+
@test @inferred(sem([1:5;], UnitWeights{Int}(5); mean=mean(1:5))) 0.707106781186548
172+
@test_throws DimensionMismatch sem(1:5, UnitWeights{Int}(4))
173+
@test @inferred(sem([1:5;], ProbabilityWeights([1:5;]))) 0.6166 rtol=.001
174+
μ = mean(1:5, ProbabilityWeights([1:5;]))
175+
@test @inferred(sem([1:5;], ProbabilityWeights([1:5;]); mean=μ)) 0.6166 rtol=.001
176+
@test @inferred(sem([10; 1:5;], ProbabilityWeights([0; 1:5;]); mean=μ)) 0.6166 rtol=.001
177+
x = sort!(vcat([5:-1:i for i in 1:5]...))
178+
μ = mean(x)
179+
@test @inferred(sem([1:5;], FrequencyWeights([1:5;]))) sem(x)
180+
@test @inferred(sem([1:5;], FrequencyWeights([1:5;]); mean=μ)) sem(x)
181+
182+
@inferred sem([1:5f0;]; mean=μ) sem(x)
183+
@inferred sem([1:5f0;], ProbabilityWeights([1:5;]); mean=μ)
184+
@inferred sem([1:5f0;], FrequencyWeights([1:5;]); mean=μ)
185+
# Broken: Bug to do with Statistics.jl's implementation of `var`
186+
# @inferred sem([1:5f0;], UnitWeights{Int}(5); mean=μ)
187+
188+
@test @inferred(isnan(sem(Int[])))
189+
@test @inferred(isnan(sem(Int[], FrequencyWeights(Int[]))))
190+
@test @inferred(isnan(sem(Int[], ProbabilityWeights(Int[]))))
191+
192+
@test @inferred(isnan(sem(Int[]; mean=0f0)))
193+
@test @inferred(isnan(sem(Int[], FrequencyWeights(Int[]); mean=0f0)))
194+
@test @inferred(isnan(sem(Int[], ProbabilityWeights(Int[]); mean=0f0)))
195+
196+
@test @inferred(isnan(sem(skipmissing(Union{Int,Missing}[missing, missing]))))
171197
@test_throws MethodError sem(Any[])
172198
@test_throws MethodError sem(skipmissing([missing]))
173199

0 commit comments

Comments
 (0)