From f157e8800a58847d5790f7f5266904c86ba6b925 Mon Sep 17 00:00:00 2001 From: John Myles White Date: Sun, 8 Dec 2013 08:13:48 -0800 Subject: [PATCH] Draft of NA-skipping stats functions --- src/DataArrays.jl | 1 + src/operators.jl | 14 +--- src/stats.jl | 170 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 12 deletions(-) create mode 100644 src/stats.jl diff --git a/src/DataArrays.jl b/src/DataArrays.jl index b150508..33d2c44 100644 --- a/src/DataArrays.jl +++ b/src/DataArrays.jl @@ -74,6 +74,7 @@ module DataArrays include("extras.jl") include("grouping.jl") include("statistics.jl") + include("stats.jl") include("predicates.jl") include("literals.jl") end diff --git a/src/operators.jl b/src/operators.jl index a4e963a..a3335c2 100644 --- a/src/operators.jl +++ b/src/operators.jl @@ -145,18 +145,8 @@ const bit_operators = [:(Base.(:&)), :(Base.(:|)), :(Base.(:$))] -const unary_vector_operators = [:(Base.minimum), - :(Base.maximum), - :(Base.prod), - :(Base.sum), - :(Base.mean), - :(Base.median), - :(Base.std), - :(Base.var), - :(Stats.mad), - :(Base.norm), - :(Stats.skewness), - :(Stats.kurtosis)] +const unary_vector_operators = [:(Stats.mad), + :(Base.norm)] # TODO: dist, iqr diff --git a/src/stats.jl b/src/stats.jl new file mode 100644 index 0000000..72a9e8c --- /dev/null +++ b/src/stats.jl @@ -0,0 +1,170 @@ +function Base.mean{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + s, n = 0.0, 0 + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + s += da.data[i] + n += 1 + end + end + return s / n +end + +function Base.median{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + if !skipna + return median(array(da)) + else + return median(removeNA(da)) + end +end + +function Base.var{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + s, n = 0.0, 0 + m = mean(da, skipna = skipna) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + z = (da.data[i] - m) + s += z * z + n += 1 + end + end + return s / (n - 1) +end + +function Base.std{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + s, n = 0.0, 0 + m = mean(da, skipna = skipna) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + z = (da.data[i] - m) + s += z * z + n += 1 + end + end + return sqrt(s / (n - 1)) +end + +function Base.minimum{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + m = typemax(T) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + m = min(m, da.data[i]) + end + end + return m +end + +function Base.maximum{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + m = typemin(T) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + m = max(m, da.data[i]) + end + end + return m +end + +function Base.prod{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + r = one(T) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + r *= da.data[i] + end + end + return r +end + +function Base.sum{T <: Real}(da::DataArray{T}; + skipna::Bool = false) + r = zero(T) + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + r += da.data[i] + end + end + return r +end + +function Stats.skewness{T <: Real}(da::DataArray{T}; + skipna::Bool = false, + m::Real = mean(da, skipna = skipna)) + n = 0 + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm3 = 0.0 # empirical 3rd centered moment + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + x_i = da.data[i] + z = x_i - m + z2 = z * z + cm2 += z2 + cm3 += z2 * z + n += 1 + end + end + cm3 /= n + cm2 /= n + return cm3 / (cm2^1.5) +end + +function Stats.kurtosis{T <: Real}(da::DataArray{T}; + skipna::Bool = false, + m::Real = mean(da, skipna = skipna)) + n = 0 + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm4 = 0.0 # empirical 4th centered moment + for i in 1:length(da) + if da.na[i] + if !skipna + throw(NAException()) + end + else + x_i = da.data[i] + z = x_i - m + z2 = z * z + cm2 += z2 + cm4 += z2 * z2 + n += 1 + end + end + cm4 /= n + cm2 /= n + return (cm4 / (cm2^2)) - 3.0 +end