diff --git a/src/cov.jl b/src/cov.jl index a77cd5086..59de7f25b 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -2,7 +2,7 @@ # auxiliary functions -function _symmetrize!(a::DenseMatrix) +function _symmetrize!(a::AbstractMatrix) m, n = size(a) m == n || error("a must be a square matrix.") for j = 1:n @@ -15,7 +15,18 @@ function _symmetrize!(a::DenseMatrix) return a end -function _scalevars(x::DenseMatrix, s::AbstractWeights, dims::Int) +function _symmetrize!(a::AbstractSparseMatrix) + m, n = size(a) + m == n || error("a must be a square matrix.") + for (i,j,vl) in zip(findnz(a)...) + i > j || continue + vr = a[j,i] + a[i,j] = a[j,i] = middle(vl, vr) + end + return a +end + +function _scalevars(x::AbstractMatrix, s::AbstractWeights, dims::Int) dims == 1 ? Diagonal(s) * x : dims == 2 ? x * Diagonal(s) : error("dims should be either 1 or 2.") @@ -23,12 +34,12 @@ end ## scatter matrix -_unscaled_covzm(x::DenseMatrix, dims::Colon) = unscaled_covzm(x) -_unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims) +_unscaled_covzm(x::AbstractMatrix, dims::Colon) = unscaled_covzm(x) +_unscaled_covzm(x::AbstractMatrix, dims::Integer) = unscaled_covzm(x, dims) -_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon) = +_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Colon) = _symmetrize!(unscaled_covzm(x, _scalevars(x, wv))) -_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) = +_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Integer) = _symmetrize!(unscaled_covzm(x, _scalevars(x, wv, dims), dims)) """ @@ -75,30 +86,29 @@ Finally, bias correction is applied to the covariance calculation if """ function mean_and_cov end -scattermat(x::DenseMatrix; mean=nothing, dims::Int=1) = +scattermat(x::AbstractMatrix; mean=nothing, dims::Int=1) = _scattermatm(x, mean, dims) -_scattermatm(x::DenseMatrix, ::Nothing, dims::Int) = +_scattermatm(x::AbstractMatrix, ::Nothing, dims::Int) = _unscaled_covzm(x .- mean(x, dims=dims), dims) -_scattermatm(x::DenseMatrix, mean, dims::Int=1) = +_scattermatm(x::AbstractMatrix, mean, dims::Int=1) = _unscaled_covzm(x .- mean, dims) -scattermat(x::DenseMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) = +scattermat(x::AbstractMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) = _scattermatm(x, wv, mean, dims) -_scattermatm(x::DenseMatrix, wv::AbstractWeights, ::Nothing, dims::Int) = +_scattermatm(x::AbstractMatrix, wv::AbstractWeights, ::Nothing, dims::Int) = _unscaled_covzm(x .- mean(x, wv, dims=dims), wv, dims) -_scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) = +_scattermatm(x::AbstractMatrix, wv::AbstractWeights, mean, dims::Int) = _unscaled_covzm(x .- mean, wv, dims) ## weighted cov -covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1; +covm(x::AbstractMatrix, mean, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) = rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected))) - -cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) = +cov(x::AbstractMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) = covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected)) -function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1) +function corm(x::AbstractMatrix, mean, w::AbstractWeights, vardim::Int=1) c = covm(x, mean, w, vardim; corrected=false) s = stdm(x, w, mean, vardim; corrected=false) cov2cor!(c, s) @@ -110,14 +120,18 @@ end Compute the Pearson correlation matrix of `X` along the dimension `dims` with a weighting `w` . """ -cor(x::DenseMatrix, w::AbstractWeights, dims::Int=1) = +cor(x::AbstractMatrix, w::AbstractWeights, dims::Int=1) = corm(x, mean(x, w, dims=dims), w, dims) -function mean_and_cov(x::DenseMatrix, dims::Int=1; corrected::Bool=true) +function mean_and_cov(x::AbstractVector; corrected::Bool=true) + m = mean(x) + return m, covm(x, m, corrected=corrected) +end +function mean_and_cov(x::AbstractMatrix, dims::Int=1; corrected::Bool=true) m = mean(x, dims=dims) return m, covm(x, m, dims, corrected=corrected) end -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1; +function mean_and_cov(x::AbstractMatrix, wv::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) m = mean(x, wv, dims=dims) return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected)) @@ -148,8 +162,10 @@ standard deviations `s`. function cor2cov!(C::AbstractMatrix, s::AbstractArray) n = length(s) size(C) == (n, n) || throw(DimensionMismatch("inconsistent dimensions")) - for i in CartesianIndices(size(C)) - @inbounds C[i] *= s[i[1]] * s[i[2]] + @inbounds for i in CartesianIndices(size(C)) + si = s[i[1]] * s[i[2]] + # the covariance is 0 when si==0, although C[i] is NaN in this case + C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si end return C end diff --git a/test/cov.jl b/test/cov.jl index ab310276c..b12f71032 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -1,13 +1,26 @@ using StatsBase using LinearAlgebra, Random, Test +using SparseArrays struct EmptyCovarianceEstimator <: CovarianceEstimator end +struct WrappedArray{T,N,A} <: AbstractArray{T,N} + a::A + WrappedArray(a::AbstractArray{T,N}) where {T,N} = new{T,N,typeof(a)}(a) +end +Base.size(w::WrappedArray) = size(w.a) +Base.getindex(w::WrappedArray{T,N}, I::Vararg{Int, N}) where {T,N} = getindex(w.a, I...) +Base.setindex!(w::WrappedArray{T,N}, v, I::Vararg{Int, N}) where {T,N} = setindex!(w.a, v, I...) + +≈ₙ(x,y) = isapprox(x, y; nans=true) + @testset "StatsBase.Covariance" begin weight_funcs = (weights, aweights, fweights, pweights) +array = randn(3, 8) +wrapped_array = WrappedArray(array) +sparse_array = sprandn(3, 8, 0.2) -@testset "$f" for f in weight_funcs - X = randn(3, 8) +@testset "$f,$(typeof(X))" for f in weight_funcs, X in (array, wrapped_array, sparse_array) Z1 = X .- mean(X, dims = 1) Z2 = X .- mean(X, dims = 2) @@ -87,27 +100,32 @@ weight_funcs = (weights, aweights, fweights, pweights) @testset "Mean and covariance" begin (m, C) = mean_and_cov(X; corrected=false) @test m == mean(X, dims=1) - @test C == cov(X, dims=1, corrected=false) + @test C ≈ cov(X, dims=1, corrected=false) (m, C) = mean_and_cov(X, 1; corrected=false) @test m == mean(X, dims=1) - @test C == cov(X, dims=1, corrected = false) + @test C ≈ cov(X, dims=1, corrected = false) (m, C) = mean_and_cov(X, 2; corrected=false) @test m == mean(X, dims=2) - @test C == cov(X, dims=2, corrected = false) + @test C ≈ cov(X, dims=2, corrected = false) (m, C) = mean_and_cov(X, wv1; corrected=false) @test m == mean(X, wv1, dims=1) - @test C == cov(X, wv1, 1, corrected=false) + @test C ≈ cov(X, wv1, 1, corrected=false) (m, C) = mean_and_cov(X, wv1, 1; corrected=false) @test m == mean(X, wv1, dims=1) - @test C == cov(X, wv1, 1, corrected=false) + @test C ≈ cov(X, wv1, 1, corrected=false) (m, C) = mean_and_cov(X, wv2, 2; corrected=false) @test m == mean(X, wv2, dims=2) - @test C == cov(X, wv2, 2, corrected=false) + @test C ≈ cov(X, wv2, 2, corrected=false) + + v = [X[:,i] for i in axes(X,2)] + (m, C) = mean_and_cov(v; corrected=false) + @test m == mean(v) + @test C ≈ cov(v, corrected=false) end @testset "Conversions" begin std1 = std(X, wv1, 1; corrected=false) @@ -120,10 +138,10 @@ weight_funcs = (weights, aweights, fweights, pweights) cor2 = cor(X, wv2, 2) @testset "cov2cor" begin - @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1) - @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2) - @test cov2cor(cov1, std1) ≈ cor1 - @test cov2cor(cov2, std2) ≈ cor2 + @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ₙ cor(X, dims = 1) + @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ₙ cor(X, dims = 2) + @test cov2cor(cov1, std1) ≈ₙ cor1 + @test cov2cor(cov2, std2) ≈ₙ cor2 end @testset "cor2cov" begin @test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1) @@ -158,30 +176,30 @@ weight_funcs = (weights, aweights, fweights, pweights) @testset "Mean and covariance" begin (m, C) = mean_and_cov(X; corrected=true) @test m == mean(X, dims=1) - @test C == cov(X, dims=1, corrected = true) + @test C ≈ cov(X, dims=1, corrected = true) (m, C) = mean_and_cov(X, 1; corrected=true) @test m == mean(X, dims=1) - @test C == cov(X, dims=1, corrected = true) + @test C ≈ cov(X, dims=1, corrected = true) (m, C) = mean_and_cov(X, 2; corrected=true) @test m == mean(X, dims=2) - @test C == cov(X, dims=2, corrected = true) + @test C ≈ cov(X, dims=2, corrected = true) if isa(wv1, Weights) @test_throws ArgumentError mean_and_cov(X, wv1; corrected=true) else (m, C) = mean_and_cov(X, wv1; corrected=true) @test m == mean(X, wv1, dims=1) - @test C == cov(X, wv1, 1; corrected=true) + @test C ≈ cov(X, wv1, 1; corrected=true) (m, C) = mean_and_cov(X, wv1, 1; corrected=true) @test m == mean(X, wv1, dims=1) - @test C == cov(X, wv1, 1; corrected=true) + @test C ≈ cov(X, wv1, 1; corrected=true) (m, C) = mean_and_cov(X, wv2, 2; corrected=true) @test m == mean(X, wv2, dims=2) - @test C == cov(X, wv2, 2; corrected=true) + @test C ≈ cov(X, wv2, 2; corrected=true) end end @testset "Conversions" begin @@ -196,25 +214,26 @@ weight_funcs = (weights, aweights, fweights, pweights) cor2 = cor(X, wv2, 2) @testset "cov2cor" begin - @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1) - @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2) - @test cov2cor(cov1, std1) ≈ cor1 - @test cov2cor(cov2, std2) ≈ cor2 + @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ₙ cor(X, dims = 1) + @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ₙ cor(X, dims = 2) + @test cov2cor(cov1, std1) ≈ₙ cor1 + @test cov2cor(cov2, std2) ≈ₙ cor2 end @testset "cov2cor!" begin tmp_cov1 = copy(cov1) - @test !(tmp_cov1 ≈ cor1) + @test !(tmp_cov1 ≈ₙ cor1) StatsBase.cov2cor!(tmp_cov1, std1) - @test tmp_cov1 ≈ cor1 + @test tmp_cov1 ≈ₙ cor1 tmp_cov2 = copy(cov2) - @test !(tmp_cov2 ≈ cor2) + @test !(tmp_cov2 ≈ₙ cor2) StatsBase.cov2cor!(tmp_cov2, std2) - @test tmp_cov2 ≈ cor2 + @test tmp_cov2 ≈ₙ cor2 end @testset "cor2cov" begin + @test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1) @test cor2cov(cor(X, dims = 2), std(X, dims = 2)) ≈ cov(X, dims = 2) @test cor2cov(cor1, std1) ≈ cov1 @@ -237,8 +256,8 @@ weight_funcs = (weights, aweights, fweights, pweights) end @testset "Correlation" begin - @test cor(X, f(ones(3)), 1) ≈ cor(X, dims = 1) - @test cor(X, f(ones(8)), 2) ≈ cor(X, dims = 2) + @test cor(X, f(ones(3)), 1) ≈ₙ cor(X, dims = 1) + @test cor(X, f(ones(8)), 2) ≈ₙ cor(X, dims = 2) cov1 = cov(X, wv1, 1; corrected=false) std1 = std(X, wv1, 1; corrected=false) @@ -247,8 +266,8 @@ weight_funcs = (weights, aweights, fweights, pweights) expected_cor1 = StatsBase.cov2cor!(cov1, std1) expected_cor2 = StatsBase.cov2cor!(cov2, std2) - @test cor(X, wv1, 1) ≈ expected_cor1 - @test cor(X, wv2, 2) ≈ expected_cor2 + @test cor(X, wv1, 1) ≈ₙ expected_cor1 + @test cor(X, wv2, 2) ≈ₙ expected_cor2 end @testset "Abstract covariance estimation" begin