diff --git a/.travis.yml b/.travis.yml index 69ca35a9..6c7c7c2c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ notifications: # uncomment the following lines to override the default test script script: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi - - julia -e 'Pkg.clone(pwd()); Pkg.checkout("DataFrames", "dfk/statsmodel-purge"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)' + - julia -e 'Pkg.clone(pwd()); Pkg.clone("https://github.com/JuliaData/DataTables.jl.git"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)' after_success: # build and deploy documentation with Documenter.jl - julia -e 'cd(Pkg.dir("StatsModels")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))' diff --git a/docs/src/formula.md b/docs/src/formula.md index 4ffbe7dc..5d060908 100644 --- a/docs/src/formula.md +++ b/docs/src/formula.md @@ -14,7 +14,7 @@ fields with possibly heterogeneous types. One of the primary goals of `StatsModels` is to make it simpler to transform tabular data into matrix format suitable for statistical modeling. -At the moment, "tabular data" means an `AbstractDataFrame`. Ultimately, the +At the moment, "tabular data" means an `AbstractDataTable`. Ultimately, the goal is to support any tabular data format that adheres to a minimal API, **regardless of backend**. @@ -88,7 +88,7 @@ dropterm The main use of `Formula`s is for fitting statistical models based on tabular data. From the user's perspective, this is done by `fit` methods that take a -`Formula` and a `DataFrame` instead of numeric matrices. +`Formula` and a `DataTable` instead of numeric matrices. Internally, this is accomplished in three stages: diff --git a/docs/src/index.md b/docs/src/index.md index 313546e6..df059ca0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -21,5 +21,5 @@ developers when dealing with statistical models and tabular data. * `RegressionModel` Much of this package was formerly part -of [`DataFrames`](https://www.github.com/JuliaStats/DataFrames.jl) +of [`DataTables`](https://www.github.com/JuliaStats/DataTables.jl) and [`StatsBase`](https://www.github.com/JuliaStats/StatsBase.jl). diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 7ec09f96..988e2d91 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -3,7 +3,7 @@ __precompile__(true) module StatsModels using Compat -using DataFrames +using DataTables using StatsBase using NullableArrays using CategoricalArrays diff --git a/src/contrasts.jl b/src/contrasts.jl index f2ba9ef8..d48f7b7d 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -169,28 +169,21 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV ContrastsMatrix(mat, tnames, c_levels, contrasts) end -# Methods for constructing ContrastsMatrix from data. These are called in -# ModelFrame constructor and setcontrasts!. -ContrastsMatrix(C::AbstractContrasts, - v::Union{CategoricalArray, NullableCategoricalArray}) = - ContrastsMatrix(C, levels(v)) -ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, - col::Union{CategoricalArray, NullableCategoricalArray}) = +ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, levels::AbstractVector) = throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)")) -# given an existing ContrastsMatrix, check that all of the levels present in the -# data are present in the contrasts. Note that this behavior is different from the +# given an existing ContrastsMatrix, check that all passed levels are present +# in the contrasts. Note that this behavior is different from the # ContrastsMatrix constructor, which requires that the levels be exactly the same. # This method exists to support things like `predict` that can operate on new data # which may contain only a subset of the original data's levels. Checking here # (instead of in `modelmat_cols`) allows an informative error message. -function ContrastsMatrix(c::ContrastsMatrix, - col::Union{CategoricalArray, NullableCategoricalArray}) - if !isempty(setdiff(levels(col), c.levels)) - throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " * - "$(setdiff(levels(col), c.levels))" * - "\n Data levels: $(levels(col))" * - "\n Contrast levels: $(c.levels)")) +function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) + if !isempty(setdiff(levels, c.levels)) + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " * + "$(setdiff(levels, c.levels))" * + "\n Data levels: $(levels)" * + "\n Contrast levels: $(c.levels)")) end return c end diff --git a/src/modelframe.jl b/src/modelframe.jl index d18f5fc2..55bd8c91 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -1,5 +1,5 @@ """ -Wrapper which combines Formula (Terms) and an AbstractDataFrame +Wrapper which combines Formula (Terms) and an AbstractDataTable This wrapper encapsulates all the information that's required to transform data of the same structure as the wrapped data frame into a model matrix. This goes @@ -13,19 +13,19 @@ then creates the necessary contrasts matrices and stores the results. # Constructors ```julia -ModelFrame(f::Formula, df::AbstractDataFrame; contrasts::Dict = Dict()) -ModelFrame(ex::Expr, d::AbstractDataFrame; contrasts::Dict = Dict()) -ModelFrame(terms::Terms, df::AbstractDataFrame; contrasts::Dict = Dict()) +ModelFrame(f::Formula, df::AbstractDataTable; contrasts::Dict = Dict()) +ModelFrame(ex::Expr, d::AbstractDataTable; contrasts::Dict = Dict()) +ModelFrame(terms::Terms, df::AbstractDataTable; contrasts::Dict = Dict()) # Inner constructors: -ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray) -ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix}) +ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray) +ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix}) ``` # Arguments * `f::Formula`: Formula whose left hand side is the *response* and right hand side are the *predictors*. -* `df::AbstractDataFrame`: The data being modeled. This is used at this stage +* `df::AbstractDataTable`: The data being modeled. This is used at this stage to determine which variables are categorical, and otherwise held for [`ModelMatrix`](@ref). * `contrasts::Dict`: An optional Dict of contrast codings for each categorical @@ -41,21 +41,23 @@ ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Di # Examples ```julia -julia> df = DataFrame(x = 1:4, y = 5:9) +julia> df = DataTable(x = 1:4, y = 5:9) julia> mf = ModelFrame(y ~ 1 + x, df) ``` """ type ModelFrame - df::AbstractDataFrame + df::AbstractDataTable terms::Terms msng::BitArray ## mapping from df keys to contrasts matrices contrasts::Dict{Symbol, ContrastsMatrix} end -is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true -is_categorical(::Any) = false +is_categorical{T<:Real}(::AbstractArray{T}) = false +typealias NullableReal{T<:Real} Nullable{T} +is_categorical{T<:NullableReal}(::AbstractArray{T}) = false +is_categorical(::AbstractArray) = true ## Check for non-redundancy of columns. For instance, if x is a factor with two ## levels, it should be expanded into two columns in y~0+x but only one column @@ -67,7 +69,7 @@ is_categorical(::Any) = false ## ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non- ## redundant evaluation terms. -function check_non_redundancy!(trms::Terms, df::AbstractDataFrame) +function check_non_redundancy!(trms::Terms, df::AbstractDataTable) (n_eterms, n_terms) = size(trms.factors) @@ -102,34 +104,49 @@ end const DEFAULT_CONTRASTS = DummyCoding +_unique(x::CategoricalArray) = unique(x) +_unique(x::NullableCategoricalArray) = [get(l) for l in unique(x) if !isnull(l)] + +function _unique{T<:Nullable}(x::AbstractArray{T}) + levs = [get(l) for l in unique(x) if !isnull(l)] + try; sort!(levs); end + return levs +end + +function _unique(x::AbstractArray) + levs = unique(x) + try; sort!(levs); end + return levs +end + ## Set up contrasts: ## Combine actual DF columns and contrast types if necessary to compute the ## actual contrasts matrices, levels, and term names (using DummyCoding ## as the default) -function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict()) +function evalcontrasts(df::AbstractDataTable, contrasts::Dict = Dict()) evaledContrasts = Dict() for (term, col) in eachcol(df) is_categorical(col) || continue evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ? contrasts[term] : DEFAULT_CONTRASTS(), - col) + _unique(col)) end return evaledContrasts end ## Default NULL handler. Others can be added as keyword arguments -function null_omit(df::DataFrame) - cc = complete_cases(df) +function null_omit(df::DataTable) + cc = completecases(df) df[cc,:], cc end _droplevels!(x::Any) = x _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x) -function ModelFrame(trms::Terms, d::AbstractDataFrame; +function ModelFrame(trms::Terms, d::AbstractDataTable; contrasts::Dict = Dict()) - df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms))) + df, msng = null_omit(DataTable(map(x -> d[x], trms.eterms))) names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) _droplevels!(c[2]) end @@ -141,9 +158,9 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ModelFrame(df, trms, msng, evaledContrasts) end -ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df)) -ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) -ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) +ModelFrame(df::AbstractDataTable, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df)) +ModelFrame(f::Formula, d::AbstractDataTable; kwargs...) = ModelFrame(Terms(f), d; kwargs...) +ModelFrame(ex::Expr, d::AbstractDataTable; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) """ setcontrasts!(mf::ModelFrame, new_contrasts::Dict) @@ -151,7 +168,7 @@ ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), Modify the contrast coding system of a ModelFrame in place. """ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) - new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col])) + new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, _unique(mf.df[col]))) for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ]) mf.contrasts = merge(mf.contrasts, new_contrasts) diff --git a/src/modelmatrix.jl b/src/modelmatrix.jl index 046ff0ed..a0d70efc 100644 --- a/src/modelmatrix.jl +++ b/src/modelmatrix.jl @@ -1,5 +1,7 @@ typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T} +typealias AbstractRealVector{T<:Real} AbstractVector{T} +typealias NullableRealVector{T<:NullableReal} AbstractVector{T} """ Convert a `ModelFrame` into a numeric matrix suitable for modeling @@ -35,18 +37,26 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::Mode end end -modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) = +modelmat_cols{T<:AbstractFloatMatrix, V<:AbstractRealVector}(::Type{T}, v::V) = convert(T, reshape(v, length(v), 1)) # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264 -modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) = +modelmat_cols{T<:AbstractFloatMatrix, V<:NullableRealVector}(::Type{T}, v::V) = convert(T, Matrix(reshape(v, length(v), 1))) +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Union{CategoricalVector, NullableCategoricalVector}) = + modelmat_cols(T, reshape(v, length(v), 1)) +# All non-real columns are considered as categorical +# Could be made more efficient by directly storing the result into the model matrix """ - modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. """ +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix) = + modelmat_cols(T, categorical(v), contrast) + + function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Union{CategoricalVector, NullableCategoricalVector}, contrast::ContrastsMatrix) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 3dd5768d..8e4ab660 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -31,23 +31,23 @@ macro delegate(source, targets) return result end -# Wrappers for DataFrameStatisticalModel and DataFrameRegressionModel -immutable DataFrameStatisticalModel{M,T} <: StatisticalModel +# Wrappers for DataTableStatisticalModel and DataTableRegressionModel +immutable DataTableStatisticalModel{M,T} <: StatisticalModel model::M mf::ModelFrame mm::ModelMatrix{T} end -immutable DataFrameRegressionModel{M,T} <: RegressionModel +immutable DataTableRegressionModel{M,T} <: RegressionModel model::M mf::ModelFrame mm::ModelMatrix{T} end -for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel), - (:RegressionModel, DataFrameRegressionModel)) +for (modeltype, dfmodeltype) in ((:StatisticalModel, DataTableStatisticalModel), + (:RegressionModel, DataTableRegressionModel)) @eval begin - function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame, + function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataTable, args...; contrasts::Dict = Dict(), kwargs...) mf = ModelFrame(f, df, contrasts=contrasts) mm = ModelMatrix(mf) @@ -58,24 +58,24 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel), end # Delegate functions from StatsBase that use our new types -typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegressionModel}) -@delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint, +typealias DataTableModels @compat(Union{DataTableStatisticalModel, DataTableRegressionModel}) +@delegate DataTableModels.model [StatsBase.coef, StatsBase.confint, StatsBase.deviance, StatsBase.nulldeviance, StatsBase.loglikelihood, StatsBase.nullloglikelihood, StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs, StatsBase.stderr, StatsBase.vcov] -@delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response, +@delegate DataTableRegressionModel.model [StatsBase.residuals, StatsBase.model_response, StatsBase.predict, StatsBase.predict!] # Need to define these manually because of ambiguity using @delegate -StatsBase.r2(mm::DataFrameRegressionModel) = r2(mm.model) -StatsBase.adjr2(mm::DataFrameRegressionModel) = adjr2(mm.model) -StatsBase.r2(mm::DataFrameRegressionModel, variant::Symbol) = r2(mm.model, variant) -StatsBase.adjr2(mm::DataFrameRegressionModel, variant::Symbol) = adjr2(mm.model, variant) +StatsBase.r2(mm::DataTableRegressionModel) = r2(mm.model) +StatsBase.adjr2(mm::DataTableRegressionModel) = adjr2(mm.model) +StatsBase.r2(mm::DataTableRegressionModel, variant::Symbol) = r2(mm.model, variant) +StatsBase.adjr2(mm::DataTableRegressionModel, variant::Symbol) = adjr2(mm.model, variant) # Predict function that takes data frame as predictor instead of matrix -function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; kwargs...) +function StatsBase.predict(mm::DataTableRegressionModel, df::AbstractDataTable; kwargs...) # copy terms, removing outcome if present (ModelFrame will complain if a - # term is not found in the DataFrame and we don't want to remove elements with missing y) + # term is not found in the DataTable and we don't want to remove elements with missing y) newTerms = dropresponse!(mm.mf.terms) # create new model frame/matrix mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts) @@ -89,7 +89,7 @@ end # coeftable implementation -function StatsBase.coeftable(model::DataFrameModels) +function StatsBase.coeftable(model::DataTableModels) ct = coeftable(model.model) cfnames = coefnames(model.mf) if length(ct.rownms) == length(cfnames) @@ -99,7 +99,7 @@ function StatsBase.coeftable(model::DataFrameModels) end # show function that delegates to coeftable -function Base.show(io::IO, model::DataFrameModels) +function Base.show(io::IO, model::DataTableModels) try ct = coeftable(model) println(io, "$(typeof(model))") diff --git a/test/contrasts.jl b/test/contrasts.jl index bad15996..598cd795 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -1,11 +1,12 @@ module TestContrasts using Base.Test -using DataFrames +using DataTables +using CategoricalArrays using StatsModels -d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b])) +d = DataTable(x = CategoricalVector([:a, :b, :c, :a, :a, :b])) mf = ModelFrame(Formula(nothing, :x), d) diff --git a/test/formula.jl b/test/formula.jl index f52e7af6..a17cacfa 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -6,7 +6,7 @@ using Compat # TODO: # - grouped variables in formulas with interactions -# - is it fast? Can expand() handle DataFrames? +# - is it fast? Can expand() handle DataTables? # - deal with intercepts # - implement ^2 for datavector's # - support more transformations with I()? diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl index e59b5ee2..1ed5f689 100644 --- a/test/modelmatrix.jl +++ b/test/modelmatrix.jl @@ -2,10 +2,11 @@ module TestModelMatrix using Base.Test using StatsModels -using DataFrames +using DataTables using Compat +using CategoricalArrays -# for testing while DataFrames still exports these: +# for testing while DataTables still exports these: import StatsModels: @formula, Formula, ModelMatrix, ModelFrame, DummyCoding, EffectsCoding, HelmertCoding, ContrastsCoding, setcontrasts!, coefnames @@ -13,7 +14,7 @@ import StatsModels: @formula, Formula, ModelMatrix, ModelFrame, DummyCoding, Eff sparsetype = SparseMatrixCSC{Float64,Int} -d = DataFrame() +d = DataTable() d[:y] = [1:4;] d[:x1] = [5:8;] d[:x2] = [9:12;] @@ -50,59 +51,59 @@ mm = ModelMatrix(mf) @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"] @test mm.m == ModelMatrix{sparsetype}(mf).m -#test_group("create a design matrix from interactions from two DataFrames") +#test_group("create a design matrix from interactions from two DataTables") ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a -## b = DataFrame() +## b = DataTable() ## b["x2"] = DataVector(x2) ## df = interaction_design_matrix(a,b) ## @test df[:,1] == DataVector([0, 10., 0, 0]) ## @test df[:,2] == DataVector([0, 0, 11., 0]) ## @test df[:,3] == DataVector([0, 0, 0, 12.]) -#test_group("expanding an singleton expression/symbol into a DataFrame") +#test_group("expanding an singleton expression/symbol into a DataTable") ## generalized expand was dropped, too ## df = deepcopy(d) ## r = expand(:x2, df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test r[:,1] == DataVector([9,10,11,12]) # TODO: test float vs int return ## df = deepcopy(d) ## ex = :(log(x2)) ## r = expand(ex, df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test r[:,1] == DataVector(log([9,10,11,12])) # ex = :(x1 & x2) # r = expand(ex, df) -# @test isa(r, DataFrame) +# @test isa(r, DataTable) # @test ncol(r) == 1 # @test r[:,1] == DataArray([45, 60, 77, 96]) ## r = expand(:(x1 + x2), df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test ncol(r) == 2 ## @test r[:,1] == DataVector(df["x1"]) ## @test r[:,2] == DataVector(df["x2"]) ## df["x1"] = CategoricalArray(x1) ## r = expand(:x1, df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test ncol(r) == 3 -## @test r == expand(CategoricalArray(x1), "x1", DataFrame()) +## @test r == expand(CategoricalArray(x1), "x1", DataTable()) ## r = expand(:(x1 + x2), df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test ncol(r) == 4 -## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame()) +## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataTable()) ## @test r[:,4] == DataVector(df["x2"]) ## df["x2"] = CategoricalArray(x2) ## r = expand(:(x1 + x2), df) -## @test isa(r, DataFrame) +## @test isa(r, DataTable) ## @test ncol(r) == 6 -## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame()) -## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataFrame()) +## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataTable()) +## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataTable()) #test_group("Creating a model matrix using full formulas: y => x1 + x2, etc") @@ -236,7 +237,7 @@ mm = ModelMatrix(mf) ## ## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...) ## -## df = DataFrame(y=1:27, +## df = DataTable(y=1:27, ## x1 = CategoricalArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), ## x2 = CategoricalArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), ## x3 = CategoricalArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) @@ -286,7 +287,7 @@ mm_sub = ModelMatrix(mf_sub) d[:x1m] = NullableArray(Nullable{Int}[5, 6, Nullable(), 7]) mf = ModelFrame(@formula(y ~ x1m), d) mm = ModelMatrix(mf) -@test isequal(NullableArray(mm.m[:, 2]), d[complete_cases(d), :x1m]) +@test isequal(NullableArray(mm.m[:, 2]), d[completecases(d), :x1m]) @test mm.m == ModelMatrix{sparsetype}(mf).m ## Same variable on left and right side @@ -296,7 +297,7 @@ mm.m == float(model_response(mf)) ## Promote non-redundant categorical terms to full rank -d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), +d = DataTable(x = Compat.repeat([:a, :b], outer = 4), y = Compat.repeat([:c, :d], inner = 2, outer = 2), z = Compat.repeat([:e, :f], inner = 4)) [categorical!(d, name) for name in names(d)] @@ -434,7 +435,7 @@ mm = ModelMatrix(mf) # Ensure that random effects terms are dropped from coefnames -df = DataFrame(x = [1,2,3], y = [4,5,6]) +df = DataTable(x = [1,2,3], y = [4,5,6]) mf = ModelFrame(@formula(y ~ 1 + (1 | x)), df) @test coefnames(mf) == ["(Intercept)"] @@ -444,11 +445,21 @@ mf = ModelFrame(@formula(y ~ 0 + (1 | x)), df) # Ensure X is not a view on df column -df = DataFrame(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0]) +df = DataTable(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0]) mf = ModelFrame(@formula(y ~ 0 + x), df) X = ModelMatrix(mf).m X[1] = 0.0 @test mf.df[1, :x] === Nullable(1.0) +# Ensure string columns are supported +df1 = DataTable(A = 1:4, B = categorical(["M", "F", "F", "M"])) +df2 = DataTable(A = 1:4, B = ["M", "F", "F", "M"]) +df3 = DataTable(Any[1:4, ["M", "F", "F", "M"]], [:A, :B]) + +M1 = ModelMatrix(ModelFrame(@formula(A ~ B), df1)) +M2 = ModelMatrix(ModelFrame(@formula(A ~ B), df2)) +M3 = ModelMatrix(ModelFrame(@formula(A ~ B), df3)) + +@test (M1.m, M1.assign) == (M2.m, M2.assign) == (M3.m, M3.assign) end diff --git a/test/statsmodel.jl b/test/statsmodel.jl index f1825104..e645cddc 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -1,7 +1,7 @@ module TestStatsModels using StatsModels -using DataFrames +using DataTables using Base.Test using Compat @@ -26,7 +26,7 @@ StatsBase.coeftable(mod::DummyMod) = 0) ## Test fitting -d = DataFrame() +d = DataTable() d[:y] = [1:4;] d[:x1] = [5:8;] d[:x2] = [9:12;] @@ -47,7 +47,7 @@ StatsBase.predict(mod::DummyMod, newX::Matrix) = newX * mod.beta mm = ModelMatrix(ModelFrame(f, d)) @test predict(m, mm.m) == mm.m * collect(1:4) -## new data from DataFrame (via ModelMatrix) +## new data from DataTable (via ModelMatrix) @test isequal(predict(m, d), NullableArray(predict(m, mm.m))) d2 = deepcopy(d)