JuliaStats · kleinschmidt · Feb 20, 2017 · Feb 18, 2017 · Feb 18, 2017 · Feb 18, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,7 @@ notifications:
 # uncomment the following lines to override the default test script
 script:
   - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-  - julia -e 'Pkg.clone(pwd()); Pkg.checkout("DataFrames", "dfk/statsmodel-purge"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
+  - julia -e 'Pkg.clone(pwd()); Pkg.clone("https://github.com/JuliaData/DataTables.jl.git"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
 after_success:
   # build and deploy documentation with Documenter.jl
   - julia -e 'cd(Pkg.dir("StatsModels")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'

diff --git a/docs/src/formula.md b/docs/src/formula.md
@@ -14,7 +14,7 @@ fields with possibly heterogeneous types.  One of the primary goals of
 `StatsModels` is to make it simpler to transform tabular data into matrix format
 suitable for statistical modeling.
 
-At the moment, "tabular data" means an `AbstractDataFrame`.  Ultimately, the
+At the moment, "tabular data" means an `AbstractDataTable`.  Ultimately, the
 goal is to support any tabular data format that adheres to a minimal API,
 **regardless of backend**.
 
@@ -88,7 +88,7 @@ dropterm
 
 The main use of `Formula`s is for fitting statistical models based on tabular
 data.  From the user's perspective, this is done by `fit` methods that take a
-`Formula` and a `DataFrame` instead of numeric matrices.
+`Formula` and a `DataTable` instead of numeric matrices.
 
 Internally, this is accomplished in three stages:
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -21,5 +21,5 @@ developers when dealing with statistical models and tabular data.
     * `RegressionModel`
 
 Much of this package was formerly part
-of [`DataFrames`](https://www.github.com/JuliaStats/DataFrames.jl)
+of [`DataTables`](https://www.github.com/JuliaStats/DataTables.jl)
 and [`StatsBase`](https://www.github.com/JuliaStats/StatsBase.jl).
diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -3,7 +3,7 @@ __precompile__(true)
 module StatsModels
 
 using Compat
-using DataFrames
+using DataTables
 using StatsBase
 using NullableArrays
 using CategoricalArrays

diff --git a/src/contrasts.jl b/src/contrasts.jl
@@ -169,28 +169,21 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV
     ContrastsMatrix(mat, tnames, c_levels, contrasts)
 end
 
-# Methods for constructing ContrastsMatrix from data. These are called in
-# ModelFrame constructor and setcontrasts!.
-ContrastsMatrix(C::AbstractContrasts,
-                v::Union{CategoricalArray, NullableCategoricalArray}) =
-    ContrastsMatrix(C, levels(v))
-ContrastsMatrix{C <: AbstractContrasts}(c::Type{C},
-                                        col::Union{CategoricalArray, NullableCategoricalArray}) =
+ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, levels::AbstractVector) =
     throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)"))
 
-# given an existing ContrastsMatrix, check that all of the levels present in the
-# data are present in the contrasts. Note that this behavior is different from the
+# given an existing ContrastsMatrix, check that all passed levels are present
+# in the contrasts. Note that this behavior is different from the
 # ContrastsMatrix constructor, which requires that the levels be exactly the same.
 # This method exists to support things like `predict` that can operate on new data
 # which may contain only a subset of the original data's levels. Checking here
 # (instead of in `modelmat_cols`) allows an informative error message.
-function ContrastsMatrix(c::ContrastsMatrix,
-                         col::Union{CategoricalArray, NullableCategoricalArray})
-    if !isempty(setdiff(levels(col), c.levels))
-        throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " *
-                            "$(setdiff(levels(col), c.levels))" *
-                            "\n  Data levels: $(levels(col))" *
-                            "\n  Contrast levels: $(c.levels)"))
+function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector)
+    if !isempty(setdiff(levels, c.levels))
+         throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " *
+                             "$(setdiff(levels, c.levels))" *
+                             "\n  Data levels: $(levels)" *
+                             "\n  Contrast levels: $(c.levels)"))
     end
     return c
 end

diff --git a/src/modelframe.jl b/src/modelframe.jl
@@ -1,5 +1,5 @@
 """
-Wrapper which combines Formula (Terms) and an AbstractDataFrame
+Wrapper which combines Formula (Terms) and an AbstractDataTable
 
 This wrapper encapsulates all the information that's required to transform data
 of the same structure as the wrapped data frame into a model matrix.  This goes
@@ -13,19 +13,19 @@ then creates the necessary contrasts matrices and stores the results.
 # Constructors
 
 ```julia
-ModelFrame(f::Formula, df::AbstractDataFrame; contrasts::Dict = Dict())
-ModelFrame(ex::Expr, d::AbstractDataFrame; contrasts::Dict = Dict())
-ModelFrame(terms::Terms, df::AbstractDataFrame; contrasts::Dict = Dict())
+ModelFrame(f::Formula, df::AbstractDataTable; contrasts::Dict = Dict())
+ModelFrame(ex::Expr, d::AbstractDataTable; contrasts::Dict = Dict())
+ModelFrame(terms::Terms, df::AbstractDataTable; contrasts::Dict = Dict())
 # Inner constructors:
-ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray)
-ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
+ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray)
+ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
 ```
 
 # Arguments
 
 * `f::Formula`: Formula whose left hand side is the *response* and right hand
   side are the *predictors*.
-* `df::AbstractDataFrame`: The data being modeled.  This is used at this stage
+* `df::AbstractDataTable`: The data being modeled.  This is used at this stage
   to determine which variables are categorical, and otherwise held for
   [`ModelMatrix`](@ref).
 * `contrasts::Dict`: An optional Dict of contrast codings for each categorical
@@ -41,21 +41,23 @@ ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Di
 # Examples
 
 ```julia
-julia> df = DataFrame(x = 1:4, y = 5:9)
+julia> df = DataTable(x = 1:4, y = 5:9)
 julia> mf = ModelFrame(y ~ 1 + x, df)
 ```
 
 """
 type ModelFrame
-    df::AbstractDataFrame
+    df::AbstractDataTable
     terms::Terms
     msng::BitArray
     ## mapping from df keys to contrasts matrices
     contrasts::Dict{Symbol, ContrastsMatrix}
 end
 
-is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
-is_categorical(::Any) = false
+is_categorical{T<:Real}(::AbstractArray{T}) = false
+typealias NullableReal{T<:Real} Nullable{T}
+is_categorical{T<:NullableReal}(::AbstractArray{T}) = false
+is_categorical(::AbstractArray) = true
 
 ## Check for non-redundancy of columns.  For instance, if x is a factor with two
 ## levels, it should be expanded into two columns in y~0+x but only one column
@@ -67,7 +69,7 @@ is_categorical(::Any) = false
 ##
 ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
 ## redundant evaluation terms.
-function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
+function check_non_redundancy!(trms::Terms, df::AbstractDataTable)
 
     (n_eterms, n_terms) = size(trms.factors)
 
@@ -102,34 +104,49 @@ end
 
 const DEFAULT_CONTRASTS = DummyCoding
 
+_unique(x::CategoricalArray) = unique(x)
+_unique(x::NullableCategoricalArray) = [get(l) for l in unique(x) if !isnull(l)]
+
+function _unique{T<:Nullable}(x::AbstractArray{T})
+    levs = [get(l) for l in unique(x) if !isnull(l)]
+    try; sort!(levs); end
+    return levs
+end
+
+function _unique(x::AbstractArray)
+    levs = unique(x)
+    try; sort!(levs); end
+    return levs
+end
+
 ## Set up contrasts:
 ## Combine actual DF columns and contrast types if necessary to compute the
 ## actual contrasts matrices, levels, and term names (using DummyCoding
 ## as the default)
-function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
+function evalcontrasts(df::AbstractDataTable, contrasts::Dict = Dict())
     evaledContrasts = Dict()
     for (term, col) in eachcol(df)
         is_categorical(col) || continue
         evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
                                                 contrasts[term] :
                                                 DEFAULT_CONTRASTS(),
-                                                col)
+                                                _unique(col))
     end
     return evaledContrasts
 end
 
 ## Default NULL handler.  Others can be added as keyword arguments
-function null_omit(df::DataFrame)
-    cc = complete_cases(df)
+function null_omit(df::DataTable)
+    cc = completecases(df)
     df[cc,:], cc
 end
 
 _droplevels!(x::Any) = x
 _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
 
-function ModelFrame(trms::Terms, d::AbstractDataFrame;
+function ModelFrame(trms::Terms, d::AbstractDataTable;
                     contrasts::Dict = Dict())
-    df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
+    df, msng = null_omit(DataTable(map(x -> d[x], trms.eterms)))
     names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
     for c in eachcol(df) _droplevels!(c[2]) end
 
@@ -141,17 +158,17 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame;
     ModelFrame(df, trms, msng, evaledContrasts)
 end
 
-ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
-ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
-ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
+ModelFrame(df::AbstractDataTable, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
+ModelFrame(f::Formula, d::AbstractDataTable; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
+ModelFrame(ex::Expr, d::AbstractDataTable; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
 
 """
     setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
 
 Modify the contrast coding system of a ModelFrame in place.
 """
 function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
-    new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col]))
+    new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, _unique(mf.df[col])))
                       for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
 
     mf.contrasts = merge(mf.contrasts, new_contrasts)

diff --git a/src/modelmatrix.jl b/src/modelmatrix.jl
@@ -1,5 +1,7 @@
 
 typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
+typealias AbstractRealVector{T<:Real} AbstractVector{T}
+typealias NullableRealVector{T<:NullableReal} AbstractVector{T}
 
 """
 Convert a `ModelFrame` into a numeric matrix suitable for modeling
@@ -35,18 +37,26 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::Mode
     end
 end
 
-modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
+modelmat_cols{T<:AbstractFloatMatrix, V<:AbstractRealVector}(::Type{T}, v::V) =
     convert(T, reshape(v, length(v), 1))
 # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
-modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
+modelmat_cols{T<:AbstractFloatMatrix, V<:NullableRealVector}(::Type{T}, v::V) =
     convert(T, Matrix(reshape(v, length(v), 1)))
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Union{CategoricalVector, NullableCategoricalVector}) =
+    modelmat_cols(T, reshape(v, length(v), 1))
 
+# All non-real columns are considered as categorical
+# Could be made more efficient by directly storing the result into the model matrix
 """
-    modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
+    modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix)
 
 Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
 levels align properly.
 """
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix) =
+    modelmat_cols(T, categorical(v), contrast)
+
+
 function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
                                                v::Union{CategoricalVector, NullableCategoricalVector},
                                                contrast::ContrastsMatrix)

diff --git a/src/statsmodel.jl b/src/statsmodel.jl
@@ -31,23 +31,23 @@ macro delegate(source, targets)
     return result
 end
 
-# Wrappers for DataFrameStatisticalModel and DataFrameRegressionModel
-immutable DataFrameStatisticalModel{M,T} <: StatisticalModel
+# Wrappers for DataTableStatisticalModel and DataTableRegressionModel
+immutable DataTableStatisticalModel{M,T} <: StatisticalModel
     model::M
     mf::ModelFrame
     mm::ModelMatrix{T}
 end
 
-immutable DataFrameRegressionModel{M,T} <: RegressionModel
+immutable DataTableRegressionModel{M,T} <: RegressionModel
     model::M
     mf::ModelFrame
     mm::ModelMatrix{T}
 end
 
-for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
-                                 (:RegressionModel, DataFrameRegressionModel))
+for (modeltype, dfmodeltype) in ((:StatisticalModel, DataTableStatisticalModel),
+                                 (:RegressionModel, DataTableRegressionModel))
     @eval begin
-        function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame,
+        function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataTable,
                                               args...; contrasts::Dict = Dict(), kwargs...)
             mf = ModelFrame(f, df, contrasts=contrasts)
             mm = ModelMatrix(mf)
@@ -58,24 +58,24 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
 end
 
 # Delegate functions from StatsBase that use our new types
-typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegressionModel})
-@delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint,
+typealias DataTableModels @compat(Union{DataTableStatisticalModel, DataTableRegressionModel})
+@delegate DataTableModels.model [StatsBase.coef, StatsBase.confint,
                                  StatsBase.deviance, StatsBase.nulldeviance,
                                  StatsBase.loglikelihood, StatsBase.nullloglikelihood,
                                  StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs,
                                  StatsBase.stderr, StatsBase.vcov]
-@delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
+@delegate DataTableRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
                                           StatsBase.predict, StatsBase.predict!]
 # Need to define these manually because of ambiguity using @delegate
-StatsBase.r2(mm::DataFrameRegressionModel) = r2(mm.model)
-StatsBase.adjr2(mm::DataFrameRegressionModel) = adjr2(mm.model)
-StatsBase.r2(mm::DataFrameRegressionModel, variant::Symbol) = r2(mm.model, variant)
-StatsBase.adjr2(mm::DataFrameRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
+StatsBase.r2(mm::DataTableRegressionModel) = r2(mm.model)
+StatsBase.adjr2(mm::DataTableRegressionModel) = adjr2(mm.model)
+StatsBase.r2(mm::DataTableRegressionModel, variant::Symbol) = r2(mm.model, variant)
+StatsBase.adjr2(mm::DataTableRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
 
 # Predict function that takes data frame as predictor instead of matrix
-function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; kwargs...)
+function StatsBase.predict(mm::DataTableRegressionModel, df::AbstractDataTable; kwargs...)
     # copy terms, removing outcome if present (ModelFrame will complain if a
-    # term is not found in the DataFrame and we don't want to remove elements with missing y)
+    # term is not found in the DataTable and we don't want to remove elements with missing y)
     newTerms = dropresponse!(mm.mf.terms)
     # create new model frame/matrix
     mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts)
@@ -89,7 +89,7 @@ end
 
 
 # coeftable implementation
-function StatsBase.coeftable(model::DataFrameModels)
+function StatsBase.coeftable(model::DataTableModels)
     ct = coeftable(model.model)
     cfnames = coefnames(model.mf)
     if length(ct.rownms) == length(cfnames)
@@ -99,7 +99,7 @@ function StatsBase.coeftable(model::DataFrameModels)
 end
 
 # show function that delegates to coeftable
-function Base.show(io::IO, model::DataFrameModels)
+function Base.show(io::IO, model::DataTableModels)
     try
         ct = coeftable(model)
         println(io, "$(typeof(model))")

diff --git a/test/contrasts.jl b/test/contrasts.jl
@@ -1,11 +1,12 @@
 module TestContrasts
 
 using Base.Test
-using DataFrames
+using DataTables
+using CategoricalArrays
 using StatsModels
 
 
-d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
+d = DataTable(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
 
 mf = ModelFrame(Formula(nothing, :x), d)
 

diff --git a/test/formula.jl b/test/formula.jl
@@ -6,7 +6,7 @@ using Compat
 
 # TODO:
 # - grouped variables in formulas with interactions
-# - is it fast?  Can expand() handle DataFrames?
+# - is it fast?  Can expand() handle DataTables?
 # - deal with intercepts
 # - implement ^2 for datavector's
 # - support more transformations with I()?