Skip to content

Commit 1c8a187

Browse files
authored
Merge pull request #13 from JuliaStats/nl/modelmat
Get tests pass again, fix handling of string columns
2 parents cd1c9a5 + 739d478 commit 1c8a187

12 files changed

+123
-91
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ notifications:
1111
# uncomment the following lines to override the default test script
1212
script:
1313
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
14-
- julia -e 'Pkg.clone(pwd()); Pkg.checkout("DataFrames", "dfk/statsmodel-purge"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
14+
- julia -e 'Pkg.clone(pwd()); Pkg.clone("https://github.com/JuliaData/DataTables.jl.git"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
1515
after_success:
1616
# build and deploy documentation with Documenter.jl
1717
- julia -e 'cd(Pkg.dir("StatsModels")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'

docs/src/formula.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ fields with possibly heterogeneous types. One of the primary goals of
1414
`StatsModels` is to make it simpler to transform tabular data into matrix format
1515
suitable for statistical modeling.
1616

17-
At the moment, "tabular data" means an `AbstractDataFrame`. Ultimately, the
17+
At the moment, "tabular data" means an `AbstractDataTable`. Ultimately, the
1818
goal is to support any tabular data format that adheres to a minimal API,
1919
**regardless of backend**.
2020

@@ -88,7 +88,7 @@ dropterm
8888

8989
The main use of `Formula`s is for fitting statistical models based on tabular
9090
data. From the user's perspective, this is done by `fit` methods that take a
91-
`Formula` and a `DataFrame` instead of numeric matrices.
91+
`Formula` and a `DataTable` instead of numeric matrices.
9292

9393
Internally, this is accomplished in three stages:
9494

docs/src/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@ developers when dealing with statistical models and tabular data.
2121
* `RegressionModel`
2222

2323
Much of this package was formerly part
24-
of [`DataFrames`](https://www.github.com/JuliaStats/DataFrames.jl)
24+
of [`DataTables`](https://www.github.com/JuliaStats/DataTables.jl)
2525
and [`StatsBase`](https://www.github.com/JuliaStats/StatsBase.jl).

src/StatsModels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ __precompile__(true)
33
module StatsModels
44

55
using Compat
6-
using DataFrames
6+
using DataTables
77
using StatsBase
88
using NullableArrays
99
using CategoricalArrays

src/contrasts.jl

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,28 +169,21 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV
169169
ContrastsMatrix(mat, tnames, c_levels, contrasts)
170170
end
171171

172-
# Methods for constructing ContrastsMatrix from data. These are called in
173-
# ModelFrame constructor and setcontrasts!.
174-
ContrastsMatrix(C::AbstractContrasts,
175-
v::Union{CategoricalArray, NullableCategoricalArray}) =
176-
ContrastsMatrix(C, levels(v))
177-
ContrastsMatrix{C <: AbstractContrasts}(c::Type{C},
178-
col::Union{CategoricalArray, NullableCategoricalArray}) =
172+
ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, levels::AbstractVector) =
179173
throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)"))
180174

181-
# given an existing ContrastsMatrix, check that all of the levels present in the
182-
# data are present in the contrasts. Note that this behavior is different from the
175+
# given an existing ContrastsMatrix, check that all passed levels are present
176+
# in the contrasts. Note that this behavior is different from the
183177
# ContrastsMatrix constructor, which requires that the levels be exactly the same.
184178
# This method exists to support things like `predict` that can operate on new data
185179
# which may contain only a subset of the original data's levels. Checking here
186180
# (instead of in `modelmat_cols`) allows an informative error message.
187-
function ContrastsMatrix(c::ContrastsMatrix,
188-
col::Union{CategoricalArray, NullableCategoricalArray})
189-
if !isempty(setdiff(levels(col), c.levels))
190-
throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " *
191-
"$(setdiff(levels(col), c.levels))" *
192-
"\n Data levels: $(levels(col))" *
193-
"\n Contrast levels: $(c.levels)"))
181+
function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector)
182+
if !isempty(setdiff(levels, c.levels))
183+
throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " *
184+
"$(setdiff(levels, c.levels))" *
185+
"\n Data levels: $(levels)" *
186+
"\n Contrast levels: $(c.levels)"))
194187
end
195188
return c
196189
end

src/modelframe.jl

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Wrapper which combines Formula (Terms) and an AbstractDataFrame
2+
Wrapper which combines Formula (Terms) and an AbstractDataTable
33
44
This wrapper encapsulates all the information that's required to transform data
55
of the same structure as the wrapped data frame into a model matrix. This goes
@@ -13,19 +13,19 @@ then creates the necessary contrasts matrices and stores the results.
1313
# Constructors
1414
1515
```julia
16-
ModelFrame(f::Formula, df::AbstractDataFrame; contrasts::Dict = Dict())
17-
ModelFrame(ex::Expr, d::AbstractDataFrame; contrasts::Dict = Dict())
18-
ModelFrame(terms::Terms, df::AbstractDataFrame; contrasts::Dict = Dict())
16+
ModelFrame(f::Formula, df::AbstractDataTable; contrasts::Dict = Dict())
17+
ModelFrame(ex::Expr, d::AbstractDataTable; contrasts::Dict = Dict())
18+
ModelFrame(terms::Terms, df::AbstractDataTable; contrasts::Dict = Dict())
1919
# Inner constructors:
20-
ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray)
21-
ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
20+
ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray)
21+
ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
2222
```
2323
2424
# Arguments
2525
2626
* `f::Formula`: Formula whose left hand side is the *response* and right hand
2727
side are the *predictors*.
28-
* `df::AbstractDataFrame`: The data being modeled. This is used at this stage
28+
* `df::AbstractDataTable`: The data being modeled. This is used at this stage
2929
to determine which variables are categorical, and otherwise held for
3030
[`ModelMatrix`](@ref).
3131
* `contrasts::Dict`: An optional Dict of contrast codings for each categorical
@@ -41,21 +41,23 @@ ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Di
4141
# Examples
4242
4343
```julia
44-
julia> df = DataFrame(x = 1:4, y = 5:9)
44+
julia> df = DataTable(x = 1:4, y = 5:9)
4545
julia> mf = ModelFrame(y ~ 1 + x, df)
4646
```
4747
4848
"""
4949
type ModelFrame
50-
df::AbstractDataFrame
50+
df::AbstractDataTable
5151
terms::Terms
5252
msng::BitArray
5353
## mapping from df keys to contrasts matrices
5454
contrasts::Dict{Symbol, ContrastsMatrix}
5555
end
5656

57-
is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true
58-
is_categorical(::Any) = false
57+
is_categorical{T<:Real}(::AbstractArray{T}) = false
58+
typealias NullableReal{T<:Real} Nullable{T}
59+
is_categorical{T<:NullableReal}(::AbstractArray{T}) = false
60+
is_categorical(::AbstractArray) = true
5961

6062
## Check for non-redundancy of columns. For instance, if x is a factor with two
6163
## levels, it should be expanded into two columns in y~0+x but only one column
@@ -67,7 +69,7 @@ is_categorical(::Any) = false
6769
##
6870
## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
6971
## redundant evaluation terms.
70-
function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
72+
function check_non_redundancy!(trms::Terms, df::AbstractDataTable)
7173

7274
(n_eterms, n_terms) = size(trms.factors)
7375

@@ -102,34 +104,49 @@ end
102104

103105
const DEFAULT_CONTRASTS = DummyCoding
104106

107+
_unique(x::CategoricalArray) = unique(x)
108+
_unique(x::NullableCategoricalArray) = [get(l) for l in unique(x) if !isnull(l)]
109+
110+
function _unique{T<:Nullable}(x::AbstractArray{T})
111+
levs = [get(l) for l in unique(x) if !isnull(l)]
112+
try; sort!(levs); end
113+
return levs
114+
end
115+
116+
function _unique(x::AbstractArray)
117+
levs = unique(x)
118+
try; sort!(levs); end
119+
return levs
120+
end
121+
105122
## Set up contrasts:
106123
## Combine actual DF columns and contrast types if necessary to compute the
107124
## actual contrasts matrices, levels, and term names (using DummyCoding
108125
## as the default)
109-
function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
126+
function evalcontrasts(df::AbstractDataTable, contrasts::Dict = Dict())
110127
evaledContrasts = Dict()
111128
for (term, col) in eachcol(df)
112129
is_categorical(col) || continue
113130
evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
114131
contrasts[term] :
115132
DEFAULT_CONTRASTS(),
116-
col)
133+
_unique(col))
117134
end
118135
return evaledContrasts
119136
end
120137

121138
## Default NULL handler. Others can be added as keyword arguments
122-
function null_omit(df::DataFrame)
123-
cc = complete_cases(df)
139+
function null_omit(df::DataTable)
140+
cc = completecases(df)
124141
df[cc,:], cc
125142
end
126143

127144
_droplevels!(x::Any) = x
128145
_droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
129146

130-
function ModelFrame(trms::Terms, d::AbstractDataFrame;
147+
function ModelFrame(trms::Terms, d::AbstractDataTable;
131148
contrasts::Dict = Dict())
132-
df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
149+
df, msng = null_omit(DataTable(map(x -> d[x], trms.eterms)))
133150
names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
134151
for c in eachcol(df) _droplevels!(c[2]) end
135152

@@ -141,17 +158,17 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame;
141158
ModelFrame(df, trms, msng, evaledContrasts)
142159
end
143160

144-
ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
145-
ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
146-
ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
161+
ModelFrame(df::AbstractDataTable, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
162+
ModelFrame(f::Formula, d::AbstractDataTable; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
163+
ModelFrame(ex::Expr, d::AbstractDataTable; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
147164

148165
"""
149166
setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
150167
151168
Modify the contrast coding system of a ModelFrame in place.
152169
"""
153170
function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
154-
new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col]))
171+
new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, _unique(mf.df[col])))
155172
for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
156173

157174
mf.contrasts = merge(mf.contrasts, new_contrasts)

src/modelmatrix.jl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11

22
typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
3+
typealias AbstractRealVector{T<:Real} AbstractVector{T}
4+
typealias NullableRealVector{T<:NullableReal} AbstractVector{T}
35

46
"""
57
Convert a `ModelFrame` into a numeric matrix suitable for modeling
@@ -35,18 +37,26 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::Mode
3537
end
3638
end
3739

38-
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) =
40+
modelmat_cols{T<:AbstractFloatMatrix, V<:AbstractRealVector}(::Type{T}, v::V) =
3941
convert(T, reshape(v, length(v), 1))
4042
# FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264
41-
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) =
43+
modelmat_cols{T<:AbstractFloatMatrix, V<:NullableRealVector}(::Type{T}, v::V) =
4244
convert(T, Matrix(reshape(v, length(v), 1)))
45+
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Union{CategoricalVector, NullableCategoricalVector}) =
46+
modelmat_cols(T, reshape(v, length(v), 1))
4347

48+
# All non-real columns are considered as categorical
49+
# Could be made more efficient by directly storing the result into the model matrix
4450
"""
45-
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
51+
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix)
4652
4753
Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
4854
levels align properly.
4955
"""
56+
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector, contrast::ContrastsMatrix) =
57+
modelmat_cols(T, categorical(v), contrast)
58+
59+
5060
function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T},
5161
v::Union{CategoricalVector, NullableCategoricalVector},
5262
contrast::ContrastsMatrix)

src/statsmodel.jl

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,23 @@ macro delegate(source, targets)
3131
return result
3232
end
3333

34-
# Wrappers for DataFrameStatisticalModel and DataFrameRegressionModel
35-
immutable DataFrameStatisticalModel{M,T} <: StatisticalModel
34+
# Wrappers for DataTableStatisticalModel and DataTableRegressionModel
35+
immutable DataTableStatisticalModel{M,T} <: StatisticalModel
3636
model::M
3737
mf::ModelFrame
3838
mm::ModelMatrix{T}
3939
end
4040

41-
immutable DataFrameRegressionModel{M,T} <: RegressionModel
41+
immutable DataTableRegressionModel{M,T} <: RegressionModel
4242
model::M
4343
mf::ModelFrame
4444
mm::ModelMatrix{T}
4545
end
4646

47-
for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
48-
(:RegressionModel, DataFrameRegressionModel))
47+
for (modeltype, dfmodeltype) in ((:StatisticalModel, DataTableStatisticalModel),
48+
(:RegressionModel, DataTableRegressionModel))
4949
@eval begin
50-
function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame,
50+
function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataTable,
5151
args...; contrasts::Dict = Dict(), kwargs...)
5252
mf = ModelFrame(f, df, contrasts=contrasts)
5353
mm = ModelMatrix(mf)
@@ -58,24 +58,24 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
5858
end
5959

6060
# Delegate functions from StatsBase that use our new types
61-
typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegressionModel})
62-
@delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint,
61+
typealias DataTableModels @compat(Union{DataTableStatisticalModel, DataTableRegressionModel})
62+
@delegate DataTableModels.model [StatsBase.coef, StatsBase.confint,
6363
StatsBase.deviance, StatsBase.nulldeviance,
6464
StatsBase.loglikelihood, StatsBase.nullloglikelihood,
6565
StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs,
6666
StatsBase.stderr, StatsBase.vcov]
67-
@delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
67+
@delegate DataTableRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
6868
StatsBase.predict, StatsBase.predict!]
6969
# Need to define these manually because of ambiguity using @delegate
70-
StatsBase.r2(mm::DataFrameRegressionModel) = r2(mm.model)
71-
StatsBase.adjr2(mm::DataFrameRegressionModel) = adjr2(mm.model)
72-
StatsBase.r2(mm::DataFrameRegressionModel, variant::Symbol) = r2(mm.model, variant)
73-
StatsBase.adjr2(mm::DataFrameRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
70+
StatsBase.r2(mm::DataTableRegressionModel) = r2(mm.model)
71+
StatsBase.adjr2(mm::DataTableRegressionModel) = adjr2(mm.model)
72+
StatsBase.r2(mm::DataTableRegressionModel, variant::Symbol) = r2(mm.model, variant)
73+
StatsBase.adjr2(mm::DataTableRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
7474

7575
# Predict function that takes data frame as predictor instead of matrix
76-
function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; kwargs...)
76+
function StatsBase.predict(mm::DataTableRegressionModel, df::AbstractDataTable; kwargs...)
7777
# copy terms, removing outcome if present (ModelFrame will complain if a
78-
# term is not found in the DataFrame and we don't want to remove elements with missing y)
78+
# term is not found in the DataTable and we don't want to remove elements with missing y)
7979
newTerms = dropresponse!(mm.mf.terms)
8080
# create new model frame/matrix
8181
mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts)
@@ -89,7 +89,7 @@ end
8989

9090

9191
# coeftable implementation
92-
function StatsBase.coeftable(model::DataFrameModels)
92+
function StatsBase.coeftable(model::DataTableModels)
9393
ct = coeftable(model.model)
9494
cfnames = coefnames(model.mf)
9595
if length(ct.rownms) == length(cfnames)
@@ -99,7 +99,7 @@ function StatsBase.coeftable(model::DataFrameModels)
9999
end
100100

101101
# show function that delegates to coeftable
102-
function Base.show(io::IO, model::DataFrameModels)
102+
function Base.show(io::IO, model::DataTableModels)
103103
try
104104
ct = coeftable(model)
105105
println(io, "$(typeof(model))")

test/contrasts.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
module TestContrasts
22

33
using Base.Test
4-
using DataFrames
4+
using DataTables
5+
using CategoricalArrays
56
using StatsModels
67

78

8-
d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
9+
d = DataTable(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
910

1011
mf = ModelFrame(Formula(nothing, :x), d)
1112

test/formula.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ using Compat
66

77
# TODO:
88
# - grouped variables in formulas with interactions
9-
# - is it fast? Can expand() handle DataFrames?
9+
# - is it fast? Can expand() handle DataTables?
1010
# - deal with intercepts
1111
# - implement ^2 for datavector's
1212
# - support more transformations with I()?

0 commit comments

Comments
 (0)