Skip to content

Replace src/scitypes.jl with ScientificTypes.jl package #28

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 41 commits into from
Aug 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
1f3e6f1
Revamp scitypes.
ablaom Jun 30, 2019
62a0a09
get tests passing; more tests neeeded
ablaom Jun 30, 2019
0b56bf4
Define ArrayScitype; make VectorScitype, MatrixScitype "aliases"
ablaom Jul 1, 2019
2a971fd
minor
ablaom Jul 1, 2019
29adc74
Define ArrayScitype; make VectorScitype, MatrixScitype "aliases"
ablaom Jul 1, 2019
53987b9
Merge branch 'master' into tablescitype
ablaom Jul 16, 2019
76dff2b
simplify scitypes again; traits need refactor
ablaom Jul 26, 2019
c405b40
Fix traits and tasks and tests
ablaom Jul 27, 2019
93e1353
Fix bug with scitype of arrays
ablaom Jul 27, 2019
d375b0b
update docs
ablaom Jul 27, 2019
737b574
Merge branch 'master' into tablescitype
ablaom Jul 27, 2019
777c004
update docs
ablaom Jul 27, 2019
62ec6a8
remove redundant code
ablaom Jul 27, 2019
eb920c1
remove redundant code
ablaom Jul 29, 2019
c6f0f24
replace scitypes.jl with ScientificTypes pkg; scitypes->schema
ablaom Aug 1, 2019
eb0e312
fix bug with info on Unsupervised models
ablaom Aug 2, 2019
9c19205
fix stray characters
ablaom Aug 2, 2019
d5e480d
Merge branch 'master' into tablescitype
ablaom Aug 4, 2019
ef0d64c
add support for sample weights
ablaom Aug 5, 2019
7f0acee
use LittleDict for info, adding OrderedCollections as dependency
ablaom Aug 5, 2019
1f5a716
use LittleDict for UnivariateFinite
ablaom Aug 5, 2019
489ed52
efficiency and other improvements to UnivariateDistribution
ablaom Aug 12, 2019
9ef0b08
update docs
ablaom Aug 12, 2019
9f17d21
Merge branch 'removepadding' into tablescitype
ablaom Aug 14, 2019
d14a351
remove forgotten merge marking
ablaom Aug 14, 2019
12a9acf
fix flawed merge
ablaom Aug 14, 2019
7eb2f1f
add [compat] ScientficTypes = "0.1.2"
ablaom Aug 14, 2019
08a1b46
remove restriction on CategoricalArrays
ablaom Aug 14, 2019
2eea724
some fixes to support(::UnivariateFinite)
ablaom Aug 14, 2019
36731e3
minor
ablaom Aug 14, 2019
e555863
revert [compat] CategoricalArrays = "<0.5.3"
ablaom Aug 14, 2019
7d47d6c
add extra distribution tests
ablaom Aug 15, 2019
afe775b
add package_license trait
ablaom Aug 15, 2019
53ec02f
add test that int method respects order
ablaom Aug 16, 2019
4484dea
correct typo and supports_sample_weights -> supports_weights
ablaom Aug 16, 2019
7479230
Modify trait defaults
ablaom Aug 16, 2019
dc8c34a
export supports_weights
ablaom Aug 16, 2019
01f2fe0
fix DataFrames column access depreciation message
ablaom Aug 19, 2019
71588d3
remove redundant scitypes from export
ablaom Aug 19, 2019
f0fbcfc
bump to v0.4.0
ablaom Aug 19, 2019
aea2fd7
Merge branch 'tablescitype' of github.com:alan-turing-institute/MLJBa…
ablaom Aug 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "0.3.0"
version = "0.4.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand All @@ -19,6 +20,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
CSV = "0.5"
CategoricalArrays = "<0.5.3"
Requires = "^0.5.2"
ScientificTypes = "0.1.2"
Tables = "<0.1.19, >= 0.2"
julia = "1"

Expand Down
2,914 changes: 1,457 additions & 1,457 deletions data/ames.csv

Large diffs are not rendered by default.

2,912 changes: 1,456 additions & 1,456 deletions data/reduced_ames.csv

Large diffs are not rendered by default.

75 changes: 39 additions & 36 deletions src/MLJBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,43 @@ export fit, update, clean!
export predict, predict_mean, predict_mode, fitted_params
export transform, inverse_transform, se, evaluate, best
export load_path, package_url, package_name, package_uuid
export input_scitype_union, input_is_multivariate
export target_scitype_union, target_quantity
export is_pure_julia, is_wrapper
export input_scitype, supports_weights
export target_scitype, target_quantity
export is_pure_julia, is_wrapper

export params # parameters.jl
export reconstruct, int, decoder, classes # data.jl
export selectrows, selectcols, select, nrows, schema # data.jl
export selectrows, selectcols, select, nrows # data.jl
export table, levels_seen, matrix, container_type # data.jl
export partition, @set_defaults # utilities.jl
export Found, Continuous, Finite, Infinite # sgcitypes.jl
export OrderedFactor, Unknown # scitypes.jl
export Count, Multiclass, Binary # scitypes.jl
export scitype, scitype_union, scitypes # scitypes.jl
export HANDLE_GIVEN_ID, @more, @constant # show.jl
export color_on, color_off # show.jl
export UnivariateFinite, average # distributions.jl
export UnivariateFinite, average # distributions.jl
export SupervisedTask, UnsupervisedTask, MLJTask # tasks.jl
export X_and_y, X_, y_, nrows, nfeatures # tasks.jl
export info # info.jl

# methods from other packages to be rexported:
export pdf, mean, mode

# re-export of ScientificTypes (`Table` not exported):
export trait
export Scientific, Found, Unknown, Finite, Infinite
export OrderedFactor, Multiclass, Count, Continuous
export Binary, ColorImage, GrayImage
export scitype, scitype_union, coerce, schema

import Base.==

using Tables
using OrderedCollections # already a dependency of StatsBase
import Distributions
import Distributions: pdf, mode
using CategoricalArrays
using OrderedCollections
import CategoricalArrays
import ColorTypes
using ScientificTypes
import ScientificTypes: trait

# to be extended:
import StatsBase: fit, predict, fit!
Expand All @@ -60,13 +66,15 @@ const COLUMN_WIDTH = 24
const DEFAULT_SHOW_DEPTH = 0

include("utilities.jl")
include("scitypes.jl")


## ABSTRACT TYPES
## BASE TYPES

# overarching MLJ type:
abstract type MLJType end
include("equality.jl") # equality for MLJType objects


## ABSTRACT MODEL TYPES

# for storing hyperparameters:
abstract type Model <: MLJType end
Expand All @@ -86,16 +94,13 @@ abstract type ProbabilisticNetwork <: Probabilistic end
abstract type DeterministicNetwork <: Deterministic end
abstract type UnsupervisedNetwork <: Unsupervised end

include("equality.jl")


## THE MODEL INTERFACE

# every model interface must implement a `fit` method of the form
# `fit(model, verbosity, X, y) -> fitresult, cache, report` or
# `fit(model, verbosity, X, ys...) -> fitresult, cache, report` (multivariate case)
# `fit(model, verbosity::Integer, training_args...) -> fitresult, cache, report`
# or, one the simplified versions
# `fit(model, X, y) -> fitresult`
# `fit(model, training_args...) -> fitresult`
# `fit(model, X, ys...) -> fitresult`
fit(model::Model, verbosity::Integer, args...) = fit(model, args...), nothing, nothing

Expand Down Expand Up @@ -129,29 +134,27 @@ function best end
clean!(model::Model) = ""

# fallback trait declarations:
target_scitype_union(::Type{<:Supervised}) =
Union{Found,NTuple{N,Found}} where N # a Tuple type in multivariate case
output_scitype_union(::Type{<:Unsupervised}) =
Union{Missing,Found}
output_is_multivariate(::Type{<:Unsupervised}) = true
input_scitype_union(::Type{<:Model}) = Union{Missing,Found}
input_is_multivariate(::Type{<:Model}) = true
is_pure_julia(::Type{<:Model}) = false
package_name(::Type{<:Model}) = "unknown"
load_path(M::Type{<:Model}) = "unknown"
package_uuid(::Type{<:Model}) = "unknown"
package_url(::Type{<:Model}) = "unknown"
is_wrapper(::Type{<:Model}) = false
is_wrapper(m::Model) = is_wrapper(typeof(m))

target_scitype_union(model::Model) = target_scitype_union(typeof(model))
input_scitype_union(model::Model) = input_scitype_union(typeof(model))
input_is_multivariate(model::Model) = input_is_multivariate(typeof(model))
input_scitype(::Any) = Unknown
output_scitype(::Any) = Unknown
target_scitype(::Any) = Unknown
is_pure_julia(::Any) = false
package_name(::Any) = "unknown"
package_license(::Any) = "unkown"
load_path(::Any) = "unknown"
package_uuid(::Any) = "unknown"
package_url(::Any) = "unknown"
is_wrapper(::Any) = false
supports_weights(::Any) = false

input_scitype(model::Model) = input_scitype(typeof(model))
output_scitype(model::Model) = output_scitype(typeof(model))
target_scitype(model::Model) = target_scitype(typeof(model))
is_pure_julia(model::Model) = is_pure_julia(typeof(model))
package_name(model::Model) = package_name(typeof(model))
load_path(model::Model) = load_path(typeof(model))
package_uuid(model::Model) = package_uuid(typeof(model))
package_url(model::Model) = package_url(typeof(model))
is_wrapper(m::Model) = is_wrapper(typeof(m))

# probabilistic supervised models may also overload one or more of
# `predict_mode`, `predict_median` and `predict_mean` defined below.
Expand Down
117 changes: 16 additions & 101 deletions src/data.jl
Original file line number Diff line number Diff line change
@@ -1,35 +1,4 @@
## CATEGORICAL ARRAY DECODER UTILITY

# """
# reconstruct(A)

# For reconstructing categorical arrays from their elements alone. Here
# `A` is of type `AbstractArray{T}` where `T` is a subtype of
# `CategoricalString` or `CategoricalValue`. The function `reconstruct` has
# the property that `reconstruct(broadcast(identity, C)) == C`, whenever `C`
# is a `CategoricalArray`. In other words, `reconstruct` is a left-inverse
# for the function `C -> broadcast(identity, C)` that strips a
# CategoricalArray of its "categorical wrapper".

# Does not handle missing values.

# """
# function reconstruct(A::AbstractArray{<:CategoricalValue{T},N}) where {T,N}
# firstnonmissing = findfirst(x->!ismissing(x), A)
# isnothing(firstnonmissing) && error("No non-missing values encountered. ")
# pool = A[firstnonmissing].pool
# refs = broadcast(x -> x.level, A)
# return CategoricalArray{T,N}(refs, pool)
# end
# function reconstruct(A::AbstractArray{<:CategoricalString,N}) where {T,N}
# firstnonmissing = findfirst(x->!ismissing(x), A)
# isnothing(firstnonmissing) && error("No non-missing values encountered. ")
# pool = A[firstnonmissing].pool
# refs = broadcast(x -> x.level, A)
# return CategoricalArray{String,N}(refs, pool)
# end

CategoricalElement = Union{CategoricalValue,CategoricalString}
CategoricalElement{U} = Union{CategoricalValue{<:Any,U},CategoricalString{U}}

"""
classes(x)
Expand Down Expand Up @@ -61,7 +30,8 @@ function classes(x::CategoricalElement)
return [p.valindex[p.invindex[v]] for v in p.levels]
end

raw(x::CategoricalElement) = x.pool.index[x.level] # a method just for testing
# a method just for testing:
raw(x::CategoricalElement) = x.pool.index[x.level]

"""
int(x)
Expand Down Expand Up @@ -95,8 +65,11 @@ Broadcasted versions of `int`.
See also: [`decoder`](@ref).
"""
int(x::CategoricalElement) = x.pool.order[x.pool.invindex[x]]
int(X::CategoricalArray) = broadcast(r -> X.pool.order[r], X.refs)
int(V::Array{<:CategoricalElement}) = broadcast(int, V)
int(A::AbstractArray{<:CategoricalElement}) = broadcast(int, A)
# workaround for CategoricalArrays issue
# https://github.com/JuliaData/CategoricalArrays.jl/issues/199:
# function int(X::CategoricalArray)


struct CategoricalDecoder{T,R} # <: MLJType
pool::CategoricalPool{T,R}
Expand Down Expand Up @@ -144,35 +117,10 @@ decoder(element::CategoricalElement) =

## TABULAR DATA

const istable = Tables.istable

# hack for detecting JuliaDB.NDSparse tables without loading as dependency:
isndsparse(X) = isdefined(X, :data_buffer)


"""
container_type(X)

Return `:table`, `:sparse`, or `:other`, according to whether `X` is a
supported table format, a supported sparse table format, or something
else.

The first two formats, together abstract vectors, support the
`MLJBase` accessor methods `selectrows`, `selectcols`, `select`,
`nrows`, `schema`, and `union_scitypes`.

"""
function container_type(X)
if istable(X)
return :table
elseif isndsparse(X)
return :sparse
else
return :other
end
end


## UTILITY FOR CONVERTING BETWEEN TABULAR DATA AND MATRICES

"""
Expand All @@ -186,7 +134,7 @@ returned. The integer relabelling of column names follows the
lexicographic ordering (as indicated by `schema(X).names`).

"""
matrix(X) = matrix(Val(container_type(X)), X)
matrix(X) = matrix(Val(ScientificTypes.trait(X)), X)
matrix(::Val{:other}, X) = throw(ArgumentError)
matrix(::Val{:other}, X::AbstractMatrix) = X

Expand Down Expand Up @@ -229,7 +177,7 @@ named tuple of columns of `X`, with `keys(cols) = names`.

"""
function table(cols::NamedTuple; prototype=cols)
istable(prototype) || error("prototype is not tabular.")
Tables.istable(prototype) || error("prototype is not tabular.")
return Tables.materializer(prototype)(cols)
end
function table(X::AbstractMatrix; names=nothing, prototype=nothing)
Expand All @@ -246,7 +194,6 @@ end

## UNIFIED API FOR ACCESSING TABLES, MATRICES AND VECTORS


"""
selectrows(X, r)

Expand All @@ -256,7 +203,7 @@ table of the preferred sink type of `typeof(X)`, even a single row is
selected.

"""
selectrows(X, r) = selectrows(Val(container_type(X)), X, r)
selectrows(X, r) = selectrows(Val(ScientificTypes.trait(X)), X, r)
selectrows(::Val{:other}, X, r) = throw(ArgumentError)

"""
Expand All @@ -269,7 +216,7 @@ object returned is a table of the preferred sink type of
or `CategoricalVector` is returned.

"""
selectcols(X, c) = selectcols(Val(container_type(X)), X, c)
selectcols(X, c) = selectcols(Val(ScientificTypes.trait(X)), X, c)
selectcols(::Val{:other}, X, c) = throw(ArgumentError)

"""
Expand All @@ -282,26 +229,16 @@ Select element of a table or sparse table at row `r` and column
See also: [`selectrows`](@ref), [`selectcols`](@ref).

"""
select(X, r, c) = select(Val(container_type(X)), X, r, c)
select(X, r, c) = select(Val(ScientificTypes.trait(X)), X, r, c)
select(::Val{:other}, X, r, c) = throw(ArgumentError)

"""
schema(X)

Returns a struct with properties `names`, `types`
with the obvious meanings. Here `X` is any table or sparse table.

"""
schema(X) = schema(Val(container_type(X)), X)
schema(::Val{:other}, X) = throw(ArgumentError)

"""
nrows(X)

Return the number of rows in a table, sparse table, or abstract vector.

"""
nrows(X) = nrows(Val(container_type(X)), X)
nrows(X) = nrows(Val(ScientificTypes.trait(X)), X)
nrows(::Val{:other}, X) = throw(ArgumentError)


Expand Down Expand Up @@ -359,24 +296,7 @@ select(::Val{:table}, X, r::Integer, c) = selectcols(selectrows(X, r), c)
select(::Val{:table}, X, r, c::Symbol) = selectcols(X, c)[r]
select(::Val{:table}, X, r, c) = selectcols(selectrows(X, r), c)

function schema(::Val{:table}, X)
istable(X) || throw(ArgumentError)
if !Tables.columnaccess(X)
return Tables.schema(Tables.rows(X))
else
return Tables.schema(Tables.columns(X))
end
end

function nrows(::Val{:table}, X)
if !Tables.columnaccess(X)
return length(collect(X))
else
cols = Tables.columntable(X)
!isempty(cols) || return 0
return length(cols[1])
end
end
nrows(::Val{:table}, X) = schema(X).nrows


## ACCESSORS FOR ABSTRACT VECTORS
Expand All @@ -385,7 +305,7 @@ selectrows(::Val{:other}, v::AbstractVector, r) = v[r]
nrows(::Val{:other}, v::AbstractVector) = length(v)
selectrows(::Val{:other}, v::CategoricalVector, r) = @inbounds v[r]


## to be replaced (not used anywhere):
## ACCESSORS FOR JULIA NDSPARSE ARRAYS (N=2)

nrows(::Val{:sparse}, X) = maximum([r[1] for r in keys(X)])
Expand All @@ -408,8 +328,3 @@ select(::Val{:sparse}, X, r::Integer, c::AbstractVector{Symbol}) = X[r,sort(c)]
select(::Val{:sparse}, X, r::Integer, ::Colon) = X[r,:]
select(::Val{:sparse}, X, r, c) = X[r,sort(c)]

function schema(::Val{:sparse}, X)
names = sort(unique([r[2] for r in keys(X)]))
types = [eltype(selectcols(X, name)) for name in names]
return Tables.Schema(names, types)
end
2 changes: 0 additions & 2 deletions src/datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ having six numerical and six categorical features."""
function load_reduced_ames()
df = CSV.read(joinpath(datadir, "reduced_ames.csv"), copycols=true,
categorical=true)
df[:target] = exp.(df[:target])
# TODO: uncomment following after julia #29501 is resolved
# df.OverallQual = categorical(df.OverallQual, ordered=true)
# df[:GarageCars] = categorical(df[:GarageCars], ordered=true)
Expand All @@ -36,7 +35,6 @@ end
function load_ames()
df = CSV.read(joinpath(datadir, "ames.csv"), copycols=true,
categorical=true)
df[:target] = exp.(df[:target])
return SupervisedTask(verbosity=0, data=df,
target=:target,
ignore=[:Id,],
Expand Down
Loading