Skip to content
This repository was archived by the owner on May 5, 2019. It is now read-only.

WIP: Restore old grouping algorithm and improve it #76

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 181 additions & 23 deletions src/groupeddatatable/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,159 @@ end
# Split
#

function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NULL
n = length(x)
counts = zeros(Int, ngroups + 1)
@inbounds for v in x
counts[v + 1] += 1
end

# mark the start of each contiguous group of like-indexed data
where = ones(Int, ngroups + 1)
if null_last
@inbounds for i in 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
where[1] = where[end] + counts[end]
else
@inbounds for i in 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
end

# this is our indexer
result = zeros(Int, n)
@inbounds for i in 1:n
label = x[i] + 1
result[where[label]] = i
where[label] += 1
end
result, where, counts
end

# Check whether combining x with new column with ngroups levels using cartesian product
# can overflow: if so, compress the codes to the 1:length(unique(x)) range,
# preserving their ordering
function compress_group_indices!(x::AbstractVector, ngroups::Integer, ngroups_new::Integer)
if ngroups_new > typemax(eltype(x)) # Shouldn't happen in practice, but better be safe
throw(OverflowError())
elseif Int128(ngroups) * Int128(ngroups_new) > typemax(eltype(x))
ngroups = group_indices!(x, x, 1, true)
end
ngroups
end

# Assign an integer code ("group label") to each level of x, and combine
# these codes with existing vector to build group indices (cartesian product)
function group_indices!{T}(x::AbstractVector, col::AbstractVector{T},
ngroups::Integer, sort::Bool)
# Factorize levels into integer codes ("labels")
d = Dict{T, UInt}()
# For first (or single) column, we can overwrite x
y = ngroups == 1 ? x : Vector{UInt}(length(x))
n = 0
# Note: using get! instead triggers lots of allocations
@inbounds for i in eachindex(x)
v = col[i]
index = Base.ht_keyindex(d, v)
if index < 0 # new level
@inbounds y[i] = d[v] = n
n += 1
else
y[i] = d.vals[index]
end
end

ngroups = compress_group_indices!(x, ngroups, length(d))

# Compute cartesian product ("group indices")
# Sort groups if possible, else use order of appearance
if sort && method_exists(isless, Tuple{T, T})
# compute mapping from unsorted to sorted codes
tmp = Vector{Int}(n)
try
tmp = sortperm!(tmp, collect(keys(d)))
catch
@goto unsorted
end
perm = ipermute!(collect(0:(n-1)), tmp)
refperm = sortperm!(tmp, collect(values(d)))
permute!(perm, tmp)

if ngroups == 1 # First (or single) column
@inbounds for i in eachindex(x)
x[i] = perm[y[i] + 1] + 1
end
else
@inbounds for i in eachindex(x)
x[i] += perm[y[i] + 1] * ngroups
end
end

return ngroups * n
end

@label unsorted

if ngroups == 1 # First (or single) column
x .= y .+ 1
else
x .+= y .* ngroups
end

return ngroups * n
end

# More efficient method which can use the references directly
# Levels are always sorted
function group_indices!(x::AbstractVector,
col::Union{AbstractCategoricalVector, AbstractNullableCategoricalVector},
ngroups::Integer, sort::Bool)
nlevels = length(levels(col))
compress_group_indices!(x, ngroups, nlevels)

order = CategoricalArrays.order(col.pool)
codes = similar(order, length(order)+1)
codes[1] = nlevels # Sort nulls last, only used if present
codes[2:end] .= order .- 1
anynulls = false
if ngroups == 1
@inbounds for i in eachindex(x)
ref = col.refs[i]
x[i] = codes[ref + 1] * ngroups + 1
if eltype(col) <: Nullable
anynulls |= (ref == 0)
end
end
else
@inbounds for i in eachindex(x)
ref = col.refs[i]
x[i] += codes[ref + 1] * ngroups
if eltype(col) <: Nullable
anynulls |= (ref == 0)
end
end
end
ngroups * (nlevels + anynulls)
end

"""
A view of an AbstractDataTable split into row groups

```julia
groupby(d::AbstractDataTable, cols)
groupby(cols)
groupby(d::AbstractDataTable, cols; sort = true)
groupby(cols; sort = true)
```

### Arguments

* `d` : an AbstractDataTable to split (optional, see [Returns](#returns))
* `cols` : data table columns to group by
* `sort`: whether to sort row groups; if `false`, levels of each
grouping column are sorted in their order of appearance

### Returns

Expand Down Expand Up @@ -79,17 +220,28 @@ dt |> groupby([:a, :b]) |> [sum, length]
```

"""
function groupby{T}(dt::AbstractDataTable, cols::Vector{T}; sort::Bool = false)
sdt = dt[cols]
dt_groups = group_rows(sdt)
# sort the groups
if sort
group_perm = sortperm(view(sdt, dt_groups.rperm[dt_groups.starts]))
permute!(dt_groups.starts, group_perm)
Base.permute!!(dt_groups.stops, group_perm)
function groupby{T}(d::AbstractDataTable, cols::Vector{T}; sort::Bool = true)
## a subset of Wes McKinney's algorithm here:
## http://wesmckinney.com/blog/mastering-high-performance-data-algorithms-i-group-by/
## https://slideshare.net/wesm/a-look-at-pandas-design-and-development

x = Vector{UInt}(nrow(d))
ngroups = 1
for j in length(cols):-1:1
ngroups = group_indices!(x, d[cols[j]], ngroups, sort)
end
# Compress group indices to the 1:ngroups range to limit size of vector to allocate
# (indices are already compressed for single keys)
if length(cols) > 1
ngroups = group_indices!(x, x, 1, true)
end
GroupedDataTable(dt, cols, dt_groups.rperm,
dt_groups.starts, dt_groups.stops)
(idx, starts) = groupsort_indexer(x, ngroups)
# Remove zero-length groupings
starts = _groupedunique!(starts)
ends = starts[2:end]
ends .-= 1
pop!(starts)
GroupedDataTable(d, cols, idx, starts, ends)
end
groupby(d::AbstractDataTable, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)

Expand Down Expand Up @@ -263,8 +415,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
based on columns `col`

```julia
by(d::AbstractDataTable, cols, f::Function; sort::Bool = false)
by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
by(d::AbstractDataTable, cols, f::Function; sort::Bool = true)
by(f::Function, d::AbstractDataTable, cols; sort::Bool = true)
```

### Arguments
Expand All @@ -273,7 +425,8 @@ by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
* `f` : a function to be applied to groups; expects each argument to
be an AbstractDataTable
* `sort`: sort row groups (no sorting by default)
* `sort`: whether to sort row groups; if `false`, levels of each
grouping column are sorted in their order of appearance

`f` can return a value, a vector, or a DataTable. For a value or
vector, these are merged into a column along with the `cols` keys. For
Expand Down Expand Up @@ -321,8 +474,8 @@ Split-apply-combine that applies a set of functions over columns of an
AbstractDataTable or GroupedDataTable

```julia
aggregate(d::AbstractDataTable, cols, fs)
aggregate(gd::GroupedDataTable, fs)
aggregate(d::AbstractDataTable, cols, fs; sort::Bool=true)
aggregate(gd::GroupedDataTable, fs; sort::Bool=true)
```

### Arguments
Expand All @@ -332,6 +485,8 @@ aggregate(gd::GroupedDataTable, fs)
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
* `fs` : a function or vector of functions to be applied to vectors
within groups; expects each argument to be a column vector
* `sort`: whether to sort row groups; if `false`, levels of each
grouping column are sorted in their order of appearance

Each `fs` should return a value or vector. All returns must be the
same length.
Expand All @@ -353,15 +508,17 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent
```

"""
aggregate(d::AbstractDataTable, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=false)
aggregate(d::AbstractDataTable, fs::Function; sort::Bool=true) =
aggregate(d, [fs], sort=sort)
function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=true)
headers = _makeheaders(fs, _names(d))
_aggregate(d, fs, headers, sort)
end

# Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable
aggregate(gd::GroupedDataTable, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=false)
aggregate(gd::GroupedDataTable, f::Function; sort::Bool=true) =
aggregate(gd, [f], sort=sort)
function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=true)
headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
sort && sort!(res, cols=headers)
Expand All @@ -375,7 +532,7 @@ end
function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
cols::Union{S, AbstractVector{S}},
fs::Union{T, Vector{T}};
sort::Bool=false)
sort::Bool=true)
aggregate(groupby(d, cols, sort=sort), fs)
end

Expand All @@ -384,7 +541,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
[Symbol(colname,'_',fname) for fname in fnames for colname in cn]
end

function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false)
function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T},
headers::Vector{Symbol}, sort::Bool=true)
res = DataTable(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
sort && sort!(res, cols=headers)
res
Expand Down
20 changes: 20 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@ function countnull(a::CategoricalArray)
return res
end

if !isdefined(Base, :unique!) # Julia < 0.7
function _groupedunique!(A::AbstractVector)
isempty(A) && return A
idxs = eachindex(A)
y = first(A)
state = start(idxs)
i, state = next(idxs, state)
for x in A
if !isequal(x, y)
i, state = next(idxs, state)
y = A[i] = x
end
end
resize!(A, i - first(idxs) + 1)
end
else
# unique!() includes a fast path for sorted vectors
_groupedunique!(A::AbstractVector) = unique!(A)
end

# Gets the name of a function. Used in groupedatatable/grouping.jl
function _fnames{T<:Function}(fs::Vector{T})
λcounter = 0
Expand Down
51 changes: 45 additions & 6 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ module TestGrouping
# grouping single row
@test groupby(DataTable(A=Int[1]), :A).starts == Int[1]

# grouping table with non-sortable column:
# check that order of appearance is respected when sort=true
gd = groupby(DataTable(A = ["A", 1, 1, "A"], B = 1:4), :A)
@test isequal(gd[1], DataTable(A = ["A", "A"], B = [1, 4]))
@test isequal(gd[2], DataTable(A = [1, 1], B = [2, 3]))

# issue #960
x = CategoricalArray(collect(1:20))
dt = DataTable(v1=x, v2=x)
Expand All @@ -165,11 +171,44 @@ module TestGrouping
levels!(dt[:Key1], ["Z", "B", "A"])
levels!(dt[:Key2], ["Z", "B", "A"])
gd = groupby(dt, :Key1)
@test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
@test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
@test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
@test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
gd = groupby(dt, [:Key1, :Key2])
@test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1))
@test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4))
@test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4))
@test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1))

# test NullableArray and NullableCategoricalArray with nulls
for (S, T) in ((NullableArray, NullableArray),
(NullableCategoricalArray, NullableCategoricalArray),
(NullableArray, NullableCategoricalArray),
(NullableCategoricalArray, NullableArray))
dt = DataTable(Key1 = S(["A", "A", "B", Nullable(), Nullable()]),
Key2 = T(["A", "B", "A", Nullable(), "A"]),
Value = 1:5)
gd = groupby(dt, :Key1)
@test isequal(gd[1], DataTable(Key1=Nullable{String}["A", "A"],
Key2=Nullable{String}["A", "B"], Value=1:2))
@test isequal(gd[2], DataTable(Key1=Nullable{String}["B"],
Key2=Nullable{String}["A"], Value=3))
@test isequal(gd[3], DataTable(Key1=[Nullable(), Nullable()],
Key2=Nullable{String}[Nullable(), "A"], Value=4:5))
gd = groupby(dt, [:Key1, :Key2])
@test isequal(gd[1], DataTable(Key1=Nullable("A"), Key2=Nullable("A"), Value=1))
@test isequal(gd[2], DataTable(Key1=Nullable("A"), Key2=Nullable("B"), Value=2))
@test isequal(gd[3], DataTable(Key1=Nullable("B"), Key2=Nullable("A"), Value=3))
@test isequal(gd[4], DataTable(Key1=Nullable(), Key2=Nullable("A"), Value=5))
@test isequal(gd[5], DataTable(Key1=Nullable(), Key2=Nullable(), Value=4))
end

# check that group indices are compressed when cartesian product would overflow
x = CategoricalArray([1, 2, 2, 1])
levels!(x, collect(1:10_000))
dt = DataTable(A = x, B = x, C = x, D = x, E = x, Value = 1:4)
gd = groupby(dt, [:A, :B, :C, :D, :E])
@test isequal(gd[1], DataTable(A = [1, 1], B = [1, 1], C = [1, 1],
D = [1, 1], E = [1, 1], Value = [1, 4]))
@test isequal(gd[2], DataTable(A = [2, 2], B = [2, 2], C = [2, 2],
D = [2, 2], E = [2, 2], Value = [2, 3]))
end