diff --git a/REQUIRE b/REQUIRE index d4d0eff..8eaf2dd 100644 --- a/REQUIRE +++ b/REQUIRE @@ -7,3 +7,4 @@ SortingAlgorithms Reexport Compat 0.19.0 FileIO 0.1.2 +DataStructures diff --git a/src/DataTables.jl b/src/DataTables.jl index c1b1479..3edad04 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -17,6 +17,7 @@ import NullableArrays: dropnull, dropnull! @reexport using CategoricalArrays using GZip using SortingAlgorithms +using DataStructures using FileIO # remove after read_rda deprecation period diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 8523a5a..c244e5b 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -15,6 +15,40 @@ similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{ similar_nullable(dt::AbstractDataTable, dims::Int) = DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) +function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false) + # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). + + # count group sizes, location 0 for NULL + n = length(x) + # counts = x.pool + counts = fill(0, ngroups + 1) + for i = 1:n + counts[x[i] + 1] += 1 + end + + # mark the start of each contiguous group of like-indexed data + where = fill(1, ngroups + 1) + if null_last + for i = 3:ngroups+1 + where[i] = where[i - 1] + counts[i - 1] + end + where[1] = where[end] + counts[end] + else + for i = 2:ngroups+1 + where[i] = where[i - 1] + counts[i - 1] + end + end + + # this is our indexer + result = fill(0, n) + for i = 1:n + label = x[i] + 1 + result[where[label]] = i + where[label] += 1 + end + result, where, counts +end + function join_idx(left, right, max_groups) ## adapted from Wes McKinney's full_outer_join in pandas (file: src/join.pyx). diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index cdd5f8c..991223f 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -26,39 +26,6 @@ end # Split # -function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false) - # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). - - # count group sizes, location 0 for NULL - n = length(x) - # counts = x.pool - counts = fill(0, ngroups + 1) - for i = 1:n - counts[x[i] + 1] += 1 - end - - # mark the start of each contiguous group of like-indexed data - where = fill(1, ngroups + 1) - if null_last - for i = 3:ngroups+1 - where[i] = where[i - 1] + counts[i - 1] - end - where[1] = where[end] + counts[end] - else - for i = 2:ngroups+1 - where[i] = where[i - 1] + counts[i - 1] - end - end - - # this is our indexer - result = fill(0, n) - for i = 1:n - label = x[i] + 1 - result[where[label]] = i - where[label] += 1 - end - result, where, counts -end """ A view of an AbstractDataTable split into row groups @@ -117,49 +84,38 @@ dt |> groupby([:a, :b]) |> [sum, length] ``` """ -function groupby{T}(d::AbstractDataTable, cols::Vector{T}) - ## a subset of Wes McKinney's algorithm here: - ## http://wesmckinney.com/blog/?p=489 - - ncols = length(cols) - # use CategoricalArray to get a set of integer references for each unique item - nv = NullableCategoricalArray(d[cols[ncols]]) - # if there are NULLs, add 1 to the refs to avoid underflows in x later - anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) - # use UInt32 instead of the original array's integer size since the number of levels can be high - x = similar(nv.refs, UInt32) - for i = 1:nrow(d) - if nv.refs[i] == 0 - x[i] = 1 +function groupby(d::AbstractDataTable, cols::Vector) + intersect = d[cols] + mappings = OrderedDict{DataTableRow, Vector{Int}}() + for i = 1:nrow(intersect) + row = DataTableRow(intersect, i) + if !haskey(mappings, row) + mappings[row] = [i] else - x[i] = CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls + push!(mappings[row], i) end end - # also compute the number of groups, which is the product of the set lengths - ngroups = length(levels(nv)) + anynulls - # if there's more than 1 column, do roughly the same thing repeatedly - for j = (ncols - 1):-1:1 - nv = NullableCategoricalArray(d[cols[j]]) - anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) - for i = 1:nrow(d) - if nv.refs[i] != 0 - x[i] += (CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - 1) * ngroups - end - end - ngroups = ngroups * (length(levels(nv)) + anynulls) - # TODO if ngroups is really big, shrink it + ngroups = length(mappings.keys) + idx = Vector{Int}(nrow(d)) + starts = fill(1, ngroups) + stops = Vector{Int}(ngroups) + + rows = mappings.vals[1] + idx[1:length(rows)] = rows + stops[1] = length(rows) + for i = 2:ngroups + rows = mappings.vals[i] + starts[i] = stops[i-1] + 1 + stops[i] = stops[i-1] + length(rows) + idx[starts[i]:stops[i]] = rows end - (idx, starts) = groupsort_indexer(x, ngroups) - # Remove zero-length groupings - starts = _uniqueofsorted(starts) - ends = starts[2:end] - 1 - GroupedDataTable(d, cols, idx, starts[1:end-1], ends) + GroupedDataTable(d, cols, idx, starts, stops) end -groupby(d::AbstractDataTable, cols) = groupby(d, [cols]) +groupby(d::AbstractDataTable, cols::Union{Int, Symbol}) = groupby(d, [cols]) # add a function curry -groupby{T}(cols::Vector{T}) = x -> groupby(x, cols) -groupby(cols) = x -> groupby(x, cols) +groupby(cols::Vector) = x -> groupby(x, cols) +groupby(cols::Union{Int, Symbol}) = x -> groupby(x, [cols]) Base.start(gd::GroupedDataTable) = 1 Base.next(gd::GroupedDataTable, state::Int) = diff --git a/test/grouping.jl b/test/grouping.jl index c8faac0..91c5ffb 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -37,8 +37,6 @@ module TestGrouping @test groupby(dt2, [:v1, :v2]).starts == collect(1:1000) @test groupby(dt2, [:v2, :v1]).starts == collect(1:1000) - # grouping empty frame - @test groupby(DataTable(A=Int[]), :A).starts == Int[] # grouping single row @test groupby(DataTable(A=Int[1]), :A).starts == Int[1] @@ -47,10 +45,6 @@ module TestGrouping dt = DataTable(v1=x, v2=x) groupby(dt, [:v1, :v2]) - dt2 = by(e->1, DataTable(x=Int64[]), :x) - @test size(dt2) == (0,1) - @test isequal(sum(dt2[:x]), Nullable(0)) - # Check that reordering levels does not confuse groupby dt = DataTable(Key1 = CategoricalArray(["A", "A", "B", "B"]), Key2 = CategoricalArray(["A", "B", "A", "B"]), @@ -67,11 +61,11 @@ module TestGrouping levels!(dt[:Key1], ["Z", "B", "A"]) levels!(dt[:Key2], ["Z", "B", "A"]) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) - @test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4)) - @test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1)) + @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) + @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) + @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) + @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) end