JuliaData · nalimilan · Jun 17, 2017 · Jun 22, 2017
diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl
@@ -26,18 +26,159 @@ end
 # Split
 #
 
+function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
+    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
+
+    # count group sizes, location 0 for NULL
+    n = length(x)
+    counts = zeros(Int, ngroups + 1)
+    @inbounds for v in x
+        counts[v + 1] += 1
+    end
+
+    # mark the start of each contiguous group of like-indexed data
+    where = ones(Int, ngroups + 1)
+    if null_last
+        @inbounds for i in 3:ngroups+1
+            where[i] = where[i - 1] + counts[i - 1]
+        end
+        where[1] = where[end] + counts[end]
+    else
+        @inbounds for i in 2:ngroups+1
+            where[i] = where[i - 1] + counts[i - 1]
+        end
+    end
+
+    # this is our indexer
+    result = zeros(Int, n)
+    @inbounds for i in 1:n
+        label = x[i] + 1
+        result[where[label]] = i
+        where[label] += 1
+    end
+    result, where, counts
+end
+
+# Check whether combining x with new column with ngroups levels using cartesian product
+# can overflow: if so, compress the codes to the 1:length(unique(x)) range,
+# preserving their ordering
+function compress_group_indices!(x::AbstractVector, ngroups::Integer, ngroups_new::Integer)
+    if ngroups_new > typemax(eltype(x)) # Shouldn't happen in practice, but better be safe
+        throw(OverflowError())
+    elseif Int128(ngroups) * Int128(ngroups_new) > typemax(eltype(x))
+        ngroups = group_indices!(x, x, 1, true)
+    end
+    ngroups
+end
+
+# Assign an integer code ("group label") to each level of x, and combine
+# these codes with existing vector to build group indices (cartesian product)
+function group_indices!{T}(x::AbstractVector, col::AbstractVector{T},
+                           ngroups::Integer, sort::Bool)
+    # Factorize levels into integer codes ("labels")
+    d = Dict{T, UInt}()
+    # For first (or single) column, we can overwrite x
+    y = ngroups == 1 ? x : Vector{UInt}(length(x))
+    n = 0
+    # Note: using get! instead triggers lots of allocations
+    @inbounds for i in eachindex(x)
+        v = col[i]
+        index = Base.ht_keyindex(d, v)
+        if index < 0 # new level
+            @inbounds y[i] = d[v] = n
+            n += 1
+        else
+            y[i] = d.vals[index]
+        end
+    end
+
+    ngroups = compress_group_indices!(x, ngroups, length(d))
+
+    # Compute cartesian product ("group indices")
+    # Sort groups if possible, else use order of appearance
+    if sort && method_exists(isless, Tuple{T, T})
+        # compute mapping from unsorted to sorted codes
+        tmp = Vector{Int}(n)
+        try
+            tmp = sortperm!(tmp, collect(keys(d)))
+        catch
+            @goto unsorted
+        end
+        perm = ipermute!(collect(0:(n-1)), tmp)
+        refperm = sortperm!(tmp, collect(values(d)))
+        permute!(perm, tmp)
+
+        if ngroups == 1 # First (or single) column
+            @inbounds for i in eachindex(x)
+                x[i] = perm[y[i] + 1] + 1
+            end
+        else
+            @inbounds for i in eachindex(x)
+                x[i] += perm[y[i] + 1] * ngroups
+            end
+        end
+
+        return ngroups * n
+    end
+
+    @label unsorted
+
+    if ngroups == 1 # First (or single) column
+        x .= y .+ 1
+    else
+        x .+= y .* ngroups
+    end
+
+    return ngroups * n
+end
+
+# More efficient method which can use the references directly
+# Levels are always sorted
+function group_indices!(x::AbstractVector,
+                        col::Union{AbstractCategoricalVector, AbstractNullableCategoricalVector},
+                        ngroups::Integer, sort::Bool)
+    nlevels = length(levels(col))
+    compress_group_indices!(x, ngroups, nlevels)
+
+    order = CategoricalArrays.order(col.pool)
+    codes = similar(order, length(order)+1)
+    codes[1] = nlevels # Sort nulls last, only used if present
+    codes[2:end] .= order .- 1
+    anynulls = false
+    if ngroups == 1
+        @inbounds for i in eachindex(x)
+            ref = col.refs[i]
+            x[i] = codes[ref + 1] * ngroups + 1
+            if eltype(col) <: Nullable
+                anynulls |= (ref == 0)
+            end
+        end
+    else
+        @inbounds for i in eachindex(x)
+            ref = col.refs[i]
+            x[i] += codes[ref + 1] * ngroups
+            if eltype(col) <: Nullable
+                anynulls |= (ref == 0)
+            end
+        end
+    end
+    ngroups * (nlevels + anynulls)
+end
+
 """
 A view of an AbstractDataTable split into row groups
 
 ```julia
-groupby(d::AbstractDataTable, cols)
-groupby(cols)
+groupby(d::AbstractDataTable, cols; sort = true)
+groupby(cols; sort = true)
 ```
 
 ### Arguments
 
 * `d` : an AbstractDataTable to split (optional, see [Returns](#returns))
 * `cols` : data table columns to group by
+* `sort`: whether to sort row groups; if `false`, levels of each
+  grouping column are sorted in their order of appearance
 
 ### Returns
 
@@ -79,17 +220,28 @@ dt |> groupby([:a, :b]) |> [sum, length]
 ```
 
 """
-function groupby{T}(dt::AbstractDataTable, cols::Vector{T}; sort::Bool = false)
-    sdt = dt[cols]
-    dt_groups = group_rows(sdt)
-    # sort the groups
-    if sort
-        group_perm = sortperm(view(sdt, dt_groups.rperm[dt_groups.starts]))
-        permute!(dt_groups.starts, group_perm)
-        Base.permute!!(dt_groups.stops, group_perm)
+function groupby{T}(d::AbstractDataTable, cols::Vector{T}; sort::Bool = true)
+    ## a subset of Wes McKinney's algorithm here:
+    ##     http://wesmckinney.com/blog/mastering-high-performance-data-algorithms-i-group-by/
+    ##     https://slideshare.net/wesm/a-look-at-pandas-design-and-development
+
+    x = Vector{UInt}(nrow(d))
+    ngroups = 1
+    for j in length(cols):-1:1
+        ngroups = group_indices!(x, d[cols[j]], ngroups, sort)
+    end
+    # Compress group indices to the 1:ngroups range to limit size of vector to allocate
+    # (indices are already compressed for single keys)
+    if length(cols) > 1
+        ngroups = group_indices!(x, x, 1, true)
     end
-    GroupedDataTable(dt, cols, dt_groups.rperm,
-                     dt_groups.starts, dt_groups.stops)
+    (idx, starts) = groupsort_indexer(x, ngroups)
+    # Remove zero-length groupings
+    starts = _groupedunique!(starts)
+    ends = starts[2:end]
+    ends .-= 1
+    pop!(starts)
+    GroupedDataTable(d, cols, idx, starts, ends)
 end
 groupby(d::AbstractDataTable, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
 
@@ -263,8 +415,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
 based on columns `col`
 
 ```julia
-by(d::AbstractDataTable, cols, f::Function; sort::Bool = false)
-by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
+by(d::AbstractDataTable, cols, f::Function; sort::Bool = true)
+by(f::Function, d::AbstractDataTable, cols; sort::Bool = true)
 ```
 
 ### Arguments
@@ -273,7 +425,8 @@ by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
 * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
 * `f` : a function to be applied to groups; expects each argument to
   be an AbstractDataTable
-* `sort`: sort row groups (no sorting by default)
+* `sort`: whether to sort row groups; if `false`, levels of each
+  grouping column are sorted in their order of appearance
 
 `f` can return a value, a vector, or a DataTable. For a value or
 vector, these are merged into a column along with the `cols` keys. For
@@ -321,8 +474,8 @@ Split-apply-combine that applies a set of functions over columns of an
 AbstractDataTable or GroupedDataTable
 
 ```julia
-aggregate(d::AbstractDataTable, cols, fs)
-aggregate(gd::GroupedDataTable, fs)
+aggregate(d::AbstractDataTable, cols, fs; sort::Bool=true)
+aggregate(gd::GroupedDataTable, fs; sort::Bool=true)
 ```
 
 ### Arguments
@@ -332,6 +485,8 @@ aggregate(gd::GroupedDataTable, fs)
 * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
 * `fs` : a function or vector of functions to be applied to vectors
   within groups; expects each argument to be a column vector
+* `sort`: whether to sort row groups; if `false`, levels of each
+  grouping column are sorted in their order of appearance
 
 Each `fs` should return a value or vector. All returns must be the
 same length.
@@ -353,15 +508,17 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))]   # equivalent
 ```
 
 """
-aggregate(d::AbstractDataTable, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
-function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=false)
+aggregate(d::AbstractDataTable, fs::Function; sort::Bool=true) =
+    aggregate(d, [fs], sort=sort)
+function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=true)
     headers = _makeheaders(fs, _names(d))
     _aggregate(d, fs, headers, sort)
 end
 
 # Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable
-aggregate(gd::GroupedDataTable, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
-function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=false)
+aggregate(gd::GroupedDataTable, f::Function; sort::Bool=true) =
+    aggregate(gd, [f], sort=sort)
+function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=true)
     headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
     res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
     sort && sort!(res, cols=headers)
@@ -375,7 +532,7 @@ end
 function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
                                                  cols::Union{S, AbstractVector{S}},
                                                  fs::Union{T, Vector{T}};
-                                                 sort::Bool=false)
+                                                 sort::Bool=true)
     aggregate(groupby(d, cols, sort=sort), fs)
 end
 
@@ -384,7 +541,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
     [Symbol(colname,'_',fname) for fname in fnames for colname in cn]
 end
 
-function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false)
+function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T},
+                                 headers::Vector{Symbol}, sort::Bool=true)
     res = DataTable(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
     sort && sort!(res, cols=headers)
     res

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -155,6 +155,26 @@ function countnull(a::CategoricalArray)
     return res
 end
 
+if !isdefined(Base, :unique!) # Julia < 0.7
+    function _groupedunique!(A::AbstractVector)
+        isempty(A) && return A
+        idxs = eachindex(A)
+        y = first(A)
+        state = start(idxs)
+        i, state = next(idxs, state)
+        for x in A
+            if !isequal(x, y)
+                i, state = next(idxs, state)
+                y = A[i] = x
+            end
+        end
+        resize!(A, i - first(idxs) + 1)
+    end
+else
+    # unique!() includes a fast path for sorted vectors
+    _groupedunique!(A::AbstractVector) = unique!(A)
+end
+
 # Gets the name of a function. Used in groupedatatable/grouping.jl
 function _fnames{T<:Function}(fs::Vector{T})
     λcounter = 0

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -140,6 +140,12 @@ module TestGrouping
     # grouping single row
     @test groupby(DataTable(A=Int[1]), :A).starts == Int[1]
 
+    # grouping table with non-sortable column:
+    # check that order of appearance is respected when sort=true
+    gd = groupby(DataTable(A = ["A", 1, 1, "A"], B = 1:4), :A)
+    @test isequal(gd[1], DataTable(A = ["A", "A"], B = [1, 4]))
+    @test isequal(gd[2], DataTable(A = [1, 1], B = [2, 3]))
+
     # issue #960
     x = CategoricalArray(collect(1:20))
     dt = DataTable(v1=x, v2=x)
@@ -165,11 +171,44 @@ module TestGrouping
     levels!(dt[:Key1], ["Z", "B", "A"])
     levels!(dt[:Key2], ["Z", "B", "A"])
     gd = groupby(dt, :Key1)
-    @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
-    @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+    @test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+    @test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
     gd = groupby(dt, [:Key1, :Key2])
-    @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1))
-    @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2))
-    @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3))
-    @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4))
+    @test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4))
+    @test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3))
+    @test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2))
+    @test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1))
+
+    # test NullableArray and NullableCategoricalArray with nulls
+    for (S, T) in ((NullableArray, NullableArray),
+                   (NullableCategoricalArray, NullableCategoricalArray),
+                   (NullableArray, NullableCategoricalArray),
+                   (NullableCategoricalArray, NullableArray))
+        dt = DataTable(Key1 = S(["A", "A", "B", Nullable(), Nullable()]),
+                       Key2 = T(["A", "B", "A", Nullable(), "A"]),
+                       Value = 1:5)
+        gd = groupby(dt, :Key1)
+        @test isequal(gd[1], DataTable(Key1=Nullable{String}["A", "A"],
+                                       Key2=Nullable{String}["A", "B"], Value=1:2))
+        @test isequal(gd[2], DataTable(Key1=Nullable{String}["B"],
+                                       Key2=Nullable{String}["A"], Value=3))
+        @test isequal(gd[3], DataTable(Key1=[Nullable(), Nullable()],
+                                       Key2=Nullable{String}[Nullable(), "A"], Value=4:5))
+        gd = groupby(dt, [:Key1, :Key2])
+        @test isequal(gd[1], DataTable(Key1=Nullable("A"), Key2=Nullable("A"), Value=1))
+        @test isequal(gd[2], DataTable(Key1=Nullable("A"), Key2=Nullable("B"), Value=2))
+        @test isequal(gd[3], DataTable(Key1=Nullable("B"), Key2=Nullable("A"), Value=3))
+        @test isequal(gd[4], DataTable(Key1=Nullable(), Key2=Nullable("A"), Value=5))
+        @test isequal(gd[5], DataTable(Key1=Nullable(), Key2=Nullable(), Value=4))
+    end
+
+    # check that group indices are compressed when cartesian product would overflow
+    x = CategoricalArray([1, 2, 2, 1])
+    levels!(x, collect(1:10_000))
+    dt = DataTable(A = x, B = x, C = x, D = x, E = x, Value = 1:4)
+    gd = groupby(dt, [:A, :B, :C, :D, :E])
+    @test isequal(gd[1], DataTable(A = [1, 1], B = [1, 1], C = [1, 1],
+                                   D = [1, 1], E = [1, 1], Value = [1, 4]))
+    @test isequal(gd[2], DataTable(A = [2, 2], B = [2, 2], C = [2, 2],
+                                   D = [2, 2], E = [2, 2], Value = [2, 3]))
 end