diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index dcdaeed..8ef19c7 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -271,9 +271,9 @@ end #' @description #' #' Return a DataVector containing the unique values of a `PooledDataArray`, -#' in the order they appear in the data, including `NA` if any missing entries +#' in the order of `levels`, including `NA` if any missing entries #' are encountered. For `PooledDataArray`s, this function is much less efficient -#' than `levels`, which does not return the values in the same order. +#' than `levels`. #' #' @param da::DataArray{T} `DataArray` whose unique values are desired. #' @@ -286,50 +286,24 @@ end #' pdv = @pdata [1, -2, 1, NA, 4] #' distinct_values = unique(pdv) function Base.unique{T}(pda::PooledDataArray{T}) - n = length(pda) nlevels = length(pda.pool) - unique_values = Vector{T}(0) - sizehint!(unique_values, nlevels) - seen = Set{eltype(pda.refs)}() - - firstna = 0 - for i in 1:n - if isna(pda, i) - if firstna == 0 - firstna = length(unique_values) + 1 - end - elseif !in(pda.refs[i], seen) - push!(seen, pda.refs[i]) - push!(unique_values, pda.pool[pda.refs[i]]) - else - continue - end - - if firstna > 0 && length(unique_values) == nlevels - break + seen = fill(false, nlevels + 1) + batch = 0 + @inbounds for i in pda.refs + seen[i + 1] = true + # Only do a costly short-circuit check periodically + batch += 1 + if batch > 1000 + all(seen) && break + batch = 0 end end - - if firstna > 0 - res = DataArray(Vector{T}(nlevels + 1)) - i = 0 - for val in unique_values - i += 1 - if i == firstna - res.na[i] = true - i += 1 - end - res.data[i] = val - end - - if firstna == nlevels + 1 - res.na[nlevels + 1] = true - end - - return res - else - return DataArray(unique_values) + seenna = shift!(seen) + res = DataArray(levels(pda)[seen]) + if seenna + push!(res, NA) end + res end #' @description diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index ce99b49..610c829 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -31,16 +31,20 @@ module TestPDA @assert levels(setlevels!(@pdata([1.0, 2.0]), [3,4])) == [3.0, 4.0] y = @pdata [1, NA, -2, 1, NA, 4, NA] - @assert isequal(unique(y), @pdata [1, NA, -2, 4]) - @assert isequal(unique(reverse(y)), @data [NA, 4, 1, -2]) - @assert isequal(unique(dropna(y)), @data [1, -2, 4]) - @assert isequal(unique(reverse(dropna(y))), @data [4, 1, -2]) + @assert isequal(unique(y), @pdata [-2, 1, 4, NA]) + @assert isequal(unique(reverse(y)), @data [-2, 1, 4, NA]) + @assert isequal(unique(dropna(y)), @data levels(dropna(y))) + @assert isequal(unique(reverse(dropna(y))), @data levels(reverse(dropna(y)))) z = @pdata ["frank", NA, "gertrude", "frank", NA, "herbert", NA] - @assert isequal(unique(z), @pdata ["frank", NA, "gertrude", "herbert"]) - @assert isequal(unique(reverse(z)), @pdata [NA, "herbert", "frank", "gertrude"]) - @assert isequal(unique(dropna(z)), @pdata ["frank", "gertrude", "herbert"]) - @assert isequal(unique(reverse(dropna(z))), @pdata ["herbert", "frank", "gertrude"]) + @assert isequal(unique(z), @pdata ["frank", "gertrude", "herbert", NA]) + @assert isequal(unique(reverse(z)), @pdata ["frank", "gertrude", "herbert", NA]) + @assert isequal(unique(dropna(z)), @data levels(dropna(z))) + @assert isequal(unique(reverse(dropna(z))), @data levels(reverse(dropna(z)))) + + # check case where some levels are not present in data + z[3] = "frank" + @assert isequal(unique(z), @pdata ["frank", "herbert", NA]) # check case where only NA occurs in final position @assert isequal(unique(@pdata [1, 2, 1, NA]), @pdata [1, 2, NA])