Skip to content
This repository was archived by the owner on May 4, 2019. It is now read-only.

Commit 696ae34

Browse files
committed
Merge pull request #106 from JuliaStats/sjk/pdasort
Sort NAs to last position for PooledDataArrays as well
2 parents 1359a53 + b5dee5f commit 696ae34

File tree

4 files changed

+45
-22
lines changed

4 files changed

+45
-22
lines changed

src/DataArrays.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ module DataArrays
3838
padNA,
3939
pdata,
4040
percent_change,
41-
Perm,
4241
PooledDataArray,
4342
PooledDataMatrix,
4443
PooledDataVecs,

src/grouping.jl

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
function groupsort_indexer(x::AbstractVector, ngroups::Integer)
1+
function groupsort_indexer(x::AbstractVector, ngroups::Integer, nalast::Bool=false)
22
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
33

44
# count group sizes, location 0 for NA
@@ -11,10 +11,17 @@ function groupsort_indexer(x::AbstractVector, ngroups::Integer)
1111

1212
# mark the start of each contiguous group of like-indexed data
1313
where = fill(1, ngroups + 1)
14-
for i = 2:ngroups+1
15-
where[i] = where[i - 1] + counts[i - 1]
14+
if nalast
15+
for i = 3:ngroups+1
16+
where[i] = where[i - 1] + counts[i - 1]
17+
end
18+
where[1] = where[end] + counts[end]
19+
else
20+
for i = 2:ngroups+1
21+
where[i] = where[i - 1] + counts[i - 1]
22+
end
1623
end
17-
24+
1825
# this is our indexer
1926
result = fill(0, n)
2027
for i = 1:n
@@ -25,4 +32,4 @@ function groupsort_indexer(x::AbstractVector, ngroups::Integer)
2532
result, where, counts
2633
end
2734

28-
groupsort_indexer(pv::PooledDataVector) = groupsort_indexer(pv.refs, length(pv.pool))
35+
groupsort_indexer(pv::PooledDataVector, nalast::Bool=false) = groupsort_indexer(pv.refs, length(pv.pool), nalast)

src/pooleddataarray.jl

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -773,19 +773,24 @@ end
773773
##
774774
##############################################################################
775775

776-
# TODO handle sortperm for non-sorted keys
777-
Base.sortperm(pda::PooledDataArray) = groupsort_indexer(pda)[1]
778-
function Base.sortperm(pda::PooledDataArray)
779-
if issorted(pda.pool)
780-
return groupsort_indexer(pda)[1]
781-
else
782-
return sortperm(reorder!(copy(pda)))
776+
function Base.sortperm(pda::PooledDataArray; alg::Base.Sort.Algorithm=Base.Sort.DEFAULT_UNSTABLE,
777+
lt::Function=isless, by::Function=identity,
778+
rev::Bool=false, order=Base.Sort.Forward)
779+
order = Base.ord(lt, by, rev, order)
780+
781+
# TODO handle custom ordering efficiently
782+
if !isa(order, Base.Order.ForwardOrdering) && !isa(order, Base.Order.ReverseOrdering)
783+
return sort!([1:length(pda)], alg, Base.Order.Perm(order,pda))
783784
end
785+
786+
# TODO handle non-sorted keys without copying
787+
perm = issorted(pda.pool) ? groupsort_indexer(pda, true)[1] : sortperm(reorder(pda))
788+
isa(order, Base.Order.ReverseOrdering) && reverse!(perm)
789+
perm
784790
end
785791

786-
Base.sortperm(pda::PooledDataArray, ::Base.Sort.ReverseOrdering) = reverse(sortperm(pda))
787-
Base.sort(pda::PooledDataArray) = pda[sortperm(pda)]
788-
Base.sort(pda::PooledDataArray, ::Base.Sort.ReverseOrdering) = pda[reverse(sortperm(pda))]
792+
Base.sort(pda::PooledDataArray; kw...) = pda[sortperm(pda; kw...)]
793+
789794
type FastPerm{O<:Base.Sort.Ordering,V<:AbstractVector} <: Base.Sort.Ordering
790795
ord::O
791796
vec::V

test/sort.jl

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,29 @@
11
module TestSort
22
using DataArrays, Base.Test
33

4+
dv1 = @data([9, 1, 8, NA, 3, 3, 7, NA])
5+
dv2 = 1.0 * dv1
6+
dv3 = DataArray([1:8])
7+
pdv1 = convert(PooledDataArray, dv1)
8+
9+
@test sortperm(dv1) == sortperm(dv2)
10+
@test sortperm(dv1) == sortperm(pdv1)
11+
@test isequal(sort(dv1), convert(DataArray, sort(dv1)))
12+
@test isequal(sort(dv1), convert(DataArray, sort(pdv1)))
13+
414
for T in (Float64, BigFloat)
515
n = 1000
616
na = randbool(n)
717
nna = sum(na)
818
a = Array(T, n)
919
ra = randn(n-nna)
1020
a[!na] = ra
11-
da = DataArray(a, na)
12-
@test isequal(sort(da), [DataArray(sort(dropna(da))), DataArray(T, nna)])
13-
@test isequal(da[sortperm(da)], [DataArray(sort(dropna(da))), DataArray(T, nna)])
14-
@test isequal(sort(da, rev=true), [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
15-
@test isequal(da[sortperm(da, rev=true)], [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
21+
for da in (DataArray(a, na), PooledDataArray(a, na), (pda = PooledDataArray(a, na); setlevels!(pda, shuffle!(pda.pool))))
22+
@test isequal(sort(da), [DataArray(sort(dropna(da))), DataArray(T, nna)])
23+
@test isequal(sort(da; lt=(x,y)->isless(x,y)), [DataArray(sort(dropna(da))), DataArray(T, nna)])
24+
@test isequal(da[sortperm(da)], [DataArray(sort(dropna(da))), DataArray(T, nna)])
25+
@test isequal(sort(da, rev=true), [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
26+
@test isequal(da[sortperm(da, rev=true)], [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
27+
end
28+
end
1629
end
17-
end

0 commit comments

Comments
 (0)