diff --git a/src/BlitSort.jl b/src/BlitSort.jl new file mode 100644 index 0000000..639a534 --- /dev/null +++ b/src/BlitSort.jl @@ -0,0 +1,470 @@ +export BlitSort + +struct BlitSortAlg <: Algorithm end + +const BlitSort = maybe_optimize(BlitSortAlg()) + +const BLIT_AUX = 512 # set to 0 for sqrt(n) cache size +const BLIT_OUT = 96 # should be smaller or equal to BLIT_AUX + +function blit_analyze!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} + @inbounds begin + half1 = nmemb ÷ 2 + quad1 = half1 ÷ 2 + quad2 = half1 - quad1 + half2 = nmemb - half1 + quad3 = half2 ÷ 2 + quad4 = half2 - quad3 + + pta = array_index + ptb = array_index + asInt(quad1) + ptc = array_index + asInt(half1) + ptd = array_index + asInt(half1 + quad3) + + astreaks = bstreaks = cstreaks = dstreaks = zero(UInt) + abalance::UInt = bbalance::UInt = cbalance::UInt = dbalance::UInt = zero(UInt) + + cnt = nmemb + while cnt > 132 + asum::UInt8 = bsum::UInt8 = csum::UInt8 = dsum::UInt8 = 0 + for _ in 32:-1:1 + asum += cmp(array[pta], array[pta+1]) > 0; pta += 1 + bsum += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1 + csum += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1 + dsum += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1 + end + abalance += asum; astreaks += asum = (asum == 0) | (asum == 32) + bbalance += bsum; bstreaks += bsum = (bsum == 0) | (bsum == 32) + cbalance += csum; cstreaks += csum = (csum == 0) | (csum == 32) + dbalance += dsum; dstreaks += dsum = (dsum == 0) | (dsum == 32) + + if cnt > 516 && asum + bsum + csum + dsum == 0 + abalance += 48; pta += 96 + bbalance += 48; ptb += 96 + cbalance += 48; ptc += 96 + dbalance += 48; ptd += 96 + cnt -= 384 + end + + cnt -= 128 + end + + for _ in cnt:-4:UInt(8) + abalance += cmp(array[pta], array[pta+1]) > 0; pta += 1 + bbalance += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1 + cbalance += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1 + dbalance += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1 + end + + quad1 < quad2 && (bbalance += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1) + quad1 < quad3 && (cbalance += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1) + quad1 < quad4 && (dbalance += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1) + + cnt = abalance + bbalance + cbalance + dbalance + + cnt == 0 && cmp(array[pta], array[pta+1]) ≤ 0 && + cmp(array[ptb], array[ptb+1]) ≤ 0 && cmp(array[ptc], array[ptc+1]) ≤ 0 && return + + abool = quad1 - abalance == 1 + bbool = quad2 - bbalance == 1 + cbool = quad3 - cbalance == 1 + dbool = quad4 - dbalance == 1 + + if abool | bbool | cbool | dbool + span1 = (abool && bbool) * (cmp(array[pta], array[pta+1]) > 0) + span2 = (bbool && cbool) * (cmp(array[ptb], array[ptb+1]) > 0) + span3 = (cbool && dbool) * (cmp(array[ptc], array[ptc+1]) > 0) + + tmp = span1 | span2 * 2 | span3 * 4 + if tmp == 1 + quad_reversal!(array, array_index, ptb) + abalance = bbalance = 0 + elseif tmp == 2 + quad_reversal!(array, pta + 1, ptc) + bbalance = cbalance = 0 + elseif tmp == 3 + quad_reversal!(array, array_index, ptc) + abalance = bbalance = cbalance = 0 + elseif tmp == 4 + quad_reversal!(array, ptb + 1, ptd) + cbalance = dbalance = 0 + elseif tmp == 5 + quad_reversal!(array, array_index, ptb) + quad_reversal!(array, ptb + 1, ptd) + abalance = bbalance = cbalance = dbalance = 0 + elseif tmp == 6 + quad_reversal!(array, pta + 1, ptd) + bbalance = cbalance = dbalance = 0 + elseif tmp == 7 + quad_reversal!(array, array_index, ptd) + return + end + + abool && !iszero(abalance) && (quad_reversal!(array, array_index, pta); abalance = 0) + bbool && !iszero(bbalance) && (quad_reversal!(array, pta + 1, ptb); bbalance = 0) + cbool && !iszero(cbalance) && (quad_reversal!(array, ptb + 1, ptc); cbalance = 0) + dbool && !iszero(dbalance) && (quad_reversal!(array, ptc + 1, ptd); dbalance = 0) + end + + cnt = nmemb ÷ 256 # more than 50% ordered + abool = astreaks > cnt + bbool = bstreaks > cnt + cbool = cstreaks > cnt + dbool = dstreaks > cnt + + tmp = abool + 2bbool + 4cbool + 8dbool + if tmp == 0 + blit_partition!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + return + elseif tmp == 1 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2 + half2, cmp) + elseif tmp == 2 + blit_partition!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, half2, cmp) + elseif tmp == 3 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, half2, cmp) + elseif tmp == 4 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1, cmp) + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + blit_partition!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 8 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1 + quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 9 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2 + quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 12 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1, cmp) + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + else + if abool + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + else + blit_partition!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + end + if bbool + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + else + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + end + if cbool + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + else + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + end + if dbool + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + else + blit_partition!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + end + end + + if cmp(array[pta], array[pta+1]) ≤ 0 + if cmp(array[ptc], array[ptc+1]) ≤ 0 + cmp(array[ptb], array[ptb+1]) ≤ 0 && return + else + rotate_merge_block!(array, array_index + asInt(half1), swap, swap_index, swap_size, quad3, quad4, cmp) + end + else + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, quad1, quad2, cmp) + cmp(array[ptc], array[ptc+1]) > 0 && + rotate_merge_block!(array, array_index + asInt(half1), swap, swap_index, swap_size, quad3, quad4, cmp) + end + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, half1, half2, cmp) + end +end + +# The next 4 functions are used for pivot selection + +function blit_binary_median(arraya, pta::Int, arrayb, ptb::Int, len::UInt, cmp::F) where {F} + @inbounds begin + while !iszero(len ÷= 2) + leni = asInt(len) + if cmp(arraya[pta+leni], arrayb[ptb+leni]) ≤ 0 + pta += leni + else + ptb += leni + end + end + aa = arraya[pta] + bb = arrayb[ptb] + return ifelse(cmp(aa, bb) > 0, aa, bb) + end +end + +function blit_trim_four!(array, pta::Int, cmp::F) where {F} + @inbounds begin + x = cmp(array[pta], array[pta+1]) > 0 + array[pta], array[pta+1] = array[pta+x], array[pta+!x] + pta += 2 + + x = cmp(array[pta], array[pta+1]) > 0 + array[pta], array[pta+1] = array[pta+x], array[pta+!x] + pta -= 2 + + x = 2(cmp(array[pta], array[pta+2]) ≤ 0) + array[pta+2] = array[pta+x] + pta += 1 + x = 2(cmp(array[pta], array[pta+2]) > 0) + array[pta] = array[pta+x] + end +end + +function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp::F) where {F} + @inbounds begin + z = asInt(nmemb ÷ 9) + + pta = array_index + + for x in swap_index:swap_index+8 + swap[x] = array[pta] + pta += z + end + + blit_trim_four!(swap, swap_index, cmp) + blit_trim_four!(swap, swap_index + 4, cmp) + + swap[swap_index] = swap[swap_index+5] + swap[swap_index+3] = swap[swap_index+8] + + blit_trim_four!(swap, swap_index, cmp) + + swap[swap_index] = swap[swap_index+6] + + x = cmp(swap[swap_index], swap[swap_index+1]) > 0 + y = cmp(swap[swap_index], swap[swap_index+2]) > 0 + z = cmp(swap[swap_index+1], swap[swap_index+2]) > 0 + + return swap[swap_index + (x == y) + (y ⊻ z)] + end +end + +Base.@assume_effects :nothrow function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, + nmemb::UInt, cmp::F) where {F} + @inbounds begin + cbrt = UInt(32) # TODO: figure out how to write this more efficiently using bsr + while nmemb > cbrt * cbrt * cbrt && cbrt < swap_size + cbrt *= 2 + end + + div = asInt(nmemb ÷ cbrt) + + pta = array_index + pts = swap_index + + for cnt in 0:Core.bitcast(Int, cbrt)-1 + swap[pts+cnt] = array[pta] + pta += div + end + pta = pts + ptb = pts + asInt(cbrt ÷ 2) + + for cnt in cbrt÷8:-1:1 + blit_trim_four!(swap, pta, cmp) + blit_trim_four!(swap, ptb, cmp) + + swap[pta] = swap[ptb+1] + swap[pta+3] = swap[ptb+2] + + pta += 4 + ptb += 4 + end + cbrt ÷= 4; + + quadsort_swap!(swap, pts, swap, pts + 2asInt(cbrt), cbrt, cbrt, cmp) + quadsort_swap!(swap, pts + asInt(cbrt), swap, pts + 2asInt(cbrt), cbrt, cbrt, cmp) + + return cmp(swap[pts+2asInt(cbrt)-1], swap[pts]) ≤ 0, + blit_binary_median(swap, pts, swap, pts + asInt(cbrt), cbrt, cmp) + end +end + +# As per suggestion by Marshall Lochbaum to improve generic data handling +function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp::F) where {F} + @inbounds begin + if nmemb > swap_size + h = nmemb ÷ 2; + + l = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, h, cmp) + r = blit_reverse_partition!(array, array_index + asInt(h), swap, swap_index, swap_size, piv, nmemb - h, cmp) + + trinity_rotation!(array, array_index + asInt(l), swap, swap_index, swap_size, h - l + r, h - l) + + return l + r + end + + ptx = array_index + pta = array_index + pts = swap_index + + for _ in nmemb÷4:-1:1 + @unroll 4 begin + if cmp(piv, array[ptx]) > 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + end + + for _ in nmemb%4:-1:1 + if cmp(piv, array[ptx]) > 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + m = pta - array_index + + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + + return asUInt(m) + end +end + +function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp::F) where {F} + @inbounds begin + if nmemb > swap_size + h = nmemb ÷ 2 + + l = blit_default_partition!(array, array_index, swap, swap_index, swap_size, piv, h, cmp) + r = blit_default_partition!(array, array_index + asInt(h), swap, swap_index, swap_size, piv, nmemb - h, cmp) + + trinity_rotation!(array, array_index + asInt(l), swap, swap_index, swap_size, h - l + r, h - l) + + return l + r + end + + ptx = array_index + pta = array_index + pts = swap_index + + for _ in nmemb÷4:-1:one(UInt) + @unroll 4 begin + if cmp(array[ptx], piv) ≤ 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + end + + for _ in nmemb%4:-1:one(UInt) + if cmp(array[ptx], piv) ≤ 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + m = pta - array_index + + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + + return asUInt(m) + end +end + +Base.@assume_effects :nothrow function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, + nmemb::UInt, cmp::F) where {F} + @inbounds begin + a_size = zero(UInt) + local max + + while true + if nmemb ≤ 2048 + piv = blit_median_of_nine!(array, array_index, swap, swap_index, nmemb, cmp) + else + generic, piv = blit_median_of_cbrt!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + + if generic + quadsort_swap!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + return + end + end + + if !iszero(a_size) && cmp(max, piv) ≤ 0 + a_size = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, nmemb, cmp) + s_size = nmemb - a_size + + if s_size ≤ a_size÷16 || a_size <= BLIT_OUT + quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + return + end + nmemb = a_size + a_size = zero(UInt) + continue + end + + a_size = blit_default_partition!(array, array_index, swap, swap_index, swap_size, piv, nmemb, cmp) + s_size = nmemb - a_size + + if a_size ≤ s_size÷16 || s_size ≤ BLIT_OUT + if iszero(s_size) + a_size = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, a_size, cmp) + s_size = nmemb - a_size + + if s_size ≤ a_size÷16 || a_size ≤ BLIT_OUT + return quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + end + nmemb = a_size + a_size = zero(UInt) + continue + end + quadsort_swap!(array, array_index + asInt(a_size), swap, swap_index, swap_size, s_size, cmp) + else + blit_partition!(array, array_index + asInt(a_size), swap, swap_index, swap_size, s_size, cmp) + end + + if s_size ≤ a_size÷16 || a_size ≤ BLIT_OUT + quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + return + end + nmemb = a_size + max = piv + end + end +end + +function sort!(array::AbstractVector, lo::Int, hi::Int, ::BlitSortAlg, o::Ordering) + len = UInt(hi - lo +1) + if len ≤ 132 + sort!(array, lo, hi, QuadSortAlg(), o) + else + cmp = let o=o; (x, y) -> lt(o, y, x) end + + if !iszero(BLIT_AUX) + swap_size = UInt(BLIT_AUX) + else + swap_size = one(UInt) << 19 + + while len ÷ swap_size < swap_size ÷ 128 + swap_size ÷= 4 + end + end + @with_stackvec swap Int(swap_size) eltype(array) begin + blit_analyze!(array, lo, swap, firstindex(swap), swap_size, len, cmp) + end + end + return array +end + +blitsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} = + (nmemb ≤ 132 ? quadsort_swap! : blit_analyze!)(array, array_index, swap, swap_index, swap_size, nmemb, cmp) \ No newline at end of file diff --git a/src/QuadSort.jl b/src/QuadSort.jl new file mode 100644 index 0000000..58804c9 --- /dev/null +++ b/src/QuadSort.jl @@ -0,0 +1,951 @@ +export QuadSort + +struct QuadSortAlg <: Algorithm end + +const QuadSort = maybe_optimize(QuadSortAlg()) + +# We remove all the branchless stuff. It actually doesn't do any good (we cannot do the clang-branchless optimization without +# introducing a lot of references, which makes everything much slower; and if we use the gcc-branchless workaround, the +# benchmarked performance is slightly worse). +# While branchless avoids branch mispredictions, we still have dependency chains; and Julia might sometimes decide to remove +# the branch anyway, but mimicking it with ifelse makes it harder for the compiler to do any optimization. +macro head_branchless_merge(arrayd, ptd, arrayl, ptl, arrayr, ptr, cmp) + esc(quote + let pl=$arrayl[$ptl], pr=$arrayr[$ptr] + $arrayd[$ptd] = if $cmp(pl, pr) ≤ 0 + $ptl += 1 + pl + else + $ptr += 1 + pr + end + end + $ptd += 1 + end) +end + +macro tail_branchless_merge(arrayd, tpd, arrayl, tpl, arrayr, tpr, cmp) + esc(quote + let pl=$arrayl[$tpl], pr=$arrayr[$tpr] + $arrayd[$tpd] = if $cmp(pl, pr) > 0 + $tpl -= 1 + pl + else + $tpr -= 1 + pr + end + end + $tpd -= 1 + end) +end + +macro getcmp(array, indexp1, fn, indexp2, cmp) + esc(:(let a1=$array[$indexp1], a2=$array[$indexp2] + ifelse($fn($cmp(a1, a2), 0), a1, a2) # here we keep it branchless, giving a minimal advantage over ternary + end)) +end + +@inline function parity_merge_two!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} + @inbounds begin + ptl = array_index; ptr = array_index + 2; pts = swap_index + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, ≤, ptr, cmp) + ptl = array_index + 1; ptr = array_index + 3; pts = swap_index + 3 + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, >, ptr, cmp) + nothing + end +end + +@inline function parity_merge_four!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} + @inbounds begin + ptl = array_index; ptr = array_index + 4; pts = swap_index + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, ≤, ptr, cmp) + ptl = array_index + 3; ptr = array_index + 7; pts = swap_index + 7 + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, >, ptr, cmp) + nothing + end +end + +@inline function swap_branchless!(array, array_index::Int, cmp::F, + x::Bool=@inbounds(cmp(array[array_index], array[array_index+1]) > 0)) where {F} + y = !x + @inbounds array[array_index], array[array_index+1] = array[array_index+x], array[array_index+y] + nothing +end + +# the next seven functions are used for sorting 0 to 31 elements +function tiny_sort!(array, array_index::Int, nmemb::UInt, cmp::F) where {F} + @inbounds if nmemb == 4 + # This is really just @inline quad_swap_four!(array, array_index, cmp) + # However, for some weird reason, the profiler reports lots of GC and dynamic dispatch going on in either tiny_sort! or + # quad_swap_four! unless we copy the code verbatim. Very strange and cannot be verified in the assembly code; however, + # by inlining manually, it actually runs faster. + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + array_index += 1 + + a0 = array[array_index] + a1 = array[array_index+1] + if cmp(a0, a1) > 0 + array[array_index] = a1 + array[array_index+1] = a0 + array_index -= 1 + + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +1, cmp) + end + elseif nmemb == 3 + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +1, cmp) + swap_branchless!(array, array_index, cmp) + elseif nmemb == 2 + swap_branchless!(array, array_index, cmp) + end +end + +# this function requires a minimum offset of 2 to work properly +function twice_unguarded_insert!(array, array_index::Int, offset::UInt, nmemb::UInt, cmp::F) where {F} + @inbounds for i in asInt(offset):asInt(nmemb)-1 + end_ = array_index + i + pta = end_ + cmp(array[pta -= 1], array[end_]) ≤ 0 && continue + key = array[end_] + if cmp(array[array_index+1], key) > 0 + top = i -1 + while true + array[end_] = array[pta]; pta -= 1; end_ -= 1 + iszero(top -= 1) && break + end + array[end_] = key + end_ -= 1 + else + while true + array[end_] = array[pta]; pta -= 1; end_ -= 1 + array[end_] = array[pta]; pta -= 1; end_ -= 1 + cmp(array[pta], key) > 0 || break + end + array[end_] = array[end_+1] + array[end_+1] = key + end + swap_branchless!(array, end_, cmp) + end +end + +function quad_swap_four!(array, array_index::Int, cmp::F) where {F} + @inbounds begin + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + array_index += 1 + + a0 = array[array_index] + a1 = array[array_index+1] + if cmp(a0, a1) > 0 + array[array_index] = a1 + array[array_index+1] = a0 + array_index -= 1 + + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +1, cmp) + end + nothing + end +end + +function parity_swap_eight!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} + @inbounds begin + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +4, cmp) + swap_branchless!(array, array_index +6, cmp) + + cmp(array[array_index+1], array[array_index+2]) ≤ 0 && + cmp(array[array_index+3], array[array_index+4]) ≤ 0 && + cmp(array[array_index+5], array[array_index+6]) ≤ 0 && return + parity_merge_two!(array, array_index, swap, swap_index, cmp) + parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) + parity_merge_four!(swap, swap_index, array, array_index, cmp) + end +end + +# left must be equal or one smaller than right +function parity_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp::F) where {F} + @inbounds begin + ptl = from_index + ptr = from_index + asInt(left) + ptd = dest_index + tpl = ptr -1 + tpr = tpl + asInt(right) + tpd = dest_index + asInt(left + right) -1 + left < right && @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + while !iszero(left -= 1) + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @tail_branchless_merge(dest, tpd, from, tpl, from, tpr, cmp) + end + dest[tpd] = @getcmp(from, tpl, >, tpr, cmp) + nothing + end +end + +function parity_swap_sixteen!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} + @inbounds begin + quad_swap_four!(array, array_index, cmp) + quad_swap_four!(array, array_index +4, cmp) + quad_swap_four!(array, array_index +8, cmp) + quad_swap_four!(array, array_index +12, cmp) + cmp(array[array_index+3], array[array_index+4]) ≤ 0 && + cmp(array[array_index+7], array[array_index+8]) ≤ 0 && + cmp(array[array_index+11], array[array_index+12]) ≤ 0 && return + parity_merge_four!(array, array_index, swap, swap_index, cmp) + parity_merge_four!(array, array_index +8, swap, swap_index +8, cmp) + parity_merge!(array, array_index, swap, swap_index, UInt(8), UInt(8), cmp) + end +end + +function tail_swap!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp::F) where {F} + @inbounds begin + if nmemb < 5 + tiny_sort!(array, array_index, nmemb, cmp) + return + end + if nmemb < 8 + quad_swap_four!(array, array_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(4), nmemb, cmp) + return + end + if nmemb < 12 + parity_swap_eight!(array, array_index, swap, swap_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(8), nmemb, cmp) + return + end + if 16 ≤ nmemb < 24 + parity_swap_sixteen!(array, array_index, swap, swap_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(16), nmemb, cmp) + return + end + half1 = asInt(nmemb ÷ 2) + quad1 = half1 ÷ 2 + quad2 = half1 - quad1 + + half2 = asInt(nmemb - half1) + quad3 = half2 ÷ 2 + quad4 = half2 - quad3 + + pta = array_index + tail_swap!(array, pta, swap, swap_index, asUInt(quad1), cmp); pta += quad1 + tail_swap!(array, pta, swap, swap_index, asUInt(quad2), cmp); pta += quad2 + tail_swap!(array, pta, swap, swap_index, asUInt(quad3), cmp); pta += quad3 + tail_swap!(array, pta, swap, swap_index, asUInt(quad4), cmp) + + cmp(array[array_index+quad1-1], array[array_index+quad1]) ≤ 0 && + cmp(array[array_index+half1-1], array[array_index+half1]) ≤ 0 && + cmp(array[pta-1], array[pta]) ≤ 0 && return + + parity_merge!(swap, swap_index, array, array_index, asUInt(quad1), asUInt(quad2), cmp) + parity_merge!(swap, swap_index + half1, array, array_index + half1, asUInt(quad3), asUInt(quad4), cmp) + parity_merge!(array, array_index, swap, swap_index, asUInt(half1), asUInt(half2), cmp) + end +end + +# the next three functions create sorted blocks of 32 elements +function quad_reversal!(array, pta::Int, ptz::Int) + @inbounds begin + loop = (ptz - pta) ÷ 2 + + ptb = pta + loop + pty = ptz - loop + if iseven(loop) + array[ptb], array[pty] = array[pty], array[ptb] + ptb -= 1; pty += 1 + loop -= 1 + end + loop ÷= 2 + while true + array[pta], array[ptz] = array[ptz], array[pta] + pta += 1; ptz -= 1 + array[ptb], array[pty] = array[pty], array[ptb] + ptb -= 1; pty += 1 + + iszero(loop) && break + loop -= 1 + end + nothing + end +end + +function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} + parity_merge_two!(array, array_index, swap, swap_index, cmp) + parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) + parity_merge_four!(swap, swap_index, array, array_index, cmp) +end + +function quad_swap!(array, array_index::Int, nmemb::UInt, cmp::F) where {F} + @inbounds(@with_stackvec swap 32 eltype(array) begin + pta = array_index + count = nmemb ÷ 8 + while !iszero(count) + count -= 1 + v1 = cmp(array[pta], array[pta+1]) > 0 + v2 = cmp(array[pta+2], array[pta+3]) > 0 + v3 = cmp(array[pta+4], array[pta+5]) > 0 + v4 = cmp(array[pta+6], array[pta+7]) > 0 + + tmp = v1 + 2v2 + 4v3 + 8v4 + if tmp == 0 + cmp(array[pta+1], array[pta+2]) ≤ 0 && + cmp(array[pta+3], array[pta+4]) ≤ 0 && + cmp(array[pta+5], array[pta+6]) ≤ 0 && + @goto ordered + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + else + if tmp == 15 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && + cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + + @label not_ordered + swap_branchless!(array, pta, cmp, v1) + swap_branchless!(array, pta +2, cmp, v2) + swap_branchless!(array, pta +4, cmp, v3) + swap_branchless!(array, pta +6, cmp, v4) + + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + end + + pta += 8 + continue + + @label ordered + pta += 8 + if !iszero(count) + count -= 1 + if (v1 = cmp(array[pta], array[pta+1]) > 0) | (v2 = cmp(array[pta+2], array[pta+3]) > 0) | + (v3 = cmp(array[pta+4], array[pta+5]) > 0) | (v4 = cmp(array[pta+6], array[pta+7]) > 0) + if v1 + v2 + v3 + v4 == 4 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + @goto not_ordered + end + cmp(array[pta+1], array[pta+2]) ≤ 0 && cmp(array[pta+3], array[pta+4]) ≤ 0 && + cmp(array[pta+5], array[pta+6]) ≤ 0 && @goto ordered + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + pta += 8 + continue + else + count -= 1 + end + break + + @label reversed + pta += 8 + if !iszero(count) + count -= 1 + if !((v1 = cmp(array[pta], array[pta+1]) ≤ 0) | (v2 = cmp(array[pta+2], array[pta+3]) ≤ 0) | + (v3 = cmp(array[pta+4], array[pta+5]) ≤ 0) | (v4 = cmp(array[pta+6], array[pta+7]) ≤ 0)) + cmp(array[pta-1], array[pta]) > 0 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 && + @goto reversed + end + quad_reversal!(array, pts, pta -1) + v1 + v2 + v3 + v4 == 4 && cmp(array[pta+1], array[pta+2]) ≤ 0 && + cmp(array[pta+3], array[pta+4]) ≤ 0 && cmp(array[pta+5], array[pta+6]) ≤ 0 && + @goto ordered + if v1 + v2 + v3 + v4 == 0 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + + swap_branchless!(array, pta, cmp, !v1) + swap_branchless!(array, pta +2, cmp, !v2) + swap_branchless!(array, pta +4, cmp, !v3) + swap_branchless!(array, pta +6, cmp, !v4) + + if cmp(array[pta+1], array[pta+2]) > 0 || + cmp(array[pta+3], array[pta+4]) > 0 || + cmp(array[pta+5], array[pta+6]) > 0 + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + end + pta += 8 + continue + else + count -= 1 + end + + nmembrem = asInt(nmemb % 8) # subtracting -1 should give something negative + doit = true + for δ in nmembrem-1:-1:0 + if cmp(array[pta+δ-1], array[pta+δ]) ≤ 0 + doit = false + break + end + end + if doit + quad_reversal!(array, pts, pta + nmembrem -1) + pts == array_index && return true + @goto reverse_end + end + quad_reversal!(array, pts, pta -1) + break + end + tail_swap!(array, pta, swap, firstindex(swap), nmemb % 8, cmp) + + @label reverse_end + + pta = array_index + count = nmemb ÷ 32 + while !iszero(count) + count -= 1 + cmp(array[pta+7], array[pta+8]) ≤ 0 && + cmp(array[pta+15], array[pta+16]) ≤ 0 && + cmp(array[pta+23], array[pta+24]) ≤ 0 && continue + parity_merge!(swap, firstindex(swap), array, pta, UInt(8), UInt(8), cmp) + parity_merge!(swap, firstindex(swap) +16, array, pta +16, UInt(8), UInt(8), cmp) + parity_merge!(array, pta, swap, firstindex(swap), UInt(16), UInt(16), cmp) + + pta += 32 + end + nmemb % 32 > 8 && tail_merge!(array, pta, swap, firstindex(swap), UInt(32), nmemb % 32, UInt(8), cmp) + return false + end) +end + +# quad merge support routines +function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp::F) where {F} + @inbounds begin + ptl = from_index + ptr = from_index + asInt(left) + tpl = ptr -1 + tpr = tpl + asInt(right) + + if left +1 ≥ right && right +1 ≥ left && left ≥ 32 + if cmp(from[ptl+15], from[ptr]) > 0 && + cmp(from[ptl], from[ptr+15]) ≤ 0 && + cmp(from[tpl], from[tpr-15]) > 0 && + cmp(from[tpl-15], from[tpr]) ≤ 0 + parity_merge!(dest, dest_index, from, from_index, left, right, cmp) + return + end + end + ptd = dest_index + tpd = dest_index + asInt(left + right) -1 + + while tpl - ptl > 8 && tpr - ptr > 8 + @label ptl8_ptr + if cmp(from[ptl+7], from[ptr]) ≤ 0 + _unsafe_copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 + tpl - ptl > 8 && @goto ptl8_ptr + break + end + @label ptl_ptr8 + if cmp(from[ptl], from[ptr+7]) > 0 + _unsafe_copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 + tpr - ptr > 8 && @goto ptl_ptr8 + break + end + @label tpl_tpr8 + if cmp(from[tpl], from[tpr-7]) ≤ 0 + tpd -= 8; tpr -= 8; _unsafe_copyto!(dest, tpd +1, from, tpr +1, 8) + tpr - ptr > 8 && @goto tpl_tpr8 + break + end + @label tpl8_tpr + if cmp(from[tpl-7], from[tpr]) > 0 + tpd -= 8; tpl -= 8; _unsafe_copyto!(dest, tpd +1, from, tpl +1, 8) + tpl - ptl > 8 && @goto tpl8_tpr + end + loop = 8 + while true + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @tail_branchless_merge(dest, tpd, from, tpl, from, tpr, cmp) + iszero(loop -= 1) && break + end + end + + if cmp(from[tpl], from[tpr]) ≤ 0 + while ptl ≤ tpl + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + end + while ptr ≤ tpr + dest[ptd] = from[ptr]; ptr += 1; ptd += 1 + end + else + while ptr ≤ tpr + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + end + while ptl ≤ tpl + dest[ptd] = from[ptl]; ptl += 1; ptd += 1 + end + end + nothing + end +end + +# main memory: [A][B][C][D] +# swap memory: [A B] step 1 +# swap memory: [A B][C D] step 2 +# main memory: [A B C D] step 3 +function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block::UInt, cmp::F) where {F} + @inbounds begin + block_x_2 = 2block + + pt1 = array_index + asInt(block) + pt2 = pt1 + asInt(block) + pt3 = pt2 + asInt(block) + + tmp = (cmp(array[pt1-1], array[pt1]) ≤ 0) | 2(cmp(array[pt3-1], array[pt3]) ≤ 0) + if tmp == 0 + cross_merge!(swap, swap_index, array, array_index, block, block, cmp) + cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) + elseif tmp == 1 + _unsafe_copyto!(swap, swap_index, array, array_index, block_x_2) + cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) + elseif tmp == 2 + cross_merge!(swap, swap_index, array, array_index, block, block, cmp) + _unsafe_copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) + elseif tmp == 3 + cmp(array[pt2-1], array[pt2]) ≤ 0 && return + _unsafe_copyto!(swap, swap_index, array, array_index, 2block_x_2) + end + cross_merge!(array, array_index, swap, swap_index, block_x_2, block_x_2, cmp) + end +end + +function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} + pte = array_index + asInt(nmemb) + block *= 4 + while block ≤ nmemb && block ≤ swap_size + pta = array_index + while true + quad_merge_block!(array, pta, swap, swap_index, block ÷ 4, cmp) + pta += asInt(block) + pta + asInt(block) ≤ pte || break + end + tail_merge!(array, pta, swap, swap_index, swap_size, asUInt(pte - pta), block ÷ 4, cmp) + block *= 4 + end + tail_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block ÷ 4, cmp) + return block ÷ 2 +end + +function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, block::UInt, cmp::F) where {F} + @inbounds begin + nmemb == block && return + + ptr = array_index + asInt(block) + tpr = array_index + asInt(nmemb) -1 + cmp(array[ptr-1], array[ptr]) ≤ 0 && return + + _unsafe_copyto!(swap, swap_index, array, array_index, block) + ptl = swap_index + tpl = swap_index + asInt(block) -1 + while ptl < tpl -1 && ptr < tpr -1 + if cmp(swap[ptl], array[ptr+1]) > 0 + array[array_index] = array[ptr] + array[array_index+1] = array[ptr+1] + ptr += 2 + array_index += 2 + elseif cmp(swap[ptl+1], array[ptr]) ≤ 0 + array[array_index] = swap[ptl] + array[array_index+1] = swap[ptl+1] + ptl += 2 + array_index += 2 + else + x = cmp(swap[ptl], array[ptr]) ≤ 0 + y = !x + array[ptr+x] = array[ptr] + ptr += 1 + array[array_index+y] = swap[ptl] + ptl += 1 + array_index += 2 + @head_branchless_merge(array, array_index, swap, ptl, array, ptr, cmp) + end + end + while ptl ≤ tpl && ptr ≤ tpr + @head_branchless_merge(array, array_index, swap, ptl, array, ptr, cmp) + end + while ptl ≤ tpl + array[array_index] = swap[ptl] + ptl += 1 + array_index += 1 + end + nothing + end +end + +function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, + cmp::F) where {F} + @inbounds begin + nmemb == block && return + + tpl = array_index + asInt(block) -1 + tpa = array_index + asInt(nmemb) -1 + cmp(array[tpl], array[tpl+1]) ≤ 0 && return + + right = nmemb - block + if nmemb ≤ swap_size && right ≥ 64 + cross_merge!(swap, swap_index, array, array_index, block, right, cmp) + _unsafe_copyto!(array, array_index, swap, swap_index, nmemb) + return + end + + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(block), right) + tpr = swap_index + asInt(right) -1 + while tpl > array_index +16 && tpr > swap_index +16 + @label tpl_tpr16 + if cmp(array[tpl], swap[tpr-15]) ≤ 0 + loop = 16 + while true + array[tpa] = swap[tpr] + tpr -= 1 + tpa -= 1 + iszero(loop -= 1) && break + end + tpr > swap_index + 16 && @goto tpl_tpr16 + break + end + @label tpl16_tpr + if cmp(array[tpl-15], swap[tpr]) > 0 + loop = 16 + while true + array[tpa] = array[tpl] + tpl -= 1 + tpa -= 1 + iszero(loop -= 1) && break + end + tpl > array_index +16 && @goto tpl16_tpr + break + end + loop = 8 + while true + if cmp(array[tpl], swap[tpr-1]) ≤ 0 + array[tpa] = swap[tpr] + array[tpa-1] = swap[tpr-1] + tpr -= 2 + tpa -= 2 + elseif cmp(array[tpl-1], swap[tpr]) > 0 + array[tpa] = array[tpl] + array[tpa-1] = array[tpl-1] + tpl -= 2 + tpa -= 2 + else + x = cmp(array[tpl], swap[tpr]) ≤ 0 + y = !x + tpa -= 1 + array[tpa+x] = swap[tpr] + tpr -= 1 + array[tpa+y] = array[tpl] + tpl -= 1 + tpa -= 1 + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + iszero(loop -= 1) && break + end + end + while tpr > swap_index +1 && tpl > array_index +1 + if cmp(array[tpl], swap[tpr-1]) ≤ 0 + array[tpa] = swap[tpr] + array[tpa-1] = swap[tpr-1] + tpr -= 2 + tpa -= 2 + elseif cmp(array[tpl-1], swap[tpr]) > 0 + array[tpa] = array[tpl] + array[tpa-1] = array[tpl-1] + tpl -= 2 + tpa -= 2 + else + x = cmp(array[tpl], swap[tpr]) ≤ 0 + y = !x + tpa -= 1 + array[tpa+x] = swap[tpr] + tpr -= 1 + array[tpa+y] = array[tpl] + tpl -= 1 + tpa -= 1 + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + end + while tpr ≥ swap_index && tpl ≥ array_index + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + while tpr ≥ swap_index + array[tpa] = swap[tpr] + tpr -= 1 + tpa -= 1 + end + nothing + end +end + +function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} + pte = array_index + asInt(nmemb) + while block < nmemb && block ≤ swap_size + pta = array_index + while pta + asInt(block) < pte + if pta + 2asInt(block) < pte + partial_backward_merge!(array, pta, swap, swap_index, swap_size, 2block, block, cmp) + pta += 2asInt(block) + continue + end + partial_backward_merge!(array, pta, swap, swap_index, swap_size, asUInt(pte - pta), block, cmp) + break + end + block *= 2 + end + nothing +end + +# the next four functions provide in-place rotate merge support +function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, left::UInt) + @inbounds begin + right = nmemb - left + if swap_size > 65536 + swap_size = UInt(65536) + end + if left < right + if left ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index, left) + _unsafe_copyto!(array, array_index, array, array_index + asInt(left), right) + _unsafe_copyto!(array, array_index + asInt(right), swap, swap_index, left) + else + pta = array_index + ptb = pta + asInt(left) + bridge = right - left + if bridge ≤ swap_size && bridge > 3 + ptc = pta + asInt(right) + ptd = ptc + asInt(left) + _unsafe_copyto!(swap, swap_index, array, ptb, bridge) + for _ in 1:left + array[ptc -= 1] = array[ptd -= 1] + array[ptd] = array[ptb -= 1] + end + _unsafe_copyto!(array, pta, swap, swap_index, bridge) + else + ptc = ptb + ptd = ptc + asInt(right) + for _ in 1:left÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptc], array[ptd] = array[ptc], array[pta], array[ptd], array[ptb] + pta += 1 + ptc += 1 + end + for _ in 1:(ptd - ptc)÷2 + ptd -= 1 + array[pta], array[ptc], array[ptd] = array[ptc], array[ptd], array[pta] + ptc += 1 + pta += 1 + end + for _ in 1:(ptd - pta)÷2 + ptd -= 1 + array[pta], array[ptd] = array[ptd], array[pta] + pta += 1 + end + end + end + elseif right < left + if right ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(left), right) + _unsafe_copyto_backwards!(array, array_index + asInt(right), array, array_index, left) + _unsafe_copyto!(array, array_index, swap, swap_index, right) + else + pta = array_index + ptb = pta + asInt(left) + bridge = left - right + if bridge ≤ swap_size && bridge > 3 + ptc = pta + asInt(right) + ptd = ptc + asInt(left) + _unsafe_copyto!(swap, swap_index, array, ptc, bridge) + for _ in 1:right + array[ptc] = array[pta] + array[pta] = array[ptb] + ptc += 1 + pta += 1 + ptb += 1 + end + _unsafe_copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) + else + ptc = ptb + ptd = ptc + asInt(right) + for _ in 1:right÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptc], array[ptd] = array[ptc], array[pta], array[ptd], array[ptb] + pta += 1 + ptc += 1 + end + for _ in 1:(ptb - pta)÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptd] = array[ptd], array[pta], array[ptb] + pta += 1 + end + for _ in 1:(ptd - pta)÷2 + ptd -= 1 + array[pta], array[ptd] = array[ptd], array[pta] + pta += 1 + end + end + end + else + pta = array_index + ptb = pta + asInt(left) + for _ in 1:left + array[pta], array[ptb] = array[ptb], array[pta] + pta += 1 + ptb += 1 + end + end + nothing + end +end + +function monobound_binary_first(array, array_index::Int, value, top::UInt, cmp::F) where {F} + @inbounds begin + end_ = array_index + asInt(top) + while top > 1 + mid = asInt(top ÷ 2) + if cmp(value, array[end_-mid]) ≤ 0 + end_ -= mid + end + top -= mid + end + if cmp(value, array[end_-1]) ≤ 0 + end_ -= 1 + end + return asUInt(end_ - array_index) + end +end + +function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, lblock::UInt, right::UInt, cmp::F) where {F} + @inbounds begin + cmp(array[array_index+asInt(lblock)-1], array[array_index+asInt(lblock)]) ≤ 0 && return + + rblock = lblock ÷ 2 + rblocki = asInt(rblock) + lblock -= rblock + lblocki = asInt(lblock) + + left = monobound_binary_first(array, array_index + lblocki + rblocki, array[array_index+lblocki], right, cmp) + + right -= left + + # [ lblock ] [ rblock ] [ left ] [ right ] + + if !iszero(left) + if lblock + left ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index, lblock) + _unsafe_copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) + _unsafe_copyto_backwards!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) + + cross_merge!(array, array_index, swap, swap_index, lblock, left, cmp) + else + trinity_rotation!(array, array_index + lblocki, swap, swap_index, swap_size, rblock + left, rblock) + unbalanced = (2left < lblock) | (2lblock < left) + if unbalanced && left ≤ swap_size + partial_backward_merge!(array, array_index, swap, swap_index, swap_size, lblock + left, lblock, cmp) + elseif unbalanced && lblock ≤ swap_size + partial_forward_merge!(array, array_index, swap, swap_index, lblock + left, lblock, cmp) + else + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, lblock, left, cmp) + end + end + end + + if !iszero(right) + unbalanced = (2right < rblock) | (2rblock < right) + if unbalanced && right ≤ swap_size || right + rblock ≤ swap_size + partial_backward_merge!(array, array_index + lblocki + asInt(left), swap, swap_index, swap_size, + rblock + right, rblock, cmp) + elseif unbalanced && rblock ≤ swap_size + partial_forward_merge!(array, array_index + lblocki + asInt(left), swap, swap_index, rblock + right, rblock, + cmp) + else + rotate_merge_block!(array, array_index + lblocki + asInt(left), swap, swap_index, swap_size, rblock, right, + cmp) + end + end + nothing + end +end + +function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} + if nmemb ≤ 2block && nmemb - block ≤ swap_size # unsigned subtraction, ensures nmemb ≥ block + partial_backward_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) + return + end + + pte = array_index + asInt(nmemb) + while block < nmemb + pta = array_index + while pta + asInt(block) < pte + if pta + 2asInt(block) < pte + rotate_merge_block!(array, pta, swap, swap_index, swap_size, block, block, cmp) + pta += 2asInt(block) + continue + end + rotate_merge_block!(array, pta, swap, swap_index, swap_size, block, pte - pta - block, cmp) + break + end + block *= 2 + end +end + +function sort!(array::AbstractVector, lo::Int, hi::Int, ::QuadSortAlg, o::Ordering) + len = UInt(hi - lo +1) + cmp = let o=o; (x, y) -> lt(o, y, x) end + + if len < 32 + @with_stackvec(swap, 32, eltype(array), + # we use a fixed size to make sure it is on the stack + tail_swap!(array, lo, swap, firstindex(swap), len, cmp) + ) + elseif !quad_swap!(array, lo, len, cmp) + swap_size = len + if swap_size ≤ 512 + @with_stackvec swapstack Int(swap_size) eltype(array) begin + block = quad_merge!(array, lo, swapstack, firstindex(swapstack), swap_size, len, UInt(32), cmp) + rotate_merge!(array, lo, swapstack, firstindex(swapstack), swap_size, len, block, cmp) + end + else + local swap + try + swap = Vector{eltype(array)}(undef, swap_size) + catch e + if e isa OutOfMemoryError + @with_stackvec stack 512 eltype(array) begin + tail_merge!(array, lo, stack, firstindex(stack), UInt(32), len, UInt(32), cmp) + rotate_merge!(array, lo, stack, firstindex(stack), UInt(32), len, UInt(64), cmp) + end + return + end + rethrow() + end + block = quad_merge!(array, lo, swap, firstindex(swap), swap_size, len, UInt(32), cmp) + rotate_merge!(array, lo, swap, firstindex(swap), swap_size, len, block, cmp) + end + end + return array +end + +function quadsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} + if nmemb ≤ 96 + tail_swap!(array, array_index, swap, swap_index, nmemb, cmp) + elseif !quad_swap!(array, array_index, nmemb, cmp) + block = quad_merge!(array, array_index, swap, swap_index, swap_size, nmemb, UInt(32), cmp) + rotate_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) + end +end \ No newline at end of file diff --git a/src/SortingAlgorithms.jl b/src/SortingAlgorithms.jl index f432e2c..782d69e 100644 --- a/src/SortingAlgorithms.jl +++ b/src/SortingAlgorithms.jl @@ -701,6 +701,17 @@ function _unsafe_copyto!(dest::Array, doffs, src::Array, soffs, n) unsafe_copyto!(dest, doffs, src, soffs, n) end +function _unsafe_copyto_backwards!(dest, doffs, src, soffs, n) + @inbounds for i in n-1:-1:0 + dest[doffs + i] = src[soffs + i] + end + dest +end + +function _unsafe_copyto_backwards!(dest::Array, doffs, src::Array, soffs, n) + unsafe_copyto!(dest, doffs, src, soffs, n) +end + # merge v[lo:m] and v[m+1:hi] ([A;B]) using scratch[1:1+hi-lo] # This is faster than merge! but requires twice as much auxiliary memory. function twoended_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where T @@ -932,4 +943,41 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, ::PagedMergeSortAlg, pagedmergesort!(v, lo, hi, o, scratch, pageLocations) return v end + +# QuadSort and BlitSort +mutable struct StackSpace{N,T} + data::NTuple{N,T} + + StackSpace{N,T}() where {T,N} = new{N,T}() +end + +macro with_stackvec(name, len, eltype, body) + quote + local stackvec = StackSpace{$(esc(len)),$(esc(eltype))}() + $(esc(name)) = unsafe_wrap(Array, Ptr{$(esc(eltype))}(pointer_from_objref(stackvec)), $(esc(len))) + GC.@preserve stackvec begin + $(esc(body)) + if !isbitstype($(esc(eltype))) + for i in eachindex($(esc(name))) + Base._unsetindex!($(esc(name)), i) + end + end + end + end +end + +macro unroll(n, expr) + result = Expr(:block) + for _ in 1:n + push!(result.args, expr) + end + esc(result) +end + +asUInt(i::Int) = Core.bitcast(UInt, i) +asInt(i::UInt) = Core.bitcast(Int, i) + +include("./QuadSort.jl") +include("./BlitSort.jl") + end # module diff --git a/test/runtests.jl b/test/runtests.jl index aab5738..1dee5b0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ using Test using StatsBase using Random -stable_algorithms = [TimSort, RadixSort, PagedMergeSort] +stable_algorithms = [TimSort, RadixSort, PagedMergeSort, QuadSort, BlitSort] unstable_algorithms = [HeapSort, CombSort] a = rand(1:10000, 1000)