Description
As I discussed briefly with @c42f in #702 (comment), using the divide-and-conquer approach in reduce
on static arrays may be useful for leveraging instruction-level parallelism (ILP). I cooked up two (somewhat contrived?) examples and it turned out that using divide-and-conquer approach can be 3x to 4x faster (in my laptop) than sequential (foldl
) approach. This is the case when "parallelizing" both compute- and memory- bound instructions.
I also made two versions of benchmarks with and without @simd
. I was surprised that putting @simd
didn't help improving the foldl
version.
Anyway, I think it seems to be a pretty good motivation for using this approach in StaticArrays.jl, although it'd be even nicer if there are more "real-world" benchmarks like this. What do you think? Does it make sense to use this approach in StaticArrays.jl?
Code
I'm using NTuple
as a model of SVector
here. mapmapreduce!
is the main function that I benchmarked.
module ILPBenchmarks
@inline tmapreduce(f::F, op::O, init, ::Tuple{}) where {F,O} = init
@inline tmapreduce(f::F, op::O, init, x::Tuple{Any}) where {F,O} = f(x)
@inline tmapreduce(f::F, op::O, init, (a, b)::Tuple{Any,Any}) where {F,O} = op(f(a), f(b))
@inline function tmapreduce(f::F, op::O, init, v::Tuple) where {F,O}
left, right = _halve(v)
return op(tmapreduce(f, op, init, left), tmapreduce(f, op, init, right))
end
@generated function _halve(v::NTuple{N,Any}) where {N}
m = N ÷ 2
quote
(($([:(v[$i]) for i in 1:m]...),), ($([:(v[$i]) for i in m+1:N]...),))
end
end
@inline foldlargs(op::F, acc) where {F} = acc
@inline foldlargs(op::F, acc, x, xs...) where {F} = foldlargs(op, op(acc, x), xs...)
@inline tmapfoldl(f::F, op::O, init, xs::Tuple) where {F,O} =
foldlargs((acc, x) -> op(acc, f(x)), init, xs...)
function mapmapreduce!(mapreducer::R, g::G, f::F, op::O, ys, xs, init) where {R,G,F,O}
for i in eachindex(xs, ys)
@inbounds ys[i] = g(mapreducer(f, op, init, xs[i]))
end
end
function mapmapreduce_simd!(mapreducer::R, g::G, f::F, op::O, ys, xs, init) where {R,G,F,O}
@simd for i in eachindex(xs, ys)
@inbounds ys[i] = g(mapreducer(f, op, init, xs[i]))
end
end
using BenchmarkTools
n = 1_000
m = 32
k = n
xs = tuple.(ntuple(_ -> rand(1:k, n), m)...)::Vector{<:NTuple{m,Any}}
ys = zeros(n)
const zs = randn(k)
@inline function memory_bound_f(x)
y = @inbounds zs[x]
(y, y)
end
@inline function compute_bound_f(x)
y = x * x * x * x * x * x * x * x
(y, y)
end
_minmax((min1, max1), (min2, max2)) = (min(min1, min2), max(max1, max2))
_diff((min, max)) = max - min
const SUITE = BenchmarkGroup()
for (simd, _mapmapreduce!) in [(false, mapmapreduce!), (true, mapmapreduce_simd!)]
s1 = SUITE["simd=$simd"] = BenchmarkGroup()
for (label, f) in [("memory-bound", memory_bound_f), ("compute-bound", compute_bound_f)]
s2 = s1[label] = BenchmarkGroup()
s2["reduce"] = @benchmarkable($_mapmapreduce!(
tmapreduce,
_diff,
$f,
_minmax,
$ys,
$xs,
(Inf, -Inf),
))
s2["foldl"] = @benchmarkable($_mapmapreduce!(
tmapfoldl,
_diff,
$f,
_minmax,
$ys,
$xs,
(Inf, -Inf),
))
end
end
end # module
Output
Here is a typical result in my laptop:
julia> run(ILPBenchmarks.SUITE)
2-element BenchmarkTools.BenchmarkGroup:
tags: []
"simd=true" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"compute-bound" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"foldl" => Trial(101.073 μs)
"reduce" => Trial(25.643 μs)
"memory-bound" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"foldl" => Trial(207.505 μs)
"reduce" => Trial(64.472 μs)
"simd=false" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"compute-bound" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"foldl" => Trial(99.495 μs)
"reduce" => Trial(25.633 μs)
"memory-bound" => 2-element BenchmarkTools.BenchmarkGroup:
tags: []
"foldl" => Trial(197.315 μs)
"reduce" => Trial(64.182 μs)