Discussion: should we use divide-and-conquer reduce for instruction-level parallelism?

As I discussed briefly with @c42f in https://github.com/JuliaArrays/StaticArrays.jl/pull/702#discussion_r379269102, using the divide-and-conquer approach in `reduce` on static arrays may be useful for leveraging instruction-level parallelism (ILP).  I cooked up two (somewhat contrived?) examples and it turned out that using divide-and-conquer approach can be 3x to 4x faster (in my laptop) than sequential (`foldl`) approach.  This is the case when "parallelizing" both compute- and memory- bound instructions.

I also made two versions of benchmarks with and without `@simd`.  I was surprised that putting `@simd` didn't help improving the `foldl` version.

Anyway, I think it seems to be a pretty good motivation for using this approach in StaticArrays.jl, although it'd be even nicer if there are more "real-world" benchmarks like this. What do you think?  Does it make sense to use this approach in StaticArrays.jl?

### Code

I'm using `NTuple` as a model of `SVector` here. `mapmapreduce!` is the main function that I benchmarked.

```julia
module ILPBenchmarks

@inline tmapreduce(f::F, op::O, init, ::Tuple{}) where {F,O} = init
@inline tmapreduce(f::F, op::O, init, x::Tuple{Any}) where {F,O} = f(x)
@inline tmapreduce(f::F, op::O, init, (a, b)::Tuple{Any,Any}) where {F,O} = op(f(a), f(b))
@inline function tmapreduce(f::F, op::O, init, v::Tuple) where {F,O}
    left, right = _halve(v)
    return op(tmapreduce(f, op, init, left), tmapreduce(f, op, init, right))
end

@generated function _halve(v::NTuple{N,Any}) where {N}
    m = N ÷ 2
    quote
        (($([:(v[$i]) for i in 1:m]...),), ($([:(v[$i]) for i in m+1:N]...),))
    end
end

@inline foldlargs(op::F, acc) where {F} = acc
@inline foldlargs(op::F, acc, x, xs...) where {F} = foldlargs(op, op(acc, x), xs...)

@inline tmapfoldl(f::F, op::O, init, xs::Tuple) where {F,O} =
    foldlargs((acc, x) -> op(acc, f(x)), init, xs...)

function mapmapreduce!(mapreducer::R, g::G, f::F, op::O, ys, xs, init) where {R,G,F,O}
    for i in eachindex(xs, ys)
        @inbounds ys[i] = g(mapreducer(f, op, init, xs[i]))
    end
end

function mapmapreduce_simd!(mapreducer::R, g::G, f::F, op::O, ys, xs, init) where {R,G,F,O}
    @simd for i in eachindex(xs, ys)
        @inbounds ys[i] = g(mapreducer(f, op, init, xs[i]))
    end
end

using BenchmarkTools
n = 1_000
m = 32
k = n
xs = tuple.(ntuple(_ -> rand(1:k, n), m)...)::Vector{<:NTuple{m,Any}}
ys = zeros(n)

const zs = randn(k)
@inline function memory_bound_f(x)
    y = @inbounds zs[x]
    (y, y)
end

@inline function compute_bound_f(x)
    y = x * x * x * x * x * x * x * x
    (y, y)
end

_minmax((min1, max1), (min2, max2)) = (min(min1, min2), max(max1, max2))
_diff((min, max)) = max - min

const SUITE = BenchmarkGroup()

for (simd, _mapmapreduce!) in [(false, mapmapreduce!), (true, mapmapreduce_simd!)]
    s1 = SUITE["simd=$simd"] = BenchmarkGroup()
    for (label, f) in [("memory-bound", memory_bound_f), ("compute-bound", compute_bound_f)]
        s2 = s1[label] = BenchmarkGroup()
        s2["reduce"] = @benchmarkable($_mapmapreduce!(
            tmapreduce,
            _diff,
            $f,
            _minmax,
            $ys,
            $xs,
            (Inf, -Inf),
        ))
        s2["foldl"] = @benchmarkable($_mapmapreduce!(
            tmapfoldl,
            _diff,
            $f,
            _minmax,
            $ys,
            $xs,
            (Inf, -Inf),
        ))
    end
end

end  # module
```

### Output

Here is a typical result in my laptop:

```julia
julia> run(ILPBenchmarks.SUITE)
2-element BenchmarkTools.BenchmarkGroup:
  tags: []
  "simd=true" => 2-element BenchmarkTools.BenchmarkGroup:
          tags: []
          "compute-bound" => 2-element BenchmarkTools.BenchmarkGroup:
                  tags: []
                  "foldl" => Trial(101.073 μs)
                  "reduce" => Trial(25.643 μs)
          "memory-bound" => 2-element BenchmarkTools.BenchmarkGroup:
                  tags: []
                  "foldl" => Trial(207.505 μs)
                  "reduce" => Trial(64.472 μs)
  "simd=false" => 2-element BenchmarkTools.BenchmarkGroup:
          tags: []
          "compute-bound" => 2-element BenchmarkTools.BenchmarkGroup:
                  tags: []
                  "foldl" => Trial(99.495 μs)
                  "reduce" => Trial(25.633 μs)
          "memory-bound" => 2-element BenchmarkTools.BenchmarkGroup:
                  tags: []
                  "foldl" => Trial(197.315 μs)
                  "reduce" => Trial(64.182 μs)
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Discussion: should we use divide-and-conquer reduce for instruction-level parallelism? #748

Code

Output

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Discussion: should we use divide-and-conquer reduce for instruction-level parallelism? #748

Description

Code

Output

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions