more

mcabbott · mcabbott · commit 3da605fa3670 · 2023-03-31T20:14:19.000-04:00
diff --git a/src/preallocated.jl b/src/preallocated.jl
@@ -1,6 +1,6 @@
 using Flux, ChainRulesCore
 using LinearAlgebra: mul!
-# using FastBroadcast: @..
+using FastBroadcast: @..
 using Strided
 
 const NoT = NoTangent()
@@ -108,32 +108,62 @@ function ChainRulesCore.rrule(::typeof(scale!), y, (scale, ds), (x, dx), (bias,
 end
 
 #####
-#####  softmax
+#####  Conv
 #####
 
-function PreLayer(::typeof(softmax))
-  fwd, rev = zeros(Float32, 0), zeros(Float32, 0)  # not ideal, demands `model |> pre |> gpu` 
-  PreLayer(softmax, nothing, fwd, rev)
+function PreLayer(c::Conv)
+  grad = _struct_sim(c)
+  fwd, rev = similar(c.weight, 0), similar(c.weight, 0)
+  PreLayer(c, grad, fwd, rev)
 end
 
-function (p::PreLayer{typeof(softmax)})(x::AbstractArray{<:Real})
-  y, dx = _pre_setup(p, x)  # generic version
-  _softmaxcall!(y, p, x, dx)
+function (p::PreLayer{<:Conv})(x::AbstractArray{<:Real})
+  y, dx = _pre_setup(p, x)
+  _convcall!(y, p, x, dx)
 end
 
-_softmaxcall!(y, p, x, dx) = softmax!(y, x)
+using Flux: conv_dims, conv_reshape_bias
+using Flux.NNlib: fast_act, conv!, output_size, channels_out
 
-function ChainRulesCore.rrule(::typeof(_softmaxcall!), y, p, x, dx)
-  y = _softmaxcall!(y, p, x, dx)
-  function back(dy)
-    # TODO: CHECK THIS!
-    dx .= dy .* y
-    dx .= dx .- y .* sum(dx; dims=1)  # could sum! into the end of rev
-    return (NoT, NoT, NoT, dx, NoT)  # last one could be NotImplemented?
+function _pre_setup(p::PreLayer{<:Conv}, x)
+  cdims = conv_dims(p.layer, x)
+  ysize = (output_size(cdims)..., channels_out(cdims), size(x)[end])
+  if prod(ysize) != length(p.fwd)
+    resize!(p.fwd, prod(ysize))
+    resize!(p.rev, length(x))
   end
-  y, back
+  y = _pre_reshape(p.fwd, ysize)
+  dx = _pre_reshape(p.rev, size(x))
+  (; y, dx)
+end
+
+function _convcall!(y, p, x, dx)
+  cdims = conv_dims(p.layer, x)
+  conv!(y, x, p.layer.weight, cdims)
+  if p.layer.bias isa AbstractArray
+    y .+= conv_reshape_bias(p.layer)
+  end
+  act!(y, fast_act(p.layer.σ, x))
 end
 
+# function ChainRulesCore.rrule(::typeof(_convcall!), y, p, x, dx)
+#   y = _densecall!(y, p, x, dx)
+#   function back(dy)
+#     dy = unthunk(dy)
+#     dy = ∇act!(y, dy, p.layer.σ)
+#     # layer
+#     weight = mul!(p.grad.weight, dy, x') 
+#     bias = ∇bias!(p.grad.bias, dy)
+#     tang = Tangent{Dense}(; weight, bias)
+#     # input
+#     dx = mul!(dx, p.layer.weight', dy)
+#     return (NoT, NoT, Tangent{PreLayer}(; layer = tang), dx, NoT)
+#   end
+#   y, back
+# end
+
+
+
 #####
 #####  BatchNorm
 #####
@@ -201,6 +231,33 @@ function ChainRulesCore.rrule(::typeof(_norm_layer_forward!), y, x, dx, μ, σ²
   y, back
 end
 
+#####
+#####  softmax
+#####
+
+function PreLayer(::typeof(softmax))
+  fwd, rev = zeros(Float32, 0), zeros(Float32, 0)  # not ideal, demands `model |> pre |> gpu` 
+  PreLayer(softmax, nothing, fwd, rev)
+end
+
+function (p::PreLayer{typeof(softmax)})(x::AbstractArray{<:Real})
+  y, dx = _pre_setup(p, x)  # generic version
+  _softmaxcall!(y, p, x, dx)
+end
+
+_softmaxcall!(y, p, x, dx) = softmax!(y, x)
+
+function ChainRulesCore.rrule(::typeof(_softmaxcall!), y, p, x, dx)
+  y = _softmaxcall!(y, p, x, dx)
+  function back(dy)
+    # TODO: CHECK THIS!
+    dx .= dy .* y
+    dx .= dx .- y .* sum(dx; dims=1)  # could sum! into the end of rev
+    return (NoT, NoT, NoT, dx, NoT)  # last one could be NotImplemented?
+  end
+  y, back
+end
+
 
 #####
 #####  activation functions
@@ -212,8 +269,8 @@ function act!(y, act::F) where F
   # y .= σ.(y)
   # Unfortunately this hits  https://github.com/JuliaLang/julia/issues/43153
   # maybe you could patch Strided.jl to avoid it? Or use another package...
-  @strided y .= σ.(y)
-  # FastBroadcast.@.. y = σ(y)
+  # @strided y .= σ.(y)
+  @.. y = σ(y)
 end
 
 # Piracy, disable @strided on CuArrays:
@@ -223,10 +280,31 @@ Strided.maybestrided(x::Flux.CuArray) = x
 ChainRulesCore.rrule(::typeof(act!), y, f) = act!(y, f), dz -> (NoT, ∇act!(y, dy, f), NoT)
 
 ∇act!(y, dy, ::typeof(identity)) = dy
-∇act!(y, dy, ::typeof(relu)) = @. y = ifelse(y>0, dy, 0f0)
-∇act!(y, dy, ::typeof(tanh)) = @. y = (1 - y^2)
-∇act!(y, dy, ::typeof(sigmoid)) = @. y = y * (1 - y)
+∇act!(y, dy, ::typeof(relu)) = @.. y = ifelse(y>0, dy, 0f0)
+∇act!(y, dy, ::typeof(tanh)) = @.. y = (1 - y^2)
+∇act!(y, dy, ::typeof(sigmoid)) = @.. y = y * (1 - y)
+
+
+function PreLayer(::typeof(relu))
+  fwd, rev = zeros(Float32, 0), zeros(Float32, 0)  # not ideal
+  PreLayer(relu, nothing, fwd, rev)
+end
+
+function (p::PreLayer{typeof(relu)})(x::AbstractArray{<:Real})
+  y, dx = _pre_setup(p, x)  # generic version
+  _relucall!(y, p, x, dx)
+end
 
+_relucall!(y, p, x, dx) = y .= relu.(x)
+
+function ChainRulesCore.rrule(::typeof(_relucall!), y, p, x, dx)
+  y = _relucall!(y, p, x, dx)
+  function back(dy)
+    @. dx = ifelse(y>0, dy, 0f0)
+    return (NoT, NoT, NoT, dx, NoT)
+  end
+  y, back
+end
 
 #####
 #####  PreLayer utils
@@ -249,10 +327,14 @@ ChainRulesCore.@non_differentiable _pre_setup(::Any, ::Any)
 
 # Cannot use reshape(::Array), as that prevents later resize!
 _pre_reshape(x::Array, size::Tuple) = Base.ReshapedArray(x, size, ())
+# _pre_reshape(x::Array, size::Tuple) = Base.__reshape((x, Base.IndexStyle(x)), size)  # what Base does, no better
 # Must use reshape(::CuArray) as mul! rejects ReshapedArray
 _pre_reshape(x::Flux.CuArray, size::Tuple) = reshape(x, size)
 _pre_reshape(x, size::Tuple) = reshape(x, size)
 
+# Base piracy! to prevent ReshapedArray from going missing
+Base._reshape(R::Base.ReshapedArray, dims::Base.Dims) = Base.ReshapedArray(R.parent, dims, ())
+
 ∇bias!(::Bool, dx) = NoT
 ∇bias!(bias, dx) = sum!(bias, dx)
 
diff --git a/test/preallocated.jl b/test/preallocated.jl
@@ -16,10 +16,10 @@ g2 = gradient((m,x) -> m(x)[1], m2, x)
 #=
 
 julia> @btime gradient((m,x) -> m(x)[1], $m1, $x);
-  min 52.167 μs, mean 2.519 ms (58 allocations, 355.41 KiB)
+  min 50.167 μs, mean 88.796 μs (58 allocations, 355.41 KiB)
 
 julia> @btime gradient((m,x) -> m(x)[1], $m2, $x);
-  min 58.750 μs, mean 190.440 μs (109 allocations, 17.44 KiB)
+  min 57.792 μs, mean 66.050 μs (115 allocations, 17.75 KiB)
 
 
 
@@ -33,9 +33,14 @@ let data = [(x,) for _ in 1:1000]
     nothing
 end
 
+# Yesterday:
 #  min 1.799 s, mean 1.802 s (177001 allocations, 352.94 MiB)
 #  min 146.713 ms, mean 251.041 ms (295001 allocations, 25.71 MiB)
 
+# Today, wtf? Maybe threading changes have hurt.
+#  min 244.235 ms, mean 251.582 ms (177001 allocations, 352.94 MiB)
+#  min 224.760 ms, mean 227.594 ms (301001 allocations, 26.02 MiB)
+
 
 m1cu = m1 |> gpu
 m2cu = m2 |> gpu
@@ -78,3 +83,56 @@ julia> @btime $m4($x);
 
 =#
 
+x4 = randn(Float32, 28, 28, 1, 13);
+
+m5 = @autosize (size(x4)...,) Chain(
+        Conv((3,3), 1 => 7, relu, stride=2, pad=1), 
+        Conv((3,3), _ => 9, relu, stride=2),
+        Conv((3,3), _ => 5, tanh, stride=2, bias=false),
+        Flux.flatten, 
+        Dense(_ => 10),
+    )
+m6 = m5 |> pre
+
+@test m5(x4) ≈ m6(x4)
+
+#=
+
+julia> @btime $m5($x4);
+  min 139.125 μs, mean 191.653 μs (179 allocations, 262.73 KiB)
+
+julia> @btime $m6($x4);
+  min 140.125 μs, mean 196.337 μs (160 allocations, 86.39 KiB)
+
+=#
+
+
+using Metalhead
+m50 = Metalhead.ResNet(50)  # 100MB
+m50pre = m50 |> pre  # 200BM
+
+
+# First run
+
+julia> @time m50(randn(Float32, 100,100,3,32)) |> size
+  5.543590 seconds (6.11 M allocations: 1.963 GiB, 14.14% gc time, 96.22% compilation time)
+(1000, 32)
+
+julia> @time m50pre(randn(Float32, 100,100,3,32)) |> size
+ 16.098089 seconds (15.84 M allocations: 2.576 GiB, 62.26% gc time, 69.06% compilation time)
+(1000, 32)
+
+# Later
+
+
+julia> @time m50(randn(Float32, 100,100,3,32)) |> size
+ 11.541100 seconds (4.40 k allocations: 1.570 GiB, 85.73% gc time)
+(1000, 32)
+
+julia> @time m50pre(randn(Float32, 100,100,3,32)) |> size
+  4.664626 seconds (4.09 k allocations: 381.454 MiB, 61.15% gc time)
+(1000, 32)
+
+
+m50pre  # now 1.340 GiB
+