diff --git a/Manifest.toml b/Manifest.toml
index cdcae7803..7649ea370 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.11.5"
 manifest_format = "2.0"
-project_hash = "93e3f90921e5771e56e7b8d21131b1107faa4765"
+project_hash = "5ea45087374f193fc9b541533d3ffc8025f44bcd"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "e2478490447631aedba0823d4d7a80b2cc8cdb32"
@@ -127,17 +127,14 @@ weakdeps = ["Libtask"]
     AdvancedPSLibtaskExt = "Libtask"
 
 [[deps.AdvancedVI]]
-deps = ["ADTypes", "Bijectors", "DiffResults", "Distributions", "DistributionsAD", "DocStringExtensions", "ForwardDiff", "LinearAlgebra", "ProgressMeter", "Random", "Requires", "StatsBase", "StatsFuns", "Tracker"]
-git-tree-sha1 = "e913c2794cf9d2db9d36d28f67abd27595101f49"
+deps = ["ADTypes", "Accessors", "DiffResults", "DifferentiationInterface", "Distributions", "DocStringExtensions", "FillArrays", "Functors", "LinearAlgebra", "LogDensityProblems", "Optimisers", "ProgressMeter", "Random", "StatsBase"]
+git-tree-sha1 = "59c9723a71ed815eafec430d4cafa592b5889b96"
 uuid = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
-version = "0.2.12"
-weakdeps = ["Enzyme", "Flux", "ReverseDiff", "Zygote"]
+version = "0.4.1"
+weakdeps = ["Bijectors"]
 
     [deps.AdvancedVI.extensions]
-    AdvancedVIEnzymeExt = ["Enzyme"]
-    AdvancedVIFluxExt = ["Flux"]
-    AdvancedVIReverseDiffExt = ["ReverseDiff"]
-    AdvancedVIZygoteExt = ["Zygote"]
+    AdvancedVIBijectorsExt = "Bijectors"
 
 [[deps.AliasTables]]
 deps = ["PtrArrays", "Random"]
@@ -747,9 +744,9 @@ version = "6.176.0"
 
 [[deps.DiffEqCallbacks]]
 deps = ["ConcreteStructs", "DataStructures", "DiffEqBase", "DifferentiationInterface", "Functors", "LinearAlgebra", "Markdown", "RecipesBase", "RecursiveArrayTools", "SciMLBase", "StaticArraysCore"]
-git-tree-sha1 = "b1f970a2873a2cf76ce35fb0ed2b755a11b31052"
+git-tree-sha1 = "76292e889472e810d40a844b714743c0ffb1c53b"
 uuid = "459566f4-90b8-5000-8ac3-15dfb0a30def"
-version = "4.1.0"
+version = "4.6.0"
 
 [[deps.DiffEqNoiseProcess]]
 deps = ["DiffEqBase", "Distributions", "GPUArraysCore", "LinearAlgebra", "Markdown", "Optim", "PoissonRandom", "QuadGK", "Random", "Random123", "RandomNumbers", "RecipesBase", "RecursiveArrayTools", "ResettableStacks", "SciMLBase", "StaticArraysCore", "Statistics"]
@@ -773,12 +770,6 @@ git-tree-sha1 = "23163d55f885173722d1e4cf0f6110cdbaf7e272"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
 version = "1.15.1"
 
-[[deps.DiffTests]]
-deps = ["LinearAlgebra", "SparseArrays", "Statistics"]
-git-tree-sha1 = "b92beb1933df01bf4915d3a05e54c2a0aad312c7"
-uuid = "de460e47-3fe3-5279-bb4a-814414816d5d"
-version = "0.1.2"
-
 [[deps.DifferentialEquations]]
 deps = ["BoundaryValueDiffEq", "DelayDiffEq", "DiffEqBase", "DiffEqCallbacks", "DiffEqNoiseProcess", "JumpProcesses", "LinearAlgebra", "LinearSolve", "NonlinearSolve", "OrdinaryDiffEq", "Random", "RecursiveArrayTools", "Reexport", "SciMLBase", "SteadyStateDiffEq", "StochasticDiffEq", "Sundials"]
 git-tree-sha1 = "afdc7dfee475828b4f0286d63ffe66b97d7a3fa7"
@@ -1154,10 +1145,10 @@ uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 version = "0.8.5"
 
 [[deps.Flux]]
-deps = ["Adapt", "ChainRulesCore", "Compat", "Functors", "LinearAlgebra", "MLDataDevices", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"]
-git-tree-sha1 = "df520a0727f843576801a0294f5be1a94be28e23"
+deps = ["Adapt", "ChainRulesCore", "Compat", "EnzymeCore", "Functors", "LinearAlgebra", "MLCore", "MLDataDevices", "MLUtils", "MacroTools", "NNlib", "OneHotArrays", "Optimisers", "Preferences", "ProgressLogging", "Random", "Reexport", "Setfield", "SparseArrays", "SpecialFunctions", "Statistics", "Zygote"]
+git-tree-sha1 = "2c35003ec8dafabdc48549102208b1b15552cb33"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.14.25"
+version = "0.16.4"
 
     [deps.Flux.extensions]
     FluxAMDGPUExt = "AMDGPU"
@@ -1226,10 +1217,10 @@ uuid = "77dc65aa-8811-40c2-897b-53d922fa7daf"
 version = "0.1.3"
 
 [[deps.Functors]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "64d8e93700c7a3f28f717d265382d52fac9fa1c1"
+deps = ["Compat", "ConstructionBase", "LinearAlgebra", "Random"]
+git-tree-sha1 = "60a0339f28a233601cb74468032b5c302d5067de"
 uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-version = "0.4.12"
+version = "0.5.2"
 
 [[deps.Future]]
 deps = ["Random"]
@@ -1358,16 +1349,6 @@ weakdeps = ["Distributions"]
     [deps.HiddenMarkovModels.extensions]
     HiddenMarkovModelsDistributionsExt = "Distributions"
 
-[[deps.Hwloc]]
-deps = ["CEnum", "Hwloc_jll", "Printf"]
-git-tree-sha1 = "6a3d80f31ff87bc94ab22a7b8ec2f263f9a6a583"
-uuid = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
-version = "3.3.0"
-weakdeps = ["AbstractTrees"]
-
-    [deps.Hwloc.extensions]
-    HwlocTrees = "AbstractTrees"
-
 [[deps.Hwloc_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
 git-tree-sha1 = "92f65c4d78ce8cdbb6b68daf88889950b0a99d11"
@@ -1513,10 +1494,10 @@ uuid = "b14d175d-62b4-44ba-8fb7-3064adc8c3ec"
 version = "0.2.4"
 
 [[deps.JumpProcesses]]
-deps = ["ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "FunctionWrappers", "Graphs", "LinearAlgebra", "Markdown", "PoissonRandom", "Random", "RandomNumbers", "RecursiveArrayTools", "Reexport", "SciMLBase", "Setfield", "StaticArrays", "SymbolicIndexingInterface", "UnPack"]
-git-tree-sha1 = "f2bdec5b4580414aee3178c8caa6e46c344c0bbc"
+deps = ["ArrayInterface", "DataStructures", "DiffEqBase", "DiffEqCallbacks", "DocStringExtensions", "FunctionWrappers", "Graphs", "LinearAlgebra", "Markdown", "PoissonRandom", "Random", "RandomNumbers", "RecursiveArrayTools", "Reexport", "SciMLBase", "Setfield", "StaticArrays", "SymbolicIndexingInterface", "UnPack"]
+git-tree-sha1 = "fb7fd516de38db80f50fe15e57d44da2836365e7"
 uuid = "ccbc3e58-028d-4f4c-8cd5-9ae44345cda5"
-version = "9.14.3"
+version = "9.16.0"
 weakdeps = ["FastBroadcast"]
 
 [[deps.KernelAbstractions]]
@@ -1885,10 +1866,10 @@ uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 version = "1.1.0"
 
 [[deps.Lux]]
-deps = ["ADTypes", "Adapt", "ArgCheck", "ArrayInterface", "ChainRulesCore", "Compat", "ConcreteStructs", "DispatchDoctor", "EnzymeCore", "FastClosures", "ForwardDiff", "Functors", "GPUArraysCore", "LinearAlgebra", "LuxCore", "LuxLib", "MLDataDevices", "MacroTools", "Markdown", "NNlib", "Optimisers", "Preferences", "Random", "Reexport", "SIMDTypes", "Setfield", "Static", "StaticArraysCore", "Statistics", "WeightInitializers"]
-git-tree-sha1 = "39cc335d2601d4b0d74b209f718e7e3641771448"
+deps = ["ADTypes", "Adapt", "ArgCheck", "ArrayInterface", "ChainRulesCore", "Compat", "ConcreteStructs", "DiffResults", "DispatchDoctor", "EnzymeCore", "FastClosures", "ForwardDiff", "Functors", "GPUArraysCore", "LinearAlgebra", "LuxCore", "LuxLib", "MLDataDevices", "MacroTools", "Markdown", "NNlib", "Optimisers", "Preferences", "Random", "Reexport", "SIMDTypes", "Setfield", "Static", "StaticArraysCore", "Statistics", "WeightInitializers"]
+git-tree-sha1 = "4bf87816440bec8e64b610de8724c04a58e65928"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
-version = "1.2.3"
+version = "1.13.4"
 
     [deps.Lux.extensions]
     LuxComponentArraysExt = "ComponentArrays"
@@ -1898,7 +1879,7 @@ version = "1.2.3"
     LuxMLUtilsExt = "MLUtils"
     LuxMPIExt = "MPI"
     LuxMPINCCLExt = ["CUDA", "MPI", "NCCL"]
-    LuxReactantExt = ["Enzyme", "Reactant"]
+    LuxReactantExt = ["Enzyme", "Reactant", "ReactantCore"]
     LuxReverseDiffExt = ["FunctionWrappers", "ReverseDiff"]
     LuxSimpleChainsExt = "SimpleChains"
     LuxTrackerExt = "Tracker"
@@ -1915,6 +1896,7 @@ version = "1.2.3"
     MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
     NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
     Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+    ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433"
     ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
     SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
     Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
@@ -1922,9 +1904,9 @@ version = "1.2.3"
 
 [[deps.LuxCore]]
 deps = ["Compat", "DispatchDoctor", "Random"]
-git-tree-sha1 = "9f71aea19be890e429ee21adfeda1e63d01ec610"
+git-tree-sha1 = "48d45736aae190b1b41c1db5242fad955f0eff1d"
 uuid = "bb33d45b-7691-41d6-9220-0943567d0623"
-version = "1.1.0"
+version = "1.2.6"
 
     [deps.LuxCore.extensions]
     LuxCoreArrayInterfaceReverseDiffExt = ["ArrayInterface", "ReverseDiff"]
@@ -1932,11 +1914,12 @@ version = "1.1.0"
     LuxCoreChainRulesCoreExt = "ChainRulesCore"
     LuxCoreEnzymeCoreExt = "EnzymeCore"
     LuxCoreFunctorsExt = "Functors"
-    LuxCoreMLDataDevicesExt = "MLDataDevices"
+    LuxCoreMLDataDevicesExt = ["Adapt", "MLDataDevices"]
     LuxCoreReactantExt = "Reactant"
     LuxCoreSetfieldExt = "Setfield"
 
     [deps.LuxCore.weakdeps]
+    Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
     ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
     ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
     EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
@@ -1948,10 +1931,10 @@ version = "1.1.0"
     Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 
 [[deps.LuxLib]]
-deps = ["ArrayInterface", "ChainRulesCore", "Compat", "CpuId", "DispatchDoctor", "EnzymeCore", "FastClosures", "ForwardDiff", "Hwloc", "KernelAbstractions", "LinearAlgebra", "LuxCore", "MLDataDevices", "Markdown", "NNlib", "Polyester", "Preferences", "Random", "Reexport", "Static", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "a9e0f1477ac91623079f7a230f4de46170dc8817"
+deps = ["ArrayInterface", "CPUSummary", "ChainRulesCore", "Compat", "DispatchDoctor", "EnzymeCore", "FastClosures", "ForwardDiff", "Functors", "KernelAbstractions", "LinearAlgebra", "LuxCore", "MLDataDevices", "Markdown", "NNlib", "Polyester", "Preferences", "Random", "Reexport", "Static", "StaticArraysCore", "Statistics"]
+git-tree-sha1 = "b06b83e48cfbfc2ea25ec673537e7eb7e69879e9"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
-version = "1.3.7"
+version = "1.8.0"
 
     [deps.LuxLib.extensions]
     LuxLibAppleAccelerateExt = "AppleAccelerate"
@@ -1961,6 +1944,7 @@ version = "1.3.7"
     LuxLibLoopVectorizationExt = "LoopVectorization"
     LuxLibMKLExt = "MKL"
     LuxLibOctavianExt = ["Octavian", "LoopVectorization"]
+    LuxLibReactantExt = "Reactant"
     LuxLibReverseDiffExt = "ReverseDiff"
     LuxLibSLEEFPiratesExt = "SLEEFPirates"
     LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
@@ -1976,6 +1960,7 @@ version = "1.3.7"
     LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
     MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
     Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
+    Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
     ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
     SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
     Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
@@ -2007,15 +1992,16 @@ version = "1.0.0"
 
 [[deps.MLDataDevices]]
 deps = ["Adapt", "Compat", "Functors", "Preferences", "Random"]
-git-tree-sha1 = "85b47bc5a8bf0c886286638585df3bec7c9f8269"
+git-tree-sha1 = "a47f08b67298dee5778cf279a9a735ca2d11a890"
 uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
-version = "1.5.3"
+version = "1.10.0"
 
     [deps.MLDataDevices.extensions]
     MLDataDevicesAMDGPUExt = "AMDGPU"
     MLDataDevicesCUDAExt = "CUDA"
     MLDataDevicesChainRulesCoreExt = "ChainRulesCore"
     MLDataDevicesChainRulesExt = "ChainRules"
+    MLDataDevicesComponentArraysExt = "ComponentArrays"
     MLDataDevicesFillArraysExt = "FillArrays"
     MLDataDevicesGPUArraysExt = "GPUArrays"
     MLDataDevicesMLUtilsExt = "MLUtils"
@@ -2035,6 +2021,7 @@ version = "1.5.3"
     CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
     ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
     ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
     FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
     GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
     MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
@@ -2204,14 +2191,15 @@ uuid = "78c3b35d-d492-501b-9361-3d52fe80e533"
 version = "0.8.1"
 
 [[deps.Mooncake]]
-deps = ["ADTypes", "ChainRules", "ChainRulesCore", "DiffRules", "DiffTests", "ExprTools", "FunctionWrappers", "GPUArraysCore", "Graphs", "InteractiveUtils", "LinearAlgebra", "MistyClosures", "Random", "Setfield", "Test"]
-git-tree-sha1 = "68c31b62f4957a76aaf379092d13475a62f7658b"
+deps = ["ADTypes", "ChainRules", "ChainRulesCore", "DiffRules", "ExprTools", "FunctionWrappers", "GPUArraysCore", "Graphs", "InteractiveUtils", "LinearAlgebra", "MistyClosures", "Random", "Test"]
+git-tree-sha1 = "e4811fc54aa752d4a9a5cc9bed5cd85b8ab75db0"
 uuid = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
-version = "0.4.104"
+version = "0.4.122"
 
     [deps.Mooncake.extensions]
     MooncakeAllocCheckExt = "AllocCheck"
     MooncakeCUDAExt = "CUDA"
+    MooncakeFluxExt = "Flux"
     MooncakeJETExt = "JET"
     MooncakeLuxLibExt = "LuxLib"
     MooncakeLuxLibSLEEFPiratesExtension = ["LuxLib", "SLEEFPirates"]
@@ -2221,6 +2209,7 @@ version = "0.4.104"
     [deps.Mooncake.weakdeps]
     AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"
     CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
     JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
     LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
     NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
@@ -2488,10 +2477,20 @@ version = "1.13.1"
     MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
 
 [[deps.Optimisers]]
-deps = ["ChainRulesCore", "Functors", "LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "c9ff5c686240c31eb8570b662dd1f66f4b183116"
+deps = ["ChainRulesCore", "ConstructionBase", "Functors", "LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "131dc319e7c58317e8c6d5170440f6bdaee0a959"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
-version = "0.3.4"
+version = "0.4.6"
+
+    [deps.Optimisers.extensions]
+    OptimisersAdaptExt = ["Adapt"]
+    OptimisersEnzymeCoreExt = "EnzymeCore"
+    OptimisersReactantExt = "Reactant"
+
+    [deps.Optimisers.weakdeps]
+    Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+    Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 
 [[deps.Optimization]]
 deps = ["ADTypes", "ArrayInterface", "ConsoleProgressMonitor", "DocStringExtensions", "LBFGSB", "LinearAlgebra", "Logging", "LoggingExtras", "OptimizationBase", "Printf", "ProgressLogging", "Reexport", "SciMLBase", "SparseArrays", "TerminalLoggers"]
@@ -3670,9 +3669,9 @@ version = "1.6.0"
 
 [[deps.Turing]]
 deps = ["ADTypes", "AbstractMCMC", "Accessors", "AdvancedHMC", "AdvancedMH", "AdvancedPS", "AdvancedVI", "BangBang", "Bijectors", "Compat", "DataStructures", "Distributions", "DistributionsAD", "DocStringExtensions", "DynamicPPL", "EllipticalSliceSampling", "ForwardDiff", "Libtask", "LinearAlgebra", "LogDensityProblems", "MCMCChains", "NamedArrays", "Optimization", "OptimizationOptimJL", "OrderedCollections", "Printf", "Random", "Reexport", "SciMLBase", "SpecialFunctions", "Statistics", "StatsAPI", "StatsBase", "StatsFuns"]
-git-tree-sha1 = "e306992e20e271c9834f65fc41972c7f3877c112"
+git-tree-sha1 = "282ca358181f585fbb271eb9301e16b6fe5c80e0"
 uuid = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
-version = "0.38.6"
+version = "0.39.1"
 weakdeps = ["DynamicHMC", "Optim"]
 
     [deps.Turing.extensions]
diff --git a/Project.toml b/Project.toml
index 6c1f69ec4..103705a87 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
 AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
 AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d"
 AdvancedMH = "5b7e9947-ddc0-4b3f-9b55-0d8042f74170"
+AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
@@ -13,6 +14,7 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
@@ -36,6 +38,7 @@ Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
 MicroCanonicalHMC = "234d2aa0-2291-45f7-9047-6fa6f316b0a8"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationNLopt = "4e6fcdb7-1186-4e1f-a706-475e75c168bb"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
@@ -54,4 +57,4 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Turing = "0.38"
+Turing = "0.39"
diff --git a/_quarto.yml b/_quarto.yml
index 90f198ca8..70ee04bce 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -32,7 +32,7 @@ website:
         text: Team
     right:
       # Current version
-      - text: "v0.38"
+      - text: "v0.39"
         menu:
           - text: Changelog
             href: https://turinglang.org/docs/changelog.html
diff --git a/developers/compiler/minituring-contexts/index.qmd b/developers/compiler/minituring-contexts/index.qmd
index 6468483ef..4fcdda02b 100755
--- a/developers/compiler/minituring-contexts/index.qmd
+++ b/developers/compiler/minituring-contexts/index.qmd
@@ -294,7 +294,7 @@ Of course, using an MCMC algorithm to sample from the prior is unnecessary and s
 The use of contexts also goes far beyond just evaluating log probabilities and sampling. Some examples from Turing are
 
 * `FixedContext`, which fixes some variables to given values and removes them completely from the evaluation of any log probabilities. They power the `Turing.fix` and `Turing.unfix` functions.
-* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.uncondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. 
+* `ConditionContext` conditions the model on fixed values for some parameters. They are used by `Turing.condition` and `Turing.decondition`, i.e. the `model | (parameter=value,)` syntax. The difference between `fix` and `condition` is whether the log probability for the corresponding variable is included in the overall log density. 
 
 * `PriorExtractorContext` collects information about what the prior distribution of each variable is.
 * `PrefixContext` adds prefixes to variable names, allowing models to be used within other models without variable name collisions.
diff --git a/developers/compiler/model-manual/index.qmd b/developers/compiler/model-manual/index.qmd
index cdf3fa829..4571254c8 100755
--- a/developers/compiler/model-manual/index.qmd
+++ b/developers/compiler/model-manual/index.qmd
@@ -36,18 +36,18 @@ using DynamicPPL
 function gdemo2(model, varinfo, context, x)
     # Assume s² has an InverseGamma distribution.
     s², varinfo = DynamicPPL.tilde_assume!!(
-        context, InverseGamma(2, 3), Turing.@varname(s²), varinfo
+        context, InverseGamma(2, 3), @varname(s²), varinfo
     )
 
     # Assume m has a Normal distribution.
     m, varinfo = DynamicPPL.tilde_assume!!(
-        context, Normal(0, sqrt(s²)), Turing.@varname(m), varinfo
+        context, Normal(0, sqrt(s²)), @varname(m), varinfo
     )
 
     # Observe each value of x[i] according to a Normal distribution.
     for i in eachindex(x)
         _retval, varinfo = DynamicPPL.tilde_observe!!(
-            context, Normal(m, sqrt(s²)), x[i], Turing.@varname(x[i]), varinfo
+            context, Normal(m, sqrt(s²)), x[i], @varname(x[i]), varinfo
         )
     end
 
@@ -55,7 +55,7 @@ function gdemo2(model, varinfo, context, x)
     # value and the updated varinfo.
     return nothing, varinfo
 end
-gdemo2(x) = Turing.Model(gdemo2, (; x))
+gdemo2(x) = DynamicPPL.Model(gdemo2, (; x))
 
 # Instantiate a Model object with our data variables.
 model2 = gdemo2([1.5, 2.0])
diff --git a/developers/inference/implementing-samplers/index.qmd b/developers/inference/implementing-samplers/index.qmd
index 9d69fbb80..1d8aa3d6c 100644
--- a/developers/inference/implementing-samplers/index.qmd
+++ b/developers/inference/implementing-samplers/index.qmd
@@ -403,11 +403,11 @@ As we promised, all of this hassle of implementing our `MALA` sampler in a way t
 It also enables use with Turing.jl through the `externalsampler`, but we need to do one final thing first: we need to tell Turing.jl how to extract a vector of parameters from the "sample" returned in our implementation of `AbstractMCMC.step`. In our case, the "sample" is a `MALASample`, so we just need the following line:
 
 ```{julia}
-# Load Turing.jl.
 using Turing
+using DynamicPPL
 
 # Overload the `getparams` method for our "sample" type, which is just a vector.
-Turing.Inference.getparams(::Turing.Model, sample::MALASample) = sample.x
+Turing.Inference.getparams(::DynamicPPL.Model, sample::MALASample) = sample.x
 ```
 
 And with that, we're good to go!
diff --git a/tutorials/bayesian-time-series-analysis/index.qmd b/tutorials/bayesian-time-series-analysis/index.qmd
index 33dcb9962..0cdf201d6 100755
--- a/tutorials/bayesian-time-series-analysis/index.qmd
+++ b/tutorials/bayesian-time-series-analysis/index.qmd
@@ -165,6 +165,8 @@ scatter!(t, yf; color=2, label="Data")
 With the model specified and with a reasonable prior we can now let Turing decompose the time series for us!
 
 ```{julia}
+using MCMCChains: get_sections
+
 function mean_ribbon(samples)
     qs = quantile(samples)
     low = qs[:, Symbol("2.5%")]
@@ -174,7 +176,7 @@ function mean_ribbon(samples)
 end
 
 function get_decomposition(model, x, cyclic_features, chain, op)
-    chain_params = Turing.MCMCChains.get_sections(chain, :parameters)
+    chain_params = get_sections(chain, :parameters)
     return returned(model(x, cyclic_features, op), chain_params)
 end
 
diff --git a/tutorials/gaussian-mixture-models/index.qmd b/tutorials/gaussian-mixture-models/index.qmd
index b93bb6f3e..36449824c 100755
--- a/tutorials/gaussian-mixture-models/index.qmd
+++ b/tutorials/gaussian-mixture-models/index.qmd
@@ -278,7 +278,7 @@ $$
 $$
 
 Where we sum the components with `logsumexp` from the [`LogExpFunctions.jl` package](https://juliastats.org/LogExpFunctions.jl/stable/).
-The manually incremented likelihood can be added to the log-probability with `Turing.@addlogprob!`, giving us the following model:
+The manually incremented likelihood can be added to the log-probability with `@addlogprob!`, giving us the following model:
 
 ```{julia}
 #| output: false
@@ -295,7 +295,7 @@ using LogExpFunctions
         for k in 1:K
             lvec[k] = (w[k] + logpdf(dists[k], x[:, i]))
         end
-        Turing.@addlogprob! logsumexp(lvec)
+        @addlogprob! logsumexp(lvec)
     end
 end
 ```
@@ -303,10 +303,10 @@ end
 ::: {.callout-warning collapse="false"}
 ## Manually Incrementing Probablity
 
-When possible, use of `Turing.@addlogprob!` should be avoided, as it exists outside the
+When possible, use of `@addlogprob!` should be avoided, as it exists outside the
 usual structure of a Turing model. In most cases, a custom distribution should be used instead.
 
-Here, the next section demonstrates the perfered method --- using the `MixtureModel` distribution we have seen already to
+Here, the next section demonstrates the preferred method --- using the `MixtureModel` distribution we have seen already to
 perform the marginalization automatically.
 :::
 
diff --git a/tutorials/hidden-markov-models/index.qmd b/tutorials/hidden-markov-models/index.qmd
index 7508299d9..9448b5794 100755
--- a/tutorials/hidden-markov-models/index.qmd
+++ b/tutorials/hidden-markov-models/index.qmd
@@ -191,7 +191,7 @@ using LogExpFunctions
     T ~ filldist(Dirichlet(fill(1/K, K)), K)
 
     hmm = HMM(softmax(ones(K)), copy(T'), [Normal(m[i], 0.1) for i in 1:K])
-    Turing.@addlogprob! logdensityof(hmm, y)
+    @addlogprob! logdensityof(hmm, y)
 end
 
 chn2 = sample(BayesHmm2(y, 3), NUTS(), 1000)
@@ -221,7 +221,7 @@ We can use the `viterbi()` algorithm, also from the `HiddenMarkovModels` package
     T ~ filldist(Dirichlet(fill(1/K, K)), K)
 
     hmm = HMM(softmax(ones(K)), copy(T'), [Normal(m[i], 0.1) for i in 1:K])
-    Turing.@addlogprob! logdensityof(hmm, y)
+    @addlogprob! logdensityof(hmm, y)
 
     # Conditional generation of the hidden states.
     if IncludeGenerated
diff --git a/tutorials/variational-inference/index.qmd b/tutorials/variational-inference/index.qmd
index f25808534..1e6da47c1 100755
--- a/tutorials/variational-inference/index.qmd
+++ b/tutorials/variational-inference/index.qmd
@@ -12,260 +12,51 @@ using Pkg;
 Pkg.instantiate();
 ```
 
-In this post we'll have a look at what's know as **variational inference (VI)**, a family of _approximate_ Bayesian inference methods, and how to use it in Turing.jl as an alternative to other approaches such as MCMC. In particular, we will focus on one of the more standard VI methods called **Automatic Differentation Variational Inference (ADVI)**.
+This post will look at **variational inference (VI)**, an optimization approach to _approximate_ Bayesian inference, and how to use it in Turing.jl as an alternative to other approaches such as MCMC.
+This post will focus on the usage of VI in Turing rather than the principles and theory underlying VI.
+If you are interested in understanding the mathematics you can checkout [our write-up]({{<meta using-turing-variational-inference>}}) or any other resource online (there are a lot of great ones).
 
-Here we will focus on how to use VI in Turing and not much on the theory underlying VI.
-If you are interested in understanding the mathematics you can checkout [our write-up]({{<meta using-turing-variational-inference>}}) or any other resource online (there a lot of great ones).
-
-Using VI in Turing.jl is very straight forward.
-If `model` denotes a definition of a `Turing.Model`, performing VI is as simple as
+Let's start with a minimal example. 
+Consider a `Turing.Model`, which we denote as `model`.
+Approximating the posterior associated with `model` via VI is as simple as
 
 ```{julia}
 #| eval: false
-m = model(data...) # instantiate model on the data
-q = vi(m, vi_alg)  # perform VI on `m` using the VI method `vi_alg`, which returns a `VariationalPosterior`
+m = model(data...)               # instantiate model on the data
+q_init = q_fullrank_gaussian(m)  # initial variational approximation
+vi(m, q_init, 1000) # perform VI with the default algorithm on `m` for 1000 iterations
 ```
+Thus, it's no more work than standard MCMC sampling in Turing.
+The default algorithm uses stochastic gradient descent to minimize the (exclusive) KL divergence.
+This is commonly referred to as *automatic differentiation variational inference*[^KTRGB2017], *stochastic gradient VI*[^TL2014], and *black-box variational inference*[^RGB2014] with the reparameterization gradient[^KW2014][^RMW2014][^TL2014].
 
-Thus it's no more work than standard MCMC sampling in Turing.
-
-To get a bit more into what we can do with `vi`, we'll first have a look at a simple example and then we'll reproduce the [tutorial on Bayesian linear regression]({{<meta linear-regression>}}) using VI instead of MCMC. Finally we'll look at some of the different parameters of `vi` and how you for example can use your own custom variational family.
+To get a bit more into what we can do with VI, let's look at a more concrete example.
+We will reproduce the [tutorial on Bayesian linear regression]({{<meta linear-regression>}}) using VI instead of MCMC.
+After that, we will discuss how to customize the behavior of `vi` for more advanced usage.
 
-We first import the packages to be used:
+Let's first import the relevant packages:
 
 ```{julia}
 using Random
 using Turing
 using Turing: Variational
-using Bijectors: bijector
-using StatsPlots, Measures
+using AdvancedVI
+using Plots
 
 Random.seed!(42);
 ```
 
-## Simple example: Normal-Gamma conjugate model
-
-The Normal-(Inverse)Gamma conjugate model is defined by the following generative process
-
-\begin{align}
-s &\sim \mathrm{InverseGamma}(2, 3) \\
-m &\sim \mathcal{N}(0, s) \\
-x_i &\overset{\text{i.i.d.}}{=} \mathcal{N}(m, s), \quad i = 1, \dots, n
-\end{align}
-
-Recall that *conjugate* refers to the fact that we can obtain a closed-form expression for the posterior. Of course one wouldn't use something like variational inference for a conjugate model, but it's useful as a simple demonstration as we can compare the result to the true posterior.
-
-First we generate some synthetic data, define the `Turing.Model` and instantiate the model on the data:
-
-```{julia}
-# generate data
-x = randn(2000);
-```
-
-```{julia}
-@model function model(x)
-    s ~ InverseGamma(2, 3)
-    m ~ Normal(0.0, sqrt(s))
-    for i in 1:length(x)
-        x[i] ~ Normal(m, sqrt(s))
-    end
-end;
-```
-
-```{julia}
-# Instantiate model
-m = model(x);
-```
-
-Now we'll produce some samples from the posterior using a MCMC method, which in constrast to VI is guaranteed to converge to the *exact* posterior (as the number of samples go to infinity).
-
-We'll produce 10 000 samples with 200 steps used for adaptation and a target acceptance rate of 0.65
-
-If you don't understand what "adaptation" or "target acceptance rate" refers to, all you really need to know is that `NUTS` is known to be one of the most accurate and efficient samplers (when applicable) while requiring little to no hand-tuning to work well.
-
-```{julia}
-#| output: false
-setprogress!(false)
-```
-
-```{julia}
-samples_nuts = sample(m, NUTS(), 10_000);
-```
-
-Now let's try VI. The most important function you need to now about to do VI in Turing is `vi`:
-
-```{julia}
-@doc(Variational.vi)
-```
-
-Additionally, you can pass
-
-  - an initial variational posterior `q`, for which we assume there exists a implementation of `update(::typeof(q), θ::AbstractVector)` returning an updated posterior `q` with parameters `θ`.
-  - a function mapping $\theta \mapsto q_{\theta}$ (denoted above `getq`) together with initial parameters `θ`. This provides more flexibility in the types of variational families that we can use, and can sometimes be slightly more convenient for quick and rough work.
-
-By default, i.e. when calling `vi(m, advi)`, Turing use a *mean-field* approximation with a multivariate normal as the base-distribution. Mean-field refers to the fact that we assume all the latent variables to be *independent*. This the "standard" ADVI approach; see [Automatic Differentiation Variational Inference (2016)](https://arxiv.org/abs/1603.00788) for more. In Turing, one can obtain such a mean-field approximation by calling `Variational.meanfield(model)` for which there exists an internal implementation for `update`:
-
-```{julia}
-@doc(Variational.meanfield)
-```
-
-Currently the only implementation of `VariationalInference` available is `ADVI`, which is very convenient and applicable as long as your `Model` is differentiable with respect to the *variational parameters*, that is, the parameters of your variational distribution, e.g. mean and variance in the mean-field approximation.
-
-```{julia}
-@doc(Variational.ADVI)
-```
-
-To perform VI on the model `m` using 10 samples for gradient estimation and taking 1000 gradient steps is then as simple as:
-
-```{julia}
-# ADVI
-advi = ADVI(10, 1000)
-q = vi(m, advi);
-```
-
-Unfortunately, for such a small problem Turing's new `NUTS` sampler is *so* efficient now that it's not that much more efficient to use ADVI. So, so very unfortunate...
-
-With that being said, this is not the case in general. For very complex models we'll later find that `ADVI` produces very reasonable results in a much shorter time than `NUTS`.
-
-And one significant advantage of using `vi` is that we can sample from the resulting `q` with ease. In fact, the result of the `vi` call is a `TransformedDistribution` from Bijectors.jl, and it implements the Distributions.jl interface for a `Distribution`:
-
-```{julia}
-q isa MultivariateDistribution
-```
-
-This means that we can call `rand` to sample from the variational posterior `q`
-
-```{julia}
-histogram(rand(q, 1_000)[1, :])
-```
-
-and `logpdf` to compute the log-probability
-
-```{julia}
-logpdf(q, rand(q))
-```
-
-Let's check the first and second moments of the data to see how our approximation compares to the point-estimates form the data:
-
-```{julia}
-var(x), mean(x)
-```
-
-```{julia}
-(mean(rand(q, 1000); dims=2)...,)
-```
-
-```{julia}
-#| echo: false
-let
-    v, m = (mean(rand(q, 2000); dims=2)...,)
-    @assert isapprox(v, 1.022; atol=0.1) "Mean of s (VI posterior, 1000 samples): $v"
-    @assert isapprox(m, -0.027; atol=0.03) "Mean of m (VI posterior, 1000 samples): $m"
-end
-```
-
-That's pretty close! But we're Bayesian so we're not interested in *just* matching the mean.
-Let's instead look the actual density `q`.
-
-For that we need samples:
-
-```{julia}
-samples = rand(q, 10000);
-size(samples)
-```
-
-```{julia}
-p1 = histogram(
-    samples[1, :]; bins=100, normed=true, alpha=0.2, color=:blue, label="", ylabel="density"
-)
-density!(samples[1, :]; label="s (ADVI)", color=:blue, linewidth=2)
-density!(samples_nuts, :s; label="s (NUTS)", color=:green, linewidth=2)
-vline!([var(x)]; label="s (data)", color=:black)
-vline!([mean(samples[1, :])]; color=:blue, label="")
-
-p2 = histogram(
-    samples[2, :]; bins=100, normed=true, alpha=0.2, color=:blue, label="", ylabel="density"
-)
-density!(samples[2, :]; label="m (ADVI)", color=:blue, linewidth=2)
-density!(samples_nuts, :m; label="m (NUTS)", color=:green, linewidth=2)
-vline!([mean(x)]; color=:black, label="m (data)")
-vline!([mean(samples[2, :])]; color=:blue, label="")
-
-plot(p1, p2; layout=(2, 1), size=(900, 500), legend=true)
-```
-
-For this particular `Model`, we can in fact obtain the posterior of the latent variables in closed form. This allows us to compare both `NUTS` and `ADVI` to the true posterior $p(s, m \mid x_1, \ldots, x_n)$.
-
-*The code below is just work to get the marginals $p(s \mid x_1, \ldots, x_n)$ and $p(m \mid x_1, \ldots, x_n)$. Feel free to skip it.*
-
-```{julia}
-# closed form computation of the Normal-inverse-gamma posterior
-# based on "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-function posterior(μ₀::Real, κ₀::Real, α₀::Real, β₀::Real, x::AbstractVector{<:Real})
-    # Compute summary statistics
-    n = length(x)
-    x̄ = mean(x)
-    sum_of_squares = sum(xi -> (xi - x̄)^2, x)
-
-    # Compute parameters of the posterior
-    κₙ = κ₀ + n
-    μₙ = (κ₀ * μ₀ + n * x̄) / κₙ
-    αₙ = α₀ + n / 2
-    βₙ = β₀ + (sum_of_squares + n * κ₀ / κₙ * (x̄ - μ₀)^2) / 2
-
-    return μₙ, κₙ, αₙ, βₙ
-end
-μₙ, κₙ, αₙ, βₙ = posterior(0.0, 1.0, 2.0, 3.0, x)
-
-# marginal distribution of σ²
-# cf. Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-p_σ² = InverseGamma(αₙ, βₙ)
-p_σ²_pdf = z -> pdf(p_σ², z)
-
-# marginal of μ
-# Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-p_μ = μₙ + sqrt(βₙ / (αₙ * κₙ)) * TDist(2 * αₙ)
-p_μ_pdf = z -> pdf(p_μ, z)
-
-# posterior plots
-p1 = plot()
-histogram!(samples[1, :]; bins=100, normed=true, alpha=0.2, color=:blue, label="")
-density!(samples[1, :]; label="s (ADVI)", color=:blue)
-density!(samples_nuts, :s; label="s (NUTS)", color=:green)
-vline!([mean(samples[1, :])]; linewidth=1.5, color=:blue, label="")
-plot!(range(0.75, 1.35; length=1_001), p_σ²_pdf; label="s (posterior)", color=:red)
-vline!([var(x)]; label="s (data)", linewidth=1.5, color=:black, alpha=0.7)
-xlims!(0.75, 1.35)
-
-p2 = plot()
-histogram!(samples[2, :]; bins=100, normed=true, alpha=0.2, color=:blue, label="")
-density!(samples[2, :]; label="m (ADVI)", color=:blue)
-density!(samples_nuts, :m; label="m (NUTS)", color=:green)
-vline!([mean(samples[2, :])]; linewidth=1.5, color=:blue, label="")
-plot!(range(-0.25, 0.25; length=1_001), p_μ_pdf; label="m (posterior)", color=:red)
-vline!([mean(x)]; label="m (data)", linewidth=1.5, color=:black, alpha=0.7)
-xlims!(-0.25, 0.25)
-
-plot(p1, p2; layout=(2, 1), size=(900, 500))
-```
-
-## Bayesian linear regression example using ADVI
-
-This is simply a duplication of the tutorial on [Bayesian linear regression]({{<meta linear-regression>}}) (much of the code is directly lifted), but now with the addition of an approximate posterior obtained using `ADVI`.
-
+## Bayesian Linear Regression Example
+Let's start by setting up our example.
+We will re-use the [Bayesian linear regression]({{<meta linear-regression>}}) example.
 As we'll see, there is really no additional work required to apply variational inference to a more complex `Model`.
 
-```{julia}
-Random.seed!(1);
-```
-
 ```{julia}
 using FillArrays
 using RDatasets
 
 using LinearAlgebra
-```
 
-```{julia}
 # Import the "Default" dataset.
 data = RDatasets.dataset("datasets", "mtcars");
 
@@ -353,167 +144,213 @@ n_obs, n_vars = size(train)
 m = linear_regression(train, train_label, n_obs, n_vars);
 ```
 
-## Performing VI
+## Basic Usage
+To run VI, we must first set a *variational family*.
+For instance, the most commonly used family is the mean-field Gaussian family.
+For this, Turing provides functions that automatically construct the initialization corresponding to the model `m`:
+```{julia}
+q_init = q_meanfield_gaussian(m);
+```
 
-First we define the initial variational distribution, or, equivalently, the family of distributions to consider. We're going to use the same mean-field approximation as Turing will use by default when we call `vi(m, advi)`, which we obtain by calling `Variational.meanfield`. This returns a `TransformedDistribution` with a `TuringDiagMvNormal` as the underlying distribution and the transformation mapping from the reals to the domain of the latent variables.
+`vi` will automatically recognize the variational family through the type of `q_init`.
+Here is a detailed documentation for the constructor:
 
 ```{julia}
-q0 = Variational.meanfield(m)
-typeof(q0)
+@doc(Variational.q_meanfield_gaussian)
 ```
+As we can see, the precise initialization can be customized through the keyword arguments.
 
+Let's run VI with the default setting:
 ```{julia}
-advi = ADVI(10, 10_000)
+n_iters = 1000
+q_avg, q_last, info, state = vi(m, q_init, n_iters; show_progress=false);
 ```
+The default setting uses the `AdvancedVI.RepGradELBO` objective, which corresponds to a variant of what is known as *automatic differentiation VI*[^KTRGB2017] or *stochastic gradient VI*[^TL2014] or *black-box VI*[^RGB2014] with the reparameterization gradient[^KW2014][^RMW2014][^TL2014].
+The default optimizer we use is `AdvancedVI.DoWG`[^KMJ2023] combined with a proximal operator.
+(The use of proximal operators with VI on a location-scale family is discussed in detail by J. Domke[^D2020][^DGG2023] and others[^KOWMG2023].)
+We will take a deeper look into the returned values and the keyword arguments in the following subsections.
+First, here is the full documentation for `vi`:
 
-Turing also provides a couple of different optimizers:
+```{julia}
+@doc(Variational.vi)
+```
 
-  - `TruncatedADAGrad` (default)
-  - `DecayedADAGrad`
-    as these are well-suited for problems with high-variance stochastic objectives, which is usually what the ELBO ends up being at different times in our optimization process.
+## Values Returned by `vi`
+The main output of the algorithm is `q_avg`, the average of the parameters generated by the optimization algorithm.
+For computing `q_avg`, the default setting uses what is known as polynomial averaging[^SZ2013].
+Usually, `q_avg` will perform better than the last-iterate `q_last`.
+For instance, we can compare the ELBO of the two:
+```{julia}
+@info("Objective of q_avg and q_last",
+    ELBO_q_avg = estimate_objective(AdvancedVI.RepGradELBO(32), q_avg, Turing.Variational.make_logdensity(m)),
+    ELBO_q_last = estimate_objective(AdvancedVI.RepGradELBO(32), q_last, Turing.Variational.make_logdensity(m)) 
+)
+```
+We can see that `ELBO_q_avg` is slightly more optimal.
 
-With that being said, thanks to Requires.jl, if we add a `using Flux` prior to `using Turing` we can also make use of all the optimizers in `Flux`, e.g. `ADAM`, without any additional changes to your code! For example:
+Now, `info` contains information generated during optimization that could be useful for diagnostics.
+For the default setting, which is `RepGradELBO`, it contains the ELBO estimated at each step, which can be plotted as follows:
 
 ```{julia}
-#| eval: false
-using Flux, Turing
-using Turing.Variational
-
-vi(m, advi; optimizer=Flux.ADAM())
+Plots.plot([i.elbo for i in info], xlabel="Iterations", ylabel="ELBO", label="info")
 ```
+Since the ELBO is estimated by a small number of samples, it appears noisy.
+Furthermore, at each step, the ELBO is evaluated on `q_last`, not `q_avg`, which is the actual output that we care about.
+To obtain more accurate ELBO estimates evaluated on `q_avg`, we have to define a custom callback function.
 
-just works.
+## Custom Callback Functions
+To inspect the progress of optimization in more detail, one can define a custom callback function.
+For example, the following callback function estimates the ELBO on `q_avg` every 10 steps with a larger number of samples:
 
-For this problem we'll use the `DecayedADAGrad` from Turing:
+```{julia}
+function callback(; stat, averaged_params, restructure, kwargs...)
+    if mod(stat.iteration, 10) == 1
+        q_avg    = restructure(averaged_params)
+        obj      = AdvancedVI.RepGradELBO(128)
+        elbo_avg = estimate_objective(obj, q_avg, Turing.Variational.make_logdensity(m))
+        (elbo_avg = elbo_avg,)
+    else
+        nothing
+    end
+end;
+```
+The `NamedTuple` returned by `callback` will be appended to the corresponding entry of `info`, and it will also be displayed on the progress meter if `show_progress` is set as `true`.
 
+The custom callback can be supplied to `vi` as a keyword argument:
 ```{julia}
-opt = Variational.DecayedADAGrad(1e-2, 1.1, 0.9)
+q_mf, _, info_mf, _ = vi(m, q_init, n_iters; show_progress=false, callback=callback);
 ```
 
+Let's plot the result:
 ```{julia}
-q = vi(m, advi, q0; optimizer=opt)
-typeof(q)
+iters   = 1:10:length(info_mf)
+elbo_mf = [i.elbo_avg for i in info_mf[iters]]
+Plots.plot!(iters, elbo_mf, xlabel="Iterations", ylabel="ELBO", label="callback", ylims=(-200,Inf))
 ```
+We can see that the ELBO values are less noisy and progress more smoothly due to averaging.
 
-*Note: as mentioned before, we internally define a `update(q::TransformedDistribution{<:TuringDiagMvNormal}, θ::AbstractVector)` method which takes in the current variational approximation `q` together with new parameters `z` and returns the new variational approximation. This is required so that we can actually update the `Distribution` object after each optimization step.*
+## Using Different Optimisers
+The default optimiser we use is a proximal variant of DoWG[^KMJ2023].
+For Gaussian variational families, this works well as a default option.
+Sometimes, the step size of `AdvancedVI.DoWG` could be too large, resulting in unstable behavior.
+(In this case, we recommend trying `AdvancedVI.DoG`[^IHC2023])
+Or, for whatever reason, it might be desirable to use a different optimiser.
+Our implementation supports any optimiser that implements the [Optimisers.jl](https://fluxml.ai/Optimisers.jl/stable/) interface.
 
-*Alternatively, we can instead provide the mapping $\theta \mapsto q_{\theta}$ directly together with initial parameters using the signature `vi(m, advi, getq, θ_init)` as mentioned earlier. We'll see an explicit example of this later on!*
+For instance, let's try using `Optimisers.Adam`[^KB2014], which is a popular choice.
+Since `AdvancedVI` does not implement a proximal operator for `Optimisers.Adam`, we must use the `AdvancedVI.ClipScale()` projection operator, which ensures that the scale matrix of the variational approximation is positive definite.
+(See the paper by J. Domke 2020[^D2020] for more detail about the use of a projection operator.)
+```{julia}
+using Optimisers
 
-To compute statistics for our approximation we need samples:
+_, _, info_adam, _ = vi(m, q_init, n_iters; show_progress=false, callback=callback, optimizer=Optimisers.Adam(3e-3), operator=ClipScale());
+```
 
 ```{julia}
-z = rand(q, 10_000);
+iters     = 1:10:length(info_mf)
+elbo_adam = [i.elbo_avg for i in info_adam[iters]]
+Plots.plot(iters, elbo_mf, xlabel="Iterations", ylabel="ELBO", label="DoWG")
+Plots.plot!(iters, elbo_adam, xlabel="Iterations", ylabel="ELBO", label="Adam")
 ```
+Compared to the default option `AdvancedVI.DoWG()`, we can see that `Optimisers.Adam(3e-3)` is converging more slowly.
+With more step size tuning, it is possible that `Optimisers.Adam` could perform better or equal.
+That is, most common optimisers require some degree of tuning to perform better or comparably to `AdvancedVI.DoWG()` or `AdvancedVI.DoG()`, which do not require much tuning at all.
+Due to this fact, they are referred to as parameter-free optimizers.
 
-Now we can for example look at the average
-
+## Using Full-Rank Variational Families
+So far, we have only used the mean-field Gaussian family.
+This, however, approximates the posterior covariance with a diagonal matrix.
+To model the full covariance matrix, we can use the *full-rank* Gaussian family[^TL2014][^KTRGB2017]:
 ```{julia}
-avg = vec(mean(z; dims=2))
+q_init_fr = q_fullrank_gaussian(m);
 ```
 
-The vector has the same ordering as the model, e.g. in this case `σ²` has index `1`, `intercept` has index `2` and `coefficients` has indices `3:12`. If  you forget or you might want to do something programmatically with the result, you can obtain the `sym → indices` mapping as follows:
-
 ```{julia}
-_, sym2range = bijector(m, Val(true));
-sym2range
+@doc(Variational.q_fullrank_gaussian)
 ```
 
-For example, we can check the sample distribution and mean value of `σ²`:
+The term *full-rank* might seem a bit peculiar since covariance matrices are always full-rank.
+This term, however, traditionally comes from the fact that full-rank families use full-rank factors in addition to the diagonal of the covariance.
 
+In contrast to the mean-field family, the full-rank family will often result in more computation per optimization step and slower convergence, especially in high dimensions:
 ```{julia}
-histogram(z[1, :])
-avg[union(sym2range[:σ²]...)]
-```
+q_fr, _, info_fr, _ = vi(m, q_init_fr, n_iters; show_progress=false, callback)
 
-```{julia}
-avg[union(sym2range[:intercept]...)]
+Plots.plot(elbo_mf, xlabel="Iterations", ylabel="ELBO", label="Mean-Field", ylims=(-200, Inf))
+
+elbo_fr = [i.elbo_avg for i in info_fr[iters]]
+Plots.plot!(elbo_fr, xlabel="Iterations", ylabel="ELBO", label="Full-Rank", ylims=(-200, Inf))
 ```
+However, we can see that the full-rank families achieve a higher ELBO in the end.
+Due to the relationship between the ELBO and the Kullback-Leibler divergence, this indicates that the full-rank covariance is much more accurate.
+This trade-off between statistical accuracy and optimization speed is often referred to as the *statistical-computational trade-off*.
+The fact that we can control this trade-off through the choice of variational family is a strength, rather than a limitation, of variational inference.
 
+We can also visualize the covariance matrix.
 ```{julia}
-avg[union(sym2range[:coefficients]...)]
+heatmap(cov(rand(q_fr, 100_000), dims=2))
 ```
 
-*Note: as you can see, this is slightly awkward to work with at the moment. We'll soon add a better way of dealing with this.*
-
-With a bit of work (this will be much easier in the future), we can also visualize the approximate marginals of the different variables, similar to `plot(chain)`:
-
-```{julia}
-function plot_variational_marginals(z, sym2range)
-    ps = []
-
-    for (i, sym) in enumerate(keys(sym2range))
-        indices = union(sym2range[sym]...)  # <= array of ranges
-        if sum(length.(indices)) > 1
-            offset = 1
-            for r in indices
-                p = density(
-                    z[r, :];
-                    title="$(sym)[$offset]",
-                    titlefontsize=10,
-                    label="",
-                    ylabel="Density",
-                    margin=1.5mm,
-                )
-                push!(ps, p)
-                offset += 1
-            end
-        else
-            p = density(
-                z[first(indices), :];
-                title="$(sym)",
-                titlefontsize=10,
-                label="",
-                ylabel="Density",
-                margin=1.5mm,
-            )
-            push!(ps, p)
-        end
-    end
+## Obtaining Summary Statistics
 
-    return plot(ps...; layout=(length(ps), 1), size=(500, 2000), margin=4.0mm)
-end
+Let's inspect the resulting variational approximation in more detail and compare it against MCMC.
+To obtain summary statistics from VI, we can draw samples from the resulting variational approximation:
+
+```{julia}
+z = rand(q_fr, 100_000);
 ```
 
+Now, we can, for example, look at expectations:
+
 ```{julia}
-plot_variational_marginals(z, sym2range)
+avg = vec(mean(z; dims=2))
 ```
 
-And let's compare this to using the `NUTS` sampler:
+The vector has the same ordering as the parameters in the model, *e.g.* in this case `σ²` has index `1`, `intercept` has index `2` and `coefficients` has indices `3:12`. If  you forget or you might want to do something programmatically with the result, you can obtain the `sym → indices` mapping as follows:
 
 ```{julia}
-chain = sample(m, NUTS(), 10_000);
+using Bijectors: bijector
+
+_, sym2range = bijector(m, Val(true));
+sym2range
 ```
 
+For example, we can check the sample distribution and mean value of `σ²`:
+
 ```{julia}
-plot(chain; margin=12.00mm)
+histogram(z[1, :])
+avg[union(sym2range[:σ²]...)]
 ```
 
 ```{julia}
-vi_mean = vec(mean(z; dims=2))[[
-    union(sym2range[:coefficients]...)...,
-    union(sym2range[:intercept]...)...,
-    union(sym2range[:σ²]...)...,
-]]
+avg[union(sym2range[:intercept]...)]
 ```
 
 ```{julia}
-mcmc_mean = mean(chain, names(chain, :parameters))[:, 2]
+avg[union(sym2range[:coefficients]...)]
 ```
 
+For further convenience, we can wrap the samples into a `Chains` object to summarize the results.
 ```{julia}
-plot(mcmc_mean; xticks=1:1:length(mcmc_mean), linestyle=:dot, label="NUTS")
-plot!(vi_mean; linestyle=:dot, label="VI")
+varnames = vcat(["σ²", "intercept"], ["coefficients[$i]" for i in 1:n_vars])
+vi_chain = Chains(reshape(z', (size(z,2), size(z,1), 1)), varnames)
 ```
+(Since we're drawing independent samples, we can simply ignore the ESS and Rhat metrics.)
 
-One thing we can look at is simply the squared error between the means:
+Let's compare this against samples from `NUTS`:
 
 ```{julia}
-sum(abs2, mcmc_mean .- vi_mean)
-```
+mcmc_chain = sample(m, NUTS(), 10_000, drop_warmup=true, progress=false);
+
+vi_mean = mean(vi_chain)[:, 2]
+mcmc_mean = mean(mcmc_chain, names(mcmc_chain, :parameters))[:, 2]
 
+plot(mcmc_mean; xticks=1:1:length(mcmc_mean), label="mean of NUTS")
+plot!(vi_mean; label="mean of VI")
+```
 That looks pretty good! But let's see how the predictive distributions looks for the two.
 
-## Prediction
+## Making Predictions
 
 Similarily to the linear regression tutorial, we're going to compare to multivariate ordinary linear regression using the `GLM` package:
 
@@ -533,7 +370,7 @@ test_cut.OLSPrediction = unstandardize(GLM.predict(ols, test_cut), train_unstand
 
 ```{julia}
 # Make a prediction given an input vector, using mean parameter values from a chain.
-function prediction_chain(chain, x)
+function prediction(chain, x)
     p = get_params(chain)
     α = mean(p.intercept)
     β = collect(mean.(p.coefficients))
@@ -541,21 +378,6 @@ function prediction_chain(chain, x)
 end
 ```
 
-```{julia}
-# Make a prediction using samples from the variational posterior given an input vector.
-function prediction(samples::AbstractVector, sym2ranges, x)
-    α = mean(samples[union(sym2ranges[:intercept]...)])
-    β = vec(mean(samples[union(sym2ranges[:coefficients]...)]; dims=2))
-    return α .+ x * β
-end
-
-function prediction(samples::AbstractMatrix, sym2ranges, x)
-    α = mean(samples[union(sym2ranges[:intercept]...), :])
-    β = vec(mean(samples[union(sym2ranges[:coefficients]...), :]; dims=2))
-    return α .+ x * β
-end
-```
-
 ```{julia}
 # Unstandardize the dependent variable.
 train_cut.MPG = unstandardize(train_cut.MPG, train_unstandardized.MPG)
@@ -568,251 +390,73 @@ first(test_cut, 6)
 ```
 
 ```{julia}
-z = rand(q, 10_000);
+# Construct the Chains from the Variational Approximations
+z_mf = rand(q_mf, 10_000);
+z_fr = rand(q_fr, 10_000);
+
+vi_mf_chain = Chains(reshape(z_mf', (size(z_mf,2), size(z_mf,1), 1)), varnames);
+vi_fr_chain = Chains(reshape(z_fr', (size(z_fr,2), size(z_fr,1), 1)), varnames);
 ```
 
 ```{julia}
 # Calculate the predictions for the training and testing sets using the samples `z` from variational posterior
-train_cut.VIPredictions = unstandardize(
-    prediction(z, sym2range, train), train_unstandardized.MPG
+train_cut.VIMFPredictions = unstandardize(
+    prediction(vi_mf_chain, train), train_unstandardized.MPG
 )
-test_cut.VIPredictions = unstandardize(
-    prediction(z, sym2range, test), train_unstandardized.MPG
+test_cut.VIMFPredictions = unstandardize(
+    prediction(vi_mf_chain, test), train_unstandardized.MPG
 )
 
-train_cut.BayesPredictions = unstandardize(
-    prediction_chain(chain, train), train_unstandardized.MPG
+train_cut.VIFRPredictions = unstandardize(
+    prediction(vi_fr_chain, train), train_unstandardized.MPG
 )
-test_cut.BayesPredictions = unstandardize(
-    prediction_chain(chain, test), train_unstandardized.MPG
-);
-```
-
-```{julia}
-vi_loss1 = mean((train_cut.VIPredictions - train_cut.MPG) .^ 2)
-bayes_loss1 = mean((train_cut.BayesPredictions - train_cut.MPG) .^ 2)
-ols_loss1 = mean((train_cut.OLSPrediction - train_cut.MPG) .^ 2)
-
-vi_loss2 = mean((test_cut.VIPredictions - test_cut.MPG) .^ 2)
-bayes_loss2 = mean((test_cut.BayesPredictions - test_cut.MPG) .^ 2)
-ols_loss2 = mean((test_cut.OLSPrediction - test_cut.MPG) .^ 2)
-
-println("Training set:
-    VI loss: $vi_loss1
-    Bayes loss: $bayes_loss1
-    OLS loss: $ols_loss1
-Test set:
-    VI loss: $vi_loss2
-    Bayes loss: $bayes_loss2
-    OLS loss: $ols_loss2")
-```
-
-
-Interestingly the squared difference between true- and mean-prediction on the test-set is actually *better* for the mean-field variational posterior than for the "true" posterior obtained by MCMC sampling using `NUTS`. But, as Bayesians, we know that the mean doesn't tell the entire story. One quick check is to look at the mean predictions ± standard deviation of the two different approaches:
-
-```{julia}
-z = rand(q, 1000);
-preds = mapreduce(hcat, eachcol(z)) do zi
-    return unstandardize(prediction(zi, sym2range, test), train_unstandardized.MPG)
-end
-
-scatter(
-    1:size(test, 1),
-    mean(preds; dims=2);
-    yerr=std(preds; dims=2),
-    label="prediction (mean ± std)",
-    size=(900, 500),
-    markersize=8,
-)
-scatter!(1:size(test, 1), unstandardize(test_label, train_unstandardized.MPG); label="true")
-xaxis!(1:size(test, 1))
-ylims!(10, 40)
-title!("Mean-field ADVI (Normal)")
-```
-
-```{julia}
-preds = mapreduce(hcat, 1:5:size(chain, 1)) do i
-    return unstandardize(prediction_chain(chain[i], test), train_unstandardized.MPG)
-end
-
-scatter(
-    1:size(test, 1),
-    mean(preds; dims=2);
-    yerr=std(preds; dims=2),
-    label="prediction (mean ± std)",
-    size=(900, 500),
-    markersize=8,
+test_cut.VIFRPredictions = unstandardize(
+    prediction(vi_fr_chain, test), train_unstandardized.MPG
 )
-scatter!(1:size(test, 1), unstandardize(test_label, train_unstandardized.MPG); label="true")
-xaxis!(1:size(test, 1))
-ylims!(10, 40)
-title!("MCMC (NUTS)")
-```
-
-Indeed we see that the MCMC approach generally provides better uncertainty estimates than the mean-field ADVI approach! Good. So all the work we've done to make MCMC fast isn't for nothing.
-
-## Alternative: provide parameter-to-distribution instead of $q$ with `update` implemented
-
-As mentioned earlier, it's also possible to just provide the mapping $\theta \mapsto q_{\theta}$ rather than the variational family / initial variational posterior `q`, i.e. use the interface `vi(m, advi, getq, θ_init)` where `getq` is the mapping $\theta \mapsto q_{\theta}$
-
-In this section we're going to construct a mean-field approximation to the model by hand using a composition of`Shift` and `Scale` from Bijectors.jl togheter with a standard multivariate Gaussian as the base distribution.
-
-```{julia}
-using Bijectors
-```
-
-```{julia}
-using Bijectors: Scale, Shift
-```
-
-```{julia}
-d = length(q)
-base_dist = Turing.DistributionsAD.TuringDiagMvNormal(zeros(d), ones(d))
-```
-
-`bijector(model::Turing.Model)` is defined by Turing, and will return a `bijector` which takes you from the space of the latent variables to the real space. In this particular case, this is a mapping `((0, ∞) × ℝ × ℝ¹⁰) → ℝ¹²`. We're interested in using a normal distribution as a base-distribution and transform samples to the latent space, thus we need the inverse mapping from the reals to the latent space:
-
-```{julia}
-to_constrained = inverse(bijector(m));
-```
-
-```{julia}
-function getq(θ)
-    d = length(θ) ÷ 2
-    A = @inbounds θ[1:d]
-    b = @inbounds θ[(d + 1):(2 * d)]
-
-    b = to_constrained ∘ Shift(b) ∘ Scale(exp.(A))
-
-    return transformed(base_dist, b)
-end
-```
-
-```{julia}
-q_mf_normal = vi(m, advi, getq, randn(2 * d));
-```
-
-```{julia}
-p1 = plot_variational_marginals(rand(q_mf_normal, 10_000), sym2range) # MvDiagNormal + Affine transformation + to_constrained
-p2 = plot_variational_marginals(rand(q, 10_000), sym2range)  # Turing.meanfield(m)
 
-plot(p1, p2; layout=(1, 2), size=(800, 2000))
-```
-
-As expected, the fits look pretty much identical.
-
-But using this interface it becomes trivial to go beyond the mean-field assumption we made for the variational posterior, as we'll see in the next section.
-
-### Relaxing the mean-field assumption
-
-Here we'll instead consider the variational family to be a full non-diagonal multivariate Gaussian. As in the previous section we'll implement this by transforming a standard multivariate Gaussian using `Scale` and `Shift`, but now `Scale` will instead be using a lower-triangular matrix (representing the Cholesky of the covariance matrix of a multivariate normal) in contrast to the diagonal matrix we used in for the mean-field approximate posterior.
-
-```{julia}
-# Using `ComponentArrays.jl` together with `UnPack.jl` makes our lives much easier.
-using ComponentArrays, UnPack
-```
-
-```{julia}
-proto_arr = ComponentArray(; L=zeros(d, d), b=zeros(d))
-proto_axes = getaxes(proto_arr)
-num_params = length(proto_arr)
-
-function getq(θ)
-    L, b = begin
-        @unpack L, b = ComponentArray(θ, proto_axes)
-        LowerTriangular(L), b
-    end
-    # For this to represent a covariance matrix we need to ensure that the diagonal is positive.
-    # We can enforce this by zeroing out the diagonal and then adding back the diagonal exponentiated.
-    D = Diagonal(diag(L))
-    A = L - D + exp(D) # exp for Diagonal is the same as exponentiating only the diagonal entries
-
-    b = to_constrained ∘ Shift(b) ∘ Scale(A)
-
-    return transformed(base_dist, b)
-end
-```
-
-```{julia}
-advi = ADVI(10, 20_000)
-```
-
-```{julia}
-q_full_normal = vi(
-    m, advi, getq, randn(num_params); optimizer=Variational.DecayedADAGrad(1e-2)
-);
-```
-
-Let's have a look at the learned covariance matrix:
-
-```{julia}
-A = q_full_normal.transform.inner.a
-```
-
-```{julia}
-heatmap(cov(A * A'))
-```
-
-```{julia}
-zs = rand(q_full_normal, 10_000);
-```
-
-```{julia}
-p1 = plot_variational_marginals(rand(q_mf_normal, 10_000), sym2range)
-p2 = plot_variational_marginals(rand(q_full_normal, 10_000), sym2range)
-
-plot(p1, p2; layout=(1, 2), size=(800, 2000))
-```
-
-So it seems like the "full" ADVI approach, i.e. no mean-field assumption, obtain the same modes as the mean-field approach but with greater uncertainty for some of the `coefficients`. This
-
-```{julia}
-# Unfortunately, it seems like this has quite a high variance which is likely to be due to numerical instability,
-# so we consider a larger number of samples. If we get a couple of outliers due to numerical issues,
-# these kind affect the mean prediction greatly.
-z = rand(q_full_normal, 10_000);
-```
-
-```{julia}
-train_cut.VIFullPredictions = unstandardize(
-    prediction(z, sym2range, train), train_unstandardized.MPG
+train_cut.BayesPredictions = unstandardize(
+    prediction(mcmc_chain, train), train_unstandardized.MPG
 )
-test_cut.VIFullPredictions = unstandardize(
-    prediction(z, sym2range, test), train_unstandardized.MPG
+test_cut.BayesPredictions = unstandardize(
+    prediction(mcmc_chain, test), train_unstandardized.MPG
 );
 ```
 
 ```{julia}
-vi_loss1 = mean((train_cut.VIPredictions - train_cut.MPG) .^ 2)
-vifull_loss1 = mean((train_cut.VIFullPredictions - train_cut.MPG) .^ 2)
+vi_mf_loss1 = mean((train_cut.VIMFPredictions - train_cut.MPG) .^ 2)
+vi_fr_loss1 = mean((train_cut.VIFRPredictions - train_cut.MPG) .^ 2)
 bayes_loss1 = mean((train_cut.BayesPredictions - train_cut.MPG) .^ 2)
 ols_loss1 = mean((train_cut.OLSPrediction - train_cut.MPG) .^ 2)
 
-vi_loss2 = mean((test_cut.VIPredictions - test_cut.MPG) .^ 2)
-vifull_loss2 = mean((test_cut.VIFullPredictions - test_cut.MPG) .^ 2)
+vi_mf_loss2 = mean((test_cut.VIMFPredictions - test_cut.MPG) .^ 2)
+vi_fr_loss2 = mean((test_cut.VIFRPredictions - test_cut.MPG) .^ 2)
 bayes_loss2 = mean((test_cut.BayesPredictions - test_cut.MPG) .^ 2)
 ols_loss2 = mean((test_cut.OLSPrediction - test_cut.MPG) .^ 2)
 
 println("Training set:
-    VI loss: $vi_loss1
+    VI Mean-Field loss: $vi_mf_loss1
+    VI Full-Rank loss: $vi_fr_loss1
     Bayes loss: $bayes_loss1
     OLS loss: $ols_loss1
 Test set:
-    VI loss: $vi_loss2
+    VI Mean-Field loss: $vi_mf_loss2
+    VI Full-Rank loss: $vi_fr_loss2
     Bayes loss: $bayes_loss2
     OLS loss: $ols_loss2")
 ```
 
+Interestingly the squared difference between true- and mean-prediction on the test-set is actually *better* for the full-rank variational posterior than for the "true" posterior obtained by MCMC sampling using `NUTS`.
+But, as Bayesians, we know that the mean doesn't tell the entire story. One quick check is to look at the mean predictions ± standard deviation of the two different approaches:
+
 ```{julia}
-z = rand(q_mf_normal, 1000);
-preds = mapreduce(hcat, eachcol(z)) do zi
-    return unstandardize(prediction(zi, sym2range, test), train_unstandardized.MPG)
+preds_vi_mf = mapreduce(hcat, 1:5:size(vi_mf_chain, 1)) do i
+    return unstandardize(prediction(vi_mf_chain[i], test), train_unstandardized.MPG)
 end
 
 p1 = scatter(
     1:size(test, 1),
-    mean(preds; dims=2);
-    yerr=std(preds; dims=2),
+    mean(preds_vi_mf; dims=2);
+    yerr=std(preds_vi_mf; dims=2),
     label="prediction (mean ± std)",
     size=(900, 500),
     markersize=8,
@@ -820,19 +464,16 @@ p1 = scatter(
 scatter!(1:size(test, 1), unstandardize(test_label, train_unstandardized.MPG); label="true")
 xaxis!(1:size(test, 1))
 ylims!(10, 40)
-title!("Mean-field ADVI (Normal)")
-```
+title!("VI Mean-Field")
 
-```{julia}
-z = rand(q_full_normal, 1000);
-preds = mapreduce(hcat, eachcol(z)) do zi
-    return unstandardize(prediction(zi, sym2range, test), train_unstandardized.MPG)
+preds_vi_fr = mapreduce(hcat, 1:5:size(vi_mf_chain, 1)) do i
+    return unstandardize(prediction(vi_fr_chain[i], test), train_unstandardized.MPG)
 end
 
 p2 = scatter(
     1:size(test, 1),
-    mean(preds; dims=2);
-    yerr=std(preds; dims=2),
+    mean(preds_vi_fr; dims=2);
+    yerr=std(preds_vi_fr; dims=2),
     label="prediction (mean ± std)",
     size=(900, 500),
     markersize=8,
@@ -840,18 +481,16 @@ p2 = scatter(
 scatter!(1:size(test, 1), unstandardize(test_label, train_unstandardized.MPG); label="true")
 xaxis!(1:size(test, 1))
 ylims!(10, 40)
-title!("Full ADVI (Normal)")
-```
+title!("VI Full-Rank")
 
-```{julia}
-preds = mapreduce(hcat, 1:5:size(chain, 1)) do i
-    return unstandardize(prediction_chain(chain[i], test), train_unstandardized.MPG)
+preds_mcmc = mapreduce(hcat, 1:5:size(mcmc_chain, 1)) do i
+    return unstandardize(prediction(mcmc_chain[i], test), train_unstandardized.MPG)
 end
 
 p3 = scatter(
     1:size(test, 1),
-    mean(preds; dims=2);
-    yerr=std(preds; dims=2),
+    mean(preds_mcmc; dims=2);
+    yerr=std(preds_mcmc; dims=2),
     label="prediction (mean ± std)",
     size=(900, 500),
     markersize=8,
@@ -860,12 +499,21 @@ scatter!(1:size(test, 1), unstandardize(test_label, train_unstandardized.MPG); l
 xaxis!(1:size(test, 1))
 ylims!(10, 40)
 title!("MCMC (NUTS)")
-```
 
-```{julia}
 plot(p1, p2, p3; layout=(1, 3), size=(900, 250), label="")
 ```
-
-Here we actually see that indeed both the full ADVI and the MCMC approaches does a much better job of quantifying the uncertainty of predictions for never-before-seen samples, with full ADVI seemingly *underestimating* the variance slightly compared to MCMC.
-
-So now you know how to do perform VI on your Turing.jl model! Great isn't it?
+We can see that the full-rank VI approximation is very close to the predictions from MCMC samples.
+Also, the coverage of full-rank VI and MCMC is much better the crude mean-field approximation.
+
+[^KMJ2023]: Khaled, A., Mishchenko, K., & Jin, C. (2023). DoWG unleashed: An efficient universal parameter-free gradient descent method. In *Advances in Neural Information Processing Systems*, 36.
+[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *Proceedings of the International Conference on Machine Learning*. PMLR.
+[^DGG2023]: Domke, J., Gower, R., & Garrigos, G. (2023). Provable convergence guarantees for black-box variational inference. In *Advances in Neural Information Processing Systems*, 36.
+[^IHC2023]: Ivgi, M., Hinder, O., & Carmon, Y. (2023). DoG is SGD’s best friend: A parameter-free dynamic step size schedule. In *Proceedings of the International Conference on Machine Learning*. PMLR.
+[^KB2014]: Kingma, D. P., & Ba, J. (2015). Adam: A method for stochastic optimization. In *Proceedings of the International Conference on Learning Representations*.
+[^KOWMG2023]: Kim, K., Oh, J., Wu, K., Ma, Y., & Gardner, J. (2023). On the convergence of black-box variational inference. In *Advances in Neural Information Processing Systems*, 36.
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14).
+[^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *Proceedings of the International Conference on Learning Representations.*
+[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In *Proceedings of the International Conference on Artificial intelligence and statistics*. PMLR.
+[^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D (2014). Stochastic backpropagation and approximate inference in deep generative models. In *Proceedings of the International Conference on Machine Learning*. PMLR.
+[^SZ2013]: Shamir, O., & Zhang, T. (2013). Stochastic gradient descent for non-smooth optimization: Convergence results and optimal averaging schemes. In *Proceedings of the International Conference on Machine Learning.* PMLR.
+[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *Proceedings of the International Conference on Machine Learning*. PMLR.
diff --git a/usage/modifying-logprob/index.qmd b/usage/modifying-logprob/index.qmd
index b44776b51..b3de9e7ee 100755
--- a/usage/modifying-logprob/index.qmd
+++ b/usage/modifying-logprob/index.qmd
@@ -13,7 +13,7 @@ Pkg.instantiate();
 ```
 
 Turing accumulates log probabilities internally in an internal data structure that is accessible through the internal variable `__varinfo__` inside of the model definition.
-To avoid users having to deal with internal data structures, Turing provides the `Turing.@addlogprob!` macro which increases the accumulated log probability.
+To avoid users having to deal with internal data structures, Turing provides the `@addlogprob!` macro which increases the accumulated log probability.
 For instance, this allows you to
 [include arbitrary terms in the likelihood](https://github.com/TuringLang/Turing.jl/issues/1332)
 
@@ -24,7 +24,7 @@ myloglikelihood(x, μ) = loglikelihood(Normal(μ, 1), x)
 
 @model function demo(x)
     μ ~ Normal()
-    Turing.@addlogprob! myloglikelihood(x, μ)
+    @addlogprob! myloglikelihood(x, μ)
 end
 ```
 
@@ -37,7 +37,7 @@ using LinearAlgebra
 @model function demo(x)
     m ~ MvNormal(zero(x), I)
     if dot(m, x) < 0
-        Turing.@addlogprob! -Inf
+        @addlogprob! -Inf
         # Exit the model evaluation early
         return nothing
     end
@@ -49,11 +49,11 @@ end
 
 Note that `@addlogprob!` always increases the accumulated log probability, regardless of the provided
 sampling context.
-For instance, if you do not want to apply `Turing.@addlogprob!` when evaluating the prior of your model but only when computing the log likelihood and the log joint probability, then you should [check the type of the internal variable `__context_`](https://github.com/TuringLang/DynamicPPL.jl/issues/154), as in the following example:
+For instance, if you do not want to apply `@addlogprob!` when evaluating the prior of your model but only when computing the log likelihood and the log joint probability, then you should [check the type of the internal variable `__context_`](https://github.com/TuringLang/DynamicPPL.jl/issues/154), as in the following example:
 
 ```{julia}
 #| eval: false
 if DynamicPPL.leafcontext(__context__) !== Turing.PriorContext()
-    Turing.@addlogprob! myloglikelihood(x, μ)
+    @addlogprob! myloglikelihood(x, μ)
 end
 ```