refactor and fix test rng handling

nsiccha · nsiccha · commit f1d1c80175ec · 2025-11-26T10:18:26.000+01:00
diff --git a/src/AdvancedHMC.jl b/src/AdvancedHMC.jl
@@ -72,7 +72,7 @@ export find_good_eps
 include("adaptation/Adaptation.jl")
 using .Adaptation
 import .Adaptation:
-    StepSizeAdaptor, MassMatrixAdaptor, StanHMCAdaptor, NesterovDualAveraging, NoAdaptation
+    StepSizeAdaptor, MassMatrixAdaptor, StanHMCAdaptor, NesterovDualAveraging, NoAdaptation, PositionOrPhasePoint
 
 # Helpers for initializing adaptors via AHMC structs
 
@@ -114,6 +114,7 @@ export StepSizeAdaptor,
     MassMatrixAdaptor,
     UnitMassMatrix,
     WelfordVar,
+    NutpieVar,
     WelfordCov,
     NaiveHMCAdaptor,
     StanHMCAdaptor,
diff --git a/src/adaptation/Adaptation.jl b/src/adaptation/Adaptation.jl
@@ -10,7 +10,7 @@ using DocStringExtensions
 """
 $(TYPEDEF)
 
-Abstract type for HMC adaptors. 
+Abstract type for HMC adaptors.
 """
 abstract type AbstractAdaptor end
 function getM⁻¹ end
@@ -21,12 +21,17 @@ function initialize! end
 function finalize! end
 export AbstractAdaptor, adapt!, initialize!, finalize!, reset!, getϵ, getM⁻¹
 
+get_position(x::PhasePoint) = x.θ
+get_position(x::AbstractVecOrMat{<:AbstractFloat}) = x
+const PositionOrPhasePoint = Union{AbstractVecOrMat{<:AbstractFloat}, PhasePoint}
+
 struct NoAdaptation <: AbstractAdaptor end
 export NoAdaptation
 include("stepsize.jl")
 export StepSizeAdaptor, NesterovDualAveraging
+
 include("massmatrix.jl")
-export MassMatrixAdaptor, UnitMassMatrix, WelfordVar, WelfordCov
+export MassMatrixAdaptor, UnitMassMatrix, WelfordVar, NutpieVar, WelfordCov
 
 ##
 ## Composite adaptors
@@ -47,23 +52,14 @@ getϵ(ca::NaiveHMCAdaptor) = getϵ(ca.ssa)
 # TODO: implement consensus adaptor
 function adapt!(
     nca::NaiveHMCAdaptor,
-    θ::AbstractVecOrMat{<:AbstractFloat},
+    z_or_theta::PositionOrPhasePoint,
     α::AbstractScalarOrVec{<:AbstractFloat},
 )
-    adapt!(nca.ssa, θ, α)
-    adapt!(nca.pc, θ, α)
-    return nothing
-end
-adapt!(
-    nca::NaiveHMCAdaptor,
-    z::PhasePoint,
-    α::AbstractScalarOrVec{<:AbstractFloat},
-) = adapt!(nca, z.θ, α)
-function reset!(aca::NaiveHMCAdaptor)
-    reset!(aca.ssa)
-    reset!(aca.pc)
+    adapt!(nca.ssa, z_or_theta, α)
+    adapt!(nca.pc, z_or_theta, α)
     return nothing
 end
+
 initialize!(adaptor::NaiveHMCAdaptor, n_adapts::Int) = nothing
 finalize!(aca::NaiveHMCAdaptor) = finalize!(aca.ssa)
 
diff --git a/src/adaptation/massmatrix.jl b/src/adaptation/massmatrix.jl
@@ -9,29 +9,17 @@ finalize!(::MassMatrixAdaptor) = nothing
 
 function adapt!(
     adaptor::MassMatrixAdaptor,
-    θ::AbstractVecOrMat{<:AbstractFloat},
-    α::AbstractScalarOrVec{<:AbstractFloat},
-    is_update::Bool=true,
-)
-    resize_adaptor!(adaptor, size(θ))
-    push!(adaptor, θ)
-    is_update && update!(adaptor)
-    return nothing
-end
-
-function adapt!(
-    adaptor::MassMatrixAdaptor,
-    z::PhasePoint,
-    α::AbstractScalarOrVec{<:AbstractFloat},
+    z_or_theta::PositionOrPhasePoint,
+    ::AbstractScalarOrVec{<:AbstractFloat},
     is_update::Bool=true,
 )
-    resize_adaptor!(adaptor, size(z.θ))
-    push!(adaptor, z)
+    resize_adaptor!(adaptor, size(get_position(z_or_theta)))
+    push!(adaptor, z_or_theta)
     is_update && update!(adaptor)
     return nothing
 end
 
-Base.push!(a::MassMatrixAdaptor, z::PhasePoint) = push!(a, z.θ)
+Base.push!(a::MassMatrixAdaptor, z_or_theta::PositionOrPhasePoint) = push!(a, get_position(z_or_theta))
 
 ## Unit mass matrix adaptor
 
@@ -53,24 +41,14 @@ getM⁻¹(::UnitMassMatrix{T}) where {T} = LinearAlgebra.UniformScaling{T}(one(T
 
 function adapt!(
     ::UnitMassMatrix,
-    ::AbstractVecOrMat{<:AbstractFloat},
-    ::AbstractScalarOrVec{<:AbstractFloat},
-    is_update::Bool=true,
-)
-    return nothing
-end
-
-function adapt!(
-    ::UnitMassMatrix,
-    ::PhasePoint,
+    ::PositionOrPhasePoint,
     ::AbstractScalarOrVec{<:AbstractFloat},
     is_update::Bool=true,
 )
     return nothing
 end
 
 ## Diagonal mass matrix adaptor
-
 abstract type DiagMatrixEstimator{T} <: MassMatrixAdaptor end
 
 getM⁻¹(ve::DiagMatrixEstimator) = ve.var
@@ -93,7 +71,7 @@ NaiveVar{T}(sz::Tuple{Int,Int}) where {T<:AbstractFloat} = NaiveVar(Vector{Matri
 
 NaiveVar(sz::Union{Tuple{Int},Tuple{Int,Int}}) = NaiveVar{Float64}(sz)
 
-Base.push!(nv::NaiveVar, s::AbstractVecOrMat) = push!(nv.S, s)
+Base.push!(nv::NaiveVar, s::AbstractVecOrMat{<:AbstractFloat}) = push!(nv.S, s)
 
 reset!(nv::NaiveVar) = resize!(nv.S, 0)
 
@@ -158,7 +136,7 @@ function reset!(wv::WelfordVar{T}) where {T<:AbstractFloat}
     return nothing
 end
 
-function Base.push!(wv::WelfordVar, s::AbstractVecOrMat{T}) where {T}
+function Base.push!(wv::WelfordVar, s::AbstractVecOrMat{T}) where {T<:AbstractFloat}
     wv.n += 1
     (; δ, μ, M, n) = wv
     n = T(n)
@@ -176,8 +154,13 @@ function get_estimation(wv::WelfordVar{T}) where {T<:AbstractFloat}
     return n / ((n + 5) * (n - 1)) * M .+ ϵ * (5 / (n + 5))
 end
 
-## Nutpie-style diagonal mass matrix estimator (using positions and gradients) - not exported yet due to https://github.com/TuringLang/AdvancedHMC.jl/issues/475
+"""
+Nutpie-style diagonal mass matrix estimator (using positions and gradients) - not exported yet due to https://github.com/TuringLang/AdvancedHMC.jl/issues/475
 
+Expected to converge faster and to a better mass matrix than WelfordVar.
+
+Can be initialized via NutpieVar(sz) where sz is either a `Tuple{Int}` or a `Tuple{Int,Int}`.
+"""
 mutable struct NutpieVar{T<:AbstractFloat,E<:AbstractVecOrMat{T},V<:AbstractVecOrMat{T}} <: DiagMatrixEstimator{T}
     position_estimator::WelfordVar{T,E,V}
     gradient_estimator::WelfordVar{T,E,V}
@@ -232,6 +215,8 @@ function reset!(nv::NutpieVar)
     reset!(nv.gradient_estimator)
 end
 
+Base.push!(::NutpieVar, x::AbstractVecOrMat{<:AbstractFloat}) = error("`NutpieVar` adaptation requires position and gradient information!")
+
 function Base.push!(nv::NutpieVar, z::PhasePoint)
     nv.n += 1
     push!(nv.position_estimator, z.θ)
@@ -266,7 +251,7 @@ end
 
 NaiveCov{T}(sz::Tuple{Int}) where {T<:AbstractFloat} = NaiveCov(Vector{Vector{T}}())
 
-Base.push!(nc::NaiveCov, s::AbstractVector) = push!(nc.S, s)
+Base.push!(nc::NaiveCov, s::AbstractVector{<:AbstractFloat}) = push!(nc.S, s)
 
 reset!(nc::NaiveCov{T}) where {T} = resize!(nc.S, 0)
 
@@ -316,7 +301,7 @@ function reset!(wc::WelfordCov{T}) where {T<:AbstractFloat}
     return nothing
 end
 
-function Base.push!(wc::WelfordCov, s::AbstractVector{T}) where {T}
+function Base.push!(wc::WelfordCov, s::AbstractVector{T}) where {T<:AbstractFloat}
     wc.n += 1
     (; δ, μ, n, M) = wc
     n = T(n)
diff --git a/src/adaptation/stan_adaptor.jl b/src/adaptation/stan_adaptor.jl
@@ -136,45 +136,20 @@ is_window_end(a::StanHMCAdaptor) = a.state.i in a.state.window_splits
 
 function adapt!(
     tp::StanHMCAdaptor,
-    θ::AbstractVecOrMat{<:AbstractFloat},
+    z_or_theta::PositionOrPhasePoint,
     α::AbstractScalarOrVec{<:AbstractFloat},
 )
     tp.state.i += 1
 
-    adapt!(tp.ssa, θ, α)
+    adapt!(tp.ssa, z_or_theta, α)
 
-    resize_adaptor!(tp.pc, size(θ)) # Resize pre-conditioner if necessary.
+    resize_adaptor!(tp.pc, size(get_position(z_or_theta))) # Resize pre-conditioner if necessary.
 
     # Ref: https://github.com/stan-dev/stan/blob/develop/src/stan/mcmc/hmc/nuts/adapt_diag_e_nuts.hpp
     if is_in_window(tp)
         # We accumlate stats from θ online and only trigger the update of M⁻¹ in the end of window.
         is_update_M⁻¹ = is_window_end(tp)
-        adapt!(tp.pc, θ, α, is_update_M⁻¹)
-    end
-
-    if is_window_end(tp)
-        reset!(tp.ssa)
-        reset!(tp.pc)
-    end
-end
-
-
-function adapt!(
-    tp::StanHMCAdaptor,
-    z::PhasePoint,
-    α::AbstractScalarOrVec{<:AbstractFloat},
-)
-    tp.state.i += 1
-
-    adapt!(tp.ssa, z.θ, α)
-
-    resize_adaptor!(tp.pc, size(z.θ)) # Resize pre-conditioner if necessary.
-
-    # Ref: https://github.com/stan-dev/stan/blob/develop/src/stan/mcmc/hmc/nuts/adapt_diag_e_nuts.hpp
-    if is_in_window(tp)
-        # We accumlate stats from θ online and only trigger the update of M⁻¹ in the end of window.
-        is_update_M⁻¹ = is_window_end(tp)
-        adapt!(tp.pc, z, α, is_update_M⁻¹)
+        adapt!(tp.pc, z_or_theta, α, is_update_M⁻¹)
     end
 
     if is_window_end(tp)
diff --git a/src/adaptation/stepsize.jl b/src/adaptation/stepsize.jl
@@ -174,7 +174,7 @@ end
 # Ref: https://github.com/stan-dev/stan/blob/develop/src/stan/mcmc/stepsize_adaptation.hpp
 # Note: This function is not merged with `adapt!` to empahsize the fact that
 #       step size adaptation is not dependent on `θ`.
-# Note 2: `da.state` and `α` support vectorised HMC but should do so together. 
+# Note 2: `da.state` and `α` support vectorised HMC but should do so together.
 function adapt_stepsize!(
     da::NesterovDualAveraging{T}, α::AbstractScalarOrVec{T}
 ) where {T<:AbstractFloat}
@@ -211,7 +211,7 @@ end
 
 function adapt!(
     da::NesterovDualAveraging,
-    θ::AbstractVecOrMat{<:AbstractFloat},
+    ::PositionOrPhasePoint,
     α::AbstractScalarOrVec{<:AbstractFloat},
 )
     adapt_stepsize!(da, α)
diff --git a/src/sampler.jl b/src/sampler.jl
@@ -60,11 +60,11 @@ end
 function Adaptation.adapt!(
     h::Hamiltonian,
     κ::AbstractMCMCKernel,
-    adaptor::Adaptation.NoAdaptation,
-    i::Int,
-    n_adapts::Int,
-    θ::AbstractVecOrMat{<:AbstractFloat},
-    α::AbstractScalarOrVec{<:AbstractFloat},
+    ::Adaptation.NoAdaptation,
+    ::Int,
+    ::Int,
+    ::PositionOrPhasePoint,
+    ::AbstractScalarOrVec{<:AbstractFloat},
 )
     return h, κ, false
 end
@@ -75,40 +75,18 @@ function Adaptation.adapt!(
     adaptor::AbstractAdaptor,
     i::Int,
     n_adapts::Int,
-    θ::AbstractVecOrMat{<:AbstractFloat},
-    α::AbstractScalarOrVec{<:AbstractFloat},
-)
-    isadapted = false
-    if i <= n_adapts
-        i == 1 && Adaptation.initialize!(adaptor, n_adapts)
-        adapt!(adaptor, θ, α)
-        i == n_adapts && finalize!(adaptor)
-        h = update(h, adaptor)
-        κ = update(κ, adaptor)
-        isadapted = true
-    end
-    return h, κ, isadapted
-end
-
-function Adaptation.adapt!(
-    h::Hamiltonian,
-    κ::AbstractMCMCKernel,
-    adaptor::AbstractAdaptor,
-    i::Int,
-    n_adapts::Int,
-    z::PhasePoint,
+    z_or_theta::PositionOrPhasePoint,
     α::AbstractScalarOrVec{<:AbstractFloat},
 )
-    isadapted = false
-    if i <= n_adapts
+    adapt = i <= n_adapts
+    if adapt
         i == 1 && Adaptation.initialize!(adaptor, n_adapts)
-        adapt!(adaptor, z, α)
+        adapt!(adaptor, z_or_theta, α)
         i == n_adapts && finalize!(adaptor)
         h = update(h, adaptor)
         κ = update(κ, adaptor)
-        isadapted = true
     end
-    return h, κ, isadapted
+    return h, κ, adapt
 end
 
 """
@@ -169,7 +147,7 @@ end
         progress::Bool=false
     )
 Sample `n_samples` samples using the proposal `κ` under Hamiltonian `h`.
-- The randomness is controlled by `rng`. 
+- The randomness is controlled by `rng`.
     - If `rng` is not provided, the default random number generator (`Random.default_rng()`) will be used.
 - The initial point is given by `θ`.
 - The adaptor is set by `adaptor`, for which the default is no adaptation.
diff --git a/test/adaptation.jl b/test/adaptation.jl
@@ -34,15 +34,23 @@ function runnuts_nutpie(ℓπ, metric::DiagEuclideanMetric; n_samples=10_000)
     κ = AdvancedHMC.make_kernel(nuts, integrator)
     # Constructing like this until we've settled on a different interface
     adaptor = AdvancedHMC.StanHMCAdaptor(
-        AdvancedHMC.Adaptation.NutpieVar(size(metric); var=copy(metric.M⁻¹)), 
+        AdvancedHMC.Adaptation.NutpieVar(size(metric); var=copy(metric.M⁻¹)),
         AdvancedHMC.StepSizeAdaptor(nuts.δ, integrator)
     )
     samples, stats = sample(h, κ, θ_init, n_samples, adaptor, n_adapts; verbose=false)
     return (samples=samples, stats=stats, adaptor=adaptor)
 end
+"""
+Computes the condition number of a covariance matrix `cov::AbstractMatrix` after preconditioning with the (diagonal) mass matrix estimated in `a::DiagMatrixEstimator`.
+
+This is a simple but serviceable proxy for eventual sampling efficiency, but see also https://arxiv.org/abs/1905.09813 for a more involved estimate.
+
+(A lower number generally means that the estimated mass matrix is better).
+"""
 preconditioned_cond(a::DiagMatrixEstimator, cov::AbstractMatrix) = cond(sqrt(Diagonal(a.var)) \ cov / sqrt(Diagonal(a.var)))
 
 @testset "Adaptation" begin
+    Random.seed!(1)
     # Check that the estimated variance is approximately correct.
     @testset "Online v.s. naive v.s. true var/cov estimation" begin
         D = 10
@@ -159,9 +167,8 @@ preconditioned_cond(a::DiagMatrixEstimator, cov::AbstractMatrix) = cond(sqrt(Dia
     @testset "Adapted mass v.s. true variance" begin
         D = 10
         n_tests = 5
-        @testset "DiagEuclideanMetric" begin
+        @testset "'Diagonal' MvNormal target" begin
             for _ in 1:n_tests
-                Random.seed!(1)
 
                 # Random variance
                 σ² = 1 .+ abs.(randn(D))
@@ -183,7 +190,7 @@ preconditioned_cond(a::DiagMatrixEstimator, cov::AbstractMatrix) = cond(sqrt(Dia
             end
         end
 
-        @testset "DenseEuclideanMetric" begin
+        @testset "'Dense' MvNormal target" begin
             n_nutpie_superior = 0
             for _ in 1:n_tests
                 # Random covariance
@@ -197,16 +204,16 @@ preconditioned_cond(a::DiagMatrixEstimator, cov::AbstractMatrix) = cond(sqrt(Dia
                 @test res.adaptor.pc.var ≈ diag(Σ) rtol = 0.2
 
                 # For this target, Nutpie will NOT converge towards the true variances, even after infinite draws.
-                # HOWEVER, it will asymptotically (but also generally more quickly than Stan) 
+                # HOWEVER, it will asymptotically (but also generally more quickly than Stan)
                 # find the best preconditioner for the target.
-                # As these are statistical algorithms, superiority is not always guaranteed, hence this way of testing.  
+                # As these are statistical algorithms, superiority is not always guaranteed, hence this way of testing.
                 res_nutpie = runnuts_nutpie(ℓπ, DiagEuclideanMetric(D))
                 n_nutpie_superior += preconditioned_cond(res_nutpie.adaptor.pc, Σ) < preconditioned_cond(res.adaptor.pc, Σ)
 
                 res = runnuts(ℓπ, DenseEuclideanMetric(D))
                 @test res.adaptor.pc.cov ≈ Σ rtol = 0.25
             end
-            @test n_nutpie_superior > n_tests / 2
+            @test n_nutpie_superior > 1 + n_tests / 2
         end
     end