From f82685369638b42eb8d1b51fb1edcbdf5135606c Mon Sep 17 00:00:00 2001 From: Patrick Kofod Mogensen Date: Wed, 26 Mar 2025 07:30:51 +0100 Subject: [PATCH 1/5] Add check_flatness keyword to HZ and format --- src/hagerzhang.jl | 306 +++++++++++++++++++++++++++++----------------- test/issues.jl | 159 ++++++++++++++++++++++-- 2 files changed, 344 insertions(+), 121 deletions(-) diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl index 9d1be56..e8688fa 100644 --- a/src/hagerzhang.jl +++ b/src/hagerzhang.jl @@ -44,19 +44,19 @@ # Display flags are represented as a bitfield # (not exported, but can use via LineSearches.ITER, for example) const one64 = convert(UInt64, 1) -const FINAL = one64 -const ITER = one64 << 1 -const PARAMETERS = one64 << 2 -const GRADIENT = one64 << 3 -const SEARCHDIR = one64 << 4 -const ALPHA = one64 << 5 -const BETA = one64 << 6 +const FINAL = one64 +const ITER = one64 << 1 +const PARAMETERS = one64 << 2 +const GRADIENT = one64 << 3 +const SEARCHDIR = one64 << 4 +const ALPHA = one64 << 5 +const BETA = one64 << 6 # const ALPHAGUESS = one64 << 7 TODO: not needed -const BRACKET = one64 << 8 -const LINESEARCH = one64 << 9 -const UPDATE = one64 << 10 -const SECANT2 = one64 << 11 -const BISECT = one64 << 12 +const BRACKET = one64 << 8 +const LINESEARCH = one64 << 9 +const UPDATE = one64 << 10 +const SECANT2 = one64 << 11 +const BISECT = one64 << 12 const BARRIERCOEF = one64 << 13 display_nextbit = 14 @@ -80,24 +80,32 @@ Conjugate gradient line search implementation from: conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software 32: 113–137. """ -@with_kw struct HagerZhang{T, Tm} <: AbstractLineSearch - delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition - sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) - alphamax::T = Inf - rho::T = 5.0 - epsilon::T = 1e-6 - gamma::T = 0.66 - linesearchmax::Int = 50 - psi3::T = 0.1 - display::Int = 0 - mayterminate::Tm = Ref{Bool}(false) - cache::Union{Nothing,LineSearchCache{T}} = nothing +@with_kw struct HagerZhang{T,Tm} <: AbstractLineSearch + delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition + sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) + alphamax::T = Inf + rho::T = 5.0 + epsilon::T = 1e-6 + gamma::T = 0.66 + linesearchmax::Int = 50 + psi3::T = 0.1 + display::Int = 0 + mayterminate::Tm = Ref{Bool}(false) + cache::Union{Nothing,LineSearchCache{T}} = nothing + check_flatness::Bool = false end -HagerZhang{T}(args...; kwargs...) where T = HagerZhang{T, Base.RefValue{Bool}}(args...; kwargs...) - -function (ls::HagerZhang)(df::AbstractObjective, x::AbstractArray{T}, - s::AbstractArray{T}, α::Real, - x_new::AbstractArray{T}, phi_0::Real, dphi_0::Real) where T +HagerZhang{T}(args...; kwargs...) where {T} = + HagerZhang{T,Base.RefValue{Bool}}(args...; kwargs...) + +function (ls::HagerZhang)( + df::AbstractObjective, + x::AbstractArray{T}, + s::AbstractArray{T}, + α::Real, + x_new::AbstractArray{T}, + phi_0::Real, + dphi_0::Real, +) where {T} ϕ, ϕdϕ = make_ϕ_ϕdϕ(df, x_new, x, s) ls(ϕ, ϕdϕ, α::Real, phi_0, dphi_0) end @@ -105,18 +113,26 @@ end (ls::HagerZhang)(ϕ, dϕ, ϕdϕ, c, phi_0, dphi_0) = ls(ϕ, ϕdϕ, c, phi_0, dphi_0) # TODO: Should we deprecate the interface that only uses the ϕ and ϕd\phi arguments? -function (ls::HagerZhang)(ϕ, ϕdϕ, - c::T, - phi_0::Real, - dphi_0::Real) where T # Should c and phi_0 be same type? - @unpack delta, sigma, alphamax, rho, epsilon, gamma, - linesearchmax, psi3, display, mayterminate, cache = ls +function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} # Should c and phi_0 be same type? + @unpack delta, + sigma, + alphamax, + rho, + epsilon, + gamma, + linesearchmax, + psi3, + display, + mayterminate, + cache = ls emptycache!(cache) zeroT = convert(T, 0) pushcache!(cache, zeroT, phi_0, dphi_0) if !(isfinite(phi_0) && isfinite(dphi_0)) - throw(LineSearchException("Value and slope at step length = 0 must be finite.", T(0))) + throw( + LineSearchException("Value and slope at step length = 0 must be finite.", T(0)), + ) end if dphi_0 >= eps(T) * abs(phi_0) throw(LineSearchException("Search direction is not a direction of descent.", T(0))) @@ -163,7 +179,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, # If c was generated by quadratic interpolation, check whether it # satisfies the Wolfe conditions if mayterminate[] && - satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) + satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) if display & LINESEARCH > 0 println("Wolfe condition satisfied on point alpha = ", c) end @@ -179,17 +195,24 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, cold = -one(T) while !isbracketed && iter < linesearchmax if display & BRACKET > 0 - println("bracketing: ia = ", ia, - ", ib = ", ib, - ", c = ", c, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) + println( + "bracketing: ia = ", + ia, + ", ib = ", + ib, + ", c = ", + c, + ", phi_c = ", + phi_c, + ", dphi_c = ", + dphi_c, + ) end if dphi_c >= zeroT # We've reached the upward slope, so we have b; examine # previous values to find a ib = length(alphas) - for i = (ib - 1):-1:1 + for i = (ib-1):-1:1 if values[i] <= phi_lim ia = i break @@ -201,7 +224,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, # have crested over the peak. Use bisection. ib = length(alphas) ia = 1 - if c ≉ alphas[ib] || slopes[ib] >= zeroT + if c ≉ alphas[ib] || slopes[ib] >= zeroT error("c = ", c) end # ia, ib = bisect(phi, lsr, ia, ib, phi_lim) # TODO: Pass options @@ -226,13 +249,19 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, if c > alphamax c = alphamax if display & BRACKET > 0 - println("bracket: exceeding alphamax, using c = alphamax = ", alphamax, - ", cold = ", cold) + println( + "bracket: exceeding alphamax, using c = alphamax = ", + alphamax, + ", cold = ", + cold, + ) end end phi_c, dphi_c = ϕdϕ(c) iterfinite = 1 - while !(isfinite(phi_c) && isfinite(dphi_c)) && c > nextfloat(cold) && iterfinite < iterfinitemax + while !(isfinite(phi_c) && isfinite(dphi_c)) && + c > nextfloat(cold) && + iterfinite < iterfinitemax alphamax = c # shrinks alphamax, assumes that steps >= c can never have finite phi_c and dphi_c iterfinite += 1 if display & BRACKET > 0 @@ -243,11 +272,19 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, end if !(isfinite(phi_c) && isfinite(dphi_c)) if display & BRACKET > 0 - println("Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.") - println("c = ", c, - ", alphamax = ", alphamax, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) + println( + "Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.", + ) + println( + "c = ", + c, + ", alphamax = ", + alphamax, + ", phi_c = ", + phi_c, + ", dphi_c = ", + dphi_c, + ) end return cold, phi_cold end @@ -262,18 +299,27 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, b = alphas[ib] @assert b > a if display & LINESEARCH > 0 - println("linesearch: ia = ", ia, - ", ib = ", ib, - ", a = ", a, - ", b = ", b, - ", phi(a) = ", values[ia], - ", phi(b) = ", values[ib]) + println( + "linesearch: ia = ", + ia, + ", ib = ", + ib, + ", a = ", + a, + ", b = ", + b, + ", phi(a) = ", + values[ia], + ", phi(b) = ", + values[ib], + ) end if b - a <= eps(b) mayterminate[] = false # reset in case another initial guess is used next return a, values[ia] # lsr.value[ia] end - iswolfe, iA, iB = secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display) + iswolfe, iA, iB = + secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display) if iswolfe mayterminate[] = false # reset in case another initial guess is used next return alphas[iA], values[iA] # lsr.value[iA] @@ -285,12 +331,15 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, if display & LINESEARCH > 0 println("Linesearch: secant succeeded") end - if nextfloat(values[ia]) >= values[ib] && nextfloat(values[iA]) >= values[iB] + if ls.check_flatness && + nextfloat(values[ia]) >= values[ib] && + nextfloat(values[iA]) >= values[iB] # It's so flat, secant didn't do anything useful, time to quit if display & LINESEARCH > 0 println("Linesearch: secant suggests it's flat") end mayterminate[] = false # reset in case another initial guess is used next + return A, values[iA] end ia = iA @@ -308,30 +357,44 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, push!(values, phi_c) push!(slopes, dphi_c) - ia, ib = update!(ϕdϕ, alphas, values, slopes, iA, iB, length(alphas), phi_lim, display) + ia, ib = update!( + ϕdϕ, + alphas, + values, + slopes, + iA, + iB, + length(alphas), + phi_lim, + display, + ) end iter += 1 end - throw(LineSearchException("Linesearch failed to converge, reached maximum iterations $(linesearchmax).", - alphas[ia])) + throw( + LineSearchException( + "Linesearch failed to converge, reached maximum iterations $(linesearchmax).", + alphas[ia], + ), + ) end # Check Wolfe & approximate Wolfe -function satisfies_wolfe(c::T, - phi_c::Real, - dphi_c::Real, - phi_0::Real, - dphi_0::Real, - phi_lim::Real, - delta::Real, - sigma::Real) where T<:Number - wolfe1 = delta * dphi_0 >= (phi_c - phi_0) / c && - dphi_c >= sigma * dphi_0 - wolfe2 = (2 * delta - 1) * dphi_0 >= dphi_c >= sigma * dphi_0 && - phi_c <= phi_lim +function satisfies_wolfe( + c::T, + phi_c::Real, + dphi_c::Real, + phi_0::Real, + dphi_0::Real, + phi_lim::Real, + delta::Real, + sigma::Real, +) where {T<:Number} + wolfe1 = delta * dphi_0 >= (phi_c - phi_0) / c && dphi_c >= sigma * dphi_0 + wolfe2 = (2 * delta - 1) * dphi_0 >= dphi_c >= sigma * dphi_0 && phi_c <= phi_lim return wolfe1 || wolfe2 end @@ -343,16 +406,18 @@ function secant(alphas, values, slopes, ia::Integer, ib::Integer) return secant(alphas[ia], alphas[ib], slopes[ia], slopes[ib]) end # phi -function secant2!(ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - delta::Real = DEFAULTDELTA, - sigma::Real = DEFAULTSIGMA, - display::Integer = 0) +function secant2!( + ϕdϕ, + alphas, + values, + slopes, + ia::Integer, + ib::Integer, + phi_lim::Real, + delta::Real = DEFAULTDELTA, + sigma::Real = DEFAULTSIGMA, + display::Integer = 0, +) phi_0 = values[1] dphi_0 = slopes[1] a = alphas[ia] @@ -362,9 +427,13 @@ function secant2!(ϕdϕ, T = eltype(slopes) zeroT = convert(T, 0) if !(dphi_a < zeroT && dphi_b >= zeroT) - error(string("Search direction is not a direction of descent; ", - "this error may indicate that user-provided derivatives are inaccurate. ", - @sprintf "(dphi_a = %f; dphi_b = %f)" dphi_a dphi_b)) + error( + string( + "Search direction is not a direction of descent; ", + "this error may indicate that user-provided derivatives are inaccurate. ", + @sprintf "(dphi_a = %f; dphi_b = %f)" dphi_a dphi_b + ), + ) end c = secant(a, b, dphi_a, dphi_b) if display & SECANT2 > 0 @@ -433,15 +502,17 @@ end # Given a third point, pick the best two that retain the bracket # around the minimum (as defined by HZ, eq. 29) # b will be the upper bound, and a the lower bound -function update!(ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - ic::Integer, - phi_lim::Real, - display::Integer = 0) +function update!( + ϕdϕ, + alphas, + values, + slopes, + ia::Integer, + ib::Integer, + ic::Integer, + phi_lim::Real, + display::Integer = 0, +) a = alphas[ia] b = alphas[ib] T = eltype(slopes) @@ -455,13 +526,22 @@ function update!(ϕdϕ, phi_c = values[ic] dphi_c = slopes[ic] if display & UPDATE > 0 - println("update: ia = ", ia, - ", a = ", a, - ", ib = ", ib, - ", b = ", b, - ", c = ", c, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) + println( + "update: ia = ", + ia, + ", a = ", + a, + ", ib = ", + ib, + ", b = ", + b, + ", c = ", + c, + ", phi_c = ", + phi_c, + ", dphi_c = ", + dphi_c, + ) end if c < a || c > b return ia, ib #, 0, 0 # it's out of the bracketing interval @@ -483,14 +563,16 @@ function update!(ϕdϕ, end # HZ, stage U3 (with theta=0.5) -function bisect!(ϕdϕ, - alphas::AbstractArray{T}, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - display::Integer = 0) where T +function bisect!( + ϕdϕ, + alphas::AbstractArray{T}, + values, + slopes, + ia::Integer, + ib::Integer, + phi_lim::Real, + display::Integer = 0, +) where {T} gphi = convert(T, NaN) a = alphas[ia] b = alphas[ib] diff --git a/test/issues.jl b/test/issues.jl index d0d01f4..7d717fa 100644 --- a/test/issues.jl +++ b/test/issues.jl @@ -3,32 +3,32 @@ using LineSearches, LinearAlgebra, Test A = randn(100, 100) x0 = randn(100) -b = A*x0 +b = A * x0 # Objective function and gradient -f(x) = .5*norm(A*x - b)^2 -g!(gvec, x) = (gvec .= A'*(A*x-b)) +f(x) = 0.5 * norm(A * x - b)^2 +g!(gvec, x) = (gvec .= A' * (A * x - b)) fg!(gvec, x) = (g!(gvec, x); return f(x)) # Init -x = 1f1*randn(100) +x = 1.0f1 * randn(100) gv = similar(x) # Line search -α0 = 1f-3 +α0 = 1.0f-3 ϕ0 = fg!(gv, x) -s = -1*gv +s = -1 * gv dϕ0 = dot(gv, s) println(ϕ0, ", ", dϕ0) # Univariate line search functions -ϕ(α) = f(x .+ α.*s) +ϕ(α) = f(x .+ α .* s) function dϕ(α) - g!(gv, x .+ α.*s) + g!(gv, x .+ α .* s) return dot(gv, s) end function ϕdϕ(α) - phi = fg!(gv, x .+ α.*s) + phi = fg!(gv, x .+ α .* s) dphi = dot(gv, s) return (phi, dphi) end @@ -36,3 +36,144 @@ end res = (StrongWolfe())(ϕ, dϕ, ϕdϕ, α0, ϕ0, dϕ0) @test res[2] > 0 @test res[2] == ϕ(res[1]) + +struct LineSearchTestCase + alphas::Vector{Float64} + values::Vector{Float64} + slopes::Vector{Float64} +end + +@testset "HZ convergence issues" begin + @testset "Flatness check issues" begin + function prepare_test_case(; alphas, values, slopes) + perm = sortperm(alphas) + alphas = alphas[perm] + push!(alphas, alphas[end] + 1) + values = values[perm] + push!(values, values[end]) + slopes = slopes[perm] + push!(slopes, 0.0) + return LineSearchTestCase(alphas, values, slopes) + end + + tc1 = prepare_test_case(; + alphas = [0.0, 1.0, 5.0, 3.541670844449739], + values = [ + 3003.592409634743, + 2962.0378569864743, + 2891.4462095232184, + 3000.9760725116876, + ], + slopes = [ + -22332.321416890798, + -20423.214551925797, + 11718.185026267562, + -22286.821227217057, + ], + ) + + function tc_to_f(tc) + function f(x) + i = findfirst(u -> u > x, tc.alphas) - 1 + xk = tc.alphas[i] + xkp1 = tc.alphas[i+1] + dx = xkp1 - xk + t = (x - xk) / dx + h00t = 2t^3 - 3t^2 + 1 + h10t = t * (1 - t)^2 + h01t = t^2 * (3 - 2t) + h11t = t^2 * (t - 1) + val = + h00t * tc.values[i] + + h10t * dx * tc.slopes[i] + + h01t * tc.values[i+1] + + h11t * dx * tc.slopes[i+1] + + return val + end + end + function tc_to_fdf(tc) + function fdf(x) + i = findfirst(u -> u > x, tc.alphas) - 1 + xk = tc.alphas[i] + xkp1 = tc.alphas[i+1] + dx = xkp1 - xk + t = (x - xk) / dx + h00t = 2t^3 - 3t^2 + 1 + h10t = t * (1 - t)^2 + h01t = t^2 * (3 - 2t) + h11t = t^2 * (t - 1) + val = + h00t * tc.values[i] + + h10t * dx * tc.slopes[i] + + h01t * tc.values[i+1] + + h11t * dx * tc.slopes[i+1] + + h00tp = 6t^2 - 6t + h10tp = 3t^2 - 4t + 1 + h01tp = -6t^2 + 6 * t + h11tp = 3t^2 - 2t + slope = + ( + h00tp * tc.values[i] + + h10tp * dx * tc.slopes[i] + + h01tp * tc.values[i+1] + + h11tp * dx * tc.slopes[i+1] + ) / dx + println(x, " ", val, " ", slope) + return val, slope + end + end + + function test_tc(tc, check_flatness) + cache = LineSearchCache{Float64}() + hz = HagerZhang(; cache, check_flatness) + f = tc_to_f(tc) + fdf = tc_to_fdf(tc) + hz(f, fdf, 1.0, fdf(0.0)...), cache + end + + res, res_cache = test_tc(tc1, true) + @test_broken minimum(res_cache.values) == res[2] + + res2, res_cache2 = test_tc(tc1, false) + @test minimum(res_cache2.values) == res2[2] + #= + using AlgebraOfGraphics, CairoMakie + draw(data((x=0.0:0.05:5.5, y=map(x->tc_to_f(tc1)(x), 0:0.05:5.5)))*mapping(:x,:y)*visual(Scatter)+ + data((alphas=res_cache.alphas, values=res_cache.values))*mapping(:alphas,:values)*visual(Scatter; color=:red)) + =# + end + + # should add as upstream + #= + @testset "from kbarros" begin + # The minimizer is x0=[0, 2πn/100], with f(x0) = 1. Any integer n is fine. + function f(x) + return (x[1]^2 + 1) * (2 - cos(100*x[2])) + end + + using Optim + + function test_converges(method) + for i in 1:100 + r = randn(2) + res = optimize(f, r, method) + if Optim.converged(res) && minimum(res) > f([0,0]) + 1e-8 + println(""" + Incorrectly reported convergence after $(res.iterations) iterations + Reached x = $(Optim.minimizer(res)) with f(x) = $(minimum(res)) + """) + end + end + end + + # Works successfully, no printed output + test_converges(LBFGS(; linesearch=Optim.LineSearches.BackTracking(order=2))) + + # Prints ~10 failures to converge (in 100 tries). Frequently fails after the + # first line search. + test_converges(ConjugateGradient(; linesearch=Optim.LineSearches.HagerZhang(check_flatness=false))) + end + =# +end From a1859b203d3b89f334e9333f59a3490824cb1767 Mon Sep 17 00:00:00 2001 From: Patrick Kofod Mogensen Date: Wed, 26 Mar 2025 07:37:57 +0100 Subject: [PATCH 2/5] undo format --- src/hagerzhang.jl | 306 +++++++++++++++++----------------------------- 1 file changed, 113 insertions(+), 193 deletions(-) diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl index e8688fa..61a0b4d 100644 --- a/src/hagerzhang.jl +++ b/src/hagerzhang.jl @@ -44,19 +44,19 @@ # Display flags are represented as a bitfield # (not exported, but can use via LineSearches.ITER, for example) const one64 = convert(UInt64, 1) -const FINAL = one64 -const ITER = one64 << 1 -const PARAMETERS = one64 << 2 -const GRADIENT = one64 << 3 -const SEARCHDIR = one64 << 4 -const ALPHA = one64 << 5 -const BETA = one64 << 6 +const FINAL = one64 +const ITER = one64 << 1 +const PARAMETERS = one64 << 2 +const GRADIENT = one64 << 3 +const SEARCHDIR = one64 << 4 +const ALPHA = one64 << 5 +const BETA = one64 << 6 # const ALPHAGUESS = one64 << 7 TODO: not needed -const BRACKET = one64 << 8 -const LINESEARCH = one64 << 9 -const UPDATE = one64 << 10 -const SECANT2 = one64 << 11 -const BISECT = one64 << 12 +const BRACKET = one64 << 8 +const LINESEARCH = one64 << 9 +const UPDATE = one64 << 10 +const SECANT2 = one64 << 11 +const BISECT = one64 << 12 const BARRIERCOEF = one64 << 13 display_nextbit = 14 @@ -80,32 +80,25 @@ Conjugate gradient line search implementation from: conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software 32: 113–137. """ -@with_kw struct HagerZhang{T,Tm} <: AbstractLineSearch - delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition - sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) - alphamax::T = Inf - rho::T = 5.0 - epsilon::T = 1e-6 - gamma::T = 0.66 - linesearchmax::Int = 50 - psi3::T = 0.1 - display::Int = 0 - mayterminate::Tm = Ref{Bool}(false) - cache::Union{Nothing,LineSearchCache{T}} = nothing - check_flatness::Bool = false +@with_kw struct HagerZhang{T, Tm} <: AbstractLineSearch + delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition + sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent) + alphamax::T = Inf + rho::T = 5.0 + epsilon::T = 1e-6 + gamma::T = 0.66 + linesearchmax::Int = 50 + psi3::T = 0.1 + display::Int = 0 + mayterminate::Tm = Ref{Bool}(false) + cache::Union{Nothing,LineSearchCache{T}} = nothing + check_flatness::Bool = false end -HagerZhang{T}(args...; kwargs...) where {T} = - HagerZhang{T,Base.RefValue{Bool}}(args...; kwargs...) - -function (ls::HagerZhang)( - df::AbstractObjective, - x::AbstractArray{T}, - s::AbstractArray{T}, - α::Real, - x_new::AbstractArray{T}, - phi_0::Real, - dphi_0::Real, -) where {T} +HagerZhang{T}(args...; kwargs...) where T = HagerZhang{T, Base.RefValue{Bool}}(args...; kwargs...) + +function (ls::HagerZhang)(df::AbstractObjective, x::AbstractArray{T}, + s::AbstractArray{T}, α::Real, + x_new::AbstractArray{T}, phi_0::Real, dphi_0::Real) where T ϕ, ϕdϕ = make_ϕ_ϕdϕ(df, x_new, x, s) ls(ϕ, ϕdϕ, α::Real, phi_0, dphi_0) end @@ -113,26 +106,18 @@ end (ls::HagerZhang)(ϕ, dϕ, ϕdϕ, c, phi_0, dphi_0) = ls(ϕ, ϕdϕ, c, phi_0, dphi_0) # TODO: Should we deprecate the interface that only uses the ϕ and ϕd\phi arguments? -function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} # Should c and phi_0 be same type? - @unpack delta, - sigma, - alphamax, - rho, - epsilon, - gamma, - linesearchmax, - psi3, - display, - mayterminate, - cache = ls +function (ls::HagerZhang)(ϕ, ϕdϕ, + c::T, + phi_0::Real, + dphi_0::Real) where T # Should c and phi_0 be same type? + @unpack delta, sigma, alphamax, rho, epsilon, gamma, + linesearchmax, psi3, display, mayterminate, cache = ls emptycache!(cache) zeroT = convert(T, 0) pushcache!(cache, zeroT, phi_0, dphi_0) if !(isfinite(phi_0) && isfinite(dphi_0)) - throw( - LineSearchException("Value and slope at step length = 0 must be finite.", T(0)), - ) + throw(LineSearchException("Value and slope at step length = 0 must be finite.", T(0))) end if dphi_0 >= eps(T) * abs(phi_0) throw(LineSearchException("Search direction is not a direction of descent.", T(0))) @@ -179,7 +164,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} # If c was generated by quadratic interpolation, check whether it # satisfies the Wolfe conditions if mayterminate[] && - satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) + satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) if display & LINESEARCH > 0 println("Wolfe condition satisfied on point alpha = ", c) end @@ -195,24 +180,17 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} cold = -one(T) while !isbracketed && iter < linesearchmax if display & BRACKET > 0 - println( - "bracketing: ia = ", - ia, - ", ib = ", - ib, - ", c = ", - c, - ", phi_c = ", - phi_c, - ", dphi_c = ", - dphi_c, - ) + println("bracketing: ia = ", ia, + ", ib = ", ib, + ", c = ", c, + ", phi_c = ", phi_c, + ", dphi_c = ", dphi_c) end if dphi_c >= zeroT # We've reached the upward slope, so we have b; examine # previous values to find a ib = length(alphas) - for i = (ib-1):-1:1 + for i = (ib - 1):-1:1 if values[i] <= phi_lim ia = i break @@ -224,7 +202,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} # have crested over the peak. Use bisection. ib = length(alphas) ia = 1 - if c ≉ alphas[ib] || slopes[ib] >= zeroT + if c ≉ alphas[ib] || slopes[ib] >= zeroT error("c = ", c) end # ia, ib = bisect(phi, lsr, ia, ib, phi_lim) # TODO: Pass options @@ -249,19 +227,13 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} if c > alphamax c = alphamax if display & BRACKET > 0 - println( - "bracket: exceeding alphamax, using c = alphamax = ", - alphamax, - ", cold = ", - cold, - ) + println("bracket: exceeding alphamax, using c = alphamax = ", alphamax, + ", cold = ", cold) end end phi_c, dphi_c = ϕdϕ(c) iterfinite = 1 - while !(isfinite(phi_c) && isfinite(dphi_c)) && - c > nextfloat(cold) && - iterfinite < iterfinitemax + while !(isfinite(phi_c) && isfinite(dphi_c)) && c > nextfloat(cold) && iterfinite < iterfinitemax alphamax = c # shrinks alphamax, assumes that steps >= c can never have finite phi_c and dphi_c iterfinite += 1 if display & BRACKET > 0 @@ -272,19 +244,11 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} end if !(isfinite(phi_c) && isfinite(dphi_c)) if display & BRACKET > 0 - println( - "Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.", - ) - println( - "c = ", - c, - ", alphamax = ", - alphamax, - ", phi_c = ", - phi_c, - ", dphi_c = ", - dphi_c, - ) + println("Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.") + println("c = ", c, + ", alphamax = ", alphamax, + ", phi_c = ", phi_c, + ", dphi_c = ", dphi_c) end return cold, phi_cold end @@ -299,27 +263,18 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} b = alphas[ib] @assert b > a if display & LINESEARCH > 0 - println( - "linesearch: ia = ", - ia, - ", ib = ", - ib, - ", a = ", - a, - ", b = ", - b, - ", phi(a) = ", - values[ia], - ", phi(b) = ", - values[ib], - ) + println("linesearch: ia = ", ia, + ", ib = ", ib, + ", a = ", a, + ", b = ", b, + ", phi(a) = ", values[ia], + ", phi(b) = ", values[ib]) end if b - a <= eps(b) mayterminate[] = false # reset in case another initial guess is used next return a, values[ia] # lsr.value[ia] end - iswolfe, iA, iB = - secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display) + iswolfe, iA, iB = secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display) if iswolfe mayterminate[] = false # reset in case another initial guess is used next return alphas[iA], values[iA] # lsr.value[iA] @@ -331,9 +286,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} if display & LINESEARCH > 0 println("Linesearch: secant succeeded") end - if ls.check_flatness && - nextfloat(values[ia]) >= values[ib] && - nextfloat(values[iA]) >= values[iB] + if ls.check_flatness && nextfloat(values[ia]) >= values[ib] && nextfloat(values[iA]) >= values[iB] # It's so flat, secant didn't do anything useful, time to quit if display & LINESEARCH > 0 println("Linesearch: secant suggests it's flat") @@ -357,44 +310,30 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, c::T, phi_0::Real, dphi_0::Real) where {T} push!(values, phi_c) push!(slopes, dphi_c) - ia, ib = update!( - ϕdϕ, - alphas, - values, - slopes, - iA, - iB, - length(alphas), - phi_lim, - display, - ) + ia, ib = update!(ϕdϕ, alphas, values, slopes, iA, iB, length(alphas), phi_lim, display) end iter += 1 end - throw( - LineSearchException( - "Linesearch failed to converge, reached maximum iterations $(linesearchmax).", - alphas[ia], - ), - ) + throw(LineSearchException("Linesearch failed to converge, reached maximum iterations $(linesearchmax).", + alphas[ia])) end # Check Wolfe & approximate Wolfe -function satisfies_wolfe( - c::T, - phi_c::Real, - dphi_c::Real, - phi_0::Real, - dphi_0::Real, - phi_lim::Real, - delta::Real, - sigma::Real, -) where {T<:Number} - wolfe1 = delta * dphi_0 >= (phi_c - phi_0) / c && dphi_c >= sigma * dphi_0 - wolfe2 = (2 * delta - 1) * dphi_0 >= dphi_c >= sigma * dphi_0 && phi_c <= phi_lim +function satisfies_wolfe(c::T, + phi_c::Real, + dphi_c::Real, + phi_0::Real, + dphi_0::Real, + phi_lim::Real, + delta::Real, + sigma::Real) where T<:Number + wolfe1 = delta * dphi_0 >= (phi_c - phi_0) / c && + dphi_c >= sigma * dphi_0 + wolfe2 = (2 * delta - 1) * dphi_0 >= dphi_c >= sigma * dphi_0 && + phi_c <= phi_lim return wolfe1 || wolfe2 end @@ -406,18 +345,16 @@ function secant(alphas, values, slopes, ia::Integer, ib::Integer) return secant(alphas[ia], alphas[ib], slopes[ia], slopes[ib]) end # phi -function secant2!( - ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - delta::Real = DEFAULTDELTA, - sigma::Real = DEFAULTSIGMA, - display::Integer = 0, -) +function secant2!(ϕdϕ, + alphas, + values, + slopes, + ia::Integer, + ib::Integer, + phi_lim::Real, + delta::Real = DEFAULTDELTA, + sigma::Real = DEFAULTSIGMA, + display::Integer = 0) phi_0 = values[1] dphi_0 = slopes[1] a = alphas[ia] @@ -427,13 +364,9 @@ function secant2!( T = eltype(slopes) zeroT = convert(T, 0) if !(dphi_a < zeroT && dphi_b >= zeroT) - error( - string( - "Search direction is not a direction of descent; ", - "this error may indicate that user-provided derivatives are inaccurate. ", - @sprintf "(dphi_a = %f; dphi_b = %f)" dphi_a dphi_b - ), - ) + error(string("Search direction is not a direction of descent; ", + "this error may indicate that user-provided derivatives are inaccurate. ", + @sprintf "(dphi_a = %f; dphi_b = %f)" dphi_a dphi_b)) end c = secant(a, b, dphi_a, dphi_b) if display & SECANT2 > 0 @@ -502,17 +435,15 @@ end # Given a third point, pick the best two that retain the bracket # around the minimum (as defined by HZ, eq. 29) # b will be the upper bound, and a the lower bound -function update!( - ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - ic::Integer, - phi_lim::Real, - display::Integer = 0, -) +function update!(ϕdϕ, + alphas, + values, + slopes, + ia::Integer, + ib::Integer, + ic::Integer, + phi_lim::Real, + display::Integer = 0) a = alphas[ia] b = alphas[ib] T = eltype(slopes) @@ -526,22 +457,13 @@ function update!( phi_c = values[ic] dphi_c = slopes[ic] if display & UPDATE > 0 - println( - "update: ia = ", - ia, - ", a = ", - a, - ", ib = ", - ib, - ", b = ", - b, - ", c = ", - c, - ", phi_c = ", - phi_c, - ", dphi_c = ", - dphi_c, - ) + println("update: ia = ", ia, + ", a = ", a, + ", ib = ", ib, + ", b = ", b, + ", c = ", c, + ", phi_c = ", phi_c, + ", dphi_c = ", dphi_c) end if c < a || c > b return ia, ib #, 0, 0 # it's out of the bracketing interval @@ -563,16 +485,14 @@ function update!( end # HZ, stage U3 (with theta=0.5) -function bisect!( - ϕdϕ, - alphas::AbstractArray{T}, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - display::Integer = 0, -) where {T} +function bisect!(ϕdϕ, + alphas::AbstractArray{T}, + values, + slopes, + ia::Integer, + ib::Integer, + phi_lim::Real, + display::Integer = 0) where T gphi = convert(T, NaN) a = alphas[ia] b = alphas[ib] From aa8fe5bd16a4b28b1e05ff99ca143db094a9a6b0 Mon Sep 17 00:00:00 2001 From: Patrick Kofod Mogensen Date: Wed, 26 Mar 2025 07:39:05 +0100 Subject: [PATCH 3/5] Update issues.jl --- test/issues.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/issues.jl b/test/issues.jl index 7d717fa..188d5e4 100644 --- a/test/issues.jl +++ b/test/issues.jl @@ -134,6 +134,8 @@ end end res, res_cache = test_tc(tc1, true) + @warn res + @warn res_cache @test_broken minimum(res_cache.values) == res[2] res2, res_cache2 = test_tc(tc1, false) From f862bb8516a6b15ce394166907695843db496118 Mon Sep 17 00:00:00 2001 From: Patrick Kofod Mogensen Date: Wed, 26 Mar 2025 07:44:26 +0100 Subject: [PATCH 4/5] Update issues.jl --- test/issues.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/issues.jl b/test/issues.jl index 188d5e4..65ac142 100644 --- a/test/issues.jl +++ b/test/issues.jl @@ -134,8 +134,8 @@ end end res, res_cache = test_tc(tc1, true) - @warn res - @warn res_cache + @show res + @show res_cache @test_broken minimum(res_cache.values) == res[2] res2, res_cache2 = test_tc(tc1, false) From 8b33bc2d441394787335aca6faa799b4f49543ff Mon Sep 17 00:00:00 2001 From: Patrick Kofod Mogensen Date: Wed, 26 Mar 2025 07:54:53 +0100 Subject: [PATCH 5/5] Fix tests --- test/captured.jl | 2 +- test/issues.jl | 6 ------ test/runtests.jl | 3 ++- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/test/captured.jl b/test/captured.jl index 05b81fa..d6d97f7 100644 --- a/test/captured.jl +++ b/test/captured.jl @@ -31,5 +31,5 @@ end fdf = OnceDifferentiable(tc) hz = HagerZhang() α, val = hz(fdf.f, fdf.fdf, 1.0, fdf.fdf(0.0)...) - @test_broken val <= minimum(tc) + @test val <= minimum(tc) end diff --git a/test/issues.jl b/test/issues.jl index 65ac142..687907e 100644 --- a/test/issues.jl +++ b/test/issues.jl @@ -37,12 +37,6 @@ res = (StrongWolfe())(ϕ, dϕ, ϕdϕ, α0, ϕ0, dϕ0) @test res[2] > 0 @test res[2] == ϕ(res[1]) -struct LineSearchTestCase - alphas::Vector{Float64} - values::Vector{Float64} - slopes::Vector{Float64} -end - @testset "HZ convergence issues" begin @testset "Flatness check issues" begin function prepare_test_case(; alphas, values, slopes) diff --git a/test/runtests.jl b/test/runtests.jl index 4910d53..011bc22 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,8 @@ my_tests = [ "alphacalc.jl", "arbitrary_precision.jl", "examples.jl", - "captured.jl" + "captured.jl", + "issues.jl", ] mutable struct StateDummy