Bpcg with direct solve and wolfe step

Added Wolfe step
ZIB-IOL · Nov 6, 2024 · 013dc15 · 013dc15
1 parent 3bcaffd
commit 013dc15
Show file tree

Hide file tree

Showing 17 changed files with 285 additions and 84 deletions.
diff --git a/examples/blended_pairwise_with_direct_solve_wolfe.jl b/examples/blended_pairwise_with_direct_solve_wolfe.jl
@@ -0,0 +1,138 @@
+#=
+This example demonstrates the use of the Blended Pairwise Conditional Gradient algorithm
+with direct solve steps for a quadratic optimization problem over a sparse polytope which is not standard quadratic.
+
+The example showcases how the algorithm balances between:
+- Pairwise steps for efficient optimization
+- Periodic direct solves for handling the quadratic objective
+- Lazy (approximate) linear minimization steps for improved iteration complexity
+
+It also demonstrates how to set up custom callbacks for tracking algorithm progress.
+=#
+
+using FrankWolfe
+using LinearAlgebra
+using Random
+
+import HiGHS
+import MathOptInterface as MOI
+
+include("../examples/plot_utils.jl")
+
+n = Int(1e2)
+k = 10000
+
+# s = rand(1:100)
+s = 10
+@info "Seed $s"
+Random.seed!(s)
+
+A = let
+    A = randn(n, n)
+    A' * A
+end
+@assert isposdef(A) == true
+
+const y = Random.rand(Bool, n) * 0.6 .+ 0.3
+
+function f(x)
+    d = x - y
+    return dot(d, A, d)
+end
+
+function grad!(storage, x)
+    mul!(storage, A, x)
+    return mul!(storage, A, y, -2, 2)
+end
+
+
+xpi = rand(n);
+total = sum(xpi);
+
+xp = xpi ./ total;
+
+f(x) = norm(x - xp)^2
+function grad!(storage, x)
+    @. storage = 2 * (x - xp)
+end
+
+lmo = FrankWolfe.KSparseLMO(5, 500.0)
+
+## other LMOs to try
+lmo = FrankWolfe.KSparseLMO(10, big"500.0")
+# lmo = FrankWolfe.LpNormLMO{Float64,5}(100.0)
+# lmo = FrankWolfe.ProbabilitySimplexOracle(100.0);
+# lmo = FrankWolfe.UnitSimplexOracle(10000.0);
+
+x00 = FrankWolfe.compute_extreme_point(lmo, rand(n))
+
+
+function build_callback(trajectory_arr)
+    return function callback(state, active_set, args...)
+        return push!(trajectory_arr, (FrankWolfe.callback_state(state)..., length(active_set)))
+    end
+end
+
+
+trajectoryBPCG_standard = []
+x, v, primal, dual_gap, _ = FrankWolfe.blended_pairwise_conditional_gradient(
+    f,
+    grad!,
+    lmo,
+    copy(x00),
+    max_iteration=k,
+    verbose=true,
+    callback=build_callback(trajectoryBPCG_standard),
+);
+
+
+active_set_quadratic_automatic_standard = FrankWolfe.ActiveSetQuadraticLinearSolve(
+    FrankWolfe.ActiveSet([(1.0, copy(x00))]),
+    grad!,
+    MOI.instantiate(MOI.OptimizerWithAttributes(HiGHS.Optimizer, MOI.Silent() => true)),
+)
+trajectoryBPCG_quadratic_automatic_standard = []
+x, v, primal, dual_gap, _ = FrankWolfe.blended_pairwise_conditional_gradient(
+    f,
+    grad!,
+    lmo,
+    active_set_quadratic_automatic_standard,
+    max_iteration=k,
+    verbose=true,
+    callback=build_callback(trajectoryBPCG_quadratic_automatic_standard),
+);
+
+
+active_set_quadratic_wolfe = FrankWolfe.ActiveSetQuadraticLinearSolve(
+    FrankWolfe.ActiveSet([(1.0, copy(x00))]),
+    2I, -2xp,
+    MOI.instantiate(MOI.OptimizerWithAttributes(HiGHS.Optimizer, MOI.Silent() => true)),
+    scheduler=FrankWolfe.LogScheduler(start_time=10, scaling_factor=1),
+    wolfe_step=true,
+)
+trajectoryBPCG_quadratic_wolfe = []
+x, v, primal, dual_gap, _ = FrankWolfe.blended_pairwise_conditional_gradient(
+    f,
+    grad!,
+    lmo,
+    active_set_quadratic_wolfe,
+    max_iteration=k,
+    verbose=true,
+    callback=build_callback(trajectoryBPCG_quadratic_wolfe),
+);
+
+dataSparsity = [
+    trajectoryBPCG_standard,
+    trajectoryBPCG_quadratic_automatic_standard,
+    trajectoryBPCG_quadratic_wolfe,
+]
+labelSparsity = [
+    "BPCG (Standard)",
+    "AS_Standard", 
+    "AS_Wolfe",
+]
+
+# Plot trajectories
+plot_trajectories(dataSparsity, labelSparsity, xscalelog=false)
+
+# plot_sparsity(dataSparsity, labelSparsity, xscalelog=false)
diff --git a/src/active_set_quadratic_direct_solve.jl b/src/active_set_quadratic_direct_solve.jl
@@ -1,4 +1,3 @@
-
 """
     ActiveSetQuadraticLinearSolve{AT, R, IT}
 
@@ -12,6 +11,8 @@ so that the gradient is `∇f(x)=Ax+b`.
 This active set stores an inner `active_set` that keeps track of the current set of vertices and convex decomposition.
 It therefore delegates all update, deletion, and addition operations to this inner active set.
 The `weight`, `atoms`, and `x` fields should only be accessed to read and are effectively the same objects as those in the inner active set.
+The flag `wolfe_step` determines whether to use a Wolfe step from the min-norm point algorithm or the normal direct solve.
+The Wolfe step solves the auxiliary subproblem over the affine hull of the current active set (instead of the convex hull).
 
 The structure also contains a scheduler struct which is called with the `should_solve_lp` function.
 To define a new frequency at which the LP should be solved, one can define another scheduler struct and implement the corresponding method.
@@ -33,6 +34,7 @@ struct ActiveSetQuadraticLinearSolve{
     b::BT # linear term
     active_set::AS
     lp_optimizer::OT
+    wolfe_step::Bool
     scheduler::SF
     counter::Base.RefValue{Int}
 end
@@ -47,9 +49,10 @@ function ActiveSetQuadraticLinearSolve(
     grad!::Function,
     lp_optimizer;
     scheduler=LogScheduler(),
+    wolfe_step=false,
 ) where {AT,R}
     A, b = detect_quadratic_function(grad!, tuple_values[1][2])
-    return ActiveSetQuadraticLinearSolve(tuple_values, A, b, lp_optimizer, scheduler=scheduler)
+    return ActiveSetQuadraticLinearSolve(tuple_values, A, b, lp_optimizer, scheduler=scheduler, wolfe_step=wolfe_step)
 end
 
 """
@@ -63,6 +66,7 @@ function ActiveSetQuadraticLinearSolve(
     b,
     lp_optimizer;
     scheduler=LogScheduler(),
+    wolfe_step=false,
 ) where {AT,R,H}
     inner_as = ActiveSetQuadraticProductCaching(tuple_values, A, b)
     return ActiveSetQuadraticLinearSolve(
@@ -73,6 +77,7 @@ function ActiveSetQuadraticLinearSolve(
         inner_as.b,
         inner_as,
         lp_optimizer,
+        wolfe_step,
         scheduler,
         Ref(0),
     )
@@ -84,6 +89,7 @@ function ActiveSetQuadraticLinearSolve(
     b,
     lp_optimizer;
     scheduler=LogScheduler(),
+    wolfe_step=false,
 )
     as = ActiveSetQuadraticLinearSolve(
         inner_as.weights,
@@ -93,6 +99,7 @@ function ActiveSetQuadraticLinearSolve(
         b,
         inner_as,
         lp_optimizer,
+        wolfe_step,
         scheduler,
         Ref(0),
     )
@@ -106,6 +113,7 @@ function ActiveSetQuadraticLinearSolve(
     b,
     lp_optimizer;
     scheduler=LogScheduler(),
+    wolfe_step=false,
 )
     as = ActiveSetQuadraticLinearSolve(
         inner_as.weights,
@@ -115,6 +123,7 @@ function ActiveSetQuadraticLinearSolve(
         b,
         inner_as,
         lp_optimizer,
+        wolfe_step,
         scheduler,
         Ref(0),
     )
@@ -127,9 +136,10 @@ function ActiveSetQuadraticLinearSolve(
     grad!::Function,
     lp_optimizer;
     scheduler=LogScheduler(),
+    wolfe_step=false,
 )
     A, b = detect_quadratic_function(grad!, inner_as.atoms[1])
-    return ActiveSetQuadraticLinearSolve(inner_as, A, b, lp_optimizer; scheduler=scheduler)
+    return ActiveSetQuadraticLinearSolve(inner_as, A, b, lp_optimizer; scheduler=scheduler, wolfe_step=wolfe_step)
 end
 
 function ActiveSetQuadraticLinearSolve{AT,R}(
@@ -269,7 +279,9 @@ function solve_quadratic_activeset_lp!(
     MOI.empty!(o)
     λ = MOI.add_variables(o, nv)
     # λ ≥ 0, ∑ λ == 1
-    MOI.add_constraint.(o, λ, MOI.GreaterThan(0.0))
+    if !as.wolfe_step
+        MOI.add_constraint.(o, λ, MOI.GreaterThan(0.0))
+    end
     sum_of_variables = MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(1.0, λ), 0.0)
     MOI.add_constraint(o, sum_of_variables, MOI.EqualTo(1.0))
     # Vᵗ A V λ == -Vᵗ b
@@ -284,15 +296,30 @@ function solve_quadratic_activeset_lp!(
             )
         end
         rhs = -dot(atom, as.b)
-        MOI.add_constraint(o, lhs, MOI.EqualTo(rhs))
+        MOI.add_constraint(o, lhs, MOI.EqualTo{Float64}(rhs))
     end
     MOI.set(o, MOI.ObjectiveFunction{typeof(sum_of_variables)}(), sum_of_variables)
     MOI.set(o, MOI.ObjectiveSense(), MOI.MIN_SENSE)
     MOI.optimize!(o)
     if MOI.get(o, MOI.TerminationStatus()) ∉ (MOI.OPTIMAL, MOI.FEASIBLE_POINT, MOI.ALMOST_OPTIMAL)
         return as
     end
-    indices_to_remove = Int[]
+    indices_to_remove, new_weights = if as.wolfe_step
+        _compute_new_weights_wolfe_step(λ, R, as.weights, o)
+    else
+        _compute_new_weights_direct_solve(λ, R, o)
+    end
+    deleteat!(as.active_set, indices_to_remove)
+    @assert length(as) == length(new_weights)
+    update_weights!(as.active_set, new_weights)
+    active_set_cleanup!(as)
+    active_set_renormalize!(as)
+    compute_active_set_iterate!(as)
+    return as
+end
+
+function _compute_new_weights_direct_solve(λ, ::Type{R}, o::MOI.AbstractOptimizer) where {R}
+    indices_to_remove = BitSet()
     new_weights = R[]
     for idx in eachindex(λ)
         weight_value = MOI.get(o, MOI.VariablePrimal(), λ[idx])
@@ -302,13 +329,48 @@ function solve_quadratic_activeset_lp!(
             push!(new_weights, weight_value)
         end
     end
-    deleteat!(as.active_set, indices_to_remove)
-    @assert length(as) == length(new_weights)
-    update_weights!(as.active_set, new_weights)
-    active_set_cleanup!(as)
-    active_set_renormalize!(as)
-    compute_active_set_iterate!(as)
-    return as
+    return indices_to_remove, new_weights
+end
+
+function _compute_new_weights_wolfe_step(λ, ::Type{R}, old_weights, o::MOI.AbstractOptimizer) where {R}
+    wolfe_weights = MOI.get.(o, MOI.VariablePrimal(), λ)
+    # all nonnegative -> use non-wolfe procedure
+    if all(>=(-10eps()), wolfe_weights)
+        return _compute_new_weights_direct_solve(λ, R, o)
+    end
+    # ratio test to know which coordinate would hit zero first
+    tau_min = 1.0
+    set_indices_zero = BitSet()
+    for idx in eachindex(λ)
+        if wolfe_weights[idx] < old_weights[idx]
+            tau = old_weights[idx] / (old_weights[idx] - wolfe_weights[idx])
+            if abs(tau - tau_min) ≤ 2weight_purge_threshold_default(typeof(tau))
+                push!(set_indices_zero, idx)
+            elseif tau < tau_min
+                tau_min = tau
+                empty!(set_indices_zero)
+                push!(set_indices_zero, idx)
+            end
+        end
+    end
+    @assert length(set_indices_zero) >= 1
+    new_lambdas = [(1 - tau_min) * old_weights[idx] + tau_min * wolfe_weights[idx] for idx in eachindex(λ)]
+    for idx in set_indices_zero
+        new_lambdas[idx] = 0
+    end
+    @assert all(>=(-2weight_purge_threshold_default(eltype(new_lambdas))), new_lambdas) "All new_lambdas must be between nonnegative $(minimum(new_lambdas))"
+    @assert isapprox(sum(new_lambdas), 1.0) "The sum of new_lambdas must be approximately 1"
+    indices_to_remove = Int[]
+    new_weights = R[]
+    for idx in eachindex(λ)
+        weight_value =  new_lambdas[idx] # using new lambdas
+        if weight_value <= eps()
+            push!(indices_to_remove, idx)
+        else
+            push!(new_weights, weight_value)
+        end
+    end
+    return indices_to_remove, new_weights
 end
 
 function _compute_quadratic_constraint_term(atom1, A::AbstractMatrix, atom2, λ)

diff --git a/src/types.jl b/src/types.jl
@@ -103,6 +103,8 @@ function Base.copyto!(dst::SparseArrays.SparseVector, src::ScaledHotVector)
     return dst
 end
 
+SparseArrays.issparse(::ScaledHotVector) = true
+
 function active_set_update_scale!(x::IT, lambda, atom::ScaledHotVector) where {IT}
     x .*= (1 - lambda)
     x[atom.val_idx] += lambda * atom.active_val

diff --git a/test/active_set.jl b/test/active_set.jl
@@ -4,7 +4,7 @@ using FrankWolfe
 import FrankWolfe: ActiveSet
 using LinearAlgebra: norm
 
-@testset "Active sets                                       " begin
+@testset "Active sets" begin
 
     @testset "Constructors and eltypes" begin
         active_set = ActiveSet([(0.1, [1, 2, 3]), (0.9, [2, 3, 4]), (0.0, [5, 6, 7])])
@@ -100,7 +100,7 @@ using LinearAlgebra: norm
     end
 end
 
-@testset "Simplex gradient descent                          " begin
+@testset "Simplex gradient descent" begin
     # Gradient descent over a 2-D unit simplex
     # each atom is a vertex, direction points to [1,1]
     # note: integers for atom element types
@@ -175,7 +175,7 @@ end
     end
 end
 
-@testset "LP separation oracle                              " begin
+@testset "LP separation oracle" begin
     # Gradient descent over a L-inf ball of radius one
     # current active set contains 3 vertices
     # direction points to [1,1]
@@ -212,7 +212,7 @@ end
     )
 end
 
-@testset "Argminmax                                         " begin
+@testset "Argminmax" begin
     active_set = FrankWolfe.ActiveSet([(0.6, [-1, -1]), (0.2, [0, 1]), (0.2, [1, 0])])
     (λ_min, a_min, i_min, val, λ_max, a_max, i_max, valM, progress) =
         FrankWolfe.active_set_argminmax(active_set, [1, 1.5])
@@ -221,7 +221,7 @@ end
     @test_throws ErrorException FrankWolfe.active_set_argminmax(active_set, [NaN, NaN])
 end
 
-@testset "LPseparationWithScaledHotVector                   " begin
+@testset "LPseparationWithScaledHotVector" begin
     v1 = FrankWolfe.ScaledHotVector(1, 1, 2)
     v2 = FrankWolfe.ScaledHotVector(1, 2, 2)
     v3 = FrankWolfe.ScaledHotVector(0, 2, 2)
@@ -240,7 +240,7 @@ end
     )
 end
 
-@testset "ActiveSet for BigFloat                            " begin
+@testset "ActiveSet for BigFloat" begin
     n = Int(1e2)
     lmo = FrankWolfe.LpNormLMO{BigFloat,1}(rand())
     x0 = Vector(FrankWolfe.compute_extreme_point(lmo, zeros(n)))