DICG (#460)

Implementation of the Decomposition Invariant Conditional Gradient and the Blended Decomposition Invariant Conditional Gradient.
ZIB-IOL · Nov 7, 2024 · 39c76df · 39c76df · dhendryc · Nov 7, 2024
1 parent 77cac37
commit 39c76df
Show file tree

Hide file tree

Showing 18 changed files with 1,807 additions and 12 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FrankWolfe"
 uuid = "f55ce6ea-fdc5-4628-88c5-0087fe54bd30"
 authors = ["ZIB-IOL"]
-version = "0.4.2"
+version = "0.4.3"
 
 [deps]
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
@@ -15,6 +15,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+MathOptSetDistances = "3b969827-a86c-476c-9527-bb6f1a8fbad5"
 
 [compat]
 Arpack = "0.5"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -21,6 +21,7 @@ ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
+MathOptSetDistances = "3b969827-a86c-476c-9527-bb6f1a8fbad5"
 
 [compat]
 Documenter = "0.27"

diff --git a/examples/optimal_experiment_design.jl b/examples/optimal_experiment_design.jl
@@ -4,6 +4,10 @@ using Random
 using Distributions
 using LinearAlgebra
 using Statistics
+using SCIP
+using MathOptInterface
+const MOI = MathOptInterface
+using SparseArrays
 using Test
 
 # The Optimal Experiment Design Problem consists of choosing a subset of experiments
@@ -43,6 +47,29 @@ function build_data(m)
     return A 
 end
 
+"""
+Build MOI version of the lmo.
+"""
+function build_moi_lmo(m)
+    o = SCIP.Optimizer()
+    MOI.empty!(o)
+    MOI.set(o, MOI.Silent(), true)
+
+    x = MOI.add_variables(o, m)
+
+    for xi in x 
+        # each var has to be non-negative
+        MOI.add_constraint(o, xi, MOI.GreaterThan(0.0))
+    end
+
+    # sum of all variables has to be less than 1.0
+    MOI.add_constraint(o, sum(x, init=0.0), MOI.LessThan(1.0))
+
+    lmo = FrankWolfe.MathOptLMO(o)
+
+    return lmo
+end
+
 """
 Check if given point is in the domain of f, i.e. X = transpose(A) * diagm(x) * A 
 positive definite.
@@ -87,12 +114,11 @@ function build_start_point(A)
     V = Vector{Float64}[]
 
     for i in S
-        v = zeros(m)
-        v[i] = 1.0
+        v = FrankWolfe.ScaledHotVector(1.0, i, m)
         push!(V, v)
     end
 
-    x = sum(V .* 1/n)
+    x = SparseArrays.SparseVector(sum(V .* 1/n))
     active_set= FrankWolfe.ActiveSet(fill(1/n, n), V, x)
 
     return x, active_set, S
@@ -228,8 +254,24 @@ m = 300
         domain_oracle = build_domain_oracle(A)
         x_s, _, primal, dual_gap, traj_data_s, _ = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, lmo, active_set, verbose=true, line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)
 
+        lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        f, grad! = build_a_criterion(A, build_safe=false)
+        x0, active_set = build_start_point(A)
+        domain_oracle = build_domain_oracle(A)
+        x_d, _, primal, dual_gap, traj_data_d = FrankWolfe.decomposition_invariant_conditional_gradient(f, grad!, lmo, x0, verbose=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)
+
+        lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        f, grad! = build_a_criterion(A, build_safe=false)
+        x0, active_set = build_start_point(A)
+        domain_oracle = build_domain_oracle(A)
+        x_b, _, primal, dual_gap, traj_data_b, _ = FrankWolfe.blended_conditional_gradient(f, grad!, lmo, x0, verbose=true, trajectory=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle))
+
         @test traj_data_s[end][1] < traj_data[end][1]
+        @test traj_data_d[end][1] <= traj_data_s[end][1]
+        @test traj_data_b[end][1] <= traj_data_s[end][1]
         @test isapprox(f(x_s), f(x))
+        @test isapprox(f(x_s), f(x_d))
+        @test isapprox(f(x_s), f(x_b))
     end
 
     @testset "D-Optimal Design" begin
@@ -246,8 +288,25 @@ m = 300
         domain_oracle = build_domain_oracle(A)
         x_s, _, primal, dual_gap, traj_data_s, _ = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, lmo, active_set, verbose=true, line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)
 
+        lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        f, grad! = build_d_criterion(A, build_safe=false)
+        x0, active_set = build_start_point(A)
+        domain_oracle = build_domain_oracle(A)
+        x_d, _, primal, dual_gap, traj_data_d = FrankWolfe.decomposition_invariant_conditional_gradient(f, grad!, lmo, x0, verbose=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)
+
+        domain_oracle = build_domain_oracle(A)
+        lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
+        f, grad! = build_d_criterion(A, build_safe=false)
+        x0, active_set = build_start_point(A)
+        domain_oracle = build_domain_oracle(A)
+        x_b, _, primal, dual_gap, traj_data_b, _ = FrankWolfe.blended_conditional_gradient(f, grad!, lmo, x0, verbose=true, trajectory=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle))
+
         @test traj_data_s[end][1] < traj_data[end][1]
+        @test traj_data_d[end][1] <= traj_data_s[end][1]
+        @test traj_data_b[end][1] <= traj_data_s[end][1]
         @test isapprox(f(x_s), f(x))
+        @test isapprox(f(x_s), f(x_d))
+        @test isapprox(f(x_s), f(x_b))
     end
 end
 
diff --git a/src/FrankWolfe.jl b/src/FrankWolfe.jl
@@ -14,6 +14,8 @@ import MathOptInterface
 const MOI = MathOptInterface
 const MOIU = MOI.Utilities
 
+import MathOptSetDistances as MOD
+
 # for Birkhoff polytope LMO
 import Hungarian
 
@@ -45,6 +47,7 @@ include("block_coordinate_algorithms.jl")
 include("alternating_methods.jl")
 include("blended_pairwise.jl")
 include("pairwise.jl")
+include("dicg.jl")
 include("tracking.jl")
 include("callback.jl")
 

diff --git a/src/afw.jl b/src/afw.jl
@@ -372,6 +372,23 @@ function away_frank_wolfe(
     return (x=x, v=v, primal=primal, dual_gap=dual_gap, traj_data=traj_data, active_set=active_set)
 end
 
+# JUSTIFICATION for using the standard FW-gap in the dual update `phi = min(dual_gap, phi / 2.0)` below.
+# Note: usually we would use the strong FW gap for phi to scale over, however it suffices to use _standard_ FW gap instead
+# To this end observe that we take a "lazy step", i.e., one using already stored vertices if in the below it holds:
+#  grad_dot_x - grad_dot_lazy_fw_vertex + grad_dot_a - grad_dot_x >= phi / lazy_tolerance
+# <=>  grad_dot_a - grad_dot_lazy_fw_vertex >= phi / lazy_tolerance
+# now phi is at least dual_gap / 2 where dual_gap = grad_dot_x - grad_dot_fw_vertex, until we cannot find a vertex from the "lazy" (already seen) set
+# => 2 * lazy_tolerance * grad_dot_a - grad_dot_lazy_fw_vertex >= (grad_dot_x - grad_dot_fw_vertex)
+# via https://hackmd.io/@spokutta/B14MTMsLF / see also https://arxiv.org/pdf/2110.12650.pdf Lemma 3.7 and (3.30)
+# we have that: 
+# (2 * lazy_tolerance + 1.0 ) * < nabla f(x_t), d_t > >= grad_dot_a - grad_dot_fw_vertex (the strong FW gap),
+# where "d_t = away_vertex - x_t" (away step) or "d_t = x_t - lazy_fw_vertex" (lazy FW step)
+# as such the < nabla f(x_t), d_t > >= strong_FW_gap / (2 * lazy_tolerance + 1.0 ) (which is required for enough primal progress per original proof)
+# usually we have lazy_tolerance = 1.0, and hence we have:
+# < nabla f(x_t), d_t > >= strong_FW_gap / 3.0
+#
+# a more complete derivation can be found in https://hackmd.io/@spokutta/B14MTMsLF
+
 function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis())
     _, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient)
     #Do lazy FW step

diff --git a/src/blended_cg.jl b/src/blended_cg.jl
@@ -163,7 +163,7 @@ function blended_conditional_gradient(
     end
 
     if verbose
-        println("\nBlended Conditional Gradients Algorithm.")
+        println("\nBlended Conditional Gradient Algorithm.")
         NumType = eltype(x)
         println(
             "MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType",

diff --git a/src/block_coordinate_algorithms.jl b/src/block_coordinate_algorithms.jl
@@ -93,6 +93,34 @@ function LazyUpdate(lazy_block::Int,refresh_rate::Int)
     return LazyUpdate(lazy_block, refresh_rate, 1)
 end
 
+"""
+The Lazy update order is discussed in "Flexible block-iterative
+analysis for the Frank-Wolfe algorithm," by Braun, Pokutta, &
+Woodstock (2024). 
+'lazy_block' is an index of a computationally expensive block to
+update;
+'refresh_rate' describes the frequency at which we perform
+a full activation; and
+'block_size' describes the number of "faster" blocks
+(i.e., those excluding 'lazy_block') activated (chosen
+uniformly at random) during each
+of the "faster" iterations; for more detail, see the
+article. If  'block_size' is unspecified, this defaults to
+1. 
+Note: This methodology is currently only proven to work
+with 'FrankWolfe.Shortstep' linesearches and a (not-yet
+implemented) adaptive method; see the article for details.
+"""
+struct LazyUpdate <: BlockCoordinateUpdateOrder 
+    lazy_block::Int
+    refresh_rate::Int
+    block_size::Int
+end
+
+function LazyUpdate(lazy_block::Int,refresh_rate::Int)
+    return LazyUpdate(lazy_block, refresh_rate, 1)
+end
+
 function select_update_indices(::FullUpdate, s::CallbackState, _)
     return [1:length(s.lmo.lmos)]
 end
@@ -204,6 +232,15 @@ function select_update_indices(update::LazyUpdate, s::CallbackState, dual_gaps)
     return push!([[rand(range(1,l)[1:l .!= update.lazy_block]) for _ in range(1,update.block_size)] for _ in 1:(update.refresh_rate -1)], range(1,l))
 end
 
+function select_update_indices(update::LazyUpdate, s::CallbackState, dual_gaps)
+    #Returns a sequence of randomized cheap indices by 
+    #excluding update.lazy_block until "refresh_rate" updates
+    #occur, then adds an update of everything while mainting
+    #randomized order.
+    l = length(s.lmo.lmos)
+    return push!([[rand(range(1,l)[1:l .!= update.lazy_block]) for _ in range(1,update.block_size)] for _ in 1:(update.refresh_rate -1)], range(1,l))
+end
+
 """
 Update step for block-coordinate Frank-Wolfe.
 These are implementations of different FW-algorithms to be used in a blockwise manner.