Skip to content

Commit

Permalink
DICG (#460)
Browse files Browse the repository at this point in the history
Implementation of the Decomposition Invariant Conditional Gradient and the Blended Decomposition Invariant Conditional Gradient.
  • Loading branch information
matbesancon authored Nov 7, 2024
1 parent 77cac37 commit 39c76df
Show file tree
Hide file tree
Showing 18 changed files with 1,807 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "FrankWolfe"
uuid = "f55ce6ea-fdc5-4628-88c5-0087fe54bd30"
authors = ["ZIB-IOL"]
version = "0.4.2"
version = "0.4.3"

[deps]
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
Expand All @@ -15,6 +15,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
MathOptSetDistances = "3b969827-a86c-476c-9527-bb6f1a8fbad5"

[compat]
Arpack = "0.5"
Expand Down
1 change: 1 addition & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
MathOptSetDistances = "3b969827-a86c-476c-9527-bb6f1a8fbad5"

[compat]
Documenter = "0.27"
Expand Down
65 changes: 62 additions & 3 deletions examples/optimal_experiment_design.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ using Random
using Distributions
using LinearAlgebra
using Statistics
using SCIP
using MathOptInterface
const MOI = MathOptInterface
using SparseArrays
using Test

# The Optimal Experiment Design Problem consists of choosing a subset of experiments
Expand Down Expand Up @@ -43,6 +47,29 @@ function build_data(m)
return A
end

"""
Build MOI version of the lmo.
"""
function build_moi_lmo(m)
o = SCIP.Optimizer()
MOI.empty!(o)
MOI.set(o, MOI.Silent(), true)

x = MOI.add_variables(o, m)

for xi in x
# each var has to be non-negative
MOI.add_constraint(o, xi, MOI.GreaterThan(0.0))
end

# sum of all variables has to be less than 1.0
MOI.add_constraint(o, sum(x, init=0.0), MOI.LessThan(1.0))

lmo = FrankWolfe.MathOptLMO(o)

return lmo
end

"""
Check if given point is in the domain of f, i.e. X = transpose(A) * diagm(x) * A
positive definite.
Expand Down Expand Up @@ -87,12 +114,11 @@ function build_start_point(A)
V = Vector{Float64}[]

for i in S
v = zeros(m)
v[i] = 1.0
v = FrankWolfe.ScaledHotVector(1.0, i, m)
push!(V, v)
end

x = sum(V .* 1/n)
x = SparseArrays.SparseVector(sum(V .* 1/n))
active_set= FrankWolfe.ActiveSet(fill(1/n, n), V, x)

return x, active_set, S
Expand Down Expand Up @@ -228,8 +254,24 @@ m = 300
domain_oracle = build_domain_oracle(A)
x_s, _, primal, dual_gap, traj_data_s, _ = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, lmo, active_set, verbose=true, line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)

lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
f, grad! = build_a_criterion(A, build_safe=false)
x0, active_set = build_start_point(A)
domain_oracle = build_domain_oracle(A)
x_d, _, primal, dual_gap, traj_data_d = FrankWolfe.decomposition_invariant_conditional_gradient(f, grad!, lmo, x0, verbose=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)

lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
f, grad! = build_a_criterion(A, build_safe=false)
x0, active_set = build_start_point(A)
domain_oracle = build_domain_oracle(A)
x_b, _, primal, dual_gap, traj_data_b, _ = FrankWolfe.blended_conditional_gradient(f, grad!, lmo, x0, verbose=true, trajectory=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle))

@test traj_data_s[end][1] < traj_data[end][1]
@test traj_data_d[end][1] <= traj_data_s[end][1]
@test traj_data_b[end][1] <= traj_data_s[end][1]
@test isapprox(f(x_s), f(x))
@test isapprox(f(x_s), f(x_d))
@test isapprox(f(x_s), f(x_b))
end

@testset "D-Optimal Design" begin
Expand All @@ -246,8 +288,25 @@ m = 300
domain_oracle = build_domain_oracle(A)
x_s, _, primal, dual_gap, traj_data_s, _ = FrankWolfe.blended_pairwise_conditional_gradient(f, grad!, lmo, active_set, verbose=true, line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)

lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
f, grad! = build_d_criterion(A, build_safe=false)
x0, active_set = build_start_point(A)
domain_oracle = build_domain_oracle(A)
x_d, _, primal, dual_gap, traj_data_d = FrankWolfe.decomposition_invariant_conditional_gradient(f, grad!, lmo, x0, verbose=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle), trajectory=true)

domain_oracle = build_domain_oracle(A)
lmo = FrankWolfe.ProbabilitySimplexOracle(1.0)
f, grad! = build_d_criterion(A, build_safe=false)
x0, active_set = build_start_point(A)
domain_oracle = build_domain_oracle(A)
x_b, _, primal, dual_gap, traj_data_b, _ = FrankWolfe.blended_conditional_gradient(f, grad!, lmo, x0, verbose=true, trajectory=true,line_search=FrankWolfe.Secant(domain_oracle=domain_oracle))

@test traj_data_s[end][1] < traj_data[end][1]
@test traj_data_d[end][1] <= traj_data_s[end][1]
@test traj_data_b[end][1] <= traj_data_s[end][1]
@test isapprox(f(x_s), f(x))
@test isapprox(f(x_s), f(x_d))
@test isapprox(f(x_s), f(x_b))
end
end

3 changes: 3 additions & 0 deletions src/FrankWolfe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import MathOptInterface
const MOI = MathOptInterface
const MOIU = MOI.Utilities

import MathOptSetDistances as MOD

# for Birkhoff polytope LMO
import Hungarian

Expand Down Expand Up @@ -45,6 +47,7 @@ include("block_coordinate_algorithms.jl")
include("alternating_methods.jl")
include("blended_pairwise.jl")
include("pairwise.jl")
include("dicg.jl")
include("tracking.jl")
include("callback.jl")

Expand Down
17 changes: 17 additions & 0 deletions src/afw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,23 @@ function away_frank_wolfe(
return (x=x, v=v, primal=primal, dual_gap=dual_gap, traj_data=traj_data, active_set=active_set)
end

# JUSTIFICATION for using the standard FW-gap in the dual update `phi = min(dual_gap, phi / 2.0)` below.
# Note: usually we would use the strong FW gap for phi to scale over, however it suffices to use _standard_ FW gap instead
# To this end observe that we take a "lazy step", i.e., one using already stored vertices if in the below it holds:
# grad_dot_x - grad_dot_lazy_fw_vertex + grad_dot_a - grad_dot_x >= phi / lazy_tolerance
# <=> grad_dot_a - grad_dot_lazy_fw_vertex >= phi / lazy_tolerance
# now phi is at least dual_gap / 2 where dual_gap = grad_dot_x - grad_dot_fw_vertex, until we cannot find a vertex from the "lazy" (already seen) set
# => 2 * lazy_tolerance * grad_dot_a - grad_dot_lazy_fw_vertex >= (grad_dot_x - grad_dot_fw_vertex)
# via https://hackmd.io/@spokutta/B14MTMsLF / see also https://arxiv.org/pdf/2110.12650.pdf Lemma 3.7 and (3.30)
# we have that:
# (2 * lazy_tolerance + 1.0 ) * < nabla f(x_t), d_t > >= grad_dot_a - grad_dot_fw_vertex (the strong FW gap),
# where "d_t = away_vertex - x_t" (away step) or "d_t = x_t - lazy_fw_vertex" (lazy FW step)
# as such the < nabla f(x_t), d_t > >= strong_FW_gap / (2 * lazy_tolerance + 1.0 ) (which is required for enough primal progress per original proof)
# usually we have lazy_tolerance = 1.0, and hence we have:
# < nabla f(x_t), d_t > >= strong_FW_gap / 3.0
#
# a more complete derivation can be found in https://hackmd.io/@spokutta/B14MTMsLF

function lazy_afw_step(x, gradient, lmo, active_set, phi, epsilon, d; use_extra_vertex_storage=false, extra_vertex_storage=nothing, lazy_tolerance=2.0, memory_mode::MemoryEmphasis=InplaceEmphasis())
_, v, v_loc, _, a_lambda, a, a_loc, _, _ = active_set_argminmax(active_set, gradient)
#Do lazy FW step
Expand Down
2 changes: 1 addition & 1 deletion src/blended_cg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ function blended_conditional_gradient(
end

if verbose
println("\nBlended Conditional Gradients Algorithm.")
println("\nBlended Conditional Gradient Algorithm.")
NumType = eltype(x)
println(
"MEMORY_MODE: $memory_mode STEPSIZE: $line_search EPSILON: $epsilon MAXITERATION: $max_iteration TYPE: $NumType",
Expand Down
37 changes: 37 additions & 0 deletions src/block_coordinate_algorithms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,34 @@ function LazyUpdate(lazy_block::Int,refresh_rate::Int)
return LazyUpdate(lazy_block, refresh_rate, 1)
end

"""
The Lazy update order is discussed in "Flexible block-iterative
analysis for the Frank-Wolfe algorithm," by Braun, Pokutta, &
Woodstock (2024).
'lazy_block' is an index of a computationally expensive block to
update;
'refresh_rate' describes the frequency at which we perform
a full activation; and
'block_size' describes the number of "faster" blocks
(i.e., those excluding 'lazy_block') activated (chosen
uniformly at random) during each
of the "faster" iterations; for more detail, see the
article. If 'block_size' is unspecified, this defaults to
1.
Note: This methodology is currently only proven to work
with 'FrankWolfe.Shortstep' linesearches and a (not-yet
implemented) adaptive method; see the article for details.
"""
struct LazyUpdate <: BlockCoordinateUpdateOrder
lazy_block::Int
refresh_rate::Int
block_size::Int
end

function LazyUpdate(lazy_block::Int,refresh_rate::Int)
return LazyUpdate(lazy_block, refresh_rate, 1)
end

function select_update_indices(::FullUpdate, s::CallbackState, _)
return [1:length(s.lmo.lmos)]
end
Expand Down Expand Up @@ -204,6 +232,15 @@ function select_update_indices(update::LazyUpdate, s::CallbackState, dual_gaps)
return push!([[rand(range(1,l)[1:l .!= update.lazy_block]) for _ in range(1,update.block_size)] for _ in 1:(update.refresh_rate -1)], range(1,l))
end

function select_update_indices(update::LazyUpdate, s::CallbackState, dual_gaps)
#Returns a sequence of randomized cheap indices by
#excluding update.lazy_block until "refresh_rate" updates
#occur, then adds an update of everything while mainting
#randomized order.
l = length(s.lmo.lmos)
return push!([[rand(range(1,l)[1:l .!= update.lazy_block]) for _ in range(1,update.block_size)] for _ in 1:(update.refresh_rate -1)], range(1,l))
end

"""
Update step for block-coordinate Frank-Wolfe.
These are implementations of different FW-algorithms to be used in a blockwise manner.
Expand Down
Loading

2 comments on commit 39c76df

@dhendryc
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/118911

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.4.3 -m "<description of version>" 39c76df43d41e68f221c941cb25dcfedd65f18af
git push origin v0.4.3

Please sign in to comment.