Skip to content

Commit

Permalink
Adding support for vectorized fmm calls
Browse files Browse the repository at this point in the history
Adding support for vectorized calls to `hfmm3d`. As mentioned by Manas Rachh the speedup is limited around batches of sizes 16-32 so the multiplication is done in batches.
  • Loading branch information
mipals committed Sep 21, 2023
1 parent 8cb7dcc commit 5161375
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 5 deletions.
101 changes: 96 additions & 5 deletions src/3d/fast_multipole_method.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,18 @@ Saves the ``i``th column of `normals` scaled by the ``i``th value of `scalings`
Used when creating the "dipole-vectors" required for using the FMM.
"""
function scale_columns!(dipvecs, normals, scalings)
@inbounds for i in eachindex(scalings)
dipvecs[1, i] = normals[1, i] * scalings[i]
dipvecs[2, i] = normals[2, i] * scalings[i]
dipvecs[3, i] = normals[3, i] * scalings[i]
function scale_columns!(dipvecs, normals, scalings, nd=1)
@inbounds for i = 1:size(normals,2)
@inbounds for j = 1:nd
dipvecs[j + 0nd, i] = normals[1, i] * scalings[i,j]
dipvecs[j + 1nd, i] = normals[2, i] * scalings[i,j]
dipvecs[j + 2nd, i] = normals[3, i] * scalings[i,j]
end
end
return dipvecs
end


"""
setup_fast_operator(mesh,zk,n_gauss,nearfield,offset,depth;single_layer=true)
Expand Down Expand Up @@ -245,6 +248,47 @@ function LinearAlgebra.mul!(y::AbstractVecOrMat{T},
# The FMM3D library does not divide by 4π so we do it manually
return y .= vals.pottarg / 4π + A.nearfield_correction * x
end
function LinearAlgebra.mul!(Y::AbstractMatrix{T},
A::FMMGOperator{T},
X::AbstractMatrix) where {T<:ComplexF64}
# Checking dimensions
LinearMaps.check_dim_mul(Y, A, X)
# Set between 16-32 as discussed with Manas Rachh
batch_size = 32
# Get input, batch_size and number of sources
n_input = size(X,2)
n_whole = div(n_input,batch_size)
n_sources = size(A.sources,2)
# Preallocation
coefficients = zeros(eltype(X), batch_size, n_sources)
for i = 1:n_whole
# Getting X-batch
Xview = @view X[:, (batch_size*(i-1) + 1):(batch_size*i) ]
# Mapping collocation point values to Gaussian points
mul!(coefficients, Transpose(Xview), A.C')
# Computing the FMM sum
vals = hfmm3d(A.tol, A.k, A.sources; targets=A.targets, charges=coefficients, pgt=1, nd=batch_size)
# Ouput equal to the FMM contributions + near field corrections
Y[:, (batch_size*(i-1) + 1):(batch_size*i) ] .=
Transpose(vals.pottarg) / 4π + A.nearfield_correction * Xview
end
# Checking if batch_size was equal to input - Returning if true
if mod(n_input,batch_size) == 0
return Y
end
# Getting X-batch
Xview = @view X[:, (batch_size*n_whole + 1):(batch_size*n_whole + mod(n_input,batch_size)) ]
# Pre-allocating
coefficients = zeros(eltype(X), mod(n_input,batch_size), n_sources)
# Mapping values of X-batch at the collocation points to the Gaussian points
mul!(coefficients, Transpose(Xview), A.C')
# Computing the FMM sum
vals = hfmm3d(A.tol, A.k, A.sources; targets=A.targets, charges=coefficients, pgt=1, nd=mod(n_input,batch_size))
# Ouput equal to the FMM contributions + near field corrections
Y[:, (batch_size*n_whole + 1):(batch_size*n_whole + mod(n_input,batch_size)) ] .=
Transpose(vals.pottarg) / 4π + A.nearfield_correction * Xview
return Y
end

function FMMGOperator(mesh, k; tol=1e-6, n_gauss=3, nearfield=true, offset=0.2, depth=1)
# Making sure the wave number is complex
Expand Down Expand Up @@ -347,6 +391,53 @@ function LinearAlgebra.mul!(y::AbstractVecOrMat{T},
# The FMM3D library does not divide by 4π so we do it manually
return y .= vals.pottarg / 4π + A.nearfield_correction * x
end
function LinearAlgebra.mul!(Y::AbstractMatrix{T},
A::FMMFOperator{T},
X::AbstractMatrix) where {T<:ComplexF64}
# Checking dimensions
LinearMaps.check_dim_mul(Y, A, X)
# Set between 16-32 as discussed with Manas Rachh
batch_size = 32
# Get input and number of sources
n_input = size(X,2)
n_whole = div(n_input,batch_size)
n_sources = size(A.sources,2)
# Preallocation
coefficients = zeros(eltype(X), n_sources, batch_size)
dipvecs = zeros(eltype(coefficients), 3batch_size, n_sources)
for i = 1:n_whole
# Getting X-batch
Xview = @view X[:, (batch_size*(i-1) + 1):(batch_size*i) ]
# Mapping collocation point values to Gaussian points
mul!(coefficients, A.C, Xview)
# Scale "dipoles"
scale_columns!(dipvecs, A.normals, coefficients, batch_size)
# Computing the FMM sum
vals = hfmm3d(A.tol, A.k, A.sources; targets=A.targets, dipvecs=dipvecs, pgt=1, nd=batch_size)
# Ouput equal to the FMM contributions + near field corrections
Y[:, (batch_size*(i-1) + 1):(batch_size*i) ] .=
Transpose(vals.pottarg) / 4π + A.nearfield_correction * Xview
end
# Checking if batch_size was equal to input - Returning if true
if mod(n_input,batch_size) == 0
return Y
end
# Getting X-batch
Xview = @view X[:, (batch_size*n_whole + 1):(batch_size*n_whole + mod(n_input,batch_size)) ]
# Pre-allocating
coefficients = zeros(eltype(X), n_sources, mod(n_input,batch_size))
dipvecs = zeros(eltype(coefficients), 3*mod(n_input,batch_size), n_sources)
# Mapping values of X-batch at the collocation points to the Gaussian points
mul!(coefficients, A.C, Xview)
# Scale "dipoles"
scale_columns!(dipvecs, A.normals, coefficients, mod(n_input,batch_size))
# Computing the FMM sum
vals = hfmm3d(A.tol, A.k, A.sources; targets=A.targets, dipvecs=dipvecs, pgt=1, nd=mod(n_input,batch_size))
# Ouput equal to the FMM contributions + near field corrections
Y[:, (batch_size*n_whole + 1):(batch_size*n_whole + mod(n_input,batch_size)) ] .=
Transpose(vals.pottarg) / 4π + A.nearfield_correction * Xview
return Y
end

function FMMFOperator(mesh, k; n_gauss=3, tol=1e-6, nearfield=true, offset=0.2, depth=1)
# Making sure the wave number is complex
Expand Down
3 changes: 3 additions & 0 deletions src/BoundaryIntegralEquations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,7 @@ export create_simple_mesh, create_bc_simple_mesh, create_vizualization_data
export LossyBlockMatrix,LossyBlockMatrixCompact,LossyOneVariableInner,LossyOneVariableOuter
export compute_lossy_rhs,LossyGlobalOuter

# ROSEBEM
export scattering_krylov_basis, taylor_assemble!, apply_taylor_expansion

end

0 comments on commit 5161375

Please sign in to comment.