From 8ed5e97579a2f5265e85f78cad749e4660d1ca1d Mon Sep 17 00:00:00 2001 From: Liam Adams Date: Fri, 15 Nov 2024 23:28:52 +0000 Subject: [PATCH] Update interpolation to work with hicsparse sparse matrix multiply backend. --- src/atlas/interpolation/method/Method.cc | 180 +++++++++++----- ...nservativeSphericalPolygonInterpolation.cc | 4 + src/tests/interpolation/CMakeLists.txt | 7 + .../test_interpolation_structured2D_gpu.cc | 196 ++++++++++++++++++ 4 files changed, 335 insertions(+), 52 deletions(-) create mode 100644 src/tests/interpolation/test_interpolation_structured2D_gpu.cc diff --git a/src/atlas/interpolation/method/Method.cc b/src/atlas/interpolation/method/Method.cc index 3a2349a1d..ce0211b13 100644 --- a/src/atlas/interpolation/method/Method.cc +++ b/src/atlas/interpolation/method/Method.cc @@ -95,14 +95,78 @@ void set_missing_values(Field& tgt, const std::vector& missing) { } } +enum class MemorySpace { Host, Device }; + +MemorySpace getSparseBackendMemorySpace(const sparse::Backend& backend) { + if (backend.type() == "eckit_linalg") { + return MemorySpace::Host; + } else if (backend.type() == "openmp") { + return MemorySpace::Host; + } else if (backend.type() == "hicsparse") { + return MemorySpace::Device; + } else { + ATLAS_NOTIMPLEMENTED; + } +} + +void fetch(const atlas::Field& f, MemorySpace memorySpace) { + if(memorySpace == MemorySpace::Host) { + if (f.hostNeedsUpdate()) { + f.updateHost(); + } + } + else { + ATLAS_ASSERT(memorySpace == MemorySpace::Device); + if (f.deviceNeedsUpdate()) { + f.updateDevice(); + } + } +} + +void setOtherMemorySpaceNeedsUpdate(const atlas::Field& f, MemorySpace memorySpace) { + if(memorySpace == MemorySpace::Host) { + f.setDeviceNeedsUpdate(true); + } + else { + ATLAS_ASSERT(memorySpace == MemorySpace::Device); + f.setHostNeedsUpdate(true); + } +} + +template +atlas::array::ArrayView make_device_view_fetched(atlas::Field& f) { + fetch(f, MemorySpace::Device); + return atlas::array::make_device_view(f); +} + +template +atlas::array::ArrayView make_device_view_fetched(const atlas::Field& f) { + fetch(f, MemorySpace::Device); + return atlas::array::make_device_view(f); +} + +template +atlas::array::ArrayView make_host_view_fetched(atlas::Field& f) { + fetch(f, MemorySpace::Host); + return atlas::array::make_host_view(f); +} + +template +atlas::array::ArrayView make_host_view_fetched(const atlas::Field& f) { + fetch(f, MemorySpace::Host); + return atlas::array::make_host_view(f); +} + } // anonymous namespace template void Method::interpolate_field_rank1(const Field& src, Field& tgt, const Matrix& W) const { auto backend = std::is_same::value ? sparse::backend::openmp() : sparse::Backend{linalg_backend_}; - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + const auto memorySpace = getSparseBackendMemorySpace(backend); + + auto src_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(src) : make_device_view_fetched(src); + auto tgt_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(tgt) : make_device_view_fetched(tgt); if (nonLinear_(src)) { Matrix W_nl(W); // copy (a big penalty -- copy-on-write would definitely be better) @@ -112,17 +176,25 @@ void Method::interpolate_field_rank1(const Field& src, Field& tgt, const Matrix& else { sparse_matrix_multiply(W, src_v, tgt_v, backend); } + + setOtherMemorySpaceNeedsUpdate(tgt, memorySpace); } template void Method::interpolate_field_rank2(const Field& src, Field& tgt, const Matrix& W) const { - sparse::Backend backend{linalg_backend_}; - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + auto backend = std::is_same::value ? sparse::backend::openmp() : sparse::Backend{linalg_backend_}; + + // To match previous logic of only using OpenMP (probably because eckit_linalg backend doesn't support "layout_left" indexing) + if (backend.type() == "eckit_linalg") { + backend = sparse::backend::openmp(); + } if (nonLinear_(src)) { // We cannot apply the same matrix to full columns as e.g. missing values could be present in only certain parts. + + auto src_v = array::make_view(src); + auto tgt_v = array::make_view(tgt); // Allocate temporary rank-1 fields corresponding to one horizontal level auto src_slice = Field("s", array::make_datatype(), {src.shape(0)}); @@ -144,13 +216,19 @@ void Method::interpolate_field_rank2(const Field& src, Field& tgt, const Matrix& interpolate_field_rank1(src_slice, tgt_slice, W); // Copy rank-1 field to this level in the rank-2 field + fetch(tgt_slice, MemorySpace::Host); for (idx_t i = 0; i < tgt.shape(0); ++i) { tgt_v(i, lev) = tgt_slice_v(i); } + setOtherMemorySpaceNeedsUpdate(tgt, MemorySpace::Host); } } else { - sparse_matrix_multiply(W, src_v, tgt_v, sparse::backend::openmp()); + const auto memorySpace = getSparseBackendMemorySpace(backend); + auto src_dv = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(src) : make_device_view_fetched(src); + auto tgt_dv = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(tgt) : make_device_view_fetched(tgt); + sparse_matrix_multiply(W, src_dv, tgt_dv, backend); + setOtherMemorySpaceNeedsUpdate(tgt, memorySpace); } } @@ -158,75 +236,57 @@ void Method::interpolate_field_rank2(const Field& src, Field& tgt, const Matrix& template void Method::interpolate_field_rank3(const Field& src, Field& tgt, const Matrix& W) const { sparse::Backend backend{linalg_backend_}; - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + auto src_v = make_host_view_fetched(src); + auto tgt_v = make_host_view_fetched(tgt); if (not W.empty() && nonLinear_(src)) { ATLAS_ASSERT(false, "nonLinear interpolation not supported for rank-3 fields."); } sparse_matrix_multiply(W, src_v, tgt_v, sparse::backend::openmp()); + setOtherMemorySpaceNeedsUpdate(tgt, MemorySpace::Host); } template void Method::adjoint_interpolate_field_rank1(Field& src, const Field& tgt, const Matrix& W) const { - array::ArrayT tmp(src.shape()); + auto backend = std::is_same::value ? sparse::backend::openmp() : sparse::Backend{linalg_backend_}; + const auto memorySpace = getSparseBackendMemorySpace(backend); - auto tmp_v = array::make_view(tmp); - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + auto src_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(src) : make_device_view_fetched(src); + auto tgt_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(tgt) : make_device_view_fetched(tgt); - tmp_v.assign(0.); + sparse_matrix_multiply_add(W, tgt_v, src_v, backend); - if (std::is_same::value) { - sparse_matrix_multiply(W, tgt_v, tmp_v, sparse::backend::openmp()); - } - else { - sparse_matrix_multiply(W, tgt_v, tmp_v, sparse::Backend{linalg_backend_}); - } - - - for (idx_t t = 0; t < tmp.shape(0); ++t) { - src_v(t) += tmp_v(t); - } + setOtherMemorySpaceNeedsUpdate(src, memorySpace); } template void Method::adjoint_interpolate_field_rank2(Field& src, const Field& tgt, const Matrix& W) const { - array::ArrayT tmp(src.shape()); + auto backend = std::is_same::value ? sparse::backend::openmp() : sparse::Backend{linalg_backend_}; + + // To match previous logic of only using OpenMP (probably because eckit_linalg backend doesn't support "layout_left" indexing) + if (backend.type() == "eckit_linalg") { + backend = sparse::backend::openmp(); + } - auto tmp_v = array::make_view(tmp); - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + const auto memorySpace = getSparseBackendMemorySpace(backend); - tmp_v.assign(0.); + auto src_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(src) : make_device_view_fetched(src); + auto tgt_v = (memorySpace == MemorySpace::Host) ? make_host_view_fetched(tgt) : make_device_view_fetched(tgt); - sparse_matrix_multiply(W, tgt_v, tmp_v, sparse::backend::openmp()); + sparse_matrix_multiply_add(W, tgt_v, src_v, backend); - for (idx_t t = 0; t < tmp.shape(0); ++t) { - for (idx_t k = 0; k < tmp.shape(1); ++k) { - src_v(t, k) += tmp_v(t, k); - } - } + setOtherMemorySpaceNeedsUpdate(src, memorySpace); } template void Method::adjoint_interpolate_field_rank3(Field& src, const Field& tgt, const Matrix& W) const { - array::ArrayT tmp(src.shape()); - - auto tmp_v = array::make_view(tmp); - auto src_v = array::make_view(src); - auto tgt_v = array::make_view(tgt); + sparse::Backend backend{linalg_backend_}; - tmp_v.assign(0.); + auto src_v = make_host_view_fetched(src); + auto tgt_v = make_host_view_fetched(tgt); - sparse_matrix_multiply(W, tgt_v, tmp_v, sparse::backend::openmp()); + sparse_matrix_multiply_add(W, tgt_v, src_v, sparse::backend::openmp()); - for (idx_t t = 0; t < tmp.shape(0); ++t) { - for (idx_t j = 0; j < tmp.shape(1); ++j) { - for (idx_t k = 0; k < tmp.shape(2); ++k) { - src_v(t, j, k) += tmp_v(t, j, k); - } - } - } + setOtherMemorySpaceNeedsUpdate(src, MemorySpace::Host); } void Method::check_compatibility(const Field& src, const Field& tgt, const Matrix& W) const { @@ -381,8 +441,13 @@ void Method::do_execute(const FieldSet& fieldsSource, FieldSet& fieldsTarget, Me void Method::do_execute(const Field& src, Field& tgt, Metadata&) const { ATLAS_TRACE("atlas::interpolation::method::Method::do_execute()"); + // todo: dispatch to gpu-aware mpi if available + if (src.hostNeedsUpdate()) { + src.updateHost(); + } haloExchange(src); - + src.setDeviceNeedsUpdate(true); + if( matrix_ ) { // (matrix == nullptr) when a partition is empty if (src.datatype().kind() == array::DataType::KIND_REAL64) { interpolate_field(src, tgt, *matrix_); @@ -411,7 +476,13 @@ void Method::do_execute(const Field& src, Field& tgt, Metadata&) const { } // set missing values - set_missing_values(tgt, missing_); + if (not missing_.empty()) { + if (tgt.hostNeedsUpdate()) { + tgt.updateHost(); + } + set_missing_values(tgt, missing_); + tgt.setDeviceNeedsUpdate(true); + } tgt.set_dirty(); } @@ -454,7 +525,12 @@ void Method::do_execute_adjoint(Field& src, const Field& tgt, Metadata&) const { src.set_dirty(); + // todo: dispatch to gpu-aware mpi if available + if (src.hostNeedsUpdate()) { + src.updateHost(); + } adjointHaloExchange(src); + src.setDeviceNeedsUpdate(true); } @@ -501,4 +577,4 @@ interpolation::Cache Method::createCache() const { } // namespace interpolation -} // namespace atlas +} // namespace atlas \ No newline at end of file diff --git a/src/atlas/interpolation/method/unstructured/ConservativeSphericalPolygonInterpolation.cc b/src/atlas/interpolation/method/unstructured/ConservativeSphericalPolygonInterpolation.cc index 334b65422..de06a04c3 100644 --- a/src/atlas/interpolation/method/unstructured/ConservativeSphericalPolygonInterpolation.cc +++ b/src/atlas/interpolation/method/unstructured/ConservativeSphericalPolygonInterpolation.cc @@ -1683,7 +1683,11 @@ void ConservativeSphericalPolygonInterpolation::do_execute(const Field& src_fiel stopwatch.stop(); { ATLAS_TRACE("halo exchange target"); + if (tgt_field.hostNeedsUpdate()) { + tgt_field.updateHost(); + } tgt_field.haloExchange(); + tgt_field.setDeviceNeedsUpdate(true); } auto remap_stat = remap_stat_; diff --git a/src/tests/interpolation/CMakeLists.txt b/src/tests/interpolation/CMakeLists.txt index a5ac0e86a..f6be0ae85 100644 --- a/src/tests/interpolation/CMakeLists.txt +++ b/src/tests/interpolation/CMakeLists.txt @@ -104,6 +104,13 @@ ecbuild_add_test( TARGET atlas_test_interpolation_biquasicubic ENVIRONMENT ${ATLAS_TEST_ENVIRONMENT} ) +ecbuild_add_test( TARGET atlas_test_interpolation_bicubic_gpu + SOURCES test_interpolation_structured2D_gpu.cc + LIBS atlas + CONDITION atlas_HAVE_CUDA OR atlas_HAVE_HIP + ENVIRONMENT ${ATLAS_TEST_ENVIRONMENT} +) + ecbuild_add_test( TARGET atlas_test_interpolation_structured2D_to_unstructured SOURCES test_interpolation_structured2D_to_unstructured.cc LIBS atlas diff --git a/src/tests/interpolation/test_interpolation_structured2D_gpu.cc b/src/tests/interpolation/test_interpolation_structured2D_gpu.cc new file mode 100644 index 000000000..1e42dc152 --- /dev/null +++ b/src/tests/interpolation/test_interpolation_structured2D_gpu.cc @@ -0,0 +1,196 @@ +/* + * (C) Copyright 2024 ECMWF. + * + * This software is licensed under the terms of the Apache Licence Version 2.0 + * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + * In applying this licence, ECMWF does not waive the privileges and immunities + * granted to it by virtue of its status as an intergovernmental organisation + * nor does it submit to any jurisdiction. + */ + +#include "atlas/array.h" +#include "atlas/field/Field.h" +#include "atlas/field/FieldSet.h" +#include "atlas/functionspace/NodeColumns.h" +#include "atlas/functionspace/StructuredColumns.h" +#include "atlas/grid/Grid.h" +#include "atlas/grid/Iterator.h" +#include "atlas/interpolation.h" +#include "atlas/mesh/Mesh.h" +#include "atlas/meshgenerator.h" +#include "atlas/output/Gmsh.h" +#include "atlas/util/CoordinateEnums.h" +#include "atlas/util/function/VortexRollup.h" + +#include "tests/AtlasTestEnvironment.h" + +using atlas::functionspace::NodeColumns; +using atlas::functionspace::StructuredColumns; +using atlas::util::Config; + +namespace atlas { +namespace test { + +//----------------------------------------------------------------------------- + +static Config scheme() { + Config scheme; + scheme.set("type", "structured-cubic2D"); + scheme.set("halo", 2); + scheme.set("name", "cubic"); + scheme.set("verbose",eckit::Resource("--verbose",false)); + scheme.set("sparse_matrix_multiply", "hicsparse"); + return scheme; +} + +std::string input_gridname(const std::string& default_grid) { + return eckit::Resource("--input-grid", default_grid); +} + +std::string output_gridname(const std::string& default_grid) { + return eckit::Resource("--output-grid", default_grid); +} + +CASE("which scheme?") { + Log::info() << scheme().getString("type") << std::endl; +} + +template +struct AdjointTolerance { + static const Value value; +}; +template <> +const double AdjointTolerance::value = 2.e-14; +template <> +const float AdjointTolerance::value = 2.e-5; + + +void test_interpolation_structured_using_fs_API_for_fieldset_w_hicsparse_backend() { + Grid input_grid(input_gridname("O32")); + Grid output_grid(output_gridname("O64")); + + // Cubic interpolation requires a StructuredColumns functionspace with 2 halos + StructuredColumns input_fs(input_grid, scheme() | option::levels(3)); + + MeshGenerator meshgen("structured"); + Mesh output_mesh = meshgen.generate(output_grid); + FunctionSpace output_fs = NodeColumns{output_mesh, option::levels(3)}; + + auto lonlat = array::make_view(input_fs.xy()); + + FieldSet fields_source; + FieldSet fields_target; + for (idx_t f = 0; f < 3; ++f) { + auto field_source = fields_source.add(input_fs.createField(option::name("field " + std::to_string(f)))); + fields_target.add(output_fs.createField(option::name("field " + std::to_string(f)))); + + auto source = array::make_view(field_source); + for (idx_t n = 0; n < input_fs.size(); ++n) { + for (idx_t k = 0; k < 3; ++k) { + source(n, k) = util::function::vortex_rollup(lonlat(n, LON), lonlat(n, LAT), 0.5 + double(k) / 2); + } + } + } + + ATLAS_TRACE_SCOPE("with matrix") { + Interpolation interpolation(scheme(), input_fs, output_fs); + interpolation.execute(fields_source, fields_target); + for (auto& field : fields_target) { + field.updateHost(); + } + + ATLAS_TRACE_SCOPE("output") { + output::Gmsh gmsh(scheme().getString("name") + "-multilevel-fieldset-output-with-matrix-" + + array::make_datatype().str() + ".msh", + Config("coordinates", "xy")); + gmsh.write(output_mesh); + output_fs.haloExchange(fields_target); + gmsh.write(fields_target); + } + } + + ATLAS_TRACE_SCOPE("with matrix adjoint") { + Interpolation interpolation(scheme() | Config("adjoint", true), input_fs, output_fs); + + std::vector AxAx(fields_source.field_names().size(), 0.); + std::vector xAtAx(fields_source.field_names().size(), 0.); + + FieldSet fields_source_reference; + for (atlas::Field& field : fields_source) { + Field temp_field(field.name(), field.datatype().kind(), field.shape()); + temp_field.set_levels(field.levels()); + + auto fieldInView = array::make_view(field); + auto fieldOutView = array::make_view(temp_field); + + for (atlas::idx_t jn = 0; jn < temp_field.shape(0); ++jn) { + for (atlas::idx_t jl = 0; jl < temp_field.levels(); ++jl) { + fieldOutView(jn, jl) = fieldInView(jn, jl); + } + } + fields_source_reference.add(temp_field); + } + + interpolation.execute(fields_source, fields_target); + for (auto& field : fields_target) { + field.updateHost(); + } + + std::size_t fIndx(0); + auto source_names = fields_source.field_names(); + for (const std::string& s : fields_target.field_names()) { + auto target = array::make_view(fields_target[s]); + auto source = array::make_view(fields_source[source_names[fIndx]]); + + for (idx_t n = 0; n < input_fs.size(); ++n) { + for (idx_t k = 0; k < 3; ++k) { + AxAx[fIndx] += source(n, k) * source(n, k); + } + } + + for (idx_t n = 0; n < output_fs.size(); ++n) { + for (idx_t k = 0; k < 3; ++k) { + AxAx[fIndx] += target(n, k) * target(n, k); + } + } + fIndx += 1; + } + + interpolation.execute_adjoint(fields_source, fields_target); + for (auto& field : fields_source) { + field.updateHost(); + } + + fIndx = 0; + for (const std::string& s : fields_source.field_names()) { + auto source_reference = array::make_view(fields_source_reference[s]); + auto source = array::make_view(fields_source[s]); + + for (idx_t n = 0; n < input_fs.size(); ++n) { + for (idx_t k = 0; k < 3; ++k) { + xAtAx[fIndx] += source(n, k) * source_reference(n, k); + } + } + fIndx += 1; + } + + for (std::size_t t = 0; t < AxAx.size(); ++t) { + Log::debug() << " Adjoint test t = " << t << " (Ax).(Ax) = " << AxAx[t] << " x.(AtAx) = " << xAtAx[t] + << " std::abs( 1.0 - xAtAx[t]/AxAx[t] ) " << std::abs(1.0 - xAtAx[t] / AxAx[t]) + << " AdjointTolerance::value " << AdjointTolerance::value << std::endl; + + EXPECT(std::abs(1.0 - xAtAx[t] / AxAx[t]) < AdjointTolerance::value); + } + } +} + +CASE("test_interpolation_structured using fs API for fieldset with hicsparse backend") { + test_interpolation_structured_using_fs_API_for_fieldset_w_hicsparse_backend(); +} + +} // namespace test +} // namespace atlas + +int main(int argc, char** argv) { + return atlas::test::run(argc, argv); +}