From f285a3c2766866043a8b33ba2ec434f3f0865170 Mon Sep 17 00:00:00 2001 From: Alexander Shlemov Date: Fri, 19 Oct 2018 18:25:42 +0300 Subject: [PATCH 001/102] CMAKE: Hook in MPI --- src/cmake/deps.cmake | 5 +++++ src/cmake/includes.cmake | 5 +++++ src/projects/spades/CMakeLists.txt | 20 ++++++++++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/cmake/deps.cmake b/src/cmake/deps.cmake index bb27036aeb..8e2193c1f9 100644 --- a/src/cmake/deps.cmake +++ b/src/cmake/deps.cmake @@ -19,6 +19,11 @@ find_package(Readline QUIET) set(CURSES_NEED_NCURSES TRUE) find_package(Curses QUIET) +find_package(MPI) +if (MPI_FOUND) + message(STATUS "MPI found") +endif() + # Use included boost unless explicitly specified if (NOT SPADES_BOOST_ROOT) set(BOOST_ROOT "${EXT_DIR}/include") diff --git a/src/cmake/includes.cmake b/src/cmake/includes.cmake index d3b2d9baf8..0ea44afd71 100644 --- a/src/cmake/includes.cmake +++ b/src/cmake/includes.cmake @@ -9,6 +9,11 @@ include_directories(SYSTEM "${Boost_INCLUDE_DIRS}") if (SPADES_USE_TCMALLOC) include_directories("${GOOGLE_PERFTOOLS_INCLUDE_DIR}") endif() + if (SPADES_USE_JEMALLOC) include_directories("$/../include") endif() + +if (MPI_FOUND) + include_directories("${MPI_INCLUDE_PATH}") +endif() diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index 5575a1d199..d96eab01f6 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -28,11 +28,9 @@ add_library(spades-stages STATIC target_link_libraries(spades-stages hmmercpp spoa easel) +set(SPADES_SRC pipeline.cpp main.cpp series_analysis.cpp ../mts/contig_abundance.cpp) add_executable(spades-core - pipeline.cpp - main.cpp - series_analysis.cpp - ../mts/contig_abundance.cpp) + ${SPADES_SRC}) target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES}) @@ -40,9 +38,23 @@ if (SPADES_STATIC_BUILD) set_target_properties(spades-core PROPERTIES LINK_SEARCH_END_STATIC 1) endif() + install(TARGETS spades-core DESTINATION bin COMPONENT spades) + +if (MPI_FOUND) + add_executable(spades-hpc ${SPADES_SRC}) + + target_link_libraries(spades-hpc spades-stages graphio common_modules ${COMMON_LIBRARIES} ${MPI_LIBRARIES}) + set_target_properties(spades-hpc PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + + if (SPADES_STATIC_BUILD) + set_target_properties(spades-hpc PROPERTIES LINK_SEARCH_END_STATIC 1) + endif() +endif() + + # Configs install(DIRECTORY "configs/" # Trailing / is important DESTINATION share/spades/configs/debruijn From 673dd41223ecbe5b505fdb5ba9acbc8e8e0beace Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Mon, 17 Dec 2018 19:20:48 +0300 Subject: [PATCH 002/102] Add MPI runtime detection --- src/cmake/deps.cmake | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/cmake/deps.cmake b/src/cmake/deps.cmake index 8e2193c1f9..ad1f82778e 100644 --- a/src/cmake/deps.cmake +++ b/src/cmake/deps.cmake @@ -19,9 +19,33 @@ find_package(Readline QUIET) set(CURSES_NEED_NCURSES TRUE) find_package(Curses QUIET) +set(MPI_DETERMINE_LIBRARY_VERSION TRUE) find_package(MPI) if (MPI_FOUND) - message(STATUS "MPI found") + # Determine MPI vendor and MPI runtime version + # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorName.c.in" + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" + # IMMEDIATE @ONLY) + # try_run(MPI_VENDOR_NAME_RUN MPI_HAVE_VENDOR_NAME + # ${CMAKE_BINARY_DIR} + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" + # RUN_OUTPUT_VARIABLE MPI_RUNTIME_NAME) + # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorVersion.c.in" + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" + # IMMEDIATE @ONLY) + # try_run(MPI_VENDOR_VERSION_RUN MPI_HAVE_VENDOR_VERSION + # ${CMAKE_BINARY_DIR} + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" + # RUN_OUTPUT_VARIABLE MPI_RUNTIME_VERSION) + message(STATUS "Detected MPI runtime: ${MPI_C_LIBRARY_VERSION_STRING}") + + if ("${MPI_C_LIBRARY_VERSION_STRING}" MATCHES "^Open MPI") + string(REGEX REPLACE "Open MPI v([0-9]+).*" "\\1" OPENMPI_MAJOR_VERSION "${MPI_C_LIBRARY_VERSION_STRING}") + message(STATUS "Open MPI runtime detected, major version: ${OPENMPI_MAJOR_VERSION}") + if (OPENMPI_MAJOR_VERSION STREQUAL 3) + message(FATAL_ERROR "Open MPI version ${OPENMPI_MAJOR_VERSION}.x is known to be buggy") + endif() + endif() endif() # Use included boost unless explicitly specified From 0219be98ec4f315ca3624fec5768fbec656e1cd5 Mon Sep 17 00:00:00 2001 From: Alexander Shlemov Date: Tue, 14 Aug 2018 17:36:59 +0300 Subject: [PATCH 003/102] CMAKE: Install spades-hpc --- src/projects/spades/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index d96eab01f6..1cb8c7185c 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -52,6 +52,9 @@ if (MPI_FOUND) if (SPADES_STATIC_BUILD) set_target_properties(spades-hpc PROPERTIES LINK_SEARCH_END_STATIC 1) endif() + install(TARGETS spades-hpc + DESTINATION bin + COMPONENT runtime) endif() From 0682203846b9eae448b03ce9f28ca65453acfd6e Mon Sep 17 00:00:00 2001 From: Alexander Shlemov Date: Thu, 27 Dec 2018 12:28:46 +0300 Subject: [PATCH 004/102] Add mpi_console_writer log_writer interface impl --- src/common/utils/logger/mpi_log_writers.hpp | 59 +++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 src/common/utils/logger/mpi_log_writers.hpp diff --git a/src/common/utils/logger/mpi_log_writers.hpp b/src/common/utils/logger/mpi_log_writers.hpp new file mode 100644 index 0000000000..9d8419cba4 --- /dev/null +++ b/src/common/utils/logger/mpi_log_writers.hpp @@ -0,0 +1,59 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "utils/filesystem/path_helper.hpp" +#include "logger.hpp" + +#include + +#include "config.hpp" + +#include +#include +#include + +namespace logging { + +struct mpi_console_writer : public writer { + + void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num, + const char *source, const char *msg) { + const std::string node_info = nodeinfo(); + if (cmem != -1ull) + std::cout << fmt::format("NODE {:s} | {:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(cmem), + utils::human_readable_memory(max_rss), logging::level_name(l), + source, fs::filename(file), int(line_num), msg) + << std::endl; + else + std::cout << fmt::format("NODE {:s} | {:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(max_rss), + logging::level_name(l), source, fs::filename(file), int(line_num), msg) + << std::endl; + } + +private: + std::string nodeinfo() const { + int initialized, finalized; + MPI_Initialized(&initialized); + MPI_Finalized(&finalized); + if (initialized && !finalized) { + int world_rank, world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + return fmt::format("{:>2d}/{:<2d}", world_rank, world_size); + } else { + return fmt::format("{:^5}", "N/A"); + } + + } +}; + +} // namespace logging From f4be241583792d2ef4945ec4045499f4e0f689e5 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 25 Sep 2020 13:21:49 +0300 Subject: [PATCH 005/102] Factor out mpi log writers as well --- src/common/utils/CMakeLists.txt | 7 ++- src/common/utils/logger/mpi_log_writers.cpp | 48 +++++++++++++++++++++ src/common/utils/logger/mpi_log_writers.hpp | 44 ++----------------- src/projects/spades/CMakeLists.txt | 3 +- 4 files changed, 59 insertions(+), 43 deletions(-) create mode 100644 src/common/utils/logger/mpi_log_writers.cpp diff --git a/src/common/utils/CMakeLists.txt b/src/common/utils/CMakeLists.txt index dd62b84f67..1bd56312ed 100644 --- a/src/common/utils/CMakeLists.txt +++ b/src/common/utils/CMakeLists.txt @@ -15,7 +15,12 @@ set(utils_src filesystem/glob.cpp logger/logger_impl.cpp logger/log_writers.cpp - logger/log_writers_thread.cpp) + logger/log_writers_thread.cpp + ) + +if (MPI_FOUND) + set(utils_src ${utils_src} logger/mpi_log_writers.cpp) +endif() if (READLINE_FOUND) set(utils_src ${utils_src} autocompletion.cpp) diff --git a/src/common/utils/logger/mpi_log_writers.cpp b/src/common/utils/logger/mpi_log_writers.cpp new file mode 100644 index 0000000000..c71579e56f --- /dev/null +++ b/src/common/utils/logger/mpi_log_writers.cpp @@ -0,0 +1,48 @@ +//*************************************************************************** +//* Copyright (c) 2020 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "mpi_log_writers.hpp" + +#include "utils/filesystem/path_helper.hpp" + +#include +#include + +namespace logging { + +void mpi_console_writer::write_msg(double time, size_t cmem, size_t max_rss, level l, const std::filesystem::path& file, size_t line_num, + const char *source, const char *msg) { + const std::string node_info = nodeinfo(); + if (cmem != -1ull) + std::cout << fmt::format("NODE {:s} | {:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(cmem), + utils::human_readable_memory(max_rss), logging::level_name(l), + source, file.filename().c_str(), int(line_num), msg) + << std::endl; + else + std::cout << fmt::format("NODE {:s} | {:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(max_rss), + logging::level_name(l), source, file.filename().c_str(), int(line_num), msg) + << std::endl; +} + +std::string mpi_console_writer::nodeinfo() const { + int initialized, finalized; + MPI_Initialized(&initialized); + MPI_Finalized(&finalized); + if (initialized && !finalized) { + int world_rank, world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + return fmt::format("{:>2d}/{:<2d}", world_rank, world_size); + } else { + return fmt::format("{:^5}", "N/A"); + } +} + +} // logging diff --git a/src/common/utils/logger/mpi_log_writers.hpp b/src/common/utils/logger/mpi_log_writers.hpp index 9d8419cba4..86813662a6 100644 --- a/src/common/utils/logger/mpi_log_writers.hpp +++ b/src/common/utils/logger/mpi_log_writers.hpp @@ -6,54 +6,16 @@ #pragma once -#include "utils/filesystem/path_helper.hpp" #include "logger.hpp" -#include - -#include "config.hpp" - -#include -#include -#include - namespace logging { struct mpi_console_writer : public writer { - void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num, - const char *source, const char *msg) { - const std::string node_info = nodeinfo(); - if (cmem != -1ull) - std::cout << fmt::format("NODE {:s} | {:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", - node_info, - utils::human_readable_time(time), utils::human_readable_memory(cmem), - utils::human_readable_memory(max_rss), logging::level_name(l), - source, fs::filename(file), int(line_num), msg) - << std::endl; - else - std::cout << fmt::format("NODE {:s} | {:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", - node_info, - utils::human_readable_time(time), utils::human_readable_memory(max_rss), - logging::level_name(l), source, fs::filename(file), int(line_num), msg) - << std::endl; - } - + void write_msg(double time, size_t cmem, size_t max_rss, level l, const std::filesystem::path& file, size_t line_num, + const char *source, const char *msg); private: - std::string nodeinfo() const { - int initialized, finalized; - MPI_Initialized(&initialized); - MPI_Finalized(&finalized); - if (initialized && !finalized) { - int world_rank, world_size; - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - return fmt::format("{:>2d}/{:<2d}", world_rank, world_size); - } else { - return fmt::format("{:^5}", "N/A"); - } - - } + std::string nodeinfo() const; }; } // namespace logging diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index 1cb8c7185c..137e0d5728 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -32,7 +32,8 @@ set(SPADES_SRC pipeline.cpp main.cpp series_analysis.cpp ../mts/contig_abundance add_executable(spades-core ${SPADES_SRC}) -target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES}) +# FIXME: Temporary +target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES} ${MPI_LIBRARIES}) if (SPADES_STATIC_BUILD) set_target_properties(spades-core PROPERTIES LINK_SEARCH_END_STATIC 1) From c98a4262e2a1e9cbff3cb43ff4c8ef2b42bc0ae5 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 25 Sep 2020 13:23:33 +0300 Subject: [PATCH 006/102] Add mpi partask core --- src/common/io/binary/binary.hpp | 28 +- src/common/pipeline/partask_mpi.hpp | 1634 +++++++++++++++++++++++++++ 2 files changed, 1639 insertions(+), 23 deletions(-) create mode 100644 src/common/pipeline/partask_mpi.hpp diff --git a/src/common/io/binary/binary.hpp b/src/common/io/binary/binary.hpp index ff4c433079..0b6c6958ea 100644 --- a/src/common/io/binary/binary.hpp +++ b/src/common/io/binary/binary.hpp @@ -432,42 +432,24 @@ class Serializer, std::enable_if_t::value } }; -// std::tuple -namespace detail { -template -constexpr decltype(auto) apply_impl(F &&f, Tuple &&t, std::index_sequence) { - return std::forward(f)(std::get(std::forward(t))...); -} - -template -constexpr std::size_t tuple_size_v = std::tuple_size::value; -} // namespace detail - -// Just std::apply from C++17 -template -constexpr decltype(auto) apply(F &&f, Tuple &&t) { - return detail::apply_impl(std::forward(f), std::forward(t), - std::make_index_sequence>>{}); -} - template class Serializer, std::enable_if_t>> { public: static void Write(std::ostream &os, const std::tuple &t) { auto binwriter = [&os](auto&& ...v) { BinWrite(os, v...); }; - apply(binwriter, t); + std::apply(binwriter, t); } static void Read(std::istream &is, std::tuple &t) { auto binreader = [&is](auto&& ...v) { BinRead(is, v...); }; - apply(binreader, t); + std::apply(binreader, t); } }; template std::enable_if_t> BinRead(std::istream &is, std::tuple t) { auto binreader = [&is](auto&& ...v) { BinRead(is, v...); }; - apply(binreader, t); + std::apply(binreader, t); } template @@ -475,12 +457,12 @@ class Serializer, std::enable_if_t>> { public: static void Write(std::ostream &os, const std::tuple &t) { auto binwriter = [&os](auto&& ...v) { BinWrite(os, v...); }; - apply(binwriter, t); + std::apply(binwriter, t); } static void Read(std::istream &is, std::tuple t) { auto binreader = [&is](auto&& ...v) { BinRead(is, v...); }; - apply(binreader, t); + std::apply(binreader, t); } }; diff --git a/src/common/pipeline/partask_mpi.hpp b/src/common/pipeline/partask_mpi.hpp new file mode 100644 index 0000000000..751290eca4 --- /dev/null +++ b/src/common/pipeline/partask_mpi.hpp @@ -0,0 +1,1634 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2019-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "io/binary/binary.hpp" +#include "io/reads/read_stream_vector.hpp" + +#include "utils/parallel/openmp_wrapper.h" +#include "utils/verify.hpp" +#include "utils/logger/logger.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#define ASSERT_MAIN_THREAD VERIFY(omp_get_thread_num() == 0) + +namespace partask { + +// Sasha's own idea, motivated by std::as_const function +template +constexpr std::add_const_t *as_const(T *p) noexcept { + return p; +} + +// Motivated by as_const +template +constexpr T &as_non_const(const T &t) noexcept { + return const_cast(t); +} + +template +constexpr T *as_non_const(const T *p) noexcept { + return const_cast(p); +} + +template +auto wrap_into_ptr(T&& t) { + return new std::remove_reference_t(std::forward(t)); +} + +template +auto wrap_into_shared(T&& t) { + return std::shared_ptr>(wrap_into_ptr(std::forward(t))); +} + +template +auto wrap_into_unique(T&& t) { + return std::unique_ptr>(wrap_into_ptr(std::forward(t))); +} + + +template +std::unique_ptr as_unique(T *p) { + return std::unique_ptr(p); +} + +template +std::shared_ptr as_shared(T *p) { + return std::shared_ptr(p); +} + +// Motivated by declval, the same stuff but returns lvalue reference +template +typename std::add_lvalue_reference::type declref() noexcept; + +inline int world_size() { + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + return world_size; +} + +inline int world_rank() { + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + return world_rank; +} + +inline bool master(int rank = 0) { + return world_rank() == rank; +} + +inline bool worker(int rank = 0) { + return world_rank() != rank; +} + +inline bool initialized() { + int flag; + MPI_Initialized(&flag); + return flag; +} + +inline void barrier() { + ASSERT_MAIN_THREAD; + static size_t count = 0; + DEBUG("barrier() called " << count << " times"); + ++count; + int ret = MPI_Barrier(MPI_COMM_WORLD); + VERIFY(ret == MPI_SUCCESS); +} + +const size_t MPI_MAX_COUNT = 1 << 30; // Should be <= MAX_INT + +inline void membroadcast(void *p, size_t count, int root = 0) { + ASSERT_MAIN_THREAD; + static size_t call_count = 0; + DEBUG("membroadcast() called " << call_count << " times"); + ++call_count; + + char *cp = reinterpret_cast(p); + while (count) { + size_t block_size = std::min(count, MPI_MAX_COUNT); + int ret = MPI_Bcast(cp, static_cast(block_size), MPI_BYTE, root, MPI_COMM_WORLD); + VERIFY(ret == MPI_SUCCESS); + cp += block_size; + count -= block_size; + } +} + +inline void memsend(const void *p, size_t count, int rank, int tag = 0) { + ASSERT_MAIN_THREAD; + char *cp = reinterpret_cast(const_cast(p)); + while (count) { + size_t block_size = std::min(count, MPI_MAX_COUNT); + int ret = MPI_Send(cp, static_cast(block_size), MPI_BYTE, rank, tag, MPI_COMM_WORLD); + VERIFY(ret == MPI_SUCCESS); + cp += block_size; + count -= block_size; + } +} + +inline void memrecv(void *p, size_t count, int rank, int tag = MPI_ANY_TAG) { + ASSERT_MAIN_THREAD; + char *cp = reinterpret_cast(p); + while (count) { + size_t block_size = std::min(count, MPI_MAX_COUNT); + int ret = MPI_Recv(cp, static_cast(block_size), MPI_BYTE, rank, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + VERIFY(ret == MPI_SUCCESS); + cp += block_size; + count -= block_size; + } +} + +template +std::enable_if_t::value && !std::is_pointer::value> broadcast(T &v, int root = 0) { + membroadcast(&v, sizeof(T), root); +} + +template +std::enable_if_t::value && !std::is_pointer::value> broadcast(std::array &v, int root = 0) { + membroadcast(v.deta(), N * sizeof(T), root); +} + +template +std::enable_if_t::value && !std::is_pointer::value> send(const T &v, int rank, int tag) { + memsend(&v, sizeof(T), rank, tag); +} + +template +std::enable_if_t::value && !std::is_pointer::value> recv(T &v, int rank, int tag = MPI_ANY_TAG) { + memrecv(&v, sizeof(T), rank, tag); +} + +template +std::enable_if_t::value && !std::is_pointer::value> send(const std::array &v, int rank, int tag = 0) { + memsend(v.data(), N * sizeof(T), rank, tag); +} + +template +std::enable_if_t::value && !std::is_pointer::value> recv(std::array &v, int rank, int tag = MPI_ANY_TAG) { + memrecv(v.data(), N * sizeof(T), rank, tag); +} + +// TODO Add send/recv/broadcast overloadings for vectors of PODs + +template +inline MPI_Datatype mpi_datatype(); + +// We use MPI_BYTE for char since it's more safe +// Using MPI_BYTE prevents MPI from performing any kind of representation conversion +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_BYTE; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_CHAR; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_SIGNED_CHAR; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_UNSIGNED_CHAR; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_SHORT; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_INT; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_LONG; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_LONG_LONG; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_FLOAT; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_DOUBLE; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_LONG_DOUBLE; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_UNSIGNED_SHORT; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_UNSIGNED; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_UNSIGNED_LONG; +} + +template <> +inline MPI_Datatype mpi_datatype() { + return MPI_UNSIGNED_LONG_LONG; +} + +template +void allreduce(T *recvbuf, size_t count, MPI_Op op) { + ASSERT_MAIN_THREAD; + DEBUG("allreduce started for " << count << " objects of type " << typeid(T).name()); + using NoneVoidT = std::conditional_t::value, char, T>; + NoneVoidT *crecvbuf = reinterpret_cast(recvbuf); + while (count) { + size_t block_size = std::min(count, MPI_MAX_COUNT); + int ret = MPI_Allreduce(MPI_IN_PLACE, crecvbuf, static_cast(block_size), mpi_datatype(), op, MPI_COMM_WORLD); + VERIFY(ret == MPI_SUCCESS); + crecvbuf += block_size; + count -= block_size; + } + DEBUG("allreduce finished"); +} + +namespace detail { + +template +struct wrap {}; + +template +struct wrap { + const char *name() const { return typeid(T).name(); } + + T get() { return std::move(t_); } + + template + wrap(TT &&t) : t_{std::forward(t)} {} + + T t_; +}; + +template +struct wrap { + const char *name() const { return typeid(T).name(); } + + T &get() { return t_; } + + wrap(T &t) : t_{t} {} + + T t_; +}; + +template <> +struct wrap { + const char *name() const { return "VOID"; } + + void get() const { return; } +}; + +using wrap_void = wrap; + +template +auto operator,(T &&t, wrap) { + static_assert(!std::is_reference::value, "TODO Add message"); + DEBUG("T&& called"); + return wrap(std::forward(t)); +} + +template +auto operator,(T &t, wrap) { + static_assert(!std::is_reference::value, "TODO Add message"); + DEBUG("T& called"); + return wrap(t); +} + +template +auto call(F &&f) -> decltype(std::forward(f)()) { + auto result = (std::forward(f)(), wrap_void()); + DEBUG("Function called"); + return result.get(); +} + +} // namespace detail + +inline bool init() { + int provided; + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_FUNNELED, &provided); + return provided >= MPI_THREAD_FUNNELED; +} + +inline bool finalize() { return MPI_Finalize() == MPI_SUCCESS; } + +struct MsgInfo { + size_t count; + bool flag; +}; + +// TODO make partask_mpi.cpp +// buffers should have sizeof(MsgInfo) free bytes before the beginning! +inline void mpi_send_buffer(char *buffer, size_t count, int destination, int tag, bool flag) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_send_buffer() called"); + MsgInfo info{count, flag}; + memcpy(buffer - sizeof(info), &info, sizeof(info)); + size_t all_count = count + sizeof(info); + VERIFY(all_count <= std::numeric_limits::max()); + int rc = MPI_Send(buffer - sizeof(info), static_cast(all_count), MPI_BYTE, destination, tag, MPI_COMM_WORLD); + VERIFY(rc == MPI_SUCCESS); +} + +inline MsgInfo mpi_recv_buffer(char *buffer, size_t buffer_size, int source, int tag) { + DEBUG("mpi_recv_buffer() called"); + size_t all_count = buffer_size + sizeof(MsgInfo); + VERIFY(all_count <= std::numeric_limits::max()); + MPI_Status status; + int rc = MPI_Recv(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, source, tag, MPI_COMM_WORLD, &status); + VERIFY(rc == MPI_SUCCESS); + int actual_count; + MPI_Get_count(&status, MPI_BYTE, &actual_count); + size_t count = actual_count - sizeof(MsgInfo); + VERIFY(count <= buffer_size); + MsgInfo info; + memcpy(&info, buffer - sizeof(info), sizeof(info)); + return info; +} + +inline void mpi_send_buffer_bcast(char *buffer, size_t count, size_t buffer_size, int root, bool flag) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_send_buffer_bcast() called"); + MsgInfo info{count, flag}; + VERIFY(info.count || info.flag); + memcpy(buffer - sizeof(info), &info, sizeof(info)); + size_t all_count = buffer_size + sizeof(info); + VERIFY(all_count <= std::numeric_limits::max()); + int rc = MPI_Bcast(buffer - sizeof(info), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD); + VERIFY(rc == MPI_SUCCESS); +} + +inline MsgInfo mpi_recv_buffer_bcast(char *buffer, size_t buffer_size, int root) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_recv_buffer_bcast() called"); + size_t all_count = buffer_size + sizeof(MsgInfo); + int rc = MPI_Bcast(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD); // count should be the same! + VERIFY(rc == MPI_SUCCESS); + MsgInfo info; + memcpy(&info, buffer - sizeof(MsgInfo), sizeof(info)); + VERIFY(info.count || info.flag); + return info; +} + +// The example was taken from http://www.voidcn.com/article/p-vjnlygmc-gy.html +// Probably we should send/recv async +template +class OutputMPIBuffer : public std::streambuf { + static const size_t KiloByte = 1 << 10; + static const size_t MegaByte = 1 << 20; + static const size_t GigaByte = 1 << 30; + +public: + // the buffer size should be EXACTELY the same as in correspondent InputBuffer + explicit OutputMPIBuffer(int destination, int tag = 0, size_t buffer_size = 100 * MegaByte) + : buffer_size_{buffer_size}, buffer_(buffer_size + sizeof(MsgInfo)), destination_{destination}, tag_{tag} { + VERIFY(buffer_.size() <= std::numeric_limits::max()); + setp(data(), data() + buffer_size_); + } + + ~OutputMPIBuffer() noexcept { + flush(true); + // INFO("Flush called: " << flush_count_); + } + OutputMPIBuffer(OutputMPIBuffer &&) = default; + OutputMPIBuffer &operator=(OutputMPIBuffer &&) = default; + +protected: + int sync() override { + flush(false); + return 0; + } + + int_type overflow(int_type ch) override { + flush(false); + if (ch != traits_type::eof()) { + VERIFY(std::less_equal()(pptr(), epptr())); + *pptr() = traits_type::to_char_type(ch); + pbump(1); + } + return ch; + } + +private: + char *data() { + return buffer_.data() + sizeof(MsgInfo); + } + + void resize_buffer(size_t buffer_size) { + buffer_size_ = buffer_size; + buffer_.resize(buffer_size_ + sizeof(MsgInfo)); + } + + void flush(bool last) { + // ++flush_count_; + size_t count = pptr() - pbase(); + if (!count && !last) { + return; + } + if (BROADCAST) { + mpi_send_buffer_bcast(pbase(), count, buffer_size_, destination_, last); + } else { + mpi_send_buffer(pbase(), count, destination_, tag_, last); + } + setp(data(), data() + buffer_size_); + } + + OutputMPIBuffer(const OutputMPIBuffer &) = delete; + OutputMPIBuffer &operator=(const OutputMPIBuffer &) = delete; + +private: + size_t buffer_size_; + std::vector buffer_; + int destination_, tag_; + // size_t flush_count_ = 0; +}; + +template +class InputMPIBuffer : public std::streambuf { + static const size_t KiloByte = 1 << 10; + static const size_t MegaByte = 1 << 20; + static const size_t GigaByte = 1 << 30; + +public: + // the buffer size should be EXACTELY the same as in correspondent OutputBuffer + explicit InputMPIBuffer(int source, int tag = MPI_ANY_TAG, size_t buffer_size = 100 * MegaByte, size_t put_back = 1 * KiloByte) + : buffer_size_{buffer_size}, put_back_{std::max(put_back, sizeof(MsgInfo))}, buffer_(put_back_ + buffer_size), + source_{source}, tag_{tag}, last_{false} { + VERIFY(buffer_.size() <= std::numeric_limits::max()); + setg(buffer_.data(), data() + buffer_size_, data() + buffer_size_); + } + + ~InputMPIBuffer() noexcept { + while (pull()) {} + } + + InputMPIBuffer(InputMPIBuffer &&) = default; + InputMPIBuffer &operator=(InputMPIBuffer &&) = default; + +private: + char *data() { + return buffer_.data() + put_back_; + } + + void resize_buffer(size_t buffer_size) { + buffer_size_ = buffer_size; + buffer_.resize(buffer_size_ + put_back_); + } + + MsgInfo retrive() { + MsgInfo info; + if (BROADCAST) { + info = mpi_recv_buffer_bcast(data(), buffer_size_, source_); + } else { + info = mpi_recv_buffer(data(), buffer_size_, source_, tag_); + } + return info; + } + + size_t pull() { + if (last_) { + return 0; + } + + MsgInfo info; + do { + info = retrive(); + VERIFY(info.flag || info.count); + } while (!info.flag && info.count == 0); + + last_ = info.flag; + return info.count; + } + + + int_type underflow() override { + if (gptr() < egptr()) { // buffer not exhausted + return traits_type::to_int_type(*gptr()); + } + + size_t n = pull(); + if (n == 0) return traits_type::eof(); + + // Set buffer pointers + setg(buffer_.data(), data(), data() + n); + + return traits_type::to_int_type(*gptr()); + } + + InputMPIBuffer(const InputMPIBuffer &) = delete; + InputMPIBuffer &operator=(const InputMPIBuffer &) = delete; +private: + size_t buffer_size_; + size_t put_back_; + std::vector buffer_; + int source_, tag_; + bool last_; +}; + +inline void mpi_send_buffer_async(char *buffer, size_t count, int destination, int tag, bool flag, MPI_Request &req) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_send_buffer() called"); + MsgInfo info{count, flag}; + memcpy(buffer - sizeof(info), &info, sizeof(info)); + size_t all_count = count + sizeof(info); + VERIFY(all_count <= std::numeric_limits::max()); + int rc = MPI_Isend(buffer - sizeof(info), static_cast(all_count), MPI_BYTE, destination, tag, MPI_COMM_WORLD, &req); + VERIFY(rc == MPI_SUCCESS); +} + +inline void mpi_recv_buffer_async(char *buffer, size_t buffer_size, int source, int tag, MPI_Request &req) { + DEBUG("mpi_recv_buffer() called"); + size_t all_count = buffer_size + sizeof(MsgInfo); + VERIFY(all_count <= std::numeric_limits::max()); + int rc = MPI_Irecv(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, source, tag, MPI_COMM_WORLD, &req); + VERIFY(rc == MPI_SUCCESS); +} + +inline MsgInfo mpi_recv_buffer_wait(char *buffer, MPI_Request &req) { + MPI_Status status; + MPI_Wait(&req, &status); + int actual_count; + MPI_Get_count(&status, MPI_BYTE, &actual_count); + size_t count = actual_count - sizeof(MsgInfo); + MsgInfo info; + memcpy(&info, buffer - sizeof(info), sizeof(info)); + VERIFY(count == info.count); + return info; +} + +inline void mpi_send_buffer_bcast_async(char *buffer, size_t count, size_t buffer_size, int root, bool flag, MPI_Request &req) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_send_buffer_bcast_async() called. count = " << count << " flag " << flag); + MsgInfo info{count, flag}; + VERIFY(info.count || info.flag); + memcpy(buffer - sizeof(info), &info, sizeof(info)); + size_t all_count = buffer_size + sizeof(info); + VERIFY(all_count <= std::numeric_limits::max()); + int rc = MPI_Ibcast(buffer - sizeof(info), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD, &req); + VERIFY(rc == MPI_SUCCESS); +} + +inline void mpi_recv_buffer_bcast_async(char *buffer, size_t buffer_size, int root, MPI_Request &req) { + ASSERT_MAIN_THREAD; + DEBUG("mpi_recv_buffer_bcast() called"); + size_t all_count = buffer_size + sizeof(MsgInfo); + int rc = MPI_Ibcast(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD, &req); // count should be the same! + VERIFY(rc == MPI_SUCCESS); +} + +inline MsgInfo mpi_recv_buffer_bcast_wait(char *buffer, MPI_Request &req) { + MPI_Wait(&req, MPI_STATUS_IGNORE); + MsgInfo info; + memcpy(&info, buffer - sizeof(MsgInfo), sizeof(info)); + DEBUG("mpi_recv_buffer_bcast_wait() called. count = " << info.count << " flag " << info.flag); + VERIFY(info.count || info.flag); + return info; +} + +template +class OutputMPIBufferAsync : public std::streambuf { + static const size_t KiloByte = 1 << 10; + static const size_t MegaByte = 1 << 20; + static const size_t GigaByte = 1 << 30; + +public: + // the buffer size should be EXACTELY the same as in correspondent InputBuffer + explicit OutputMPIBufferAsync(size_t destination, int tag = 0, size_t buffer_size = 100 * MegaByte) + : buffer_size_{buffer_size}, buffer1_(buffer_size + sizeof(MsgInfo)), buffer2_(buffer_size + sizeof(MsgInfo)), destination_{int(destination)}, tag_{tag} { + pbuffer_ = &buffer1_; + setp(data(), data() + buffer_size_); + } + + ~OutputMPIBufferAsync() noexcept { + flush(true); + if (wait_) { + MPI_Wait(&req_, MPI_STATUS_IGNORE); + wait_ = false; + } + // INFO("Flush called: " << flush_count_); + } + OutputMPIBufferAsync(OutputMPIBufferAsync &&) = default; + OutputMPIBufferAsync &operator=(OutputMPIBufferAsync &&) = default; + +protected: + int sync() override { + flush(false); + return 0; + } + + int_type overflow(int_type ch) override { + flush(false); + if (ch != traits_type::eof()) { + VERIFY(std::less_equal()(pptr(), epptr())); + *pptr() = traits_type::to_char_type(ch); + pbump(1); + } + return ch; + } + +private: + char *data() { + return pbuffer_->data() + sizeof(MsgInfo); + } + + void flush(bool last) { + // ++flush_count_; + size_t count = pptr() - pbase(); + if (!count && !last) { + return; + } + if (wait_) { + MPI_Wait(&req_, MPI_STATUS_IGNORE); + wait_ = false; + } + if (BROADCAST) { + mpi_send_buffer_bcast_async(pbase(), count, buffer_size_, destination_, last, req_); + } else { + mpi_send_buffer_async(pbase(), count, destination_, tag_, last, req_); + } + wait_ = true; + pbuffer_ = pbuffer_ == &buffer1_ ? &buffer2_ : &buffer1_; + setp(data(), data() + buffer_size_); + } + + OutputMPIBufferAsync(const OutputMPIBufferAsync &) = delete; + OutputMPIBufferAsync &operator=(const OutputMPIBufferAsync &) = delete; + +private: + size_t buffer_size_; + std::vector buffer1_, buffer2_; + int destination_, tag_; + // size_t flush_count_ = 0; + std::vector *pbuffer_; + bool wait_ = false; + MPI_Request req_; +}; + +template +class InputMPIBufferAsync : public std::streambuf { + static const size_t KiloByte = 1 << 10; + static const size_t MegaByte = 1 << 20; + static const size_t GigaByte = 1 << 30; + +public: + // the buffer size should be EXACTELY the same as in correspondent OutputBuffer + explicit InputMPIBufferAsync(size_t source, int tag = MPI_ANY_TAG, size_t buffer_size = 100 * MegaByte, size_t put_back = 1 * KiloByte) + : buffer_size_{buffer_size}, put_back_{std::max(put_back, sizeof(MsgInfo))}, buffer1_(put_back_ + buffer_size), buffer2_(put_back_ + buffer_size), + source_{int(source)}, tag_{tag} { + setg(pbuffer_->data(), data() + buffer_size_, data() + buffer_size_); + } + + ~InputMPIBufferAsync() noexcept { + while (pull()) {} + } + + InputMPIBufferAsync(InputMPIBufferAsync &&) = default; + InputMPIBufferAsync &operator=(InputMPIBufferAsync &&) = default; + +protected: + int_type underflow() override { + if (gptr() < egptr()) { // buffer not exhausted + return traits_type::to_int_type(*gptr()); + } + + size_t n = pull(); + if (n == 0) return traits_type::eof(); + + // Set buffer pointers + setg(pbuffer_->data(), data(), data() + n); + + return traits_type::to_int_type(*gptr()); + } + +private: + char *data() { + return pbuffer_->data() + put_back_; + } + + MsgInfo wait() { + return BROADCAST ? mpi_recv_buffer_bcast_wait(data(), req_) : mpi_recv_buffer_wait(data(), req_); + } + + void recv() { + if (BROADCAST) { + mpi_recv_buffer_bcast_async(data(), buffer_size_, source_, req_); + } else { + mpi_recv_buffer_async(data(), buffer_size_, source_, tag_, req_); + } + } + + size_t pull() { + if (last_) { + return 0; + } + + pbuffer_ = pbuffer_ == &buffer1_ ? &buffer2_ : &buffer1_; + if (!wait_) { + recv(); + wait_ = true; + } + + VERIFY(wait_); + MsgInfo info = wait(); + last_ = info.flag; + wait_ = false; + + if (!last_) { + pbuffer_ = pbuffer_ == &buffer1_ ? &buffer2_ : &buffer1_; + recv(); + pbuffer_ = pbuffer_ == &buffer1_ ? &buffer2_ : &buffer1_; + wait_ = true; + } + + return info.count; + } + + InputMPIBufferAsync(const InputMPIBufferAsync &) = delete; + InputMPIBufferAsync &operator=(const InputMPIBufferAsync &) = delete; + +private: + size_t buffer_size_; + size_t put_back_; + std::vector buffer1_, buffer2_; + int source_, tag_; + bool last_ = false; + bool wait_ = false; + std::vector *pbuffer_ = &buffer1_; + MPI_Request req_; +}; + +// TODO add put_back support +// TODO Add mutex +class ChunkedStringBuffer : public std::streambuf { + static const size_t KiloByte = 1 << 10; + static const size_t MegaByte = 1 << 20; + static const size_t GigaByte = 1 << 30; + +public: + explicit ChunkedStringBuffer(size_t buffer_size = 100 * MegaByte) + : buffer_size_{buffer_size}, g_buffer_id_{size_t(-1)}, buffer_(buffer_size_) { + setp(buffer_.data(), buffer_.data() + buffer_size_); + setg(buffer_.data(), buffer_.data() + buffer_size_, buffer_.data() + buffer_size_); + } + + int sync() override { + flush(); + return 0; + } + + size_t size() const { + size_t result = pptr() - pbase(); + for (const auto &buffer : buffers_) { + result += buffer.size(); + } + return result; + } + + int_type underflow() override { + if (gptr() < egptr()) { // buffer not exhausted + return traits_type::to_int_type(*gptr()); + } + + /* DEBUG("underflow(): g_buffer_id_ = " << g_buffer_id_); */ + if (g_buffer_id_ != size_t(-1)) { + // clear current buffer + buffers_[g_buffer_id_].clear(); + buffers_[g_buffer_id_].shrink_to_fit(); + } + + ++g_buffer_id_; + + if (g_buffer_id_ == buffers_.size()) { + flush(); + VERIFY(g_buffer_id_ < buffers_.size()); + if (buffers_[g_buffer_id_].empty()) { + return traits_type::eof(); + } + } + + // Set buffer pointers + auto &b = buffers_[g_buffer_id_]; + setg(b.data(), b.data(), b.data() + b.size()); + + return traits_type::to_int_type(*gptr()); + } + + int_type overflow(int_type ch) override { + flush(); + + if (ch != traits_type::eof()) { + VERIFY(std::less_equal()(pptr(), epptr())); + *pptr() = traits_type::to_char_type(ch); + pbump(1); + } + return ch; + } + + void broadcast(int root = 0) { + if (world_rank() == root) { + flush(); + } + size_t buffers_size = buffers_.size(); + ::partask::broadcast(buffers_size, root); + buffers_.resize(buffers_size); + + std::vector sizes(buffers_size + 1 + 1 + 1 + 6); + sizes[0] = buffer_size_; // Actually already should be equal + sizes[1] = g_buffer_id_; + sizes[2] = buffer_.size(); + + sizes[3] = eback() - buffer_.data(); + sizes[4] = gptr() - buffer_.data(); + sizes[5] = egptr() - buffer_.data(); + sizes[6] = pbase() - buffer_.data(); + sizes[7] = pptr() - buffer_.data(); // Actually, could not be synced + sizes[8] = epptr() - buffer_.data(); + for (size_t i = 0; i < buffers_.size(); ++i) { + sizes[i + 9] = buffers_[i].size(); + } + ::partask::membroadcast(sizes.data(), sizeof(sizes[0]) * sizes.size(), root); + buffer_size_ = sizes[0]; + g_buffer_id_ = sizes[1]; + buffer_.resize(sizes[2]); + ::partask::membroadcast(buffer_.data(), sizeof(buffer_[0]) * buffer_.size(), root); + if (world_rank() != root) { + setg(sizes[3] + buffer_.data(), sizes[4] + buffer_.data(), sizes[5] + buffer_.data()); + setp(sizes[6] + buffer_.data(), sizes[8] + buffer_.data()); + } + + for (size_t i = 0; i < buffers_.size(); ++i) { + buffers_[i].resize(sizes[i + 9]); + ::partask::membroadcast(buffers_[i].data(), sizeof(buffers_[i][0]) * buffers_[i].size(), root); + } + } + +private: + void flush() { + size_t count = pptr() - pbase(); + buffer_.resize(count); + buffers_.emplace_back(buffer_size_); + std::swap(buffer_, buffers_.back()); + setp(buffer_.data(), buffer_.data() + buffer_size_); + } + + size_t buffer_size_; + size_t g_buffer_id_; + std::vector> buffers_; + std::vector buffer_; +}; + +template +class MPIStream : public BaseStream { + using This = MPIStream; +public: + template + MPIStream(Args... args) : BaseStream(nullptr), buf_(args...) { // All args are integral + this->init(&buf_); + } + + MPIStream(This&&) = default; + MPIStream& operator=(This&&) = default; + MPIStream(const This&) = delete; + MPIStream& operator=(const This&) = delete; + +private: + Buffer buf_; +}; + +using InputMPIStream = MPIStream>; +using OutputMPIStream = MPIStream>; +class InputMPIStreamBcast : public MPIStream> { +public: + using MPIStream::MPIStream; + InputMPIStreamBcast() : MPIStream(0) {} +}; + +class OutputMPIStreamBcast : public MPIStream> { +public: + using MPIStream::MPIStream; + OutputMPIStreamBcast() : MPIStream(0) {} +}; + +class ChunkedStringStream : public MPIStream { +public: + size_t size() const { return dynamic_cast(this->rdbuf())->size(); } + + void broadcast(int root = 0) { return dynamic_cast(this->rdbuf())->broadcast(root); } +}; + +template +void broadcast(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { + ASSERT_MAIN_THREAD; + DEBUG("Broadcasting of type " << typeid(T).name()); + + static size_t call_count = 0; + DEBUG("membroadcast() called " << call_count << " times"); + ++call_count; + + if (world_rank() == root) { + OutputMPIStreamBcast os(root); + DEBUG("Broadcast serialization..."); + std::forward(serialize)(os, data); + DEBUG("Broadcast serialization complete"); + } else { + InputMPIStreamBcast is(root); + DEBUG("Broadcast deserialization..."); + std::forward(deserialize)(is, data); + DEBUG("Broadcast deserialization complete"); + } +} + +template +void broadcast2(T1 &data1, Serialize1 &&serialize1, Deserialize1 &&deserialize1, + T2 &data2, Serialize2 &&serialize2, Deserialize2 &&deserialize2, + int root = 0) { + ASSERT_MAIN_THREAD; + DEBUG("Broadcasting of types " << typeid(T1).name() << " " << typeid(T2).name()); + + static size_t call_count = 0; + DEBUG("membroadcast() called " << call_count << " times"); + ++call_count; + + if (world_rank() == root) { + OutputMPIStreamBcast os(root); + DEBUG("Broadcast serialization..."); + std::forward(serialize1)(os, data1); + std::forward(serialize2)(os, data2); + DEBUG("Broadcast serialization complete"); + } else { + InputMPIStreamBcast is(root); + DEBUG("Broadcast deserialization..."); + std::forward(deserialize1)(is, data1); + std::forward(deserialize2)(is, data2); + DEBUG("Broadcast deserialization complete"); + } +} + +template +void broadcast_full_dump(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { + ASSERT_MAIN_THREAD; + DEBUG("Broadcasting of type " << typeid(T).name()); + + static size_t call_count = 0; + DEBUG("broadcast_full_dump() called " << call_count << " times"); + ++call_count; + + ChunkedStringStream css; + if (world_rank() == root) { + DEBUG("Broadcast serialization..."); + std::forward(serialize)(css, data); + DEBUG("Broadcast serialization complete"); + } + css.broadcast(root); + + if (world_rank() != root) { + DEBUG("Broadcast deserialization..."); + std::forward(deserialize)(css, data); + DEBUG("Broadcast deserialization complete"); + } +} + +template +auto broadcast(T &data, int root = 0) -> decltype(std::declval::value, T>>(), + io::binary::BinWrite(declref(), data), + io::binary::BinRead(declref(), data), + void()) { + broadcast(data, [](std::ostream &os, const T &data) { io::binary::BinWrite(os, data); }, + [](std::istream &is, T &data) { io::binary::BinRead(is, data); }, + root); +} + +template +auto send(const T &data, Serialize &&serialize, int destination, int tag = 0) -> decltype(std::forward(serialize)(declref(), data), void()) { + ASSERT_MAIN_THREAD; + OutputMPIStream os(destination, tag); + DEBUG("Serialization..."); + std::forward(serialize)(os, data); +} + +template +auto recv(T &data, Deserialize &&deserialize, int source, int tag = MPI_ANY_TAG) -> decltype(std::forward(deserialize)(declref(), data), void()) { + ASSERT_MAIN_THREAD; + InputMPIStream is(source, tag); + DEBUG("Serialization..."); + std::forward(deserialize)(is, data); +} + +inline std::vector collect_num_threads(int root = 0) { + ASSERT_MAIN_THREAD; + std::vector all_num_threads; + + int num_threads = omp_get_max_threads(); + if (world_rank() == root) { + all_num_threads.resize(world_size()); + MPI_Gather(&num_threads, 1, MPI_INT, all_num_threads.data(), 1, MPI_INT, root, MPI_COMM_WORLD); + } else { + MPI_Gather(&num_threads, 1, MPI_INT, nullptr, 1, MPI_INT, root, MPI_COMM_WORLD); + } + broadcast(all_num_threads, root); + return all_num_threads; +} + +inline int overall_num_threads(int root = 0) { + ASSERT_MAIN_THREAD; + auto threads = collect_num_threads(root); + return std::accumulate(threads.cbegin(), threads.cend(), int(0)); +} + +inline void all_set_num_threads(const std::vector &all_num_threads, int root = 0) { + ASSERT_MAIN_THREAD; + int num_threads; + MPI_Scatter(const_cast(all_num_threads.data()), 1, MPI_INT, &num_threads, 1, MPI_INT, root, MPI_COMM_WORLD); + omp_set_num_threads(num_threads); +} + +inline void all_set_num_threads(int num_threads, int = 0) { omp_set_num_threads(num_threads); } + +#define CREATE_HAS_METHOD_CHECKER(METHOD) \ +struct has_##METHOD##_method { \ + template \ + constexpr static auto detail_test(T &&obj, Tuple &&tuple, std::index_sequence) -> \ + decltype(std::forward(obj).METHOD(std::get(std::forward(tuple))...), \ + std::true_type()); \ +\ + template \ + constexpr static auto test(T &&obj, Tuple &&tuple) -> decltype(detail_test(std::forward(obj), \ + std::forward(tuple),\ + std::make_index_sequence>>{})); \ + constexpr static std::false_type test(...); \ +}; + +class TaskRegistry { + static const int MAP_TAG = 13; + static const int MERGE_TAG = 14; + static const size_t MULT = 1; + static const size_t STOP_LISTENING_TASK = -1; + + class AbstractTask { + public: + virtual void process(std::istream &is, std::ostream &) { + // Skip all characters in stream + DEBUG("Unimplemented (trivial) process method"); + is.ignore(std::numeric_limits::max()); + } + + virtual void sync(void) { + DEBUG("Unimplemented (trivial) sync method"); + // Do nothing + } + + virtual ~AbstractTask() noexcept = default; + }; + + CREATE_HAS_METHOD_CHECKER(sync); + CREATE_HAS_METHOD_CHECKER(merge); + CREATE_HAS_METHOD_CHECKER(process); + CREATE_HAS_METHOD_CHECKER(make_splitter); + + class AbstractTaskFactory { + public: + virtual AbstractTask *create(std::istream&) const = 0; + virtual ~AbstractTaskFactory() noexcept = default; + }; + + template + class TaskFactory : public AbstractTaskFactory { + class ConcreteTask : public AbstractTask { + constexpr static const bool has_make_splitter = decltype(has_make_splitter_method::test(std::declval(), + std::tuple_cat(std::tuple(), + std::declval())))::value; + constexpr static const bool has_process = decltype(has_process_method::test(std::declval(), + std::tuple_cat(std::tuple(declref(), + declref()), + std::declval())))::value; + constexpr static const bool has_merge = decltype(has_merge_method::test(std::declval(), + std::tuple_cat(std::tuple>(), + std::declval())))::value; + constexpr static const bool has_sync = decltype(has_sync_method::test(std::declval(), std::declval()))::value; + public: + ConcreteTask(Task &&task, const Locals &locals) : task_{std::move(task)}, locals_{locals} {} + + template + auto make_splitter(std::enable_if_t size) { + auto make_splitter_args = std::tuple_cat(std::make_tuple(size), locals_); + auto splitter = + std::apply( + [&](auto &&... ts) { + return task_.make_splitter(std::forward(ts)...); + }, + make_splitter_args); + return splitter; + } + + template + auto make_splitter(std::enable_if_t) { + WARN("Unimplemented (trivial) make_splitter method"); + auto trivial_splitter = [](std::ostream&, int) { + return false; + }; + return trivial_splitter; + } + + void process(std::istream &is, std::ostream &os) override { + process_impl(is, os); + } + + void sync(void) override { + sync_impl(); + } + + template > + decltype(auto) merge(std::enable_if_t &piss) { + auto merge_args = std::tuple_cat(std::make_tuple(piss), locals_); + auto merge_call = [&](auto &&... ts) { return task_.merge(std::forward(ts)...); }; + return std::apply(merge_call, merge_args); + } + + template > + void merge(std::enable_if_t &piss) { + DEBUG("Unimplemented (trivial) merge method"); + // Do nothing except reading streams + for (auto &pis : piss) { + auto &is = *pis; + // Skip all characters in stream + is.ignore(std::numeric_limits::max()); + } + } + + private: + Task task_; + Locals locals_; + + template + void process_impl(std::enable_if_t &is, std::ostream &os) { + auto args = std::tuple_cat(std::tuple(is, os), locals_); + auto process_call = [this](auto &&... ts) { return task_.process(std::forward(ts)...); }; + std::apply(process_call, args); + } + + template + void process_impl(std::enable_if_t &is, std::ostream &os) { + AbstractTask::process(is, os); + } + + template + std::enable_if_t sync_impl() { + auto sync_call = [this](auto &&... ts) { return task_.sync(std::forward(ts)...); }; + std::apply(sync_call, locals_); + } + + template + std::enable_if_t sync_impl() { + AbstractTask::sync(); + } + + }; + + public: + TaskFactory(Locals &&locals) : locals_{std::move(locals)} {} + + ConcreteTask *acquire(Task &&task) const { + return new ConcreteTask(std::move(task), locals_); + } + + ConcreteTask *create(std::istream& is) const override { + Task task(is); + return acquire(std::move(task)); + } + + private: + Locals locals_; + }; + + +public: + TaskRegistry() { + ASSERT_MAIN_THREAD; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank_); + MPI_Comm_size(MPI_COMM_WORLD, &world_size_); + all_num_threads_ = collect_num_threads(); + } + + template + class Job { + public: + Job(TaskRegistry &task_registry, size_t job_id) + : task_registry_{task_registry}, job_id_{job_id} {} + + template + decltype(auto) operator()(Args &&... args) const { + VERIFY(task_registry_.world_rank_ == 0); + Task task(std::forward(args)...); + + task_registry_.job_broadcast_(job_id_); + { + OutputMPIStreamBcast obs(0); + task.serialize(obs); + } // close obs stream and send the rest of the data + + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + auto pfactory = dynamic_cast*>(task_registry_.factories_[job_id_].get()); + auto ptask = as_unique(pfactory->acquire(std::move(task))); + + auto plocal_map = std::make_shared(); + auto plocal_merge = std::make_shared(); + { + std::vector> oss; + oss.push_back(plocal_map); + for (int rank = 1; rank < world_size; ++rank) { + oss.emplace_back(new OutputMPIStream(rank, MAP_TAG)); + } + + const auto &all_num_threads_ = task_registry_.all_num_threads_; + size_t sum_num_threads = std::accumulate(all_num_threads_.cbegin(), all_num_threads_.cend(), 0); + VERIFY(sum_num_threads > 0); + DEBUG("All threads: " << sum_num_threads << " Multiplicator: " << MULT); + auto splitter = ptask->make_splitter(sum_num_threads * MULT); + + auto mult_splitter = [&splitter](auto &os, int rank, size_t count) { + for (size_t i = 0; i < count; ++i) { + bool result = splitter(os, rank); + if (!result) return false; + } + return true; + }; + + for (int rank = 0; mult_splitter(*oss[rank], rank, all_num_threads_[rank]); + rank = (rank + 1) % world_size) { + } + } // close streams here and send split data + + DEBUG("Process started"); + ptask->process(*plocal_map, *plocal_merge); + DEBUG("Process done"); + + std::vector> iss; + iss.push_back(std::move(plocal_merge)); + for (int rank = 1; rank < world_size; ++rank) { + DEBUG("Getting data from node " << rank); + iss.emplace_back(new InputMPIStream(rank, MERGE_TAG)); + DEBUG("Got data from node " << rank); + } + + std::vector piss; + for (int rank = 0; rank < world_size; ++rank) { + piss.push_back(iss[rank].get()); + } + + DEBUG("Merge started"); + auto wrap = (ptask->merge(piss), detail::wrap_void()); + DEBUG("Merge done, closing streams"); + for (auto is : iss) { + is.reset(); + } + DEBUG("Streams closed"); + DEBUG("sync calling..."); + ptask->sync(); + DEBUG("sync called, returning"); + return wrap.get(); + } + + private: + TaskRegistry &task_registry_; + size_t job_id_; + }; + + template + auto add(LocalArgs &&... local_args) { + barrier(); + + auto locals = std::make_tuple(std::forward(local_args)...); // std::make_tuple unwraps ref & cref + size_t job_id = factories_.size(); + factories_.emplace_back(new TaskFactory(std::move(locals))); + return Job(*this, job_id); + } + + void stop_listening() { + DEBUG("Stop listening"); + if (worker()) return; + if (listening_) { + job_broadcast_(STOP_LISTENING_TASK); + listening_ = false; + } + } + + void stop() { + WARN("stop() is depricated and will be removed soon. Use stop_listening() instead"); + return stop_listening(); + } + + void listen() { + if (master()) { + VERIFY(!listening_); + listening_ = true; + return; + } + DEBUG("Listening started"); + while (listen_one_()) { + }; + } + + ~TaskRegistry() { + stop_listening(); + } + + bool master() const { return world_rank() == 0; } + bool worker() const { return !master(); } + int world_size() const { return world_size_; } + int world_rank() const { return world_rank_; } + bool listening() const { return listening_; } + +private: + int world_rank_; + int world_size_; + std::vector all_num_threads_; + + std::vector> factories_; + + bool listening_ = false; + + void job_broadcast_(size_t job_id) { + VERIFY(world_rank_ == 0); + DEBUG("Job sending... " << job_id); + broadcast(job_id); + DEBUG("Job sent"); + } + + bool listen_one_() { + VERIFY(worker()); + size_t job_id; + DEBUG("Awaiting for job..."); + broadcast(job_id); // Get job id + DEBUG("Job got, id: " << job_id); + if (job_id == STOP_LISTENING_TASK) { + return false; + } + + const auto &pfactory = factories_[job_id]; + DEBUG("Task object initializer obtained"); + auto pibs = std::make_unique(0); + auto ptask = as_unique(pfactory->create(*pibs)); + pibs.reset(); // Pull the rest of data and close the pipe + DEBUG("Task object created"); + { + InputMPIStream is(0, MAP_TAG); + DEBUG("Input stream constructed"); + OutputMPIStream os(0, MERGE_TAG); + DEBUG("Remote process output stream constructed"); + ptask->process(is, os); + DEBUG("process done, closing the scope..."); + } // close and destroy streams + ptask->sync(); + DEBUG("sync() done"); + return true; + } + +}; + +#undef CREATE_HAS_METHOD_CHECKER + +inline auto make_trivial_generator() { + auto generator = [](std::ostream&, size_t) { + return false; + }; + return generator; +} + +inline auto make_seq_generator(size_t size) { + auto generator = [size, i = size_t(0)](std::ostream &os, size_t) mutable -> bool { + if (i == size) { + os.put('\0'); + return false; + } + os.put('\1'); + io::binary::BinWrite(os, i); + ++i; + return true; + }; + + return generator; +} + +inline auto make_seq_plus_n_generator(size_t size) { + auto generator = [size, i = size_t(0)](std::ostream &os, size_t) mutable -> bool { + if (i == size) { + return false; + } + io::binary::BinWrite(os, i, size); + ++i; + return true; + }; + + return generator; +} + +inline std::vector get_seq_plus_n(std::istream &is, size_t &size) { + std::vector chunks; + size = 0; + while (is.peek() != EOF) { + size_t i; + io::binary::BinRead(is, i, size); + chunks.push_back(i); + } + + return chunks; +} + +template +auto make_seq_along_generator(const Container &c) { + return make_seq_generator(c.size()); +} + +inline std::vector get_seq(std::istream &is) { + std::vector chunks; + // while (is.peek() != EOF) { + while (is.get() && is) { + size_t i; + io::binary::BinRead(is, i); + chunks.push_back(i); + } + + return chunks; +} + +template +auto make_vector_splitter(size_t n, const std::vector& data) { + size_t N = data.size(); + + auto splitter = [&data, n, N, chunk = size_t(0), + idx = size_t(0)](std::ostream& os, size_t /*node*/) mutable -> bool { + if (chunk == n) { + os.put('\0'); + return false; + } + + size_t size = N / n + (chunk < (N % n)); + + os.put('\1'); + io::binary::BinWrite(os, size); + for (size_t i = 0; i < size; ++i, ++idx) { + io::binary::BinWrite(os, data[idx]); + } + + ++chunk; + return true; + }; + + return splitter; +} + +template +auto all_equal(const T &v) -> decltype(broadcast(*new T(v)), T(v) == T(v), bool()) { + T cv(v); + broadcast(cv); + return v == cv; +} + +template +auto critical_ordered(F &&f) -> decltype(std::forward(f)()) { + using wrap_type = decltype(std::forward(f)(), detail::wrap_void()); + std::unique_ptr pwrap; + + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + for (int rank = 0; rank < world_size; ++rank) { + barrier(); + + if (world_rank == rank) { + auto wrap = (std::forward(f)(), detail::wrap_void()); + pwrap = wrap_into_unique(std::move(wrap)); + } + } + + return pwrap->get(); +} + +template +class FastLocalTransferWrap { +public: + FastLocalTransferWrap(T &v, int root = 0) : p_{&v}, root_{root} {} + + T &ref() { + return *p_; + } + + const T &ref() const { + return *p_; + } + + void BinWrite(std::ostream &os) const { + if (partask::master(root_)) { + T *p = new T(std::move(const_cast(ref()))); + DEBUG("Writing address " << p); + os.put('\1'); + io::binary::BinWrite(os, p); + } else { + os.put('\0'); + io::binary::BinWrite(os, ref()); + } + } + + void BinRead(std::istream &is) { + if (is.get()) { + VERIFY(partask::master(root_)); + T *p = io::binary::BinRead(is); + DEBUG("Readind address " << p); + ref() = std::move(*p); + delete p; + } else { + io::binary::BinRead(is, ref()); + } + } + +private: + T *p_; + int root_; +}; + +template +auto fast_local_transfer(T &v, int root = 0) { + return FastLocalTransferWrap(v, root); +} + +template +void swap_streams(io::ReadStreamList &all_streams, + io::ReadStreamList &streams, + const std::vector &chunks) { + VERIFY(streams.size() == chunks.size()); + for (size_t i = 0; i < chunks.size(); ++i) { + DEBUG("Swapping: " << i << " <-> " << chunks[i]); + std::swap(streams[i], all_streams[chunks[i]]); + } +} + +template +auto create_empty_stream_list(size_t size) { + io::ReadStreamList streams; + for (size_t i = 0; i < size; ++i) { + io::ReadStream empty_stream; + streams.push_back(std::move(empty_stream)); + } + return streams; +} + +} // namespace partask + +namespace io { + +namespace binary { + +// Enables io::binary::BinRead(is, FastLocalTransferWrap(x)); +// It is not a specialization! +template +void BinRead(std::istream &is, partask::FastLocalTransferWrap &&w) { + w.BinRead(is); +} + +} // namespace binary +} // namespace io From b4b9b1f31c7d72dd9ebe7c16b98dcbc3babb15db Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 27 May 2020 00:19:50 +0300 Subject: [PATCH 007/102] Add MPI test --- src/CMakeListsInternal.txt | 2 + src/test/mpi/CMakeLists.txt | 15 +++ src/test/mpi/mpi_test.cpp | 226 ++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+) create mode 100644 src/test/mpi/CMakeLists.txt create mode 100644 src/test/mpi/mpi_test.cpp diff --git a/src/CMakeListsInternal.txt b/src/CMakeListsInternal.txt index 7f937094c7..e511e3cf21 100644 --- a/src/CMakeListsInternal.txt +++ b/src/CMakeListsInternal.txt @@ -11,9 +11,11 @@ if (SPADES_BUILD_INTERNAL) add_subdirectory(test/debruijn) add_subdirectory(test/examples) add_subdirectory(test/adt) + add_subdirectory(test/mpi) else() add_subdirectory(test/include_test EXCLUDE_FROM_ALL) add_subdirectory(test/debruijn EXCLUDE_FROM_ALL) + add_subdirectory(test/mpi EXCLUDE_FROM_ALL) add_subdirectory(test/adt EXCLUDE_FROM_ALL) add_subdirectory(test/examples EXCLUDE_FROM_ALL) endif() diff --git a/src/test/mpi/CMakeLists.txt b/src/test/mpi/CMakeLists.txt new file mode 100644 index 0000000000..46f5b538b9 --- /dev/null +++ b/src/test/mpi/CMakeLists.txt @@ -0,0 +1,15 @@ +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(mpi_test CXX) + +add_executable(mpi_test mpi_test.cpp) +target_link_libraries(mpi_test utils ${COMMON_LIBRARIES}) +if (MPI_FOUND) + target_link_libraries(mpi_test ${MPI_LIBRARIES}) +endif() + + diff --git a/src/test/mpi/mpi_test.cpp b/src/test/mpi/mpi_test.cpp new file mode 100644 index 0000000000..edf0c2d890 --- /dev/null +++ b/src/test/mpi/mpi_test.cpp @@ -0,0 +1,226 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include +#include +#include +#include +#include +#include + +#include "pipeline/partask_mpi.hpp" + +#include "utils/logger/mpi_log_writers.hpp" + +void create_console_logger() { + using namespace logging; + + logger *lg = create_logger(""); + lg->add_writer(std::make_shared()); + attach_logger(lg); +} + +class ArraySum { +public: + ArraySum(const std::string &message = "") : message_{message} {}; + ArraySum(const ArraySum&) = delete; + ArraySum(ArraySum&&) = default; + ~ArraySum() noexcept { + std::cout << "~ArraySum() node:" << partask::world_rank() << std::endl; + } + + std::string message_; + ArraySum(std::istream &is) {std::getline(is, message_);} + + std::ostream &serialize(std::ostream &os) const { return os << message_; } + + template + auto make_splitter(size_t n, const Data &data, Args &&...) { + size_t N = data.size(); + auto splitter = [N, n, i = size_t(0)](std::ostream &os, size_t /*node*/) mutable -> bool { + if (i == n) return false; + size_t begin = i * N / n; + size_t end = (i + 1) * N / n; + ++i; + os << begin << " " << end << " "; + return true; + }; + + return splitter; + } + + template + void process(std::istream &is, std::ostream &os, const Data &data, Args &&...) { + std::cout << "process run" << std::endl; + std::cout << "MESSAGE: " << message_ << std::endl; + long long int sum = 0; +#pragma omp parallel reduction(+ : sum) + while (true) { + size_t begin, end; + bool exit = false; +#pragma omp critical + { + if (is.peek() == EOF || !(is >> begin >> end)) exit = true; + if (!exit) std::cout << "Extracted range: " << begin << " " << end << std::endl; + } + if (exit) break; + for (size_t i = begin; i < end; ++i) { + sum += data[i]; + } + } + std::cout << "Computed sum: " << sum << std::endl; + os << sum; + } + + template + auto merge(const std::vector &piss, Args &&...) { + long long int sum = 0; + for (auto &pis : piss) { + long long int local_sum; + *pis >> local_sum; + sum += local_sum; + } + + return sum; + } +}; + +const size_t N = 100000; +std::array data; +int main() { + create_console_logger(); + INFO("Starting mpi test"); + + { + int a = 1, b = 2, c = 3; + + std::stringstream ss; + auto tpl = std::make_tuple(a, b, c, std::string("21312321")); + io::binary::BinWrite(ss, std::make_tuple(int(3), int(4), int(5), std::string("123456"))); + io::binary::BinRead(ss, tpl); + INFO("Read: " << std::get<0>(tpl) << ":" << std::get<1>(tpl) << ":" << std::get<2>(tpl) << ":" << std::get<3>(tpl)); + + io::binary::BinWrite(ss, std::tie(a, b)); + a = 42, b = 32; + io::binary::BinRead(ss, std::tie(a, b)); + INFO("Read: " << a << ":" << b); + + io::binary::BinWrite(ss, tpl); + io::binary::BinWrite(ss, std::vector(10)); + } + + partask::init(); + { + partask::ChunkedStringStream mss; + mss << "mama,papa,pipi" << std::endl; + std::string s; + mss >> s; + std::cout << "Test: " << s; + } + + std::cout << "\n\n\n\n"; + + + std::unordered_map> m, m2; + m["mama"] = {1, 2, 4}; + m["papa"] = {1, 3, 4, 2}; + + std::stringstream ss; + io::binary::BinWrite(ss, m); + io::binary::BinRead(ss, m2); + std::cout << m2["papa"][1] << " <-- should be "<< m["papa"][1] << std::endl;; + + io::binary::BinWrite(std::cout, std::string("Mama"), 123, std::string("Pipi")); + + + io::binary::BinWrite(std::cout, std::string("Mama"), 123, std::string("Pipi")); + + std::iota(data.begin(), data.end(), 1); + size_t sum = std::accumulate(data.cbegin(), data.cend(), size_t(0)); + std::cout << "Actual sum: " << sum << std::endl; + + + + if (partask::world_rank() == 0) { + std::string t = "0123456789"; + std::string s = ""; + for (size_t i = 0; i < 25; ++i) { + s += t; + } + partask::OutputMPIStream os(1); + // os << s; + os << "checkX"; + os << "checkY"; + os << "checkZ"; + os << "checkA"; + os << s; + os << "checkB"; + os << s; + os << "checkC"; + os << "checkD"; + os << "checkE"; + os.flush(); + os.flush(); + } + if (partask::world_rank() == 1) { + partask::InputMPIStream is(0); + std::string s; + is >> s; + std::cout << "Streams test" << std::endl; + std::cout << s << std::endl; + std::cout << "Streams test" << std::endl; + } + + partask::barrier(); + std::cout << "broadcast test" << std::endl; + partask::broadcast(m); + std::cout << "broadcast test done" << std::endl; + partask::barrier(); + + partask::all_set_num_threads(10); + + partask::TaskRegistry reg; + + auto ptr = std::make_unique("Mama"); // Non-copyable + std::mutex mtx; // Non-copyable + auto job = reg.add(std::cref(data), std::cref(ptr), std::ref(mtx)); + reg.listen(); + + if (reg.master()) { + auto res = job("Message1"); + std::cout << "JOB RESULT: " << res << std::endl; + res = job("Message2"); + std::cout << "JOB RESULT: " << res << std::endl; + res = job("Message3"); + std::cout << "JOB RESULT: " << res << std::endl; + res = job(); + std::cout << "JOB RESULT: " << res << std::endl; + } + + reg.stop_listening(); + std::cout << "Before the barrier " << __LINE__; + partask::barrier(); + + auto job2 = reg.add(std::cref(data)); + reg.listen(); + + if (reg.master()) { + auto res = job(); + std::cout << "JOB RESULT: " << res << std::endl; + res = job2(); + std::cout << "JOB RESULT: " << res << std::endl; + res = job(); + std::cout << "JOB RESULT: " << res << std::endl; + res = job(); + std::cout << "JOB RESULT: " << res << std::endl; + } + + reg.stop_listening(); + reg.stop_listening(); + + partask::finalize(); + return 0; +} From 10396a48aed5774abb7a328cfc659e4c77afabc1 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Mon, 28 Sep 2020 13:58:56 +0300 Subject: [PATCH 008/102] User-friendly rank reporting --- src/common/utils/logger/mpi_log_writers.cpp | 2 +- src/test/mpi/mpi_test.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common/utils/logger/mpi_log_writers.cpp b/src/common/utils/logger/mpi_log_writers.cpp index c71579e56f..11e244244d 100644 --- a/src/common/utils/logger/mpi_log_writers.cpp +++ b/src/common/utils/logger/mpi_log_writers.cpp @@ -39,7 +39,7 @@ std::string mpi_console_writer::nodeinfo() const { int world_rank, world_size; MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - return fmt::format("{:>2d}/{:<2d}", world_rank, world_size); + return fmt::format("{:>2d}/{:<2d}", world_rank + 1, world_size); } else { return fmt::format("{:^5}", "N/A"); } diff --git a/src/test/mpi/mpi_test.cpp b/src/test/mpi/mpi_test.cpp index edf0c2d890..849a61536d 100644 --- a/src/test/mpi/mpi_test.cpp +++ b/src/test/mpi/mpi_test.cpp @@ -112,7 +112,8 @@ int main() { io::binary::BinWrite(ss, std::vector(10)); } - partask::init(); + bool init = partask::init(); + INFO("MPI init: " << (init ? "done" : "failed")); { partask::ChunkedStringStream mss; mss << "mama,papa,pipi" << std::endl; From 049271af3088cb34638ee87c2551b2ba4447f4df Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 25 Sep 2020 16:52:53 +0300 Subject: [PATCH 009/102] Rudimentrary MPI stage manager & MPI stage --- src/common/pipeline/CMakeLists.txt | 22 ++- src/common/pipeline/mpi_stage.cpp | 236 +++++++++++++++++++++++++++++ src/common/pipeline/mpi_stage.hpp | 157 +++++++++++++++++++ src/common/pipeline/stage.cpp | 28 +++- src/common/pipeline/stage.hpp | 29 +++- src/projects/spades/CMakeLists.txt | 3 +- 6 files changed, 456 insertions(+), 19 deletions(-) create mode 100644 src/common/pipeline/mpi_stage.cpp create mode 100644 src/common/pipeline/mpi_stage.hpp diff --git a/src/common/pipeline/CMakeLists.txt b/src/common/pipeline/CMakeLists.txt index 6d4ba25eeb..ac507cada5 100644 --- a/src/common/pipeline/CMakeLists.txt +++ b/src/common/pipeline/CMakeLists.txt @@ -8,10 +8,20 @@ project(pipeline CXX) -add_library(pipeline STATIC - graph_pack.cpp - graph_pack_helpers.cpp - sequence_mapper_gp_api.cpp - stage.cpp) +set(pipeline_src + graph_pack.cpp + graph_pack_helpers.cpp + sequence_mapper_gp_api.cpp + stage.cpp) -target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment) \ No newline at end of file +if (MPI_FOUND) + set(pipeline_src ${pipeline_src} mpi_stage.cpp) +endif() + +add_library(pipeline STATIC ${pipeline_src}) + +if (MPI_FOUND) + target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment ${MPI_LIBRARIES}) +else() + target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment) +endif() diff --git a/src/common/pipeline/mpi_stage.cpp b/src/common/pipeline/mpi_stage.cpp new file mode 100644 index 0000000000..f15e514b34 --- /dev/null +++ b/src/common/pipeline/mpi_stage.cpp @@ -0,0 +1,236 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "pipeline/stage.hpp" +#include "pipeline/mpi_stage.hpp" + +#include "partask_mpi.hpp" +#include "io/binary/graph_pack.hpp" +#include "io/dataset_support/read_converter.hpp" +#include "utils/logger/log_writers.hpp" + +#include +#include +#include +#include + + +namespace { +class PhaseIdComparator { + public: + PhaseIdComparator(const char* id) { + const char* pos = strstr(id, ":"); + VERIFY(pos != NULL); + id_ = pos + 1; + } + + bool operator()(const std::unique_ptr &phase) const { + return 0 == strcmp(id_, phase->id()); + } + + private: + const char* id_; +}; +} + +namespace spades { + +void MPICompositeStageBase::run(graph_pack::GraphPack& gp, + const char* started_from) { + // The logic here is as follows. By this time StageManager already called + // load() function of the Stage itself. Therefore we only need to do + // storage-related things (if any) and therefore just call the init() + // function. Phases are supposed only to load the differences. + VERIFY(parent_); + init(gp, started_from); + auto start_phase = phases_.begin(); + if (started_from && + strstr(started_from, ":") && + started_from == strstr(started_from, id())) { + start_phase = std::find_if(phases_.begin(), phases_.end(), PhaseIdComparator(started_from)); + if (start_phase == phases_.end()) { + ERROR("Invalid start stage / phase combination specified: " << started_from); + exit(-1); + } + if (start_phase != phases_.begin()) { + PhaseBase * prev_phase = std::prev(start_phase)->get(); + std::string composite_id(id()); + composite_id += ":"; + composite_id += prev_phase->id(); + prev_phase->load(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); + } + } + + + // Whether the previous phase was parallel. If this is the first phase then + // assume that the previous was parallel for the sake of simplicity of the + // implementation. + bool pparallel = true; + for (auto et = phases_.end(); start_phase != et; ++start_phase) { + PhaseBase *phase = start_phase->get(); + bool cparallel = phase->distributed(); + + if (cparallel) { + if (!pparallel) { + partask::critical_ordered([this] { + if (worker()) { + io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); + } + }); + INFO("Syncing world for MPI parallel section"); + const size_t deadbeef = 0xDEADBEEF; + if (master()) { + partask::OutputMPIStreamBcast s(0); + io::binary::FullPackIO().BinWrite(s, gp); + io::binary::BinWrite(s, deadbeef); + debruijn_graph::config::write_lib_data(s); + io::binary::BinWrite(s, deadbeef); + } else { + partask::InputMPIStreamBcast s(0); + io::binary::FullPackIO().BinRead(s, gp); + size_t db; + io::binary::BinRead(s, db); + VERIFY(db == deadbeef); + debruijn_graph::config::load_lib_data(s); + io::binary::BinRead(s, db); + VERIFY(db == deadbeef); + } + INFO("World synced"); + } + INFO("MPI PROCEDURE == " << phase->name() << (master() ? " (master)" : " (worker)")); + phase->run(gp, started_from); + + // Do saves only on master node + if (parent_->saves_policy().EnabledCheckpoints(id()) && master()) { + std::string composite_id(id()); + composite_id += ":"; + composite_id += phase->id(); + + phase->save(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); + } + } else { + if (master()) { + INFO("PROCEDURE == " << phase->name()); + phase->run(gp, started_from); + if (parent_->saves_policy().EnabledCheckpoints(id())) { + std::string composite_id(id()); + composite_id += ":"; + composite_id += phase->id(); + + phase->save(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); + } + } else { + INFO("PROCEDURE == " << phase->name() << " (skipped on worker)"); + } + } + + pparallel = cparallel; + } + + fini(gp); +} + +MPIStageManager::MPIStageManager(SavesPolicy policy) + : StageManager(policy), world_size_(1), rank_(0), first_(false) { + int initialized = 0; + MPI_Initialized(&initialized); + VERIFY(initialized); + if (!initialized) { + int provided; + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_FUNNELED, &provided); + if (provided < MPI_THREAD_FUNNELED) { + FATAL_ERROR("Used MPI implementation failed to provide MPI_THREAD_FUNNELED thread support level"); + } + first_ = true; + } + + MPI_Comm_size(MPI_COMM_WORLD, &world_size_); + MPI_Comm_rank(MPI_COMM_WORLD, &rank_); + + INFO("MPI communications established, world size: " << world_size_ << ", current rank: " << rank_ << (master() ? " (master)" : " (worker)")); +} + +MPIStageManager::~MPIStageManager() { + INFO("MPI communications stopped" << (master() ? " (master)" : " (worker)")); + if (first_) + MPI_Finalize(); +} + +void MPIStageManager::run(graph_pack::GraphPack& g, + const char* start_from) { + auto start_stage = prepare_run(g, start_from); + const auto& saves_policy = this->saves_policy(); + + // Whether the previous stage was parallel. If this is the first stage then + // assume that the previous was parallel for the sake of simplicity of the + // implementation. + bool pparallel = true; + for (; start_stage != stages().end(); ++start_stage) { + AssemblyStage *stage = start_stage->get(); + bool cparallel = stage->distributed(); + + if (cparallel) { + if (!pparallel) { + partask::critical_ordered([this] { + if (worker()) { + io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); + } + }); + INFO("Syncing world for MPI parallel section"); + const size_t deadbeef = 0xDEADBEEF; + if (master()) { + partask::OutputMPIStreamBcast s(0); + io::binary::FullPackIO().BinWrite(s, g); + io::binary::BinWrite(s, deadbeef); + debruijn_graph::config::write_lib_data(s); + io::binary::BinWrite(s, deadbeef); + } else { + partask::InputMPIStreamBcast s(0); + io::binary::FullPackIO().BinRead(s, g); + size_t db; + io::binary::BinRead(s, db); + VERIFY_MSG(db == deadbeef, "Values " << db << " " << deadbeef); + debruijn_graph::config::load_lib_data(s); + io::binary::BinRead(s, db); + VERIFY(db == deadbeef); + } + INFO("World synced"); + } + INFO("MPI STAGE == " << stage->name() << (master() ? " (master)" : " (worker)")); + stage->prepare(g, start_from); + stage->run(g, start_from); + + // Do saves only on master node + if (saves_policy.EnabledCheckpoints(stage->id()) && master()) + stage->save(g, saves_policy.SavesPath()); + } else { + if (master()) { + INFO("STAGE == " << stage->name()); + stage->prepare(g, start_from); + stage->run(g, start_from); + if (saves_policy.EnabledCheckpoints(stage->id())) { + auto prev_saves = saves_policy.GetLastCheckpoint(); + stage->save(g, saves_policy.SavesPath()); + saves_policy.UpdateCheckpoint(stage->id()); + if (!prev_saves.empty() && saves_policy.RemovePreviousCheckpoint()) { + remove_all(saves_policy.SavesPath() / prev_saves); + } + } + } else { + INFO("STAGE == " << stage->name() << " (skipped on worker)"); + } + } + + if (cparallel || !stage->constant()) { + pparallel = cparallel; + } + } +} + +bool MPIAssemblyStage::master() const { return static_cast(parent_)->master(); } +bool MPIAssemblyStage::worker() const { return static_cast(parent_)->worker(); } + +} // namespace spades diff --git a/src/common/pipeline/mpi_stage.hpp b/src/common/pipeline/mpi_stage.hpp new file mode 100644 index 0000000000..cffd95cf5d --- /dev/null +++ b/src/common/pipeline/mpi_stage.hpp @@ -0,0 +1,157 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "stage.hpp" + +#include +#include +#include + +namespace spades { + +class MPIAssemblyStage; + +class MPIAssemblyStage : public AssemblyStage { +public: + using AssemblyStage::AssemblyStage; + + bool master() const; + bool worker() const; + bool distributed() const override { return true; } +}; + +class MPICompositeStageBase : public MPIAssemblyStage { +public: + class PhaseBase : public MPIAssemblyStage { + public: + PhaseBase(const char *name, const char *id) + : MPIAssemblyStage(name, id), parent_stage_(nullptr) { } + + bool distributed() const override { return false; } + bool master() const { return parent_stage_->master(); } + bool worker() const { return parent_stage_->worker(); } + protected: + MPICompositeStageBase *parent_stage_; + + friend class MPICompositeStageBase; + }; + + MPICompositeStageBase(const char *name, const char *id) + : MPIAssemblyStage(name, id) { } + + MPICompositeStageBase &add(PhaseBase *phase) { + phases_.push_back(std::unique_ptr(phase)); + phase->parent_stage_ = this; + + return *this; + } + + template + MPICompositeStageBase &add(Args&&... args) { + phases_.push_back(std::unique_ptr(new Phase(std::forward(args)...))); + phases_.back()->parent_stage_ = this; + + return *this; + } + + const std::vector >& phases() const { + return phases_; + } + + virtual void init(graph_pack::GraphPack &, const char * = nullptr) = 0; + virtual void fini(graph_pack::GraphPack &) = 0; + void run(graph_pack::GraphPack &gp, const char * = nullptr); + +private: + std::vector > phases_; +}; + +template +class MPICompositeStageWithStorage : public MPICompositeStageBase { +public: + class Phase : public PhaseBase { + public: + Phase(const char *name, const char *id) + : PhaseBase(name, id) { } + + MPICompositeStageWithStorage *parent() { return static_cast *>(parent_stage_); } + const MPICompositeStageWithStorage *parent() const { return static_cast *>(parent_stage_); } + + Storage &storage() { return parent()->storage(); } + const Storage &storage() const { return parent()->storage(); } + }; + + MPICompositeStageWithStorage(const char *name, const char *id) + : MPICompositeStageBase(name, id) { } + + void init(graph_pack::GraphPack &, const char * = nullptr) override {}; + void fini(graph_pack::GraphPack &) override {}; + + virtual Storage &storage() = 0; + virtual const Storage &storage() const = 0; +}; + +// FIXME: Make storage a policy +template +class MPICompositeStage : public MPICompositeStageWithStorage { +public: + MPICompositeStage(const char *name, const char *id) + : MPICompositeStageWithStorage(name, id) { } + + Storage &storage() override { return storage_; } + const Storage &storage() const override { return storage_; } + +private: + Storage storage_; +}; + +template +class MPICompositeStageDeferred : public MPICompositeStageWithStorage { +public: + MPICompositeStageDeferred(const char *name, const char *id) + : MPICompositeStageWithStorage(name, id) { } + + Storage &storage() override { return *storage_; } + const Storage &storage() const override { return *storage_; } + +protected: + bool has_storage() const { return (bool)storage_; } + + template void init_storage(Args&&... args) { + storage_.reset(new Storage(std::forward(args)...)); + } + void reset_storage() { + storage_.reset(); + } + +private: + // std::optional would be better, but it requires complete Storage type at + // this point. + std::unique_ptr storage_; +}; + +class MPIStageManager : public StageManager { +public: + MPIStageManager(SavesPolicy policy = SavesPolicy()); + ~MPIStageManager(); + + void run(graph_pack::GraphPack &g, + const char *start_from = nullptr); + + bool master() const { return rank_ == 0; } + bool worker() const { return rank_ != 0; } + +private: + int world_size_; + int rank_; + bool first_; + + DECL_LOGGER("MPIStageManager"); +}; + +} // namespace spades diff --git a/src/common/pipeline/stage.cpp b/src/common/pipeline/stage.cpp index 0741cba66f..619f1d0589 100644 --- a/src/common/pipeline/stage.cpp +++ b/src/common/pipeline/stage.cpp @@ -9,6 +9,7 @@ #include "stage.hpp" #include "graph_pack_helpers.h" +#include "partask_mpi.hpp" #include "io/binary/graph_pack.hpp" #include "io/dataset_support/read_converter.hpp" @@ -35,8 +36,8 @@ void AssemblyStage::load(graph_pack::GraphPack& gp, io::binary::FullPackIO().Load(p, gp); debruijn_graph::config::load_lib_data(p); - io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); - + // FIXME: Should not be here + partask::critical_ordered([] { io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); }); } @@ -62,7 +63,7 @@ class StageIdComparator { len_ = (pos != NULL ? pos - id : strlen(id)); } - bool operator()(const std::unique_ptr &stage) const { + bool operator()(const std::unique_ptr &stage) const { const char* sid = stage->id(); return (0 == strncmp(id_, sid, len_) && sid[len_] == 0); } @@ -80,13 +81,17 @@ class PhaseIdComparator { id_ = pos + 1; } - bool operator()(const std::unique_ptr &phase) const { + bool operator()(const std::unique_ptr &phase) const { return 0 == strcmp(id_, phase->id()); } private: const char* id_; }; +} + + +namespace spades { void CompositeStageBase::run(graph_pack::GraphPack& gp, const char* started_from) { @@ -144,8 +149,9 @@ void AssemblyStage::prepare(graph_pack::GraphPack& g, PrepareForStage(g, stage); } -void StageManager::run(graph_pack::GraphPack& g, - const char* start_from) { +std::vector>::iterator +StageManager::prepare_run(graph_pack::GraphPack& g, + const char* start_from) { auto start_stage = stages_.begin(); if (start_from) { if (strcmp(start_from, "last") == 0) { @@ -155,7 +161,7 @@ void StageManager::run(graph_pack::GraphPack& g, auto last_stage = std::find_if(stages_.begin(), stages_.end(), StageIdComparator(last_saves.c_str())); if (last_stage == stages_.end()) { WARN("Nothing to continue"); - return; + // return; FIXME: check it out } start_stage = std::next(last_stage); } else { @@ -183,6 +189,13 @@ void StageManager::run(graph_pack::GraphPack& g, } } + return start_stage; +} + +void StageManager::run(graph_pack::GraphPack& g, + const char* start_from) { + auto start_stage = prepare_run(g, start_from); + for (; start_stage != stages_.end(); ++start_stage) { AssemblyStage *stage = start_stage->get(); @@ -206,5 +219,4 @@ void StageManager::run(graph_pack::GraphPack& g, } } } - } diff --git a/src/common/pipeline/stage.hpp b/src/common/pipeline/stage.hpp index 46c5035ee6..cd7f1c9e2e 100644 --- a/src/common/pipeline/stage.hpp +++ b/src/common/pipeline/stage.hpp @@ -42,6 +42,8 @@ class AssemblyStage { const char *prefix = nullptr) const; void prepare(graph_pack::GraphPack &, const char *stage_name, const char *started_from = nullptr); virtual void run(graph_pack::GraphPack &, const char *started_from = nullptr) = 0; + virtual bool distributed() const { return false; } + virtual bool constant() const { return false; } private: const char *name_; @@ -88,6 +90,10 @@ class CompositeStageBase : public AssemblyStage { virtual void fini(graph_pack::GraphPack &) = 0; void run(graph_pack::GraphPack &gp, const char * = nullptr); + const std::vector >& phases() const { + return phases_; + } + private: std::vector > phases_; }; @@ -215,15 +221,16 @@ class StageManager { StageManager &add(AssemblyStage *stage) { stages_.push_back(std::unique_ptr(stage)); - stages_.back()->parent_ = this; + acquire(stage); return *this; } template StageManager &add(Args&&... args) { - stages_.push_back(std::unique_ptr(new Stage(std::forward(args)...))); - stages_.back()->parent_ = this; + auto *stage = new Stage(std::forward(args)...); + stages_.push_back(std::unique_ptr(stage)); + acquire(stage); return *this; } @@ -235,6 +242,22 @@ class StageManager { return saves_policy_; } +protected: + void acquire(AssemblyStage *stage) { + stage->parent_ = this; + } + + std::vector > &stages() { + return stages_; + } + + const std::vector > &stages() const { + return stages_; + } + + std::vector>::iterator prepare_run(graph_pack::GraphPack& g, + const char *start_from); + private: using Stages = std::vector >; diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index 137e0d5728..1cb8c7185c 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -32,8 +32,7 @@ set(SPADES_SRC pipeline.cpp main.cpp series_analysis.cpp ../mts/contig_abundance add_executable(spades-core ${SPADES_SRC}) -# FIXME: Temporary -target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES} ${MPI_LIBRARIES}) +target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES}) if (SPADES_STATIC_BUILD) set_target_properties(spades-core PROPERTIES LINK_SEARCH_END_STATIC 1) From 22fd17f60ff72133f8060cf626682a0ec62ace04 Mon Sep 17 00:00:00 2001 From: Alexander Shlemov Date: Wed, 7 Nov 2018 16:34:00 +0300 Subject: [PATCH 010/102] Add TestMPI stage --- src/common/stages/CMakeLists.txt | 8 ++- src/common/stages/test_mpi.cpp | 106 +++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 src/common/stages/test_mpi.cpp diff --git a/src/common/stages/CMakeLists.txt b/src/common/stages/CMakeLists.txt index 5f01e0e201..69a7483a59 100644 --- a/src/common/stages/CMakeLists.txt +++ b/src/common/stages/CMakeLists.txt @@ -8,6 +8,10 @@ project(stages CXX) -add_library(stages STATIC - read_conversion.cpp construction.cpp simplification.cpp ss_edge_split.cpp genomic_info_filler.cpp) +set(stages_src read_conversion.cpp construction.cpp simplification.cpp ss_edge_split.cpp genomic_info_filler.cpp) +if (MPI_FOUND) + set(stages_src ${stages_src} test_mpi.cpp) +endif() + +add_library(stages STATIC ${stages_src}) target_link_libraries(stages coverage_model pipeline gqf input) diff --git a/src/common/stages/test_mpi.cpp b/src/common/stages/test_mpi.cpp new file mode 100644 index 0000000000..120b20a98f --- /dev/null +++ b/src/common/stages/test_mpi.cpp @@ -0,0 +1,106 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "pipeline/partask_mpi.hpp" +#include "pipeline/mpi_stage.hpp" +#include "pipeline/stage.hpp" + +#include +#include +#include +#include +#include + +namespace debruijn_graph { + +class ArraySum { +public: + ArraySum(const std::string &message = "") : message_{message} {}; + ArraySum(const ArraySum&) = delete; + ArraySum(ArraySum&&) = default; + + std::string message_; + ArraySum(std::istream &is) { std::getline(is, message_); } + + std::ostream &serialize(std::ostream &os) const { return os << message_; } + + template + auto make_splitter(size_t n, const Data &data) { + size_t N = data.size(); + auto splitter = [N, n, i = size_t(0)](std::ostream &os, size_t /*node*/) mutable -> bool { + if (i == n) return false; + size_t begin = i * N / n; + size_t end = (i + 1) * N / n; + ++i; + os << begin << " " << end << " "; + return true; + }; + + return splitter; + }; + + template + void process(std::istream &is, std::ostream &os, const Data &data) { + INFO("MESSAGE: " << message_); + long long int sum = 0; +#pragma omp parallel reduction(+ : sum) + while (true) { + size_t begin, end; + bool exit = false; +#pragma omp critical + { + if (is.peek() == EOF || !(is >> begin >> end)) { + exit = true; + } else { + DEBUG("Extracted range: " << begin << " " << end); + } + } + if (exit) break; + for (size_t i = begin; i < end; ++i) { + sum += data[i]; + } + } + INFO("Computed sum: " << sum); + os << sum; + } + + auto merge(const std::vector &piss, ...) { + long long int sum = 0; + for (auto &pis : piss) { + long long int local_sum; + *pis >> local_sum; + sum += local_sum; + } + + return sum; + }; +}; + +class TestMPI : public spades::MPIAssemblyStage { +public: + TestMPI() : MPIAssemblyStage("Test MPI", "test_mpi") {} + + void run(graph_pack::GraphPack& /*gp*/, const char *) override { + INFO("TestMPI started"); + partask::TaskRegistry treg; + + const size_t N = 100000; + std::array data; + std::iota(data.begin(), data.end(), 1); + + auto job = treg.add(std::cref(data)); + treg.listen(); + + if (treg.master()) { + auto res = job("Message1"); + INFO("JOB RESULT: " << res); + } + + treg.stop_listening(); + } +}; + +} // namespace debruijn_graph From b1485edaf7ab6f1f7c438eb446b2a0d25cc54593 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Mon, 28 Sep 2020 13:25:57 +0300 Subject: [PATCH 011/102] Separate out SPAdes and hpcSPAdes binaries --- src/projects/spades/CMakeLists.txt | 13 ++- src/projects/spades/main_mpi.cpp | 143 +++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 5 deletions(-) create mode 100644 src/projects/spades/main_mpi.cpp diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index 1cb8c7185c..c31c42b0b0 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -28,11 +28,12 @@ add_library(spades-stages STATIC target_link_libraries(spades-stages hmmercpp spoa easel) -set(SPADES_SRC pipeline.cpp main.cpp series_analysis.cpp ../mts/contig_abundance.cpp) +set(SPADES_SRC pipeline.cpp series_analysis.cpp ../mts/contig_abundance.cpp) +set(SPADES_LIB spades-stages graphio common_modules ${COMMON_LIBRARIES}) add_executable(spades-core + main.cpp ${SPADES_SRC}) - -target_link_libraries(spades-core spades-stages graphio common_modules ${COMMON_LIBRARIES}) +target_link_libraries(spades-core ${SPADES_LIB}) if (SPADES_STATIC_BUILD) set_target_properties(spades-core PROPERTIES LINK_SEARCH_END_STATIC 1) @@ -44,9 +45,11 @@ install(TARGETS spades-core COMPONENT spades) if (MPI_FOUND) - add_executable(spades-hpc ${SPADES_SRC}) + add_executable(spades-hpc + main_mpi.cpp + ${SPADES_SRC}) - target_link_libraries(spades-hpc spades-stages graphio common_modules ${COMMON_LIBRARIES} ${MPI_LIBRARIES}) + target_link_libraries(spades-hpc ${SPADES_LIB} ${MPI_LIBRARIES}) set_target_properties(spades-hpc PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") if (SPADES_STATIC_BUILD) diff --git a/src/projects/spades/main_mpi.cpp b/src/projects/spades/main_mpi.cpp new file mode 100644 index 0000000000..8c73d4a2d6 --- /dev/null +++ b/src/projects/spades/main_mpi.cpp @@ -0,0 +1,143 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2015-2022 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "configs/config_struct.hpp" +#include "pipeline/partask_mpi.hpp" + +#include "utils/logger/mpi_log_writers.hpp" +#include "utils/memory_limit.hpp" +#include "utils/segfault_handler.hpp" +#include "utils/perf/timetracer.hpp" + +#include "k_range.hpp" +#include "version.hpp" + +namespace spades { +void assemble_genome(); +} + +struct TimeTracerRAII { + TimeTracerRAII(llvm::StringRef program_name, + unsigned granularity = 500, + const std::string &prefix = "", const std::string &suffix = "") { + time_trace_file_ = prefix + "spades_time_trace_" + suffix + ".json"; + llvm::timeTraceProfilerInitialize(granularity, program_name); + } + ~TimeTracerRAII() { + if (auto E = llvm::timeTraceProfilerWrite(time_trace_file_, "spades-core")) { + handleAllErrors(std::move(E), + [&](const llvm::StringError &SE) { + ERROR("" << SE.getMessage() << "\n"); + }); + return; + } else { + INFO("Time trace is written to: " << time_trace_file_); + } + llvm::timeTraceProfilerCleanup(); + } + + std::string time_trace_file_; +}; + +void load_config(const std::vector& cfg_fns) { + for (const auto& s : cfg_fns) { + CHECK_FATAL_ERROR(exists(s), "File " << s << " doesn't exist or can't be read!"); + } + + cfg::create_instance(cfg_fns); + + create_directory(cfg::get().output_dir); + create_directory(cfg::get().tmp_dir); + + create_directory(cfg::get().temp_bin_reads_path); +} + +void create_console_logger(const std::filesystem::path& dir, std::filesystem::path log_prop_fn) { + using namespace logging; + + if (!exists(log_prop_fn)) + log_prop_fn = dir / log_prop_fn; + + logger *lg = create_logger(exists(log_prop_fn) ? log_prop_fn : ""); + lg->add_writer(std::make_shared()); + attach_logger(lg); +} + +int main(int argc, char **argv) { + utils::perf_counter pc; + + const size_t GB = 1 << 30; + + srand(42); + srandom(42); + + bool init = partask::init(); + INFO("MPI init: " << (init ? "done" : "failed")); + + try { + using namespace debruijn_graph; + + std::filesystem::path cfg_dir = std::filesystem::path(argv[1]).parent_path(); + + std::vector cfg_fns; + for (int i = 1; i < argc; ++i) { + cfg_fns.push_back(argv[i]); + } + + // read configuration file (dataset path etc.) + load_config(cfg_fns); + + create_console_logger(cfg_dir, cfg::get().log_filename); + for (const auto& cfg_fn : cfg_fns) + INFO("Loaded config from " << cfg_fn); + + VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K); + VERIFY(cfg::get().K % 2 != 0); + + utils::limit_memory(cfg::get().max_memory * GB); + + // assemble it! + START_BANNER("hpcSPAdes"); + INFO("Maximum k-mer length: " << runtime_k::MAX_K); + INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K); + INFO("Maximum # of threads to use (adjusted due to OMP capabilities): " << cfg::get().max_threads); + std::unique_ptr traceraii; + if (cfg::get().tt.enable || cfg::get().developer_mode) { + traceraii.reset(new TimeTracerRAII(argv[0], + cfg::get().tt.granularity, + cfg::get().output_dir, std::to_string(cfg::get().K))); + INFO("Time tracing is enabled"); + } + + TIME_TRACE_SCOPE("spades"); + spades::assemble_genome(); + } catch (std::bad_alloc const &e) { + std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } catch (std::exception const &e) { + std::cerr << "Exception caught " << e.what() << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } catch (...) { + std::cerr << "Unknown exception caught " << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } + + unsigned ms = (unsigned) pc.time_ms(); + unsigned secs = (ms / 1000) % 60; + unsigned mins = (ms / 1000 / 60) % 60; + unsigned hours = (ms / 1000 / 60 / 60); + INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds"); + + // OK + int success = partask::finalize(); + VERIFY(success); + return 0; +} From 68e89b406a0ee4826b4d08b23950114fd7677abe Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Mon, 28 Sep 2020 13:37:25 +0300 Subject: [PATCH 012/102] Even better SPAdes / hpcSPAdes separation --- src/common/pipeline/mpi_stage.hpp | 2 +- src/common/pipeline/stage.hpp | 4 +-- src/projects/spades/main.cpp | 2 +- src/projects/spades/main_mpi.cpp | 4 +-- src/projects/spades/pipeline.cpp | 51 ++++++++++++++++++++----------- 5 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/common/pipeline/mpi_stage.hpp b/src/common/pipeline/mpi_stage.hpp index cffd95cf5d..b758cdc38a 100644 --- a/src/common/pipeline/mpi_stage.hpp +++ b/src/common/pipeline/mpi_stage.hpp @@ -141,7 +141,7 @@ class MPIStageManager : public StageManager { ~MPIStageManager(); void run(graph_pack::GraphPack &g, - const char *start_from = nullptr); + const char *start_from = nullptr) override; bool master() const { return rank_ == 0; } bool worker() const { return rank_ != 0; } diff --git a/src/common/pipeline/stage.hpp b/src/common/pipeline/stage.hpp index cd7f1c9e2e..817f736d25 100644 --- a/src/common/pipeline/stage.hpp +++ b/src/common/pipeline/stage.hpp @@ -235,8 +235,8 @@ class StageManager { return *this; } - void run(graph_pack::GraphPack &g, - const char *start_from = nullptr); + virtual void run(graph_pack::GraphPack &g, + const char *start_from = nullptr); const SavesPolicy &saves_policy() const { return saves_policy_; diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp index b79ce686d6..027b441f00 100644 --- a/src/projects/spades/main.cpp +++ b/src/projects/spades/main.cpp @@ -17,7 +17,7 @@ #include "version.hpp" namespace spades { -void assemble_genome(); +void assemble_genome(bool mpi = false); } struct TimeTracerRAII { diff --git a/src/projects/spades/main_mpi.cpp b/src/projects/spades/main_mpi.cpp index 8c73d4a2d6..c85379b614 100644 --- a/src/projects/spades/main_mpi.cpp +++ b/src/projects/spades/main_mpi.cpp @@ -18,7 +18,7 @@ #include "version.hpp" namespace spades { -void assemble_genome(); +void assemble_genome(bool mpi); } struct TimeTracerRAII { @@ -115,7 +115,7 @@ int main(int argc, char **argv) { } TIME_TRACE_SCOPE("spades"); - spades::assemble_genome(); + spades::assemble_genome(true); } catch (std::bad_alloc const &e) { std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl; MPI_Abort(MPI_COMM_WORLD, EINTR); diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index 381e6eea46..62d4e87dfa 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -22,6 +22,7 @@ #include "library/library.hpp" #include "pipeline/graph_pack.hpp" #include "pipeline/stage.hpp" +#include "pipeline/mpi_stage.hpp" #include "alignment/kmer_mapper.hpp" #include "wastewater_disentangle.hpp" @@ -205,7 +206,15 @@ static void AddRepeatResolutionStages(StageManager &SPAdes) { .add(); } -void assemble_genome() { +class FakeStageOnlyforDataSyncDoesNothingElse : public spades::AssemblyStage { +public: + FakeStageOnlyforDataSyncDoesNothingElse() + : AssemblyStage("Fake Stage Only for Data Sync", "fake_stage_sync_data") { } + + void run(graph_pack::GraphPack&, const char *) {} +}; + +void assemble_genome(bool mpi = false) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; @@ -224,10 +233,16 @@ void assemble_genome() { INFO("Starting from stage: " << cfg::get().entry_point); - StageManager SPAdes(SavesPolicy(cfg::get().checkpoints, - cfg::get().output_saves, cfg::get().load_from)); + std::unique_ptr SPAdes; + SavesPolicy saves_policy(cfg::get().checkpoints, + cfg::get().output_saves, cfg::get().load_from); + if (mpi) { + SPAdes.reset(new MPIStageManager(saves_policy)); + } else { + SPAdes.reset(new StageManager(saves_policy)); + } - if (SPAdes.saves_policy().EnabledAnyCheckpoint()) + if (SPAdes->saves_policy().EnabledAnyCheckpoint()) create_directory(cfg::get().output_saves); bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable; @@ -247,44 +262,44 @@ void assemble_genome() { } // Build the pipeline - SPAdes.add(); + SPAdes->add(); if (!AssemblyGraphPresent()) { - AddConstructionStages(SPAdes); + AddConstructionStages(*SPAdes); if (cfg::get().sewage) - SPAdes.add(); + SPAdes->add(); - AddSimplificationStages(SPAdes); + AddSimplificationStages(*SPAdes); - SPAdes.add(cfg::get().main_iteration ? - GetBeforeRROutput() : GetNonFinalStageOutput()); + SPAdes->add(cfg::get().main_iteration ? + GetBeforeRROutput() : GetNonFinalStageOutput()); } else { - SPAdes.add(); + SPAdes->add(); } if (cfg::get().main_iteration) { // Not metaextrachromosomal! if (mode == pipeline_type::plasmid) - SPAdes.add(); + SPAdes->add(); if (HybridLibrariesPresent()) - SPAdes.add(); + SPAdes->add(); // No graph modification allowed after HybridLibrariesAligning stage! if (cfg::get().rr_enable) - AddRepeatResolutionStages(SPAdes); + AddRepeatResolutionStages(*SPAdes); if (mode == pipeline_type::metaextrachromosomal) - AddMetaplasmidStages(SPAdes); + AddMetaplasmidStages(*SPAdes); else - SPAdes.add(GetFinalStageOutput()); + SPAdes->add(GetFinalStageOutput()); if (cfg::get().hm) - SPAdes.add(); + SPAdes->add(); } - SPAdes.run(conj_gp, cfg::get().entry_point.c_str()); + SPAdes->run(conj_gp, cfg::get().entry_point.c_str()); // For informing spades.py about estimated params write_lib_data(cfg::get().output_dir / "final"); From 5b8f287a439e3b600bca629b51444534820bde68 Mon Sep 17 00:00:00 2001 From: Olga Kunyavskaya Date: Wed, 7 Oct 2020 13:17:34 +0300 Subject: [PATCH 013/102] read_converter: changable chunk_num --- .../io/dataset_support/read_converter.cpp | 21 ++++++++++--------- .../io/dataset_support/read_converter.hpp | 9 +++++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/common/io/dataset_support/read_converter.cpp b/src/common/io/dataset_support/read_converter.cpp index a29cbf48fb..499e7e75b5 100644 --- a/src/common/io/dataset_support/read_converter.cpp +++ b/src/common/io/dataset_support/read_converter.cpp @@ -174,7 +174,7 @@ void ConvertIfNeeded(DataSet &data, unsigned nthreads, pool = std::make_unique(nthreads); for (auto &lib : data) { - if (!ReadConverter::LoadLibIfExists(lib)) + if (lib.data().binary_reads_info.bin_reads_info_file != "" && !ReadConverter::LoadLibIfExists(lib)) ReadConverter::ConvertToBinary(lib, pool.get(), flags, tagger); } } @@ -182,13 +182,14 @@ void ConvertIfNeeded(DataSet &data, unsigned nthreads, BinaryPairedStreams paired_binary_readers(SequencingLibraryT &lib, bool followed_by_rc, size_t insert_size, - bool include_merged) { + bool include_merged, + size_t chunk_num) { const auto& data = lib.data(); CHECK_FATAL_ERROR(data.binary_reads_info.binary_converted, "Lib was not converted to binary, cannot produce binary stream"); ReadStreamList paired_streams; - const size_t n = data.binary_reads_info.chunk_num; + const size_t n = chunk_num ? chunk_num : data.binary_reads_info.chunk_num; for (size_t i = 0; i < n; ++i) { ReadStream stream{BinaryFilePairedStream(data.binary_reads_info.paired_read_prefix, @@ -212,13 +213,14 @@ BinaryPairedStreams paired_binary_readers(SequencingLibraryT &lib, BinarySingleStreams single_binary_readers(SequencingLibraryT &lib, bool followed_by_rc, - bool including_paired_and_merged) { + bool including_paired_and_merged, + size_t chunk_num) { const auto& data = lib.data(); CHECK_FATAL_ERROR(data.binary_reads_info.binary_converted, "Lib was not converted to binary, cannot produce binary stream"); BinarySingleStreams single_streams; - const size_t n = data.binary_reads_info.chunk_num; + const size_t n = chunk_num ? chunk_num : data.binary_reads_info.chunk_num; for (size_t i = 0; i < n; ++i) single_streams.push_back(BinaryFileSingleStream(data.binary_reads_info.single_read_prefix, @@ -249,16 +251,15 @@ BinarySingleStreams single_binary_readers_for_libs(DataSet& dataset_info, const std::vector& libs, bool followed_by_rc, - bool including_paired_reads) { + bool including_paired_reads, + size_t chunk_num) { VERIFY(!libs.empty()) - size_t chunk_num = dataset_info[libs.front()].data().binary_reads_info.chunk_num; + chunk_num = chunk_num ? chunk_num : dataset_info[libs.front()].data().binary_reads_info.chunk_num; std::vector streams(chunk_num); for (size_t i = 0; i < libs.size(); ++i) { - VERIFY_MSG(chunk_num == dataset_info[libs[i]].data().binary_reads_info.chunk_num, - "Cannot create stream for multiple libraries with different chunk_num") BinarySingleStreams lib_streams = single_binary_readers(dataset_info[libs[i]], - followed_by_rc, including_paired_reads); + followed_by_rc, including_paired_reads, chunk_num); for (size_t j = 0; j < chunk_num; ++j) streams[j].push_back(std::move(lib_streams[j])); diff --git a/src/common/io/dataset_support/read_converter.hpp b/src/common/io/dataset_support/read_converter.hpp index cf7e48e7b1..24e55a6ae7 100644 --- a/src/common/io/dataset_support/read_converter.hpp +++ b/src/common/io/dataset_support/read_converter.hpp @@ -55,14 +55,17 @@ void ConvertIfNeeded(DataSet &data, unsigned nthreads = 1, BinaryPairedStreams paired_binary_readers(SequencingLibraryT &lib, bool followed_by_rc, size_t insert_size, - bool include_merged); + bool include_merged, + size_t chunk_num = 0); BinarySingleStreams single_binary_readers(SequencingLibraryT &lib, bool followed_by_rc, - bool including_paired_and_merged); + bool including_paired_and_merged, + size_t chunk_num = 0); BinarySingleStreams single_binary_readers_for_libs(DataSet& dataset_info, const std::vector& libs, bool followed_by_rc = true, - bool including_paired_reads = true); + bool including_paired_reads = true, + size_t chunk_num = 0); } From 903dff431967b2414bd1044d2559ef9558d9d828 Mon Sep 17 00:00:00 2001 From: Olga Kunyavskaya Date: Mon, 2 Nov 2020 15:46:03 +0300 Subject: [PATCH 014/102] MPI: sequence mapper notifier --- .../alignment/sequence_mapper_notifier.hpp | 93 ++++++++++++++++++- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 7e190c57ad..d98719662f 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -16,6 +16,7 @@ #include "io/reads/paired_read.hpp" #include "io/reads/read_stream_vector.hpp" #include "utils/perf/timetracer.hpp" +#include "pipeline/partask_mpi.hpp" #include #include @@ -36,10 +37,38 @@ class SequenceMapperListener { virtual void ProcessSingleRead(size_t /* thread_index */, const io::SingleReadSeq& /* r */, const omnigraph::MappingPath& /* read */) {} virtual void MergeBuffer(size_t /* thread_index */) {} - + + virtual void Serialize(std::ostream&) const {} + virtual void Deserialize(std::istream&) {} + + virtual void MergeFromStream(std::istream&) {} + virtual ~SequenceMapperListener() {} }; +inline void PyramidMergeMPI(SequenceMapperListener &listener) { + size_t mpi_size = partask::world_size(); + size_t mpi_rank = partask::world_rank(); + const size_t deadbeef = 0xDEADBEEF; + + for (size_t step = 1; step < mpi_size; step *= 2) { + if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { + partask::InputMPIStream is(mpi_rank + step); + size_t sz; + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + listener.MergeFromStream(is); + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + } else if (mpi_rank % (2*step) == step) { + partask::OutputMPIStream os(mpi_rank - step); + io::binary::BinWrite(os, deadbeef); + listener.Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } +} + class SequenceMapperNotifier { static constexpr size_t BUFFER_SIZE = 200000; public: @@ -51,6 +80,63 @@ class SequenceMapperNotifier { void Subscribe(SequenceMapperListener* listener, size_t lib_index = 0); + template + void ProcessLibraryMPI(io::ReadStreamList& streams, + size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { + INFO("ProcessLibraryMPI started"); + // Select streams + std::vector chunks; + size_t mpi_size = partask::world_size(); + size_t mpi_rank = partask::world_rank(); + for (size_t i = 0; i < streams.size(); ++i) { + if (i % mpi_size == mpi_rank) { + chunks.push_back(i); + } + } + INFO("Selected streams: " << chunks); + auto local_streams = partask::create_empty_stream_list(chunks.size()); + partask::swap_streams(streams, local_streams, chunks); + + // Run ProcessLibrary + INFO("Running ProcessLibrary"); + ProcessLibrary(local_streams, lib_index, mapper, threads_count); + INFO("ProcessLibrary done"); + + // Swap streams back + partask::swap_streams(streams, local_streams, chunks); + + INFO("Merging results..."); + for (const auto& listener : listeners_[lib_index]) { + INFO("Merging listener " << typeid(*listener).name()); + PyramidMergeMPI(*listener); + } + INFO("Listeners merged"); + + const size_t deadbeef = 0xDEADBEEF; + if (mpi_size > 1) { + INFO("Syncing listeners..."); + if (mpi_rank == 0) { + partask::OutputMPIStreamBcast os(0); + for (const auto& listener : listeners_[lib_index]) { + io::binary::BinWrite(os, deadbeef); + listener->Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } else { + partask::InputMPIStreamBcast is(0); + for (const auto& listener : listeners_[lib_index]) { + size_t sz; + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + listener->Deserialize(is); + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + } + } + INFO("Listeners synced"); + } + } + template void ProcessLibrary(io::ReadStreamList& streams, const SequenceMapperT& mapper, size_t threads_count = 0) { @@ -68,7 +154,7 @@ class SequenceMapperNotifier { threads_count = streams.size(); streams.reset(); - NotifyStartProcessLibrary(lib_index, threads_count); + NotifyStartProcessLibrary(lib_index, streams.size()); size_t counter = 0, n = 15; #pragma omp parallel for num_threads(threads_count) shared(counter) @@ -97,9 +183,10 @@ class SequenceMapperNotifier { counter += size; } - for (size_t i = 0; i < threads_count; ++i) + for (size_t i = 0; i < streams.size(); ++i) NotifyMergeBuffer(lib_index, i); + streams.close(); INFO("Total " << counter << " reads processed"); NotifyStopProcessLibrary(lib_index); } From 688fdcd3f2dca7974a8dc0a414d82dc067b48ad5 Mon Sep 17 00:00:00 2001 From: Olga Kunyavskaya Date: Thu, 8 Oct 2020 16:14:11 +0300 Subject: [PATCH 015/102] MPI mismatch_corrector --- src/projects/spades/mismatch_correction.cpp | 73 ++++++++++++++++++++- src/projects/spades/mismatch_correction.hpp | 5 +- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index 7ee1ca3db9..f72e64c54f 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -20,6 +20,11 @@ #include #include +#include "io/binary/binary.hpp" +#include "io/binary/types/phmap.hpp" +#include "io/binary/graph_pack.hpp" +#include "pipeline/partask_mpi.hpp" + template std::vector split_iterator(size_t chunks, Iter b, Iter e, size_t n) { std::vector result(chunks + 1, e); @@ -65,6 +70,14 @@ struct NuclCount { counts_[3] += other.counts_[3]; return *this; } + + void BinWrite(std::ostream &os) const { + io::binary::BinWrite(os, counts_); + } + + void BinRead(std::istream &is) { + io::binary::BinRead(is, counts_); + } }; struct MismatchEdgeInfo { @@ -94,6 +107,14 @@ struct MismatchEdgeInfo { public: phmap::flat_hash_map info_; + + void BinWrite(std::ostream &os) const { + io::binary::BinWrite(os, info_); + } + + void BinRead(std::istream &is) { + io::binary::BinRead(is, info_); + } }; class MismatchStatistics : public SequenceMapperListener { @@ -272,6 +293,20 @@ class MismatchStatistics : public SequenceMapperListener { Merge(statistics_buffers_[thread_index]); } + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, statistics_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, statistics_); + } + + void MergeFromStream(std::istream &is) override { + InnerMismatchStatistics other_statistics; + io::binary::BinRead(is, other_statistics); + Merge(other_statistics); + } + const_iterator begin() const { return statistics_.begin(); } @@ -410,6 +445,30 @@ class MismatchShallNotPass { return CorrectAllEdges(statistics); } + size_t ParallelStopMismatchIterationMPI() { + INFO("Collect potential mismatches"); + MismatchStatistics statistics(gp_); + INFO("Potential mismatches collected"); + + SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + + auto& dataset = cfg::get_writable().ds; + + auto mapper = MapperInstance(gp_); + for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { + if (!dataset.reads[i].is_mismatch_correctable()) + continue; + + notifier.Subscribe(&statistics, i); + auto &reads = dataset.reads[i]; + size_t num_readers = partask::overall_num_threads(); + auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true, num_readers); + notifier.ProcessLibraryMPI(single_streams, i, *mapper); + } + + return CorrectAllEdges(statistics); + } + public: MismatchShallNotPass(graph_pack::GraphPack &gp, double relative_threshold = 1.5) : gp_(gp), @@ -431,6 +490,18 @@ class MismatchShallNotPass { } return res; } + + size_t ParallelStopAllMismatchesMPI(size_t max_iterations = 1) { + size_t res = 0; + while (max_iterations > 0) { + size_t last = ParallelStopMismatchIterationMPI(); + res += last; + if (last == 0) + break; + max_iterations--; + } + return res; + } }; } // namespace mismatches @@ -438,7 +509,7 @@ class MismatchShallNotPass { void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { EnsureBasicMapping(gp); size_t corrected = mismatches::MismatchShallNotPass(gp, 2). - ParallelStopAllMismatches(1); + ParallelStopAllMismatchesMPI(1); INFO("Corrected " << corrected << " nucleotides"); } diff --git a/src/projects/spades/mismatch_correction.hpp b/src/projects/spades/mismatch_correction.hpp index 8fdc946afe..4c75a216f5 100644 --- a/src/projects/spades/mismatch_correction.hpp +++ b/src/projects/spades/mismatch_correction.hpp @@ -9,13 +9,14 @@ #pragma once #include "pipeline/stage.hpp" +#include "pipeline/mpi_stage.hpp" namespace debruijn_graph { -class MismatchCorrection : public spades::AssemblyStage { +class MismatchCorrection : public spades::MPIAssemblyStage { public: MismatchCorrection() - : AssemblyStage("Mismatch Correction", "mismatch_correction") { } + : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") { } void run(graph_pack::GraphPack &gp, const char *) override; }; From b6f8f223de1487cf5cb9cfb789b4b5d3cb609e5a Mon Sep 17 00:00:00 2001 From: Olga Kunyavskaya Date: Mon, 2 Nov 2020 15:47:31 +0300 Subject: [PATCH 016/102] MPI: Gap closer --- src/projects/spades/gap_closer.cpp | 30 +++++++++++++++++++++++++----- src/projects/spades/gap_closer.hpp | 19 ++++--------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index 822cf91007..ad9c77d25a 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -14,6 +14,8 @@ #include "modules/simplification/compressor.hpp" #include "paired_info/concurrent_pair_info_buffer.hpp" #include "pipeline/sequence_mapper_gp_api.hpp" +#include "pipeline/partask_mpi.hpp" +#include "io/binary/edge_index.hpp" #include #include @@ -36,6 +38,20 @@ class GapCloserPairedIndexFiller : public SequenceMapperListener { const size_t max_dist_to_tip_; size_t cnt_libs_to_process_ = 0; + virtual void Serialize(std::ostream& os) const override { + io::binary::BinWrite(os, paired_index_); + } + + virtual void Deserialize(std::istream& is) override { + io::binary::BinRead(is, paired_index_); + } + + virtual void MergeFromStream(std::istream& is) override { + omnigraph::de::PairedInfoIndexT remote(graph_); + io::binary::BinRead(is, remote); + paired_index_.Merge(remote); + } + void ProcessPairedRead(const MappingPath &path1, const MappingPath &path2) { for (size_t i = 0; i < path1.size(); ++i) { auto OutTipIter = out_tip_map_.find(path1[i].first); @@ -478,17 +494,21 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { return; } + SequenceMapperNotifier notifier(cfg::get().ds.reads.lib_count()); + size_t num_readers = partask::overall_num_threads(); + auto& dataset = cfg::get_writable().ds; for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { if (dataset.reads[i].type() != io::LibraryType::PairedEnd) continue; - SequenceMapperNotifier notifier; - notifier.Subscribe(&gcpif); - io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, 0, false); - notifier.ProcessLibrary(paired_streams, *gcpif.GetMapper()); - + notifier.Subscribe(&gcpif, i); + io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, + 0, false, num_readers); + notifier.ProcessLibraryMPI(paired_streams, i, *gcpif.GetMapper()); + INFO("Initializing gap closer"); + g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves GapCloser gap_closer(g, tips_paired_idx, cfg::get().gc.minimal_intersection, cfg::get().gc.weight_threshold); gap_closer.CloseShortGaps(); diff --git a/src/projects/spades/gap_closer.hpp b/src/projects/spades/gap_closer.hpp index 0eb0a8225b..c01151aee7 100644 --- a/src/projects/spades/gap_closer.hpp +++ b/src/projects/spades/gap_closer.hpp @@ -5,30 +5,19 @@ //* All Rights Reserved //* See file LICENSE for details. //*************************************************************************** -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#ifndef GAP_CLOSER_HPP_ -#define GAP_CLOSER_HPP_ +#pragma once +#include "pipeline/mpi_stage.hpp" #include "pipeline/stage.hpp" namespace debruijn_graph { -class GapClosing : public spades::AssemblyStage { +class GapClosing : public spades::MPIAssemblyStage { public: GapClosing(const char* id) - : AssemblyStage("Gap Closer", id) {} + : MPIAssemblyStage("Gap Closer (parmap)", id) {} void run(graph_pack::GraphPack &gp, const char*) override; }; } - - - -#endif /* GAP_CLOSER_HPP_ */ From 0490ca29535bfb99ce9ac8a42a9bfc7ceb454895 Mon Sep 17 00:00:00 2001 From: Olga Kunyavskaya Date: Thu, 5 Nov 2020 18:44:17 +0300 Subject: [PATCH 017/102] MPI: pair_info_count --- src/common/alignment/long_read_mapper.hpp | 14 +++ .../alignment/rna/ss_coverage_filler.hpp | 18 +++ .../alignment/sequence_mapper_notifier.hpp | 13 +- src/common/paired_info/histogram.hpp | 2 + src/common/paired_info/is_counter.hpp | 61 ++++++++-- src/common/paired_info/pair_info_filler.hpp | 14 +++ src/projects/spades/gap_closer.cpp | 2 +- src/projects/spades/pair_info_count.cpp | 115 +++++++++++++++++- src/projects/spades/pair_info_count.hpp | 9 +- 9 files changed, 223 insertions(+), 25 deletions(-) diff --git a/src/common/alignment/long_read_mapper.hpp b/src/common/alignment/long_read_mapper.hpp index d938a2a45a..48aef46fbd 100644 --- a/src/common/alignment/long_read_mapper.hpp +++ b/src/common/alignment/long_read_mapper.hpp @@ -55,6 +55,20 @@ class LongReadMapper: public SequenceMapperListener { return g_; } + void Serialize(std::ostream &os) const override { + storage_.BinWrite(os); + } + + void Deserialize(std::istream &is) override { + storage_.BinRead(is); + } + + void MergeFromStream(std::istream &is) override { + PathStorage remote(g_); + remote.BinRead(is); + storage_.AddStorage(remote); + } + private: void ProcessSingleRead(size_t thread_index, const omnigraph::MappingPath& mapping, const io::SingleRead& r); diff --git a/src/common/alignment/rna/ss_coverage_filler.hpp b/src/common/alignment/rna/ss_coverage_filler.hpp index c23faff935..b97f33769f 100644 --- a/src/common/alignment/rna/ss_coverage_filler.hpp +++ b/src/common/alignment/rna/ss_coverage_filler.hpp @@ -13,6 +13,8 @@ #include "alignment/sequence_mapper_notifier.hpp" #include "assembly_graph/paths/mapping_path.hpp" +#include "io/binary/binary.hpp" + namespace debruijn_graph { class SSCoverageFiller: public SequenceMapperListener { @@ -65,6 +67,22 @@ class SSCoverageFiller: public SequenceMapperListener { storage_.IncreaseKmerCount(it.first, size_t(it.second)); tmp_storages_[thread_index].Clear(); } + + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, storage_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, storage_); + } + + void MergeFromStream(std::istream &is) override { + SSCoverageStorage remote(g_); + io::binary::BinRead(is, remote); + for (const auto& it : remote) { + storage_.IncreaseKmerCount(it.first, size_t(it.second)); + } + } }; diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index d98719662f..83ae7fce78 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -38,10 +38,17 @@ class SequenceMapperListener { virtual void MergeBuffer(size_t /* thread_index */) {} - virtual void Serialize(std::ostream&) const {} - virtual void Deserialize(std::istream&) {} + virtual void Serialize(std::ostream&) const { + VERIFY_MSG(false, "Method Serialize is not implemented. Using default realization."); + } + + virtual void Deserialize(std::istream&) { + VERIFY_MSG(false, "Method Deserialize is not implemented. Using default realization."); + } - virtual void MergeFromStream(std::istream&) {} + virtual void MergeFromStream(std::istream&) { + VERIFY_MSG(false, "Method MergeFromStream is not implemented. Using default realization."); + } virtual ~SequenceMapperListener() {} }; diff --git a/src/common/paired_info/histogram.hpp b/src/common/paired_info/histogram.hpp index fb8f7e5984..78e6b4d23c 100644 --- a/src/common/paired_info/histogram.hpp +++ b/src/common/paired_info/histogram.hpp @@ -10,6 +10,8 @@ #include "index_point.hpp" +#include "utils/verify.hpp" + #include "adt/flat_set.hpp" #include "adt/small_pod_vector.hpp" diff --git a/src/common/paired_info/is_counter.hpp b/src/common/paired_info/is_counter.hpp index 078487db0d..f5f45033cd 100644 --- a/src/common/paired_info/is_counter.hpp +++ b/src/common/paired_info/is_counter.hpp @@ -115,22 +115,57 @@ class InsertSizeCounter: public SequenceMapperListener { } struct count_data { - size_t total_; - std::vector arr_; - count_data() - : total_(0) {} - - count_data(size_t nthreads) - : total_(0), arr_(nthreads, 0) {} - - void inc(size_t i) { ++arr_[i]; } - void merge() { - for (size_t i = 0; i < arr_.size(); ++i) { - total_ += arr_[i]; + size_t total_; + std::vector arr_; + count_data() + : total_(0) {} + + count_data(size_t nthreads) + : total_(0), arr_(nthreads, 0) {} + + void inc(size_t i) { ++arr_[i]; } + void merge() { + for (size_t i = 0; i < arr_.size(); ++i) { + total_ += arr_[i]; + arr_[i] = 0; + } + } + + void BinWrite(std::ostream &os) const { + io::binary::BinWrite(os, total_); + } + + void BinRead(std::istream &is) { + io::binary::BinRead(is, total_); } - } }; + void Serialize(std::ostream &os) const override { + total_.BinWrite(os); + counted_.BinWrite(os); + negative_.BinWrite(os); + io::binary::BinWrite(os, hist_); + } + + void Deserialize(std::istream &is) override { + total_.BinRead(is); + counted_.BinRead(is); + negative_.BinRead(is); + io::binary::BinRead(is, hist_); + } + + void MergeFromStream(std::istream &is) override { + InsertSizeCounter remote(*this); + remote.Deserialize(is); + total_.total_ += remote.total_.total_; + counted_.total_ += remote.counted_.total_; + negative_.total_ += remote.negative_.total_; + + for (const auto& kv: remote.hist_) { + hist_[kv.first] += kv.second; + } + } + private: const Graph &graph_; diff --git a/src/common/paired_info/pair_info_filler.hpp b/src/common/paired_info/pair_info_filler.hpp index 412f4021d5..472ff8ac13 100644 --- a/src/common/paired_info/pair_info_filler.hpp +++ b/src/common/paired_info/pair_info_filler.hpp @@ -63,6 +63,20 @@ class LatePairedIndexFiller : public SequenceMapperListener { ProcessPairedRead(read1, read2, r.distance()); } + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, paired_index_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, paired_index_); + } + + void MergeFromStream(std::istream &is) override { + omnigraph::de::UnclusteredPairedInfoIndexT remote(paired_index_.graph()); + io::binary::BinRead(is, remote); + paired_index_.Merge(remote); + } + virtual ~LatePairedIndexFiller() {} private: diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index ad9c77d25a..56174bc816 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -506,7 +506,7 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, 0, false, num_readers); notifier.ProcessLibraryMPI(paired_streams, i, *gcpif.GetMapper()); - + INFO("Initializing gap closer"); g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves GapCloser gap_closer(g, tips_paired_idx, diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 86f1e3eacb..5fd0fde25a 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -42,6 +42,55 @@ std::shared_ptr> ChooseProperMapper(const graph_pack::Grap return MapperInstance(gp); } +class DEFilter : public SequenceMapperListener { + public: + DEFilter(paired_info::PairedInfoFilter &filter, const Graph &g) + : bf_(filter), g_(g) {} + + void ProcessPairedRead(size_t, + const io::PairedRead&, + const MappingPath& read1, + const MappingPath& read2) override { + ProcessPairedRead(read1, read2); + } + void ProcessPairedRead(size_t, + const io::PairedReadSeq&, + const MappingPath& read1, + const MappingPath& read2) override { + ProcessPairedRead(read1, read2); + } + + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, bf_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, bf_); + } + + void MergeFromStream(std::istream &is) override { + paired_info::PairedInfoFilter remote; + io::binary::BinRead(is, remote); + bf_.merge(remote); + } + + private: + void ProcessPairedRead(const MappingPath& path1, + const MappingPath& path2) { + for (size_t i = 0; i < path1.size(); ++i) { + EdgeId edge1 = path1.edge_at(i); + for (size_t j = 0; j < path2.size(); ++j) { + EdgeId edge2 = path2.edge_at(j); + bf_.add({edge1, edge2}); + bf_.add({g_.conjugate(edge2), g_.conjugate(edge1)}); + } + } + } + + paired_info::PairedInfoFilter &bf_; + const Graph &g_; +}; + bool HasGoodRRLibs() { for (const auto &lib : cfg::get().ds.reads) { if (lib.is_contig_lib()) @@ -129,18 +178,60 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, } auto mapper_ptr = ChooseProperMapper(gp, reads); + size_t num_readers = partask::overall_num_threads(); if (use_binary) { - auto single_streams = single_binary_readers(reads, false, map_paired); - notifier.ProcessLibrary(single_streams, *mapper_ptr); + auto single_streams = single_binary_readers(reads, false, map_paired, num_readers); + notifier.ProcessLibraryMPI(single_streams, ilib, *mapper_ptr); } else { auto single_streams = single_easy_readers(reads, false, map_paired, /*handle Ns*/false); - notifier.ProcessLibrary(single_streams, *mapper_ptr); + notifier.ProcessLibraryMPI(single_streams, ilib, *mapper_ptr); } return single_long_reads.size(); } +void ProcessPairedReads(graph_pack::GraphPack &gp, + std::unique_ptr filter, + unsigned filter_threshold, + size_t ilib) { + SequencingLib &reads = cfg::get_writable().ds.reads[ilib]; + const auto &data = reads.data(); + + unsigned round_thr = 0; + // Do not round if filtering is disabled + if (filter) + round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * data.insert_size_deviation * cfg::get().de.rounding_coeff, + cfg::get().de.rounding_thr)); + + SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + INFO("Left insert size quantile " << data.insert_size_left_quantile << + ", right insert size quantile " << data.insert_size_right_quantile << + ", filtering threshold " << filter_threshold << + ", rounding threshold " << round_thr); + + LatePairedIndexFiller::WeightF weight; + if (filter) { + weight = [&](const std::pair &ep, + const MappingRange&, const MappingRange&) { + return (filter->lookup(ep) > filter_threshold ? 1. : 0.); + }; + } else { + weight = [&](const std::pair &, + const MappingRange&, const MappingRange&) { + return 1.; + }; + } + + using Indices = omnigraph::de::UnclusteredPairedInfoIndicesT; + LatePairedIndexFiller pif(gp.get(), weight, round_thr, gp.get_mutable()[ilib]); + notifier.Subscribe(&pif, ilib); + + size_t num_readers = partask::overall_num_threads(); + auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, (size_t) data.mean_insert_size, + /*include merged*/true, num_readers); + notifier.ProcessLibraryMPI(paired_streams, ilib, *ChooseProperMapper(gp, reads)); +} } // namespace void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { @@ -200,8 +291,24 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { // Only filter paired-end libraries if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) { + filter.reset(new paired_info::PairedInfoFilter([](const std::pair &e, uint64_t seed) { + uint64_t h1 = e.first.hash(); + return XXH3_64bits_withSeed(&h1, sizeof(h1), (e.second.hash() * seed) ^ seed); + }, + 12 * edgepairs)); + INFO("Filtering data for library #" << i); - filter = paired_info::FillEdgePairFilter(graph, *ChooseProperMapper(gp, lib), lib, edgepairs); + { + SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + DEFilter filter_counter(*filter, graph); + notifier.Subscribe(&filter_counter, i); + + VERIFY(lib.data().unmerged_read_length != 0); + size_t num_readers = partask::overall_num_threads(); + auto reads = paired_binary_readers(lib, /*followed by rc*/false, + 0, /*include merged*/true, num_readers); + notifier.ProcessLibraryMPI(reads, i, *ChooseProperMapper(gp, lib)); + } } INFO("Mapping library #" << i); diff --git a/src/projects/spades/pair_info_count.hpp b/src/projects/spades/pair_info_count.hpp index 294829393d..e7a7cd51ac 100644 --- a/src/projects/spades/pair_info_count.hpp +++ b/src/projects/spades/pair_info_count.hpp @@ -9,14 +9,15 @@ #pragma once #include "pipeline/stage.hpp" +#include "pipeline/mpi_stage.hpp" namespace debruijn_graph { -class PairInfoCount : public spades::AssemblyStage { - public: +class PairInfoCount : public spades::MPIAssemblyStage { + public: PairInfoCount(bool preliminary = false) - : AssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", - preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} + : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", + preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} void run(graph_pack::GraphPack &gp, const char*) override; }; From 288cca51e2cf5d5526f43a190c7283c8baa47e76 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 12:08:57 +0500 Subject: [PATCH 018/102] Move implementation to .cpp. Small cleanup while there --- .../alignment/sequence_mapper_notifier.cpp | 25 +++++++++++++- .../alignment/sequence_mapper_notifier.hpp | 33 ++++--------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.cpp b/src/common/alignment/sequence_mapper_notifier.cpp index 36e93283dd..5c85cbc46a 100644 --- a/src/common/alignment/sequence_mapper_notifier.cpp +++ b/src/common/alignment/sequence_mapper_notifier.cpp @@ -14,8 +14,31 @@ namespace debruijn_graph { +void SequenceMapperNotifier::PyramidMergeMPI(SequenceMapperListener &listener) { + size_t mpi_size = partask::world_size(); + size_t mpi_rank = partask::world_rank(); + const size_t deadbeef = 0xDEADBEEF; + + for (size_t step = 1; step < mpi_size; step *= 2) { + if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { + partask::InputMPIStream is(mpi_rank + step); + size_t sz; + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + listener.MergeFromStream(is); + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + } else if (mpi_rank % (2*step) == step) { + partask::OutputMPIStream os(mpi_rank - step); + io::binary::BinWrite(os, deadbeef); + listener.Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } +} + SequenceMapperNotifier::SequenceMapperNotifier(size_t lib_count) - : listeners_(lib_count) + : listeners_(lib_count) {} void SequenceMapperNotifier::Subscribe(SequenceMapperListener* listener, size_t lib_index) { diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 83ae7fce78..f54d61de9b 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -39,43 +39,20 @@ class SequenceMapperListener { virtual void MergeBuffer(size_t /* thread_index */) {} virtual void Serialize(std::ostream&) const { - VERIFY_MSG(false, "Method Serialize is not implemented. Using default realization."); + VERIFY_MSG(false, "Serialize() is not implemented"); } virtual void Deserialize(std::istream&) { - VERIFY_MSG(false, "Method Deserialize is not implemented. Using default realization."); + VERIFY_MSG(false, "Deserialize() is not implemented"); } virtual void MergeFromStream(std::istream&) { - VERIFY_MSG(false, "Method MergeFromStream is not implemented. Using default realization."); + VERIFY_MSG(false, "MergeFromStream() is not implemented"); } virtual ~SequenceMapperListener() {} }; -inline void PyramidMergeMPI(SequenceMapperListener &listener) { - size_t mpi_size = partask::world_size(); - size_t mpi_rank = partask::world_rank(); - const size_t deadbeef = 0xDEADBEEF; - - for (size_t step = 1; step < mpi_size; step *= 2) { - if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { - partask::InputMPIStream is(mpi_rank + step); - size_t sz; - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - listener.MergeFromStream(is); - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - } else if (mpi_rank % (2*step) == step) { - partask::OutputMPIStream os(mpi_rank - step); - io::binary::BinWrite(os, deadbeef); - listener.Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } -} - class SequenceMapperNotifier { static constexpr size_t BUFFER_SIZE = 200000; public: @@ -87,6 +64,8 @@ class SequenceMapperNotifier { void Subscribe(SequenceMapperListener* listener, size_t lib_index = 0); + void PyramidMergeMPI(SequenceMapperListener &listener); + template void ProcessLibraryMPI(io::ReadStreamList& streams, size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { @@ -122,7 +101,7 @@ class SequenceMapperNotifier { const size_t deadbeef = 0xDEADBEEF; if (mpi_size > 1) { INFO("Syncing listeners..."); - if (mpi_rank == 0) { + if (partask::master()) { partask::OutputMPIStreamBcast os(0); for (const auto& listener : listeners_[lib_index]) { io::binary::BinWrite(os, deadbeef); From c8d6fb334707ad029e269e26fbf11bf86caa60b4 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 12:14:11 +0500 Subject: [PATCH 019/102] Allow name ot be overriden --- src/common/alignment/sequence_mapper_notifier.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index f54d61de9b..42fca3210b 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -50,6 +50,10 @@ class SequenceMapperListener { VERIFY_MSG(false, "MergeFromStream() is not implemented"); } + virtual const char* name() const { + return typeid(*this).name(); + } + virtual ~SequenceMapperListener() {} }; @@ -93,7 +97,7 @@ class SequenceMapperNotifier { INFO("Merging results..."); for (const auto& listener : listeners_[lib_index]) { - INFO("Merging listener " << typeid(*listener).name()); + INFO("Merging listener " << listener->name()); PyramidMergeMPI(*listener); } INFO("Listeners merged"); From f3cb336c5891f9b31acf0826da248ffecf00c0de Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 12:59:09 +0500 Subject: [PATCH 020/102] Factor out implementation into a separate class --- .../alignment/sequence_mapper_notifier.cpp | 2 +- .../alignment/sequence_mapper_notifier.hpp | 121 +++++++++--------- src/common/pipeline/partask_mpi.hpp | 12 ++ src/projects/spades/gap_closer.cpp | 4 +- src/projects/spades/mismatch_correction.cpp | 4 +- src/projects/spades/pair_info_count.cpp | 14 +- 6 files changed, 84 insertions(+), 73 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.cpp b/src/common/alignment/sequence_mapper_notifier.cpp index 5c85cbc46a..6c33e343fd 100644 --- a/src/common/alignment/sequence_mapper_notifier.cpp +++ b/src/common/alignment/sequence_mapper_notifier.cpp @@ -14,7 +14,7 @@ namespace debruijn_graph { -void SequenceMapperNotifier::PyramidMergeMPI(SequenceMapperListener &listener) { +void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { size_t mpi_size = partask::world_size(); size_t mpi_rank = partask::world_rank(); const size_t deadbeef = 0xDEADBEEF; diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 42fca3210b..d55be56848 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -68,72 +68,12 @@ class SequenceMapperNotifier { void Subscribe(SequenceMapperListener* listener, size_t lib_index = 0); - void PyramidMergeMPI(SequenceMapperListener &listener); - - template - void ProcessLibraryMPI(io::ReadStreamList& streams, - size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { - INFO("ProcessLibraryMPI started"); - // Select streams - std::vector chunks; - size_t mpi_size = partask::world_size(); - size_t mpi_rank = partask::world_rank(); - for (size_t i = 0; i < streams.size(); ++i) { - if (i % mpi_size == mpi_rank) { - chunks.push_back(i); - } - } - INFO("Selected streams: " << chunks); - auto local_streams = partask::create_empty_stream_list(chunks.size()); - partask::swap_streams(streams, local_streams, chunks); - - // Run ProcessLibrary - INFO("Running ProcessLibrary"); - ProcessLibrary(local_streams, lib_index, mapper, threads_count); - INFO("ProcessLibrary done"); - - // Swap streams back - partask::swap_streams(streams, local_streams, chunks); - - INFO("Merging results..."); - for (const auto& listener : listeners_[lib_index]) { - INFO("Merging listener " << listener->name()); - PyramidMergeMPI(*listener); - } - INFO("Listeners merged"); - - const size_t deadbeef = 0xDEADBEEF; - if (mpi_size > 1) { - INFO("Syncing listeners..."); - if (partask::master()) { - partask::OutputMPIStreamBcast os(0); - for (const auto& listener : listeners_[lib_index]) { - io::binary::BinWrite(os, deadbeef); - listener->Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } else { - partask::InputMPIStreamBcast is(0); - for (const auto& listener : listeners_[lib_index]) { - size_t sz; - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - listener->Deserialize(is); - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - } - } - INFO("Listeners synced"); - } - } - template void ProcessLibrary(io::ReadStreamList& streams, const SequenceMapperT& mapper, size_t threads_count = 0) { return ProcessLibrary(streams, 0, mapper, threads_count); } - - private: + template void ProcessLibrary(io::ReadStreamList& streams, size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { @@ -191,9 +131,68 @@ class SequenceMapperNotifier { void NotifyMergeBuffer(size_t ilib, size_t ithread) const; +protected: std::vector > listeners_; //first vector's size = count libs }; + +class SequenceMapperNotifierMPI : public SequenceMapperNotifier { + void PyramidMergeMPI(SequenceMapperListener &listener); + +public: + using SequenceMapperNotifier::SequenceMapperNotifier; + + template + void ProcessLibrary(io::ReadStreamList& streams, + size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { + INFO("ProcessLibraryMPI started"); + // Select streams + std::vector chunks = partask::chunks_rr(streams.size()); + INFO("Selected streams: " << chunks); + auto local_streams = partask::create_empty_stream_list(chunks.size()); + partask::swap_streams(streams, local_streams, chunks); + + // Run ProcessLibrary + INFO("Running ProcessLibrary"); + SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); + INFO("ProcessLibrary done"); + + // Swap streams back + partask::swap_streams(streams, local_streams, chunks); + + INFO("Merging results..."); + for (const auto& listener : listeners_[lib_index]) { + INFO("Merging listener " << listener->name()); + PyramidMergeMPI(*listener); + } + INFO("Listeners merged"); + + if (partask::world_size() > 1) { + const size_t deadbeef = 0xDEADBEEF; + INFO("Syncing listeners..."); + if (partask::master()) { + partask::OutputMPIStreamBcast os(0); + for (const auto& listener : listeners_[lib_index]) { + io::binary::BinWrite(os, deadbeef); + listener->Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } else { + partask::InputMPIStreamBcast is(0); + for (const auto& listener : listeners_[lib_index]) { + size_t sz; + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + listener->Deserialize(is); + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + } + } + INFO("Listeners synced"); + } + } +}; + } // namespace debruijn_graph diff --git a/src/common/pipeline/partask_mpi.hpp b/src/common/pipeline/partask_mpi.hpp index 751290eca4..8a89080309 100644 --- a/src/common/pipeline/partask_mpi.hpp +++ b/src/common/pipeline/partask_mpi.hpp @@ -1596,6 +1596,18 @@ auto fast_local_transfer(T &v, int root = 0) { return FastLocalTransferWrap(v, root); } +inline std::vector chunks_rr(size_t sz) { + std::vector chunks; + size_t mpi_size = world_size(); + size_t mpi_rank = world_rank(); + for (size_t i = 0; i < sz; ++i) { + if (i % mpi_size == mpi_rank) + chunks.push_back(i); + } + + return chunks; +} + template void swap_streams(io::ReadStreamList &all_streams, io::ReadStreamList &streams, diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index 56174bc816..ac4c0bfe45 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -494,7 +494,7 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { return; } - SequenceMapperNotifier notifier(cfg::get().ds.reads.lib_count()); + SequenceMapperNotifierMPI notifier(cfg::get().ds.reads.lib_count()); size_t num_readers = partask::overall_num_threads(); auto& dataset = cfg::get_writable().ds; @@ -505,7 +505,7 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { notifier.Subscribe(&gcpif, i); io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, 0, false, num_readers); - notifier.ProcessLibraryMPI(paired_streams, i, *gcpif.GetMapper()); + notifier.ProcessLibrary(paired_streams, i, *gcpif.GetMapper()); INFO("Initializing gap closer"); g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index f72e64c54f..040c5924a7 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -450,7 +450,7 @@ class MismatchShallNotPass { MismatchStatistics statistics(gp_); INFO("Potential mismatches collected"); - SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); auto& dataset = cfg::get_writable().ds; @@ -463,7 +463,7 @@ class MismatchShallNotPass { auto &reads = dataset.reads[i]; size_t num_readers = partask::overall_num_threads(); auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true, num_readers); - notifier.ProcessLibraryMPI(single_streams, i, *mapper); + notifier.ProcessLibrary(single_streams, i, *mapper); } return CorrectAllEdges(statistics); diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 5fd0fde25a..b7cb24e42f 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -157,7 +157,7 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, auto& reads = cfg::get_writable().ds.reads[ilib]; const auto &graph = gp.get(); - SequenceMapperNotifier notifier; + SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); auto &single_long_reads = gp.get_mutable>()[ilib]; auto& trusted_paths = gp.get_mutable()[ilib]; @@ -181,11 +181,11 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, size_t num_readers = partask::overall_num_threads(); if (use_binary) { auto single_streams = single_binary_readers(reads, false, map_paired, num_readers); - notifier.ProcessLibraryMPI(single_streams, ilib, *mapper_ptr); + notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr); } else { auto single_streams = single_easy_readers(reads, false, map_paired, /*handle Ns*/false); - notifier.ProcessLibraryMPI(single_streams, ilib, *mapper_ptr); + notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr); } return single_long_reads.size(); @@ -204,7 +204,7 @@ void ProcessPairedReads(graph_pack::GraphPack &gp, round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * data.insert_size_deviation * cfg::get().de.rounding_coeff, cfg::get().de.rounding_thr)); - SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); INFO("Left insert size quantile " << data.insert_size_left_quantile << ", right insert size quantile " << data.insert_size_right_quantile << ", filtering threshold " << filter_threshold << @@ -230,7 +230,7 @@ void ProcessPairedReads(graph_pack::GraphPack &gp, size_t num_readers = partask::overall_num_threads(); auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, (size_t) data.mean_insert_size, /*include merged*/true, num_readers); - notifier.ProcessLibraryMPI(paired_streams, ilib, *ChooseProperMapper(gp, reads)); + notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads)); } } // namespace @@ -299,7 +299,7 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { INFO("Filtering data for library #" << i); { - SequenceMapperNotifier notifier(cfg::get_writable().ds.reads.lib_count()); + SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); DEFilter filter_counter(*filter, graph); notifier.Subscribe(&filter_counter, i); @@ -307,7 +307,7 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { size_t num_readers = partask::overall_num_threads(); auto reads = paired_binary_readers(lib, /*followed by rc*/false, 0, /*include merged*/true, num_readers); - notifier.ProcessLibraryMPI(reads, i, *ChooseProperMapper(gp, lib)); + notifier.ProcessLibrary(reads, i, *ChooseProperMapper(gp, lib)); } } From adea3d5819dc2c055ef127d0f24abe5fdec48437 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 13:18:35 +0500 Subject: [PATCH 021/102] Simplify parallel processing --- .../alignment/sequence_mapper_notifier.hpp | 21 +++++++-------- src/common/pipeline/partask_mpi.hpp | 27 ++++++++++++------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index d55be56848..4f7e004fc5 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -53,7 +53,7 @@ class SequenceMapperListener { virtual const char* name() const { return typeid(*this).name(); } - + virtual ~SequenceMapperListener() {} }; @@ -138,7 +138,7 @@ class SequenceMapperNotifier { class SequenceMapperNotifierMPI : public SequenceMapperNotifier { void PyramidMergeMPI(SequenceMapperListener &listener); - + public: using SequenceMapperNotifier::SequenceMapperNotifier; @@ -149,16 +149,15 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { // Select streams std::vector chunks = partask::chunks_rr(streams.size()); INFO("Selected streams: " << chunks); - auto local_streams = partask::create_empty_stream_list(chunks.size()); - partask::swap_streams(streams, local_streams, chunks); - - // Run ProcessLibrary - INFO("Running ProcessLibrary"); - SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); - INFO("ProcessLibrary done"); - // Swap streams back - partask::swap_streams(streams, local_streams, chunks); + partask::execute_on_subset(streams, + chunks, + [&](io::ReadStreamList& local_streams) { + // Run ProcessLibrary + INFO("Running ProcessLibrary"); + SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); + INFO("ProcessLibrary done"); + }); INFO("Merging results..."); for (const auto& listener : listeners_[lib_index]) { diff --git a/src/common/pipeline/partask_mpi.hpp b/src/common/pipeline/partask_mpi.hpp index 8a89080309..d119a3ad30 100644 --- a/src/common/pipeline/partask_mpi.hpp +++ b/src/common/pipeline/partask_mpi.hpp @@ -1596,7 +1596,7 @@ auto fast_local_transfer(T &v, int root = 0) { return FastLocalTransferWrap(v, root); } -inline std::vector chunks_rr(size_t sz) { +inline auto chunks_rr(size_t sz) { std::vector chunks; size_t mpi_size = world_size(); size_t mpi_rank = world_rank(); @@ -1608,10 +1608,10 @@ inline std::vector chunks_rr(size_t sz) { return chunks; } -template -void swap_streams(io::ReadStreamList &all_streams, - io::ReadStreamList &streams, - const std::vector &chunks) { +template +void swap_streams(StreamListType &all_streams, + StreamListType &streams, + const std::vector &chunks) { VERIFY(streams.size() == chunks.size()); for (size_t i = 0; i < chunks.size(); ++i) { DEBUG("Swapping: " << i << " <-> " << chunks[i]); @@ -1619,16 +1619,25 @@ void swap_streams(io::ReadStreamList &all_streams, } } -template +template auto create_empty_stream_list(size_t size) { - io::ReadStreamList streams; + StreamListType streams; for (size_t i = 0; i < size; ++i) { - io::ReadStream empty_stream; - streams.push_back(std::move(empty_stream)); + streams.push_back({}); } return streams; } +template +void execute_on_subset(StreamListType &all_streams, + const std::vector &chunks, + F f) { + auto local_streams = partask::create_empty_stream_list(chunks.size()); + partask::swap_streams(all_streams, local_streams, chunks); + f(local_streams); + partask::swap_streams(all_streams, local_streams, chunks); +} + } // namespace partask namespace io { From 1fba4566d4d9f4d8920668fd4bb300c6b6999ec3 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 13:35:19 +0500 Subject: [PATCH 022/102] Simplify --- src/common/alignment/sequence_mapper_notifier.hpp | 3 +-- src/common/pipeline/partask_mpi.hpp | 11 ++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 4f7e004fc5..575d5cebe1 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -150,8 +150,7 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { std::vector chunks = partask::chunks_rr(streams.size()); INFO("Selected streams: " << chunks); - partask::execute_on_subset(streams, - chunks, + partask::execute_on_subset(streams, chunks, [&](io::ReadStreamList& local_streams) { // Run ProcessLibrary INFO("Running ProcessLibrary"); diff --git a/src/common/pipeline/partask_mpi.hpp b/src/common/pipeline/partask_mpi.hpp index d119a3ad30..089535d189 100644 --- a/src/common/pipeline/partask_mpi.hpp +++ b/src/common/pipeline/partask_mpi.hpp @@ -8,11 +8,11 @@ #pragma once #include "io/binary/binary.hpp" -#include "io/reads/read_stream_vector.hpp" #include "utils/parallel/openmp_wrapper.h" #include "utils/verify.hpp" #include "utils/logger/logger.hpp" +#include "utils/stl_utils.hpp" #include #include @@ -1628,6 +1628,15 @@ auto create_empty_stream_list(size_t size) { return streams; } +template +void execute_on_subset(StreamListType &all_streams, + F f) { + // Select streams + std::vector chunks = partask::chunks_rr(all_streams.size()); + INFO("Selected streams: " << chunks); + execute_on_subset(all_streams, chunks, std::move(f)); +} + template void execute_on_subset(StreamListType &all_streams, const std::vector &chunks, From ed7fe74798db1956c547e186c9ddffe9bb953996 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 13:53:20 +0500 Subject: [PATCH 023/102] Ensure the streams are fresh --- src/common/pipeline/partask_mpi.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/pipeline/partask_mpi.hpp b/src/common/pipeline/partask_mpi.hpp index 089535d189..329001c8ce 100644 --- a/src/common/pipeline/partask_mpi.hpp +++ b/src/common/pipeline/partask_mpi.hpp @@ -1643,7 +1643,9 @@ void execute_on_subset(StreamListType &all_streams, F f) { auto local_streams = partask::create_empty_stream_list(chunks.size()); partask::swap_streams(all_streams, local_streams, chunks); + local_streams.reset(); f(local_streams); + local_streams.close(); partask::swap_streams(all_streams, local_streams, chunks); } From 335c06f8f580d376a600ebe415dfbdd159f59bc7 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 26 Feb 2021 16:12:11 +0500 Subject: [PATCH 024/102] Add MPI construction stage --- src/common/stages/CMakeLists.txt | 2 +- src/common/stages/construction.cpp | 35 +- src/common/stages/construction_mpi.cpp | 456 +++++++++++++++++++++++++ src/common/stages/construction_mpi.hpp | 25 ++ src/projects/spades/pipeline.cpp | 10 +- 5 files changed, 493 insertions(+), 35 deletions(-) create mode 100644 src/common/stages/construction_mpi.cpp create mode 100644 src/common/stages/construction_mpi.hpp diff --git a/src/common/stages/CMakeLists.txt b/src/common/stages/CMakeLists.txt index 69a7483a59..c6bf4cd057 100644 --- a/src/common/stages/CMakeLists.txt +++ b/src/common/stages/CMakeLists.txt @@ -10,7 +10,7 @@ project(stages CXX) set(stages_src read_conversion.cpp construction.cpp simplification.cpp ss_edge_split.cpp genomic_info_filler.cpp) if (MPI_FOUND) - set(stages_src ${stages_src} test_mpi.cpp) + set(stages_src ${stages_src} test_mpi.cpp construction_mpi.cpp) endif() add_library(stages STATIC ${stages_src}) diff --git a/src/common/stages/construction.cpp b/src/common/stages/construction.cpp index 7c261ae653..5e209d1f1d 100644 --- a/src/common/stages/construction.cpp +++ b/src/common/stages/construction.cpp @@ -39,7 +39,7 @@ struct ConstructionStorage { fs::TmpDir workdir; }; -bool add_trusted_contigs(io::DataSet &libraries, +static bool add_trusted_contigs(io::DataSet &libraries, io::ReadStreamList &trusted_list) { std::vector trusted_contigs; for (size_t i = 0; i < libraries.lib_count(); ++i) { @@ -55,7 +55,7 @@ bool add_trusted_contigs(io::DataSet &libraries, return !trusted_contigs.empty(); } -void merge_read_streams(io::ReadStreamList &streams1, +static void merge_read_streams(io::ReadStreamList &streams1, io::ReadStreamList &streams2) { for (size_t i = 0; i < streams2.size(); ++i) { if (i < streams1.size()) { @@ -66,7 +66,7 @@ void merge_read_streams(io::ReadStreamList &streams1, } } -io::ReadStreamList temp_merge_read_streams(io::ReadStreamList &streams1, +static io::ReadStreamList temp_merge_read_streams(io::ReadStreamList &streams1, io::ReadStreamList &streams2) { io::ReadStreamList merge_stream_list; @@ -86,7 +86,7 @@ io::ReadStreamList temp_merge_read_streams(io::ReadStreamList -void add_additional_contigs_to_lib(std::filesystem::path path_to_additional_contigs_dir, size_t max_threads, +static void add_additional_contigs_to_lib(std::filesystem::path path_to_additional_contigs_dir, size_t max_threads, io::ReadStreamList &trusted_list) { io::SequencingLibraryT seq_lib; seq_lib.set_type(io::LibraryType::TrustedContigs); @@ -368,33 +368,6 @@ class GraphCondenser : public Construction::Phase { } }; -//FIXME unused? -class EdgeIndexFiller : public Construction::Phase { -public: - EdgeIndexFiller() - : Construction::Phase("Edge index filling", "initial_edge_index_filling") { } - - virtual ~EdgeIndexFiller() = default; - - void run(graph_pack::GraphPack &gp, const char*) override { - auto &index = gp.get_mutable>(); - index.Refill(); - index.Attach(); - } - - void load(graph_pack::GraphPack&, - const std::filesystem::path &, - const char*) override { - VERIFY_MSG(false, "implement me"); - } - - void save(const graph_pack::GraphPack&, - const std::filesystem::path &, - const char*) const override { - // VERIFY_MSG(false, "implement me"); - } -}; - class PHMCoverageFiller : public Construction::Phase { public: PHMCoverageFiller() diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp new file mode 100644 index 0000000000..832be486a4 --- /dev/null +++ b/src/common/stages/construction_mpi.cpp @@ -0,0 +1,456 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2015-2022 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "construction_mpi.hpp" + +#include "alignment/edge_index.hpp" +#include "assembly_graph/construction/early_simplification.hpp" +#include "io/dataset_support/dataset_readers.hpp" +#include "io/dataset_support/read_converter.hpp" +#include "io/reads/coverage_filtering_read_wrapper.hpp" +#include "io/reads/multifile_reader.hpp" +#include "kmer_index/ph_map/coverage_hash_map_builder.hpp" +#include "modules/graph_construction.hpp" +#include "pipeline/genomic_info.hpp" +#include "pipeline/graph_pack.hpp" +#include "utils/filesystem/temporary.hpp" + +namespace debruijn_graph { + +struct ConstructionStorage { + using CoverageMap = kmers::PerfectHashMap, kmers::DefaultStoring>; + + ConstructionStorage(unsigned k) + : ext_index(k) {} + + kmers::DeBruijnExtensionIndex<> ext_index; + + std::unique_ptr cqf; + std::unique_ptr> kmers; + std::unique_ptr coverage_map; + config::debruijn_config::construction params; + io::ReadStreamList read_streams; + io::ReadStreamList contigs_streams; + fs::TmpDir workdir; +}; + +static bool add_trusted_contigs(io::DataSet &libraries, + io::ReadStreamList &trusted_list) { + std::vector trusted_contigs; + for (size_t i = 0; i < libraries.lib_count(); ++i) { + auto& lib = libraries[i]; + if (lib.type() != io::LibraryType::TrustedContigs) + continue; + trusted_contigs.push_back(i); + } + + if (!trusted_contigs.empty()) { + trusted_list = io::single_binary_readers_for_libs(libraries, trusted_contigs, true, false); + } + return !trusted_contigs.empty(); +} + +static void merge_read_streams(io::ReadStreamList &streams1, + io::ReadStreamList &streams2) { + for (size_t i = 0; i < streams2.size(); ++i) { + if (i < streams1.size()) { + streams1[i] = io::MultifileWrap(std::move(streams1[i]), std::move(streams2[i])); + } else { + streams1.push_back(std::move(streams2[i])); + } + } +} + +static io::ReadStreamList temp_merge_read_streams(io::ReadStreamList &streams1, + io::ReadStreamList &streams2) { + io::ReadStreamList merge_stream_list; + + for (size_t i = 0; i < std::max(streams1.size(), streams2.size()); ++i) { + if (i < streams1.size() && i < streams2.size()) { + merge_stream_list.push_back(io::ScopedMultifileWrap(streams1[i], streams2[i])); + } else if (i < streams1.size()) { + merge_stream_list.push_back(io::ScopedMultifileWrap(streams1[i])); + } else { + merge_stream_list.push_back(io::ScopedMultifileWrap(streams2[i])); + } + } + + return merge_stream_list; +} + + + + +static void add_additional_contigs_to_lib(std::filesystem::path path_to_additional_contigs_dir, size_t max_threads, + io::ReadStreamList &trusted_list) { + io::SequencingLibraryT seq_lib; + seq_lib.set_type(io::LibraryType::TrustedContigs); + seq_lib.set_orientation(io::LibraryOrientation::Undefined); + seq_lib.data().lib_index = size_t(-1); + auto& bin_info = seq_lib.data().binary_reads_info; + bin_info.single_read_prefix = path_to_additional_contigs_dir / "contigs"; + bin_info.bin_reads_info_file = path_to_additional_contigs_dir / "contigs_info"; + bin_info.binary_converted = true; + bin_info.chunk_num = max_threads; + + io::ReadStreamList lib_streams = io::single_binary_readers(seq_lib, true, false); + merge_read_streams(trusted_list, lib_streams); +} + +void ConstructionMPI::init(graph_pack::GraphPack &gp, const char *) { + init_storage(unsigned(gp.k())); + + auto& dataset = cfg::get_writable().ds; + + // Has to be separate stream for not counting it in coverage + if (add_trusted_contigs(dataset.reads, storage().contigs_streams)) + INFO("Trusted contigs will be used in graph construction"); + + if (cfg::get().use_additional_contigs) { + INFO("Contigs from previous K will be used: " << cfg::get().additional_contigs); + add_additional_contigs_to_lib(cfg::get().additional_contigs, cfg::get().max_threads, storage().contigs_streams); + } + + // FIXME: indices here are awful + std::vector libs_for_construction; + for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { + if (dataset.reads[i].is_graph_constructable()) { + libs_for_construction.push_back(i); + } + } + + storage().params = cfg::get().con; + storage().workdir = fs::tmp::make_temp_dir(gp.workdir(), "construction"); + //FIXME needs to be changed if we move to hash only filtering + storage().read_streams = io::single_binary_readers_for_libs(dataset.reads, libs_for_construction); + + //Updating dataset stats + VERIFY(dataset.RL == 0 && dataset.aRL == 0.); + size_t merged_max_len = 0; + uint64_t total_nucls = 0; + size_t read_count = 0; + for (size_t lib_id : libs_for_construction) { + auto lib_data = dataset.reads[lib_id].data(); + if (lib_data.unmerged_read_length == 0) { + FATAL_ERROR("Failed to determine read length for library #" << lib_data.lib_index << ". " + "Check that not only merged reads are present."); + } + dataset.no_merge_RL = std::max(dataset.no_merge_RL, lib_data.unmerged_read_length); + merged_max_len = std::max(merged_max_len, lib_data.merged_read_length); + total_nucls += dataset.reads[lib_id].data().total_nucls; + read_count += dataset.reads[lib_id].data().read_count; + } + + dataset.RL = std::max(dataset.no_merge_RL, merged_max_len); + INFO("Max read length " << dataset.RL); + + if (merged_max_len > 0) + INFO("Max read length without merged " << dataset.no_merge_RL); + + dataset.aRL = double(total_nucls) / double(read_count); + INFO("Average read length " << dataset.aRL); +} + +void ConstructionMPI::fini(graph_pack::GraphPack &) { + reset_storage(); +} + +ConstructionMPI::~ConstructionMPI() {} + +namespace { + +class CoverageFilter: public ConstructionMPI::Phase { + public: + CoverageFilter() + : ConstructionMPI::Phase("k-mer multiplicity estimation", "cqf_filter") { } + virtual ~CoverageFilter() = default; + + void run(graph_pack::GraphPack &, const char*) override { + auto &read_streams = storage().read_streams; + const auto &index = storage().ext_index; + using storing_type = decltype(storage().ext_index)::storing_type; + + VERIFY_MSG(read_streams.size(), "No input streams specified"); + + unsigned rthr = storage().params.read_cov_threshold; + + using KmerFilter = kmers::StoringTypeFilter; + + unsigned kplusone = index.k() + 1; + rolling_hash::SymmetricCyclicHash hasher(kplusone); + + INFO("Estimating k-mers cardinality"); + size_t kmers = EstimateCardinalityUpperBound(kplusone, read_streams, hasher, KmerFilter()); + + // Create main CQF using # of slots derived from estimated # of k-mers + storage().cqf.reset(new qf::cqf(kmers)); + + INFO("Building k-mer coverage histogram"); + FillCoverageHistogram(*storage().cqf, kplusone, hasher, read_streams, rthr, KmerFilter()); + + // Replace input streams with wrapper ones + storage().read_streams = io::CovFilteringWrap(std::move(read_streams), kplusone, hasher, *storage().cqf, rthr); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + +void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } + +}; + + +class KMerCounting : public ConstructionMPI::Phase { + typedef rolling_hash::SymmetricCyclicHash<> SeqHasher; +public: + KMerCounting() + : ConstructionMPI::Phase("k+1-mer counting", "kpomer_counting") { } + + virtual ~KMerCounting() = default; + + void run(graph_pack::GraphPack &, const char*) override { + auto &read_streams = storage().read_streams; + auto &contigs_streams = storage().contigs_streams; + const auto &index = storage().ext_index; + size_t buffer_size = storage().params.read_buffer_size; + using storing_type = decltype(storage().ext_index)::storing_type; + + VERIFY_MSG(read_streams.size(), "No input streams specified"); + + + io::ReadStreamList merge_streams = temp_merge_read_streams(read_streams, contigs_streams); + + unsigned nthreads = (unsigned)merge_streams.size(); + using Splitter = kmers::DeBruijnReadKMerSplitter>; + + kmers::KMerDiskCounter + counter(storage().workdir, + Splitter(storage().workdir, index.k() + 1, merge_streams, buffer_size)); + auto kmers = counter.Count(10 * nthreads, nthreads); + storage().kmers.reset(new kmers::KMerDiskStorage(std::move(kmers))); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } +}; + +class ExtensionIndexBuilder : public ConstructionMPI::Phase { +public: + ExtensionIndexBuilder() + : ConstructionMPI::Phase("Extension index construction", "extension_index_construction") { } + + virtual ~ExtensionIndexBuilder() = default; + + void run(graph_pack::GraphPack &, const char*) override { + // FIXME: We just need files here, not the full counter. Implement refererence counting scheme! + kmers::DeBruijnExtensionIndexBuilder().BuildExtensionIndexFromKPOMers(storage().workdir, + storage().ext_index, + *storage().kmers, + unsigned(storage().read_streams.size()), + storage().params.read_buffer_size); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } +}; + + +class EarlyTipClipper : public ConstructionMPI::Phase { +public: + EarlyTipClipper() + : ConstructionMPI::Phase("Early tip clipping", "early_tip_clipper") { } + + virtual ~EarlyTipClipper() = default; + + void run(graph_pack::GraphPack &gp, const char*) override { + if (!storage().params.early_tc.length_bound) { + INFO("Early tip clipper length bound set as (RL - K)"); + storage().params.early_tc.length_bound = cfg::get().ds.RL - gp.k(); + } + EarlyTipClipperProcessor(storage().ext_index, *storage().params.early_tc.length_bound).ClipTips(); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } +}; + +class EarlyATClipper : public ConstructionMPI::Phase { +public: + EarlyATClipper() + : ConstructionMPI::Phase("Early A/T remover", "early_at_remover") { } + + virtual ~EarlyATClipper() = default; + + void run(graph_pack::GraphPack &, const char*) override { + EarlyLowComplexityClipperProcessor at_processor(storage().ext_index, 0.8, 10, 200); + at_processor.RemoveATEdges(); + at_processor.RemoveATTips(); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } +}; + +class GraphCondenser : public ConstructionMPI::Phase { +public: + GraphCondenser() + : ConstructionMPI::Phase("Condensing graph", "graph_condensing") { } + + virtual ~GraphCondenser() = default; + + void run(graph_pack::GraphPack &gp, const char*) override { + auto &index = gp.get_mutable>(); + if (index.IsAttached()) + index.Detach(); + DeBruijnGraphExtentionConstructor(gp.get_mutable(), storage().ext_index).ConstructGraph(storage().params.keep_perfect_loops); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + //FIXME why commented here and others + // VERIFY_MSG(false, "implement me"); + } +}; + +class PHMCoverageFiller : public ConstructionMPI::Phase { +public: + PHMCoverageFiller() + : ConstructionMPI::Phase("Filling coverage indices (PHM)", "coverage_filling_phm") {} + virtual ~PHMCoverageFiller() = default; + + void run(graph_pack::GraphPack &gp, const char *) override { + storage().coverage_map.reset(new ConstructionStorage::CoverageMap(storage().kmers->k())); + auto &coverage_map = *storage().coverage_map; + + kmers::CoverageHashMapBuilder().BuildIndex(coverage_map, + *storage().kmers, + storage().read_streams); + /* + INFO("Checking the PHM"); + + auto &index = gp.index.inner_index(); + for (auto I = index.value_cbegin(), E = index.value_cend(); + I != E; ++I) { + const auto& edge_info = *I; + + Sequence sk = gp.g.EdgeNucls(edge_info.edge_id).Subseq(edge_info.offset, edge_info.offset + index.k()); + auto kwh = coverage_map.ConstructKWH(sk.start(index.k())); + + uint32_t cov = coverage_map.get_value(kwh, utils::InvertableStoring::trivial_inverter()); + if (edge_info.count != cov) + INFO("" << kwh << ":" << edge_info.count << ":" << cov); + } */ + + INFO("Filling coverage and flanking coverage from PHM"); + FillCoverageAndFlankingFromPHM(coverage_map, + gp.get_mutable(), gp.get_mutable>()); + + std::vector hist; + size_t maxcov = 0; + size_t kmer_per_record = 1; + if (EdgeIndex::IsInvertable()) + kmer_per_record = 2; + + for (auto I = coverage_map.value_cbegin(), E = coverage_map.value_cend(); I != E; ++I) { + size_t ccov = *I; + if (!ccov) + continue; + maxcov = std::max(ccov, maxcov); + if (maxcov > hist.size()) + hist.resize(maxcov, 0); + hist[ccov - 1] += kmer_per_record; + } + + gp.get_mutable().set_cov_histogram(hist); + } + + void load(graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) override { + VERIFY_MSG(false, "implement me"); + } + + void save(const graph_pack::GraphPack&, + const std::filesystem::path &, + const char*) const override { + // VERIFY_MSG(false, "implement me"); + } + +}; + +} // namespace + +ConstructionMPI::ConstructionMPI() + : spades::MPICompositeStageDeferred("de Bruijn graph construction", "construction") { + if (cfg::get().con.read_cov_threshold) + add(); + + add(); + + add(); + if (config::PipelineHelper::IsRNAPipeline(cfg::get().mode)) + add(); + if (cfg::get().con.early_tc.enable && !cfg::get().gap_closer_enable) + add(); + add(); + add(); +} + + +} //namespace debruijn_graph diff --git a/src/common/stages/construction_mpi.hpp b/src/common/stages/construction_mpi.hpp new file mode 100644 index 0000000000..b3e2d1279c --- /dev/null +++ b/src/common/stages/construction_mpi.hpp @@ -0,0 +1,25 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "pipeline/mpi_stage.hpp" + +namespace debruijn_graph { + +struct ConstructionStorage; + +class ConstructionMPI : public spades::MPICompositeStageDeferred { +public: + ConstructionMPI(); + ~ConstructionMPI(); + + void init(graph_pack::GraphPack &gp, const char *) override; + void fini(graph_pack::GraphPack &gp) override; +}; + +} + diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index 62d4e87dfa..e208f39503 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -29,6 +29,7 @@ #include "stages/genomic_info_filler.hpp" #include "stages/read_conversion.hpp" #include "stages/construction.hpp" +#include "stages/construction_mpi.hpp" #include "stages/simplification.hpp" #include "stages/ss_edge_split.hpp" #include "configs/config_struct.hpp" @@ -186,11 +187,14 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); } -static void AddConstructionStages(StageManager &SPAdes) { +static void AddConstructionStages(StageManager &SPAdes, bool mpi = false) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; - SPAdes.add(); + if (mpi) + SPAdes.add(); + else + SPAdes.add(); if (!PipelineHelper::IsMetagenomicPipeline(mode)) SPAdes.add(); } @@ -265,7 +269,7 @@ void assemble_genome(bool mpi = false) { SPAdes->add(); if (!AssemblyGraphPresent()) { - AddConstructionStages(*SPAdes); + AddConstructionStages(*SPAdes, mpi); if (cfg::get().sewage) SPAdes->add(); From dfd6f079ed18c5c41809f27e75216b1d0dc69eb9 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Sat, 27 Feb 2021 12:50:05 +0500 Subject: [PATCH 025/102] First real MPI construction step: collect k-mer coverage in parallel --- src/common/stages/construction_mpi.cpp | 88 +++++++++++++++++++------- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 832be486a4..eed59c7bee 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -18,6 +18,8 @@ #include "modules/graph_construction.hpp" #include "pipeline/genomic_info.hpp" #include "pipeline/graph_pack.hpp" +#include "pipeline/mpi_stage.hpp" +#include "pipeline/partask_mpi.hpp" #include "utils/filesystem/temporary.hpp" namespace debruijn_graph { @@ -127,9 +129,11 @@ void ConstructionMPI::init(graph_pack::GraphPack &gp, const char *) { storage().params = cfg::get().con; storage().workdir = fs::tmp::make_temp_dir(gp.workdir(), "construction"); //FIXME needs to be changed if we move to hash only filtering - storage().read_streams = io::single_binary_readers_for_libs(dataset.reads, libs_for_construction); + size_t num_readers = partask::overall_num_threads(); + storage().read_streams = io::single_binary_readers_for_libs(dataset.reads, libs_for_construction, true, true, num_readers); + INFO("Overall number of readers (actual): " << storage().read_streams.size()); - //Updating dataset stats + // Updating dataset stats VERIFY(dataset.RL == 0 && dataset.aRL == 0.); size_t merged_max_len = 0; uint64_t total_nucls = 0; @@ -138,7 +142,7 @@ void ConstructionMPI::init(graph_pack::GraphPack &gp, const char *) { auto lib_data = dataset.reads[lib_id].data(); if (lib_data.unmerged_read_length == 0) { FATAL_ERROR("Failed to determine read length for library #" << lib_data.lib_index << ". " - "Check that not only merged reads are present."); + "Check that not only merged reads are present."); } dataset.no_merge_RL = std::max(dataset.no_merge_RL, lib_data.unmerged_read_length); merged_max_len = std::max(merged_max_len, lib_data.merged_read_length); @@ -232,7 +236,7 @@ class KMerCounting : public ConstructionMPI::Phase { io::ReadStreamList merge_streams = temp_merge_read_streams(read_streams, contigs_streams); - unsigned nthreads = (unsigned)merge_streams.size(); + unsigned nthreads = cfg::get().max_threads; using Splitter = kmers::DeBruijnReadKMerSplitter>; @@ -268,7 +272,7 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { kmers::DeBruijnExtensionIndexBuilder().BuildExtensionIndexFromKPOMers(storage().workdir, storage().ext_index, *storage().kmers, - unsigned(storage().read_streams.size()), + cfg::get().max_threads, storage().params.read_buffer_size); } @@ -368,34 +372,70 @@ class GraphCondenser : public ConstructionMPI::Phase { } }; +template +class CollectKMerCoverageTask { + using ReadStreams = io::ReadStreamList; + +public: + CollectKMerCoverageTask() = default; + CollectKMerCoverageTask(std::istream &is) { deserialize(is); } + std::ostream &serialize(std::ostream &os) const { return os; } + std::istream &deserialize(std::istream &is) { return is; } + + auto make_splitter(size_t, ReadStreams &read_streams, Index &) { + return partask::make_seq_along_generator(read_streams); + } + + void process(std::istream &is, std::ostream &, ReadStreams &read_streams, Index &index) { + auto chunks = partask::get_seq(is); + if (!chunks.size()) + return; + + INFO("Selected streams: " << chunks); + partask::execute_on_subset(read_streams, chunks, + [&](ReadStreams& local_streams) { + # pragma omp parallel for + for (size_t i = 0; i < local_streams.size(); ++i) + kmers::CoverageHashMapBuilder().FillCoverageFromStream(local_streams[i], index); + }); + } + + void sync(ReadStreams & /*read_streams*/, Index &index) { + auto &values = index.values(); + partask::allreduce(values.data(), values.size(), MPI_SUM); + } +}; + + class PHMCoverageFiller : public ConstructionMPI::Phase { public: PHMCoverageFiller() : ConstructionMPI::Phase("Filling coverage indices (PHM)", "coverage_filling_phm") {} virtual ~PHMCoverageFiller() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &gp, const char *) override { + if (!storage().kmers) + storage().kmers.reset(new kmers::KMerDiskStorage()); + partask::broadcast(*storage().kmers); + storage().coverage_map.reset(new ConstructionStorage::CoverageMap(storage().kmers->k())); auto &coverage_map = *storage().coverage_map; - - kmers::CoverageHashMapBuilder().BuildIndex(coverage_map, - *storage().kmers, - storage().read_streams); - /* - INFO("Checking the PHM"); - - auto &index = gp.index.inner_index(); - for (auto I = index.value_cbegin(), E = index.value_cend(); - I != E; ++I) { - const auto& edge_info = *I; - - Sequence sk = gp.g.EdgeNucls(edge_info.edge_id).Subseq(edge_info.offset, edge_info.offset + index.k()); - auto kwh = coverage_map.ConstructKWH(sk.start(index.k())); - - uint32_t cov = coverage_map.get_value(kwh, utils::InvertableStoring::trivial_inverter()); - if (edge_info.count != cov) - INFO("" << kwh << ":" << edge_info.count << ":" << cov); - } */ + auto &streams = storage().read_streams; + + unsigned nthreads = cfg::get().max_threads; + kmers::PerfectHashMapBuilder().BuildIndex(coverage_map, *storage().kmers, nthreads); + + INFO("Collecting k-mer coverage information from reads, this takes a while."); + { + partask::TaskRegistry treg; + auto fill_kmer_coverage = treg.add>(std::ref(streams), std::ref(coverage_map)); + treg.listen(); + if (partask::master()) + fill_kmer_coverage(); + treg.stop_listening(); + } INFO("Filling coverage and flanking coverage from PHM"); FillCoverageAndFlankingFromPHM(coverage_map, From 7739944d3513534fbba9e71d6869d3ec83918f22 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 4 Aug 2021 16:03:21 +0300 Subject: [PATCH 026/102] add comment to DistanceEstimation --- src/projects/spades/distance_estimation.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp index 4fc55e9b0f..c8f6f20384 100644 --- a/src/projects/spades/distance_estimation.cpp +++ b/src/projects/spades/distance_estimation.cpp @@ -21,13 +21,31 @@ namespace debruijn_graph { +/* + * Input: raw_paired_indices -- the map from pairs of edges to histogram of estimated distance between them. + * Output: clustered_indices -- the map from pairs of edges to histogram of distance, but now clustering + * the initial histogram and pick only potential distance according to raw_paired_indices + * and information from graph + * scaffolding_indices -- the map from pairs of edges to thinned out histogram of distance in which only + * picks in histogram are selected + * + * Need this histogram for edges which occur more then one time or for find out how much time we need to repeat the loop. + */ void DistanceEstimation::run(graph_pack::GraphPack &gp, const char*) { using namespace omnigraph::de; using namespace distance_estimation; const config::debruijn_config& config = cfg::get(); const auto &graph = gp.get(); + + /* paired_indices -- conceptually, a map from a pair of edges to a histogram of distances between them. + * In fact, map from edge to a map from edge to histogram of the distance between them. + * For four pairs of direct and conjugate pairs(e1-e2; conj(e1)-e2; e1-conj(e2); conj(e1)-conj(e2)) store histogram + * only for one of them. Take paired_indices as a input, already filled at that moment. + */ auto &paired_indices = gp.get_mutable>(); + + /* Output of that stage, need to fill clustered_indices and scaffolding_indices */ auto &clustered_indices = gp.get_mutable>("clustered_indices"); auto &scaffolding_indices = gp.get_mutable>("scaffolding_indices"); size_t max_repeat_length = From 60b8dd1068d7e47db113609678c452e7959aa85d Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 5 Aug 2021 11:52:29 +0300 Subject: [PATCH 027/102] BinRead/Write for PairedIndices --- .../paired_info/distance_estimation.cpp | 24 +++++ .../paired_info/distance_estimation.hpp | 96 ++++++++++++++++++- 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/src/common/paired_info/distance_estimation.cpp b/src/common/paired_info/distance_estimation.cpp index 6304f9d2a3..89308f09d3 100644 --- a/src/common/paired_info/distance_estimation.cpp +++ b/src/common/paired_info/distance_estimation.cpp @@ -8,6 +8,7 @@ #include "distance_estimation.hpp" #include "assembly_graph/paths/path_processor.hpp" +#include "pipeline/partask_mpi.hpp" namespace omnigraph { namespace de { @@ -181,5 +182,28 @@ void DistanceEstimator::ProcessEdge(EdgeId e1, const InPairedIndex &pi, PairedIn this->AddToResult(res, ep, result); } } + +void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthreads) const { + this->Init(); + const auto &index = this->index(); + + DEBUG("Collecting edge infos"); + std::vector edges; + for (EdgeId e : this->graph().edges()) + edges.push_back(e); + + DEBUG("Processing"); + PairedInfoBuffersT buffer(this->graph(), nthreads); +# pragma omp parallel for num_threads(nthreads) schedule(guided, 10) + for (size_t i = 0; i < edges.size(); ++i) { + EdgeId edge = edges[i]; + ProcessEdge(edge, index, buffer[omp_get_thread_num()]); + } + + for (size_t i = 0; i < nthreads; ++i) { + result.Merge(buffer[i]); + buffer[i].clear(); + } +} } } diff --git a/src/common/paired_info/distance_estimation.hpp b/src/common/paired_info/distance_estimation.hpp index 9755d25d50..bf74de5000 100644 --- a/src/common/paired_info/distance_estimation.hpp +++ b/src/common/paired_info/distance_estimation.hpp @@ -16,6 +16,7 @@ #include "utils/parallel/openmp_wrapper.h" #include "math/xmath.h" +#include "pipeline/partask_mpi.hpp" namespace omnigraph { @@ -125,11 +126,11 @@ class DistanceEstimator : public AbstractDistanceEstimator { const InHistogram &histogram, const GraphLengths &raw_forward) const; -private: virtual void ProcessEdge(debruijn_graph::EdgeId e1, const InPairedIndex &pi, PairedInfoBuffer &result) const; + private: virtual const std::string Name() const { static const std::string my_name = "SIMPLE"; return my_name; @@ -138,6 +139,99 @@ class DistanceEstimator : public AbstractDistanceEstimator { DECL_LOGGER("DistanceEstimator"); }; +class DistanceEstimatorMPI : public DistanceEstimator { + typedef DistanceEstimator base; + typedef std::vector GraphLengths; + typedef std::vector> EstimHist; + typedef std::pair EdgePair; + + protected: + typedef typename base::InPairedIndex InPairedIndex; + typedef typename base::OutPairedIndex OutPairedIndex; + typedef typename base::InHistogram InHistogram; + typedef typename base::OutHistogram OutHistogram; + + public: + DistanceEstimatorMPI(const debruijn_graph::Graph &graph, + const InPairedIndex &index, + const GraphDistanceFinder &distance_finder, + size_t linkage_distance, size_t max_distance) + : base(graph, index, distance_finder, linkage_distance, max_distance) { } + + virtual ~DistanceEstimatorMPI() = default; + + class DistanceEstimatorTask { + DistanceEstimatorTask() = default; + public: + DistanceEstimatorTask(std::vector &edges, + unsigned int nthreads) : edges_(edges), nthreads_(nthreads) {}; + + DistanceEstimatorTask(std::istream &is) { + io::binary::BinRead(is, edges_, nthreads_); + + } + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, edges_, nthreads_); + return os; + } + + auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimatorMPI &, + PairedInfoIndexT & /*result*/) { + return partask::make_seq_along_generator(edges_); + } + + void process(std::istream &is, std::ostream &os, const InPairedIndex &index, + const DistanceEstimatorMPI &self, PairedInfoIndexT & /*result*/) { + DEBUG("Processing"); + auto edges_id = partask::get_seq(is); + + PairedInfoBuffersT buffer(self.graph(), nthreads_); + # pragma omp parallel for num_threads(nthreads_) schedule(guided, 10) + for (size_t i = 0; i < edges_id.size(); ++i) { + debruijn_graph::EdgeId edge = edges_[edges_id[i]]; + self.ProcessEdge(edge, index, buffer[omp_get_thread_num()]); + } + + buffer.BinWrite(os); + buffer.Clear(); + } + + auto merge(const std::vector &piss, + const InPairedIndex &index, + const DistanceEstimatorMPI &self, + PairedInfoIndexT &result) { + for (auto pis : piss) { + PairedInfoBuffersT buffer(self.graph(), nthreads_); + buffer.BinRead(*pis); + for (size_t j = 0; j < nthreads_; ++j) { + result.Merge(buffer[j]); + buffer[j].clear(); + } + } + } + + private: + std::vector edges_; + unsigned nthreads_; + }; + + void Init() const { + INFO("Using " << this->Name() << " distance estimator"); + } + + virtual void Estimate(OutPairedIndex &result, size_t nthreads) const; + + friend DistanceEstimatorTask; + private: + virtual const std::string Name() const { + static const std::string my_name = "SIMPLE_MPI"; + return my_name; + } + + DECL_LOGGER("DistanceEstimatorMPI"); +}; + } } From e00a2a85c4b40bca9fe41fa37c69e900d97ce1b9 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 5 Aug 2021 12:49:55 +0300 Subject: [PATCH 028/102] Distance Estimator MPI --- .../paired_info/distance_estimation.cpp | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/common/paired_info/distance_estimation.cpp b/src/common/paired_info/distance_estimation.cpp index 89308f09d3..399014f97c 100644 --- a/src/common/paired_info/distance_estimation.cpp +++ b/src/common/paired_info/distance_estimation.cpp @@ -27,7 +27,7 @@ std::vector GraphDistanceFinder::GetGraphDistancesLengths(EdgeId e1, Edg void GraphDistanceFinder::FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const { std::vector path_lower_bounds; size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_); - PathProcessor paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound); + PathProcessor paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound); for (auto &entry : second_edges) { EdgeId e2 = entry.first; @@ -65,7 +65,7 @@ AbstractDistanceEstimator::OutHistogram AbstractDistanceEstimator::ClusterResult size_t left = i; DEWeight weight = DEWeight(estimated[i].second); while (i + 1 < estimated.size() && - (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) { + (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) { ++i; weight += estimated[i].second; } @@ -77,11 +77,11 @@ AbstractDistanceEstimator::OutHistogram AbstractDistanceEstimator::ClusterResult } void AbstractDistanceEstimator::AddToResult(const OutHistogram &clustered, EdgePair ep, - PairedInfoBuffer &result) const { + PairedInfoBuffer &result) const { result.AddMany(ep.first, ep.second, clustered); } -void DistanceEstimator::Estimate(PairedInfoIndexT &result, size_t nthreads) const { +void DistanceEstimator::Estimate(PairedInfoIndexT &result, size_t nthreads) const { this->Init(); const auto &index = this->index(); @@ -139,7 +139,7 @@ DistanceEstimator::EstimHist DistanceEstimator::EstimateEdgePairDistances(EdgePa if (le(abs(forward[cur_dist] - point.d), max_distance_)) weights[cur_dist] += point.weight; } else if (cur_dist + 1 < forward.size() && - eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { + eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) { if (le(abs(forward[cur_dist] - point.d), max_distance_)) weights[cur_dist] += point.weight * 0.5; ++cur_dist; @@ -183,7 +183,7 @@ void DistanceEstimator::ProcessEdge(EdgeId e1, const InPairedIndex &pi, PairedIn } } -void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthreads) const { +void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthreads) const { this->Init(); const auto &index = this->index(); @@ -192,18 +192,15 @@ void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthr for (EdgeId e : this->graph().edges()) edges.push_back(e); - DEBUG("Processing"); - PairedInfoBuffersT buffer(this->graph(), nthreads); -# pragma omp parallel for num_threads(nthreads) schedule(guided, 10) - for (size_t i = 0; i < edges.size(); ++i) { - EdgeId edge = edges[i]; - ProcessEdge(edge, index, buffer[omp_get_thread_num()]); - } + partask::TaskRegistry treg; + auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(*this), std::ref(result)); + treg.listen(); - for (size_t i = 0; i < nthreads; ++i) { - result.Merge(buffer[i]); - buffer[i].clear(); + if (partask::master()) { + dist_estimator_mpi(edges, nthreads); } + treg.stop_listening(); + partask::broadcast(result); } } } From 7d193d986d3d5c6eaf98b5ea4551abd56b75ceab Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 5 Aug 2021 13:00:36 +0300 Subject: [PATCH 029/102] Distance Estimator MPI stage --- src/common/paired_info/distance_estimation_utils.cpp | 2 +- src/projects/spades/distance_estimation.hpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/paired_info/distance_estimation_utils.cpp b/src/common/paired_info/distance_estimation_utils.cpp index c67c8b62eb..ce7d9e065f 100644 --- a/src/common/paired_info/distance_estimation_utils.cpp +++ b/src/common/paired_info/distance_estimation_utils.cpp @@ -132,7 +132,7 @@ void EstimatePairedDistances(PairedInfoIndexT &clustered_index, INFO("Weight Filter Done"); - DistanceEstimator estimator(graph, paired_index, dist_finder, linkage_distance, max_distance); + DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance); EstimateWithEstimator(clustered_index, estimator, checker); diff --git a/src/projects/spades/distance_estimation.hpp b/src/projects/spades/distance_estimation.hpp index c5540d11b3..66e887c785 100644 --- a/src/projects/spades/distance_estimation.hpp +++ b/src/projects/spades/distance_estimation.hpp @@ -8,18 +8,18 @@ #pragma once +#include #include "pipeline/stage.hpp" namespace debruijn_graph { -class DistanceEstimation : public spades::AssemblyStage { - public: +class DistanceEstimation : public spades::MPIAssemblyStage { + public: DistanceEstimation(bool preliminary = false) - : AssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", - preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} + : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", + preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} void run(graph_pack::GraphPack &gp, const char*) override; }; - } From a08fd05539e191491f5bb2ff41e483fa904f8865 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 5 Aug 2021 16:49:32 +0300 Subject: [PATCH 030/102] DistanceEstimator MPI wrapper --- .../paired_info/distance_estimation.cpp | 2 +- .../paired_info/distance_estimation.hpp | 53 ++++++++++--------- .../paired_info/distance_estimation_utils.cpp | 7 ++- src/common/paired_info/paired_info.hpp | 4 +- .../smoothing_distance_estimation.hpp | 8 +-- .../weighted_distance_estimation.hpp | 8 +-- 6 files changed, 43 insertions(+), 39 deletions(-) diff --git a/src/common/paired_info/distance_estimation.cpp b/src/common/paired_info/distance_estimation.cpp index 399014f97c..1f8b83a09a 100644 --- a/src/common/paired_info/distance_estimation.cpp +++ b/src/common/paired_info/distance_estimation.cpp @@ -193,7 +193,7 @@ void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthr edges.push_back(e); partask::TaskRegistry treg; - auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(*this), std::ref(result)); + auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(dist_estimator_), std::ref(result)); treg.listen(); if (partask::master()) { diff --git a/src/common/paired_info/distance_estimation.hpp b/src/common/paired_info/distance_estimation.hpp index bf74de5000..d76798e74b 100644 --- a/src/common/paired_info/distance_estimation.hpp +++ b/src/common/paired_info/distance_estimation.hpp @@ -53,7 +53,7 @@ class AbstractDistanceEstimator { typedef typename InPairedIndex::HistProxy InHistogram; typedef typename OutPairedIndex::Histogram OutHistogram; -public: + public: AbstractDistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &index, const GraphDistanceFinder &distance_finder, @@ -63,6 +63,10 @@ class AbstractDistanceEstimator { virtual void Estimate(PairedInfoIndexT &result, size_t nthreads) const = 0; + virtual const std::string Name() const = 0; + + const debruijn_graph::Graph &graph() const { return graph_; } + virtual ~AbstractDistanceEstimator() { } protected: @@ -71,8 +75,6 @@ class AbstractDistanceEstimator { typedef std::vector GraphLengths; typedef std::map LengthMap; - const debruijn_graph::Graph &graph() const { return graph_; } - const InPairedIndex &index() const { return index_; } void FillGraphDistancesLengths(debruijn_graph::EdgeId e1, LengthMap &second_edges) const; @@ -87,8 +89,6 @@ class AbstractDistanceEstimator { const GraphDistanceFinder &distance_finder_; const size_t linkage_distance_; - virtual const std::string Name() const = 0; - DECL_LOGGER("AbstractDistanceEstimator"); }; @@ -104,7 +104,7 @@ class DistanceEstimator : public AbstractDistanceEstimator { typedef typename base::InHistogram InHistogram; typedef typename base::OutHistogram OutHistogram; -public: + public: DistanceEstimator(const debruijn_graph::Graph &graph, const InPairedIndex &index, const GraphDistanceFinder &distance_finder, @@ -119,6 +119,15 @@ class DistanceEstimator : public AbstractDistanceEstimator { virtual void Estimate(OutPairedIndex &result, size_t nthreads) const; + virtual const std::string Name() const { + static const std::string my_name = "SIMPLE"; + return my_name; + } + + virtual void ProcessEdge(debruijn_graph::EdgeId e1, + const InPairedIndex &pi, + PairedInfoBuffer &result) const; + protected: const DEDistance max_distance_; @@ -126,16 +135,6 @@ class DistanceEstimator : public AbstractDistanceEstimator { const InHistogram &histogram, const GraphLengths &raw_forward) const; - virtual void ProcessEdge(debruijn_graph::EdgeId e1, - const InPairedIndex &pi, - PairedInfoBuffer &result) const; - - private: - virtual const std::string Name() const { - static const std::string my_name = "SIMPLE"; - return my_name; - } - DECL_LOGGER("DistanceEstimator"); }; @@ -153,10 +152,11 @@ class DistanceEstimatorMPI : public DistanceEstimator { public: DistanceEstimatorMPI(const debruijn_graph::Graph &graph, - const InPairedIndex &index, - const GraphDistanceFinder &distance_finder, - size_t linkage_distance, size_t max_distance) - : base(graph, index, distance_finder, linkage_distance, max_distance) { } + const InPairedIndex &index, + const GraphDistanceFinder &distance_finder, + size_t linkage_distance, size_t max_distance, + const DistanceEstimator& base_dist_estimator) + : base(graph, index, distance_finder, linkage_distance, max_distance), dist_estimator_(base_dist_estimator) {} virtual ~DistanceEstimatorMPI() = default; @@ -176,16 +176,15 @@ class DistanceEstimatorMPI : public DistanceEstimator { return os; } - auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimatorMPI &, + auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimator&, PairedInfoIndexT & /*result*/) { return partask::make_seq_along_generator(edges_); } void process(std::istream &is, std::ostream &os, const InPairedIndex &index, - const DistanceEstimatorMPI &self, PairedInfoIndexT & /*result*/) { + const DistanceEstimator& self, PairedInfoIndexT & /*result*/) { DEBUG("Processing"); auto edges_id = partask::get_seq(is); - PairedInfoBuffersT buffer(self.graph(), nthreads_); # pragma omp parallel for num_threads(nthreads_) schedule(guided, 10) for (size_t i = 0; i < edges_id.size(); ++i) { @@ -198,8 +197,8 @@ class DistanceEstimatorMPI : public DistanceEstimator { } auto merge(const std::vector &piss, - const InPairedIndex &index, - const DistanceEstimatorMPI &self, + const InPairedIndex&, + const DistanceEstimator& self, PairedInfoIndexT &result) { for (auto pis : piss) { PairedInfoBuffersT buffer(self.graph(), nthreads_); @@ -224,8 +223,10 @@ class DistanceEstimatorMPI : public DistanceEstimator { friend DistanceEstimatorTask; private: + const DistanceEstimator& dist_estimator_; + virtual const std::string Name() const { - static const std::string my_name = "SIMPLE_MPI"; + const std::string my_name = dist_estimator_.Name() + "_MPI"; return my_name; } diff --git a/src/common/paired_info/distance_estimation_utils.cpp b/src/common/paired_info/distance_estimation_utils.cpp index ce7d9e065f..651844501d 100644 --- a/src/common/paired_info/distance_estimation_utils.cpp +++ b/src/common/paired_info/distance_estimation_utils.cpp @@ -106,7 +106,7 @@ void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, PairInfoWeightChecker checker(graph, 0.); DEBUG("Weight Filter Done"); - SmoothingDistanceEstimator estimator(graph, paired_index, dist_finder, + SmoothingDistanceEstimator estimator_base(graph, paired_index, dist_finder, [&] (int i) {return wrapper.CountWeight(i);}, linkage_distance, max_distance, ade.threshold, ade.range_coeff, @@ -114,6 +114,8 @@ void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, ade.min_peak_points, ade.percentage, ade.derivative_threshold); + DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance, estimator_base); + EstimateWithEstimator(scaffolding_index, estimator, checker); } @@ -132,7 +134,8 @@ void EstimatePairedDistances(PairedInfoIndexT &clustered_index, INFO("Weight Filter Done"); - DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance); + DistanceEstimator estimator_base(graph, paired_index, dist_finder, linkage_distance, max_distance); + DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance, estimator_base); EstimateWithEstimator(clustered_index, estimator, checker); diff --git a/src/common/paired_info/paired_info.hpp b/src/common/paired_info/paired_info.hpp index ad94550340..ae829517cb 100644 --- a/src/common/paired_info/paired_info.hpp +++ b/src/common/paired_info/paired_info.hpp @@ -732,7 +732,7 @@ class PairedIndices { using io::binary::BinWrite; BinWrite(str, data_.size()); - for (int i = 0; i < data_.size(); ++i) { + for (size_t i = 0; i < data_.size(); ++i) { data_[i].BinWrite(str); } } @@ -743,7 +743,7 @@ class PairedIndices { VERIFY(size == data_.size()); - for (int i = 0; i < size; ++i) { + for (size_t i = 0; i < size; ++i) { data_[i].BinRead(str); } } diff --git a/src/common/paired_info/smoothing_distance_estimation.hpp b/src/common/paired_info/smoothing_distance_estimation.hpp index ea80e1d9a0..d8ef4bc5c6 100644 --- a/src/common/paired_info/smoothing_distance_estimation.hpp +++ b/src/common/paired_info/smoothing_distance_estimation.hpp @@ -51,6 +51,10 @@ class SmoothingDistanceEstimator : public WeightedDistanceEstimator { virtual ~SmoothingDistanceEstimator() { } + const std::string Name() const override { + return "SMOOTHING"; + } + protected: typedef std::pair EdgePair; typedef std::vector> EstimHist; @@ -97,10 +101,6 @@ class SmoothingDistanceEstimator : public WeightedDistanceEstimator { void ExtendRightDFS(const debruijn_graph::EdgeId &first, debruijn_graph::EdgeId current, TempHistogram &data, int shift, size_t max_shift) const; - const std::string Name() const override { - return "SMOOTHING"; - } - DECL_LOGGER("SmoothingDistanceEstimator") }; diff --git a/src/common/paired_info/weighted_distance_estimation.hpp b/src/common/paired_info/weighted_distance_estimation.hpp index de4844a202..b62cfc8f3d 100644 --- a/src/common/paired_info/weighted_distance_estimation.hpp +++ b/src/common/paired_info/weighted_distance_estimation.hpp @@ -33,6 +33,10 @@ class WeightedDistanceEstimator : public DistanceEstimator { virtual ~WeightedDistanceEstimator() { } + const std::string Name() const override { + return "WEIGHTED"; + } + protected: typedef std::vector> EstimHist; @@ -45,10 +49,6 @@ class WeightedDistanceEstimator : public DistanceEstimator { const InHistogram &histogram, const GraphLengths &raw_forward) const override; - const std::string Name() const override { - return "WEIGHTED"; - } - private: DECL_LOGGER("WeightedDistanceEstimator"); }; From 8ea28cad0c82d28cfd592ce7791cabde0190bab9 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Sun, 7 Mar 2021 18:59:28 +0300 Subject: [PATCH 031/102] MPI GraphCondensing --- .../debruijn_graph_constructor.hpp | 22 +++-- .../debruijn_graph_constructor_mpi.hpp | 98 +++++++++++++++++++ src/common/stages/construction_mpi.cpp | 28 +++++- 3 files changed, 139 insertions(+), 9 deletions(-) create mode 100644 src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp index 2acd725b37..59462bb6bc 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp @@ -370,15 +370,14 @@ class UnbranchingPathExtractor { return result; } -public: - UnbranchingPathExtractor(Index &origin, size_t k) - : origin_(origin), kmer_size_(k) {} - //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. - const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { - auto its = origin_.kmer_begin(nchunks); - + const std::vector ExtractUnbranchingPaths(std::vector &its) const { INFO("Extracting unbranching paths"); + if (its.size() == 0) { + INFO("No input iterators, returning empty vector"); + return {}; + } + std::vector> sequences(its.size()); # pragma omp parallel for schedule(guided) for (size_t i = 0; i < its.size(); ++i) @@ -400,6 +399,14 @@ class UnbranchingPathExtractor { INFO("Extracting unbranching paths finished. " << sequences[0].size() << " sequences extracted"); return sequences[0]; } +public: + UnbranchingPathExtractor(Index &origin, size_t k) + : origin_(origin), kmer_size_(k) {} + + const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { + auto its = origin_.kmer_begin(nchunks); + return ExtractUnbranchingPaths(its); + } const std::vector ExtractUnbranchingPathsAndLoops(unsigned nchunks) { std::vector result = ExtractUnbranchingPaths(nchunks); @@ -410,6 +417,7 @@ class UnbranchingPathExtractor { return result; } + template friend class DeBruijnGraphExtentionConstructorTask; private: DECL_LOGGER("UnbranchingPathExtractor") }; diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp new file mode 100644 index 0000000000..e5be3c95c2 --- /dev/null +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -0,0 +1,98 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "pipeline/partask_mpi.hpp" +#include "io/binary/graph.hpp" +#include "debruijn_graph_constructor.hpp" + +namespace debruijn_graph { +template +class DeBruijnGraphExtentionConstructorTask { + private: + typedef typename Graph::EdgeId EdgeId; + typedef kmers::DeBruijnExtensionIndex<> Index; + typedef typename Graph::VertexId VertexId; + typedef RtSeq Kmer; + + bool collect_loops_; + + public: + DeBruijnGraphExtentionConstructorTask(std::istream &is) { + io::binary::BinRead(is, collect_loops_); + } + + DeBruijnGraphExtentionConstructorTask(bool collect_loops) : collect_loops_{collect_loops} {} + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, collect_loops_); + return os; + } + + template + auto make_splitter(size_t size, Args &&...) { + return partask::make_seq_plus_n_generator(size); + } + + void process(std::istream &is, std::ostream &os, Graph &g, Index &index) { + size_t n = 0; + std::vector chunks = partask::get_seq_plus_n(is, n); + if (!chunks.size()) { + INFO("Empty job, skipping"); + } + + auto iters = index.kmer_begin(n); + + std::vector local_iters; + for (size_t i : chunks) { + if (i < iters.size()) { + local_iters.push_back(std::move(iters[i])); + } + } + UnbranchingPathExtractor extractor(index, g.k()); + auto seqs = extractor.ExtractUnbranchingPaths(local_iters); + extractor.CleanCondensed(seqs); + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + io::binary::BinWrite(os, partask::fast_local_transfer(seqs)); + } + + void merge(const std::vector &piss, Graph &g, Index &index) { + std::vector seqs; + for (size_t i = 0; i < piss.size(); ++i) { + auto &is = *piss[i]; + if (is.peek() != EOF) { + std::vector local_seqs; + io::binary::BinRead(is, partask::fast_local_transfer(local_seqs)); + seqs.insert(seqs.end(), + std::make_move_iterator(local_seqs.begin()), std::make_move_iterator(local_seqs.end())); + } + } + + if (collect_loops_) { + UnbranchingPathExtractor extractor(index, g.k()); + std::vector loops = extractor.CollectLoops(omp_get_max_threads()); + seqs.insert(seqs.end(), + std::make_move_iterator(loops.begin()), std::make_move_iterator(loops.end())); + } + + INFO("Sorting edges..."); + parallel::sort(seqs.begin(), seqs.end(), Sequence::RawCompare); + INFO("Sorting edges finished"); + + FastGraphFromSequencesConstructor(g.k(), index).ConstructGraph(g, seqs); + } + + void sync(Graph &g, Index &) { + auto serialize = [](std::ostream &os, const Graph &g) { + io::binary::GraphIO().BinWrite(os, g); + }; + auto deserialize = [](std::istream &is, Graph &g) { + io::binary::GraphIO().BinRead(is, g); + }; + partask::broadcast(g, serialize, deserialize); + } +}; +} //namespace debruijn_graph diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index eed59c7bee..b3a40868bc 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -10,6 +10,7 @@ #include "alignment/edge_index.hpp" #include "assembly_graph/construction/early_simplification.hpp" +#include "assembly_graph/construction/debruijn_graph_constructor_mpi.hpp" #include "io/dataset_support/dataset_readers.hpp" #include "io/dataset_support/read_converter.hpp" #include "io/reads/coverage_filtering_read_wrapper.hpp" @@ -267,8 +268,14 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { virtual ~ExtensionIndexBuilder() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &, const char*) override { // FIXME: We just need files here, not the full counter. Implement refererence counting scheme! + if (!storage().kmers) + storage().kmers.reset(new kmers::KMerDiskStorage()); + partask::broadcast(*storage().kmers); + kmers::DeBruijnExtensionIndexBuilder().BuildExtensionIndexFromKPOMers(storage().workdir, storage().ext_index, *storage().kmers, @@ -297,6 +304,8 @@ class EarlyTipClipper : public ConstructionMPI::Phase { virtual ~EarlyTipClipper() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &gp, const char*) override { if (!storage().params.early_tc.length_bound) { INFO("Early tip clipper length bound set as (RL - K)"); @@ -325,6 +334,8 @@ class EarlyATClipper : public ConstructionMPI::Phase { virtual ~EarlyATClipper() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &, const char*) override { EarlyLowComplexityClipperProcessor at_processor(storage().ext_index, 0.8, 10, 200); at_processor.RemoveATEdges(); @@ -347,15 +358,28 @@ class EarlyATClipper : public ConstructionMPI::Phase { class GraphCondenser : public ConstructionMPI::Phase { public: GraphCondenser() - : ConstructionMPI::Phase("Condensing graph", "graph_condensing") { } + : ConstructionMPI::Phase("Condensing graph (MPI)", "graph_condensing_mpi") { } virtual ~GraphCondenser() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &gp, const char*) override { auto &index = gp.get_mutable>(); if (index.IsAttached()) index.Detach(); - DeBruijnGraphExtentionConstructor(gp.get_mutable(), storage().ext_index).ConstructGraph(storage().params.keep_perfect_loops); + + partask::TaskRegistry treg; + using GraphT = std::decay_t())>; + auto condence = treg.add>(std::ref(gp.get_mutable()), std::ref(storage().ext_index)); + treg.listen(); + if (partask::master()) { + condence(storage().params.keep_perfect_loops); + } + treg.stop_listening(); + INFO("Graph synced, edges " << gp.get().e_size() << ", vertices " << gp.get().size()); + VERIFY(partask::all_equal(gp.get().e_size())); + VERIFY(partask::all_equal(gp.get().size())); } void load(graph_pack::GraphPack&, From abf09c4541af176deb6be38019549111f4c49760 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Fri, 14 May 2021 14:19:06 +0300 Subject: [PATCH 032/102] remove Seq from extansion index --- .../debruijn_graph_constructor.hpp | 24 +++---------------- .../debruijn_graph_constructor_mpi.hpp | 3 ++- .../extension_index/kmer_extension_index.hpp | 24 ++++++++++++++++++- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp index 59462bb6bc..efe5370372 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp @@ -312,24 +312,6 @@ class UnbranchingPathExtractor { } } - void CleanCondensed(const Sequence &sequence) { - Kmer kmer = sequence.start(kmer_size_); - KeyWithHash kwh = origin_.ConstructKWH(kmer); - origin_.IsolateVertex(kwh); - for (size_t pos = kmer_size_; pos < sequence.size(); pos++) { - kwh = kwh << sequence[pos]; - origin_.IsolateVertex(kwh); - } - } - - void CleanCondensed(const std::vector &sequences) { -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < sequences.size(); ++i) { - CleanCondensed(sequences[i]); - CleanCondensed(!sequences[i]); - } - } - // This methods collects all loops that were not extracted by finding // unbranching paths because there are no junctions on loops. const std::vector CollectLoops(unsigned nchunks) { @@ -361,8 +343,8 @@ class UnbranchingPathExtractor { else result.push_back(s); - CleanCondensed(s); - CleanCondensed(s_rc); + origin_.removeSequence(s); + origin_.removeSequence(s_rc); } } } @@ -410,7 +392,7 @@ class UnbranchingPathExtractor { const std::vector ExtractUnbranchingPathsAndLoops(unsigned nchunks) { std::vector result = ExtractUnbranchingPaths(nchunks); - CleanCondensed(result); + origin_.removeSequences(result); std::vector loops = CollectLoops(nchunks); result.insert(result.end(), std::make_move_iterator(loops.begin()), std::make_move_iterator(loops.end())); diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index e5be3c95c2..8483c55821 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -52,9 +52,10 @@ class DeBruijnGraphExtentionConstructorTask { local_iters.push_back(std::move(iters[i])); } } + UnbranchingPathExtractor extractor(index, g.k()); auto seqs = extractor.ExtractUnbranchingPaths(local_iters); - extractor.CleanCondensed(seqs); + index.removeSequences(seqs); partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); io::binary::BinWrite(os, partask::fast_local_transfer(seqs)); } diff --git a/src/common/kmer_index/extension_index/kmer_extension_index.hpp b/src/common/kmer_index/extension_index/kmer_extension_index.hpp index 69f98655ea..f2fc19019d 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index.hpp @@ -15,6 +15,7 @@ #include "kmer_index/ph_map/storing_traits.hpp" #include "sequence/rtseq.hpp" +#include "sequence/sequence.hpp" #include "utils/stl_utils.hpp" #include @@ -72,9 +73,12 @@ class DeBruijnExtensionIndex : public KeyIteratingMap DeEdge; using base::ConstructKWH; + unsigned k_size_; DeBruijnExtensionIndex(unsigned K) - : base(K) {} + : base(K) { + k_size_ = K; + } using PerfectHashMap::raw_data; using PerfectHashMap::raw_size; @@ -124,6 +128,24 @@ class DeBruijnExtensionIndex : public KeyIteratingMapget_raw_value_reference(kwh).IsolateVertex(); } + void removeSequence(const Sequence &sequence) { + RtSeq kmer = sequence.start(k_size_); + KeyWithHash kwh = ConstructKWH(kmer); + IsolateVertex(kwh); + for (size_t pos = k_size_; pos < sequence.size(); pos++) { + kwh = kwh << sequence[pos]; + IsolateVertex(kwh); + } + } + + void removeSequences(const std::vector &sequences) { +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < sequences.size(); ++i) { + removeSequence(sequences[i]); + removeSequence(!sequences[i]); + } + } + bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const { return this->get_value(kwh).CheckOutgoing(nucl); } From cc1577f1caccb42829aff8254b8e23c85fe82f66 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Fri, 14 May 2021 14:26:21 +0300 Subject: [PATCH 033/102] remove friend from UnbranchingPathExtractor --- .../debruijn_graph_constructor.hpp | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp index efe5370372..124cd917aa 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp @@ -312,6 +312,45 @@ class UnbranchingPathExtractor { } } +public: + UnbranchingPathExtractor(Index &origin, size_t k) + : origin_(origin), kmer_size_(k) {} + + //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. + const std::vector ExtractUnbranchingPaths(std::vector &its) const { + INFO("Extracting unbranching paths"); + if (its.size() == 0) { + INFO("No input iterators, returning empty vector"); + return {}; + } + + std::vector> sequences(its.size()); +# pragma omp parallel for schedule(guided) + for (size_t i = 0; i < its.size(); ++i) + CalculateSequences(its[i], sequences[i]); + + size_t snum = std::accumulate(sequences.begin(), sequences.end(), + 0ULL, + [](size_t val, const std::vector &s) { + return val + s.size(); + }); + sequences[0].reserve(snum); + for (size_t i = 1; i < sequences.size(); ++i) { + sequences[0].insert(sequences[0].end(), + std::make_move_iterator(sequences[i].begin()), std::make_move_iterator(sequences[i].end())); + sequences[i].clear(); + sequences[i].shrink_to_fit(); + } + + INFO("Extracting unbranching paths finished. " << sequences[0].size() << " sequences extracted"); + return sequences[0]; + } + + const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { + auto its = origin_.kmer_begin(nchunks); + return ExtractUnbranchingPaths(its); + } + // This methods collects all loops that were not extracted by finding // unbranching paths because there are no junctions on loops. const std::vector CollectLoops(unsigned nchunks) { @@ -352,44 +391,6 @@ class UnbranchingPathExtractor { return result; } - //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. - const std::vector ExtractUnbranchingPaths(std::vector &its) const { - INFO("Extracting unbranching paths"); - if (its.size() == 0) { - INFO("No input iterators, returning empty vector"); - return {}; - } - - std::vector> sequences(its.size()); -# pragma omp parallel for schedule(guided) - for (size_t i = 0; i < its.size(); ++i) - CalculateSequences(its[i], sequences[i]); - - size_t snum = std::accumulate(sequences.begin(), sequences.end(), - 0ULL, - [](size_t val, const std::vector &s) { - return val + s.size(); - }); - sequences[0].reserve(snum); - for (size_t i = 1; i < sequences.size(); ++i) { - sequences[0].insert(sequences[0].end(), - std::make_move_iterator(sequences[i].begin()), std::make_move_iterator(sequences[i].end())); - sequences[i].clear(); - sequences[i].shrink_to_fit(); - } - - INFO("Extracting unbranching paths finished. " << sequences[0].size() << " sequences extracted"); - return sequences[0]; - } -public: - UnbranchingPathExtractor(Index &origin, size_t k) - : origin_(origin), kmer_size_(k) {} - - const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { - auto its = origin_.kmer_begin(nchunks); - return ExtractUnbranchingPaths(its); - } - const std::vector ExtractUnbranchingPathsAndLoops(unsigned nchunks) { std::vector result = ExtractUnbranchingPaths(nchunks); origin_.removeSequences(result); @@ -399,7 +400,6 @@ class UnbranchingPathExtractor { return result; } - template friend class DeBruijnGraphExtentionConstructorTask; private: DECL_LOGGER("UnbranchingPathExtractor") }; From 2daac3f0bd926560f42036612c005fd6877b2790 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 21 Jul 2021 17:31:54 +0300 Subject: [PATCH 034/102] style fix --- .../construction/debruijn_graph_constructor.hpp | 8 ++++---- .../construction/debruijn_graph_constructor_mpi.hpp | 3 ++- .../kmer_index/extension_index/kmer_extension_index.hpp | 8 ++++---- src/common/stages/construction_mpi.cpp | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp index 124cd917aa..8cc868929a 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp @@ -346,7 +346,7 @@ class UnbranchingPathExtractor { return sequences[0]; } - const std::vector ExtractUnbranchingPaths(unsigned nchunks) const { + const std::vector ExtractUnbranchingPaths(unsigned nchunks = 1) const { auto its = origin_.kmer_begin(nchunks); return ExtractUnbranchingPaths(its); } @@ -382,8 +382,8 @@ class UnbranchingPathExtractor { else result.push_back(s); - origin_.removeSequence(s); - origin_.removeSequence(s_rc); + origin_.RemoveSequence(s); + origin_.RemoveSequence(s_rc); } } } @@ -393,7 +393,7 @@ class UnbranchingPathExtractor { const std::vector ExtractUnbranchingPathsAndLoops(unsigned nchunks) { std::vector result = ExtractUnbranchingPaths(nchunks); - origin_.removeSequences(result); + origin_.RemoveSequences(result); std::vector loops = CollectLoops(nchunks); result.insert(result.end(), std::make_move_iterator(loops.begin()), std::make_move_iterator(loops.end())); diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index 8483c55821..6fa6372670 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -55,7 +55,7 @@ class DeBruijnGraphExtentionConstructorTask { UnbranchingPathExtractor extractor(index, g.k()); auto seqs = extractor.ExtractUnbranchingPaths(local_iters); - index.removeSequences(seqs); + index.RemoveSequences(seqs); partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); io::binary::BinWrite(os, partask::fast_local_transfer(seqs)); } @@ -73,6 +73,7 @@ class DeBruijnGraphExtentionConstructorTask { } if (collect_loops_) { + INFO("Collecting perfect loops"); UnbranchingPathExtractor extractor(index, g.k()); std::vector loops = extractor.CollectLoops(omp_get_max_threads()); seqs.insert(seqs.end(), diff --git a/src/common/kmer_index/extension_index/kmer_extension_index.hpp b/src/common/kmer_index/extension_index/kmer_extension_index.hpp index f2fc19019d..ffed00ae1a 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index.hpp @@ -128,7 +128,7 @@ class DeBruijnExtensionIndex : public KeyIteratingMapget_raw_value_reference(kwh).IsolateVertex(); } - void removeSequence(const Sequence &sequence) { + void RemoveSequence(const Sequence &sequence) { RtSeq kmer = sequence.start(k_size_); KeyWithHash kwh = ConstructKWH(kmer); IsolateVertex(kwh); @@ -138,11 +138,11 @@ class DeBruijnExtensionIndex : public KeyIteratingMap &sequences) { + void RemoveSequences(const std::vector &sequences) { # pragma omp parallel for schedule(guided) for (size_t i = 0; i < sequences.size(); ++i) { - removeSequence(sequences[i]); - removeSequence(!sequences[i]); + RemoveSequence(sequences[i]); + RemoveSequence(!sequences[i]); } } diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index b3a40868bc..9e678f3484 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -371,10 +371,10 @@ class GraphCondenser : public ConstructionMPI::Phase { partask::TaskRegistry treg; using GraphT = std::decay_t())>; - auto condence = treg.add>(std::ref(gp.get_mutable()), std::ref(storage().ext_index)); + auto condense = treg.add>(std::ref(gp.get_mutable()), std::ref(storage().ext_index)); treg.listen(); if (partask::master()) { - condence(storage().params.keep_perfect_loops); + condense(storage().params.keep_perfect_loops); } treg.stop_listening(); INFO("Graph synced, edges " << gp.get().e_size() << ", vertices " << gp.get().size()); From 441b1897f23b51e8418e321c8a496c48f5466ae7 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Sat, 12 Jun 2021 22:21:04 +0300 Subject: [PATCH 035/102] MPI EarlyTipClipper --- src/common/stages/construction_mpi.cpp | 75 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 9e678f3484..3585467396 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -297,21 +297,88 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { }; +template +class TipClippingTask { + TipClippingTask() = default; + public: + TipClippingTask(size_t length_bound) : length_bound_{length_bound} {} + TipClippingTask(std::istream &is) { deserialize(is); } + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, length_bound_); + return os; + } + + std::istream &deserialize(std::istream &is) { + io::binary::BinRead(is, length_bound_); + return is; + } + + auto make_splitter(size_t size, Index &) { + return partask::make_seq_plus_n_generator(size); + } + + void process(std::istream &is, std::ostream &os, Index &index) { + size_t n = 0; + std::vector chunks = partask::get_seq_plus_n(is, n); + + INFO("Job got, " << chunks.size() << "/" << n << "chunks"); + auto iters = index.kmer_begin(n); + std::vector local_iters; + for (size_t i : chunks) { + if (i < iters.size()) { + local_iters.push_back(std::move(iters[i])); + } + } + size_t kpo_mers_removed = EarlyTipClipperProcessor(index, length_bound_).ClipTips(local_iters); // TODO support empty input + + INFO("K+1-mers removed: " << kpo_mers_removed); + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + io::binary::BinWrite(os, kpo_mers_removed); + } + + size_t merge(const std::vector &piss, Index&) { + size_t kpo_mers_removed = 0; + for (auto &pis : piss) { + kpo_mers_removed += io::binary::BinRead(*pis); + } + return kpo_mers_removed; + } + + private: + size_t length_bound_; +}; + + class EarlyTipClipper : public ConstructionMPI::Phase { public: EarlyTipClipper() - : ConstructionMPI::Phase("Early tip clipping", "early_tip_clipper") { } + : ConstructionMPI::Phase("Early tip clipping (MPI)", "early_tip_clipper_mpi") { } virtual ~EarlyTipClipper() = default; bool distributed() const override { return true; } void run(graph_pack::GraphPack &gp, const char*) override { - if (!storage().params.early_tc.length_bound) { - INFO("Early tip clipper length bound set as (RL - K)"); + partask::TaskRegistry treg; + auto &index = storage().ext_index; + using Index = std::remove_reference_t; + VERIFY(partask::all_equal(index.size())); + + auto clip_tips = treg.add>(std::ref(index)); + treg.listen(); + + if (partask::master()) { + if (!storage().params.early_tc.length_bound) { + INFO("Early tip clipper length bound set as (RL - K)"); storage().params.early_tc.length_bound = cfg::get().ds.RL - gp.k(); + } + + size_t length_bound = *storage().params.early_tc.length_bound; + size_t kpo_mers_removed = clip_tips(length_bound); + INFO(kpo_mers_removed << " short edges (" << (index.k() + 1) << "-mers) were removed by early tip clipper"); } - EarlyTipClipperProcessor(storage().ext_index, *storage().params.early_tc.length_bound).ClipTips(); + + treg.stop_listening(); } void load(graph_pack::GraphPack&, From 94ae6f7e8f2152e5f9327760896efc0edc509d12 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 10:49:39 +0300 Subject: [PATCH 036/102] MPI Build Extension Index --- src/common/assembly_graph/CMakeLists.txt | 4 + .../kmer_extension_index_builder_mpi.hpp | 206 ++++++++++++++ src/common/kmer_index/kmer_mph/kmer_index.hpp | 4 + .../kmer_mph/kmer_index_builder.hpp | 15 + .../kmer_mph/kmer_index_builder_mpi.hpp | 259 ++++++++++++++++++ .../ph_map/perfect_hash_map_builder.hpp | 59 +++- src/common/stages/construction_mpi.cpp | 11 +- 7 files changed, 552 insertions(+), 6 deletions(-) create mode 100644 src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp create mode 100644 src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp diff --git a/src/common/assembly_graph/CMakeLists.txt b/src/common/assembly_graph/CMakeLists.txt index 09dcbd7ce9..717e2c792c 100644 --- a/src/common/assembly_graph/CMakeLists.txt +++ b/src/common/assembly_graph/CMakeLists.txt @@ -18,3 +18,7 @@ add_library(assembly_graph STATIC ../alignment/edge_index_refiller.cpp) target_link_libraries(assembly_graph utils llvm-support) + +if (MPI_FOUND) + target_link_libraries(assembly_graph ${MPI_LIBRARIES}) +endif() diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp new file mode 100644 index 0000000000..f99149147d --- /dev/null +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -0,0 +1,206 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "kmer_extension_index.hpp" +#include "kmer_extension_index_builder.hpp" + +#include "kmer_index/kmer_mph/kmer_index_builder.hpp" +#include "kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" +#include "kmer_index/kmer_mph/kmer_splitters.hpp" +#include "kmer_index/kmer_counting.hpp" +#include "kmer_index/ph_map/perfect_hash_map_builder.hpp" +#include "io/reads/multifile_reader.hpp" + +#include "pipeline/partask_mpi.hpp" + +namespace kmers { +class DeBruijnExtensionIndexBuilderMPI : public DeBruijnExtensionIndexBuilder { + public: + template + void BuildExtensionIndexFromStream(fs::TmpDir workdir, Index &index, + Streams &streams, + size_t read_buffer_size = 0) const { + unsigned nthreads = omp_get_max_threads(); + using KmerFilter = StoringTypeFilter; + + // First, build a k+1-mer index + DeBruijnReadKMerSplitter + splitter(workdir, index.k() + 1, 0xDEADBEEF, streams, read_buffer_size); + kmers::KMerDiskCounter counter(workdir, splitter); + auto storage = counter.CountAll(nthreads, /* merge */ false); + + BuildExtensionIndexFromKPOMers(workdir, index, storage, + nthreads, read_buffer_size); + } + + template + void BuildExtensionIndexFromKPOMersMPI(fs::TmpDir workdir, + Index &index, KMerStorage &kmerstorage, + unsigned nbuckets, size_t read_buffer_size = 0) const; + + private: + DECL_LOGGER("DeBruijnExtensionIndexBuilderMPI"); +}; + +template +class FillIndexTask { + public: + FillIndexTask() = default; + + FillIndexTask(kmers::KMerDiskStorage &storage) : storage_{storage} {}; + FillIndexTask(std::istream &is) { + storage_.BinRead(is); + } + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, storage_); + return os; + } + + auto make_splitter(size_t, Index &) { + return partask::make_seq_generator(storage_.num_buckets()); + } + + void process(std::istream &is, std::ostream & /*os*/, Index &index) { + auto file_ids = partask::get_seq(is); +# pragma omp parallel for + for (size_t i = 0; i < file_ids.size(); ++i) { + size_t idx = file_ids[i]; + builder_.FillExtensionsFromIndex(storage_.bucket_begin(idx), storage_.bucket_end(idx), index); + } + + // Send nothing + } + + void sync(Index &index) { + INFO("FillIndexTask::sync started"); + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BOR); + INFO("FillIndexTask::sync finished"); + } + + private: + kmers::KMerDiskStorage storage_; + DeBruijnExtensionIndexBuilder builder_; +}; + +template +class SplitKPOMersTask { + typedef typename Index::traits_t::SeqType Seq; + SplitKPOMersTask() = default; + public: + SplitKPOMersTask(kmers::KMerDiskStorage &storage, + unsigned k, + unsigned nthreads, + size_t read_buffer_size, + const std::string &dir) + : storage_{storage}, k_{k}, nthreads_{nthreads}, read_buffer_size_{read_buffer_size}, dir_{dir} {}; + SplitKPOMersTask(std::istream &is) { + io::binary::BinRead(is, storage_, k_, nthreads_, read_buffer_size_, dir_); + + } + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, storage_, k_, nthreads_, read_buffer_size_, dir_); + return os; + } + + auto make_splitter(size_t) { + return partask::make_seq_generator(storage_.num_buckets()); + } + + void process(std::istream &is, std::ostream &os) { + auto workdir = fs::tmp::acquire_temp_dir(dir_); + workdir->release(); + DeBruijnKMerKMerSplitter, + typename kmers::KMerDiskStorage::kmer_iterator> splitter( + workdir, k_, k_ + 1, Index::storing_type::IsInvertable(), read_buffer_size_); + + policy_ = splitter.bucket_policy(); + auto file_ids = partask::get_seq(is); + for (size_t i : file_ids) { + splitter.AddKMers(storage_.bucket(i)); + } + + kmers::KMerDiskCounter counter2(workdir, splitter); + auto storage2 = counter2.CountAll(storage_.num_buckets(), nthreads_,/* merge */ false); + storage2.BinWrite(os); + storage2.release_all(); + } + + auto merge(const std::vector &piss) { + auto workdir = fs::tmp::acquire_temp_dir(dir_); + workdir->release(); + std::vector> storages; + for (size_t i = 0; i < piss.size(); ++i) { + auto &is = *piss[i]; + kmers::KMerDiskStorage kmerstorage(workdir, k_, policy_); + kmerstorage.BinRead(is); + storages.push_back(std::move(kmerstorage)); + } + + return storages; + } + + private: + kmers::KMerDiskStorage storage_; + typename kmers::KMerDiskStorage::KMerSegmentPolicy policy_; + unsigned k_; + unsigned nthreads_; + size_t read_buffer_size_; + std::string dir_; +}; + +template +inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI(fs::TmpDir workdir, + Index &index, + KMerStorage &kmerstorage, + unsigned nthreads, + size_t read_buffer_size) const { + typedef typename Index::traits_t::SeqType Seq; + VERIFY(kmerstorage.k() == index.k() + 1); + + KMerStorage kmerfiles2(workdir, index.k(), kmerstorage.segment_policy()); + + INFO("DeBruijnExtensionIndexBuilder started nthreads = " << nthreads); + partask::TaskRegistry treg; + auto merge_kmer_files = treg.add>(); + auto split_kpo_mers = treg.add>(); + treg.listen(); + DEBUG("Listening started"); + + if (partask::master()) { + std::vector outputfiles; + DEBUG("Split_kpo_mers started"); + auto unmerged_kmerfiles2 = split_kpo_mers(kmerstorage, index.k(), nthreads, read_buffer_size, workdir->dir()); + DEBUG("Split_kpo_mers finished"); + + for (unsigned i = 0; i < kmerfiles2.num_buckets(); ++i) { + outputfiles.push_back(kmerfiles2.create(i)->file()); + } + + merge_kmer_files(std::move(unmerged_kmerfiles2), outputfiles, index.k(), workdir->dir()); + DEBUG("Merge_kmer_files finished"); + } + treg.stop_listening(); + + partask::broadcast(kmerfiles2); + INFO("Total kmers=" << kmerfiles2.total_kmers()); + BuildIndexMPI(index, kmerfiles2, /* save_final */ true); + + auto fill_index = treg.add>(std::ref(index)); + treg.listen(); + DEBUG("Listening started"); + + if (partask::master()) { + fill_index(kmerstorage); + } + treg.stop_listening(); + + INFO("Building k-mer extensions from k+1-mers finished."); +} +} //utils diff --git a/src/common/kmer_index/kmer_mph/kmer_index.hpp b/src/common/kmer_index/kmer_mph/kmer_index.hpp index 6678382c89..6b8e37f3ca 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index.hpp @@ -22,6 +22,9 @@ namespace kmers { template class KMerIndexBuilder; +template +class KMerIndexBuilderMPI; + template class KMerIndex { public: @@ -144,5 +147,6 @@ class KMerIndex { } friend class KMerIndexBuilder<__self>; + friend class KMerIndexBuilderMPI<__self>; }; } diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp index 09b20a1e6f..ab7a88ef9e 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp @@ -104,6 +104,7 @@ class KMerDiskStorage { } KMerDiskStorage(KMerDiskStorage &&) = default; + KMerDiskStorage(const KMerDiskStorage &) = default; KMerDiskStorage &operator=(KMerDiskStorage &&) = default; fs::DependentTmpFile create() { @@ -140,6 +141,16 @@ class KMerDiskStorage { return fsize / (Seq::GetDataSize(k_) * sizeof(typename Seq::DataType)); } + void release_all() { + if (all_kmers_) { + all_kmers_->release(); + } else { + for (auto &file : buckets_) { + file->release(); + } + } + } + fs::TmpFile final_kmers() { VERIFY_MSG(all_kmers_, "k-mers were not merged yet"); return all_kmers_; @@ -161,6 +172,10 @@ class KMerDiskStorage { return adt::make_range(bucket_begin(i), bucket_end(i)); } + auto bucket_file(size_t i) const { + return buckets_[i]; + } + size_t num_buckets() const { return buckets_.size(); } KMerSegmentPolicy segment_policy() const { return segment_policy_; } diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp new file mode 100644 index 0000000000..9f35070840 --- /dev/null +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp @@ -0,0 +1,259 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "kmer_index_builder.hpp" +#include "kmer_buckets.hpp" + +#include "pipeline/partask_mpi.hpp" + +namespace kmers { +template +class KMerIndexBuilderMPI { + typedef typename Index::KMerSeq Seq; + typedef typename Index::kmer_index_traits kmer_index_traits; + + public: + KMerIndexBuilderMPI() = default; + size_t BuildIndexMPI(Index &out, KMerDiskStorage& kmerstorage, bool save_final = false); + + private: + DECL_LOGGER("K-mer Index Building MPI"); + + class BuildKMerIndexTask { + BuildKMerIndexTask() = default; + public: + BuildKMerIndexTask(KMerDiskStorage kmerstorage, unsigned k) : storage_{std::move(kmerstorage)}, k_{k} {}; + BuildKMerIndexTask(std::istream &is) { deserialize(is); } + + std::ostream &serialize(std::ostream &os) const { + storage_.BinWrite(os); + return os; + } + + std::istream &deserialize(std::istream &is) { + storage_.BinRead(is); + return is; + } + + auto make_splitter(size_t, Index &) { + return partask::make_seq_generator(storage_.num_buckets()); + } + + void process(std::istream &is, std::ostream &os, Index &) { + std::vector residuals; + while (is.get() && is) { + size_t i; + io::binary::BinRead(is, i); + DEBUG("BuildKMerIndexTask: process, i = " << i); + residuals.push_back(i); + } + + size_t num_buckets = storage_.num_buckets(); + + std::vector indices(num_buckets); + std::vector sizes(num_buckets); + DEBUG("NumBuckets: " << num_buckets); +#pragma omp parallel for + for (size_t ii = 0; ii < residuals.size(); ++ii) { + size_t i = residuals[ii]; + const auto &bucket_range = storage_.bucket(i); + sizes[i] = storage_.bucket_size(i); + DEBUG("Bucket size: " << sizes[i]); + indices[i] = new typename Index::KMerDataIndex(sizes[i], Index::KMerDataIndex::ConflictPolicy::Ignore, /* gamma */ 4.0); + indices[i]->build(boomphf::range(storage_.bucket_begin(i), storage_.bucket_end(i))); + DEBUG("Index created"); + } + + for (size_t i = 0; i < indices.size(); ++i) { + if (!indices[i]) continue; + DEBUG("Sending index " << i); + os.put('\1'); + io::binary::BinWrite(os, i, sizes[i]); + if (partask::master()) { + io::binary::BinWrite(os, indices[i]); // Pass the pointer + } else { + indices[i]->save(os); + delete indices[i]; + } + } + os.put('\0'); + } + + void merge(const std::vector &piss, Index &index) { + INFO("Index merge started"); + index.clear(); + + auto segment_policy = storage_.segment_policy(); + size_t segments = segment_policy.num_segments(); + index.segment_starts_.resize(segments + 1, 0); + index.index_.resize(storage_.num_buckets(), + typename Index::KMerDataIndex(0, Index::KMerDataIndex::ConflictPolicy::Ignore, /* gamma */ 4.0)); + index.segment_policy_ = segment_policy; + index.num_segments_ = segments; + + DEBUG("Index initialized with empty subindices"); + + for (size_t node = 0; node < piss.size(); ++node) { + DEBUG("Collecting stream " << node); + std::istream &is = *piss[node]; + while (static_cast(is.get())) { + size_t i, size; + io::binary::BinRead(is, i, size); + DEBUG("Load index " << i << " " << size << " from stream " << node); + if (node == 0) { + typename KMerIndex::KMerDataIndex *p; + io::binary::BinRead(is, p); + std::swap(index.index_[i], *p); + delete p; + } else { + index.index_[i].load(is); + } + DEBUG("Index loaded " << i << " " << size); + index.segment_starts_[i + 1] = size; + } + } + + // Finally, record the sizes of buckets. + for (size_t i = 1; i <= segments; ++i) { + index.segment_starts_[i] += index.segment_starts_[i - 1]; + } + + INFO("Index merge done"); + } + + void sync(Index &index) { + INFO("Index broadcasting started"); + partask::broadcast(index, + [](std::ostream &os, const auto &index) { return index.serialize(os); }, + [](std::istream &is, auto &index) { return index.deserialize(is); }); + INFO("Index broadcasting done"); + } + + private: + KMerDiskStorage storage_; + unsigned k_; + }; +}; + +template +size_t KMerIndexBuilderMPI::BuildIndexMPI(Index &index, KMerDiskStorage& storage, bool save_final) { + index.clear(); + + INFO("Building kmer index "); + + size_t buckets = storage.num_buckets(); + unsigned k = storage.k(); + DEBUG("K: " << k); + partask::TaskRegistry treg; + auto build_index = treg.add(std::ref(index)); + + treg.listen(); + if (partask::master()) { + build_index(storage, k); + } + treg.stop_listening(); + + size_t kmers = index.segment_starts_[buckets]; + double bits_per_kmer = 8.0 * (double) index.mem_size() / (double) kmers; + INFO("Index built. Total " << kmers << " k-mers, " << index.mem_size() << " bytes occupied (" << bits_per_kmer + << " bits per kmer), " << buckets << " buckets"); + + if (partask::master() && save_final) { + storage.merge(); + } + + return kmers; +} + +// Merge KMerDiskStorage from a few nodes +template +class MergeKMerFilesTask { + MergeKMerFilesTask() = default; + public: + MergeKMerFilesTask(std::vector> storages, std::vector& ofiles, unsigned k, const std::string &dir) : storages_{std::move(storages)}, ofiles_{ofiles}, k_{k}, dir_{dir} {}; + MergeKMerFilesTask(std::istream &is) { + io::binary::BinRead(is, storages_, ofiles_, dir_, k_); + } + + void serialize(std::ostream &os) const { + io::binary::BinWrite(os, storages_, ofiles_, dir_, k_); + } + + auto make_splitter(size_t) { + return partask::make_seq_generator(storages_[0].num_buckets()); + } + + void process(std::istream &is, std::ostream &) { + std::vector residuals; + while (is.get() && is) { + size_t i; + io::binary::BinRead(is, i); + DEBUG("MergeKMerFilesTask: process, i = " << i); + residuals.push_back(i); + } + + /*size_t num_open_files = omp_get_max_threads() * (2 * partask::world_size()); + INFO("Setting open file limit to " << num_open_files); + utils::limit_file(num_open_files);*/ + + int totalsum = 0; +#pragma omp parallel for + for (size_t idx = 0; idx < residuals.size(); ++idx) { + size_t i = residuals[idx]; // TODO rename var i -> residual + + MMappedRecordArrayWriter os(ofiles_[i], Seq::GetDataSize(storages_[0].k())); + auto elcnt = Seq::GetDataSize(storages_[0].k()); + std::vector> ins; + std::vector strs(storages_.size(), 0); + + bool notEmpty = true; + size_t prevId = -1ULL; + unsigned sumsize = 0; + for (size_t sid = 0; sid < storages_.size(); ++sid) { + ins.push_back(MMappedRecordArrayReader(*storages_[sid].bucket_file(i), Seq::GetDataSize(storages_[0].k()), /* unlink */ false)); + sumsize += ins.back().size(); + } + + int total = 0; + while (notEmpty) { + size_t bstpos = -1ULL; + for (size_t sid = 0; sid < storages_.size(); ++sid) { + if (ins[sid].size() == strs[sid]) { + continue; + } + if (bstpos == -1ULL || adt::array_less()(*(ins[sid].begin() + strs[sid]), + *(ins[bstpos].begin() + strs[bstpos]))) { + bstpos = sid; + } + } + if (bstpos != -1ULL) { + if (prevId == -1ULL || adt::array_less()(*(ins[prevId].begin() + (strs[prevId] - 1)), *(ins[bstpos].begin() + (strs[bstpos])))) { + os.resize(1); + os.write(ins[bstpos].data() + strs[bstpos] * elcnt, 1); + total += 1; + } + prevId = bstpos; + strs[bstpos] += 1; + } else { + notEmpty = false; + } + } + totalsum += total; + } + INFO("Total kmers writen" << totalsum); + + } + + void merge(const std::vector &) {} + + private: + std::vector> storages_; + std::vector ofiles_; + unsigned k_; + std::string dir_; +}; +} diff --git a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp index d29e1b4f46..5b22553bd7 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp @@ -11,7 +11,7 @@ #include "kmer_maps.hpp" #include "cqf_hash_map.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" - +#include "kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "utils/perf/timetracer.hpp" namespace kmers { @@ -44,6 +44,16 @@ struct PerfectHashMapBuilder { builder.BuildIndex(*index.index_ptr_, storage); index.resize(storage.total_kmers()); } + + template + void BuildIndexMPI(PerfectHashMap &index, + KMerStorage& storage, bool save_final = true) const { + using KMerIndex = typename PerfectHashMap::KMerIndexT; + + kmers::KMerIndexBuilderMPI builder; + size_t sz = builder.BuildIndexMPI(*index.index_ptr_, storage, save_final); + index.resize(sz); + } }; struct CQFHashMapBuilder { @@ -102,6 +112,17 @@ struct KeyStoringIndexBuilder { index.SortUniqueKMers(); } + template + void BuildIndexMPI(KeyStoringMap &index, + KMerStorage& kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + if (partask::master()) { + VERIFY(!index.kmers_.get()); + index.kmers_file_ = kmerstorage.final_kmers(); + index.SortUniqueKMers(); + } + } + private: PerfectHashMapBuilder phm_builder_; }; @@ -115,6 +136,24 @@ struct KeyIteratingIndexBuilder { index.kmers_ = res.final_kmers(); } + template + void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage& kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + std::string final_kmers_file; + if (partask::master()) { + index.kmers_ = kmerstorage.final_kmers(); + final_kmers_file = index.kmers_->file(); + } + // MPI code leaked so far( TODO do smth with this + partask::broadcast(final_kmers_file); + if (partask::worker()) { + index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); + index.kmers_->release(); + } + INFO("Final K-mers file: " << final_kmers_file); + } + private: PerfectHashMapBuilder phm_builder_; }; @@ -146,4 +185,22 @@ void BuildIndex(PerfectHashMap &index, PerfectHashMapBuilder().BuildIndex(index, storage, thread_num); } +template +void BuildIndexMPI(PerfectHashMap &index, + KMerStorage &storage, bool save_final = true) { + PerfectHashMapBuilder().BuildIndexMPI(index, storage, save_final); +} + +template +void BuildIndexMPI(KeyStoringMap &index, + KMerStorage &storage, bool save_final = true) { + KeyStoringIndexBuilder().BuildIndexMPI(index, storage, save_final); +} + +template +void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage &storage, bool save_final = true) { + KeyIteratingIndexBuilder().BuildIndexMPI(index, storage, save_final); +} + } diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 3585467396..9230319953 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -16,6 +16,7 @@ #include "io/reads/coverage_filtering_read_wrapper.hpp" #include "io/reads/multifile_reader.hpp" #include "kmer_index/ph_map/coverage_hash_map_builder.hpp" +#include "kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp" #include "modules/graph_construction.hpp" #include "pipeline/genomic_info.hpp" #include "pipeline/graph_pack.hpp" @@ -276,11 +277,11 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { storage().kmers.reset(new kmers::KMerDiskStorage()); partask::broadcast(*storage().kmers); - kmers::DeBruijnExtensionIndexBuilder().BuildExtensionIndexFromKPOMers(storage().workdir, - storage().ext_index, - *storage().kmers, - cfg::get().max_threads, - storage().params.read_buffer_size); + kmers::DeBruijnExtensionIndexBuilderMPI().BuildExtensionIndexFromKPOMersMPI(storage().workdir, + storage().ext_index, + *storage().kmers, + cfg::get().max_threads, + storage().params.read_buffer_size); } void load(graph_pack::GraphPack&, From 3334aa7a464ddf046d070da11388b902694bc76b Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 11:41:59 +0300 Subject: [PATCH 037/102] comments for MergeKmerFileTask --- .../kmer_extension_index_builder_mpi.hpp | 105 +++++++++++++++++- .../kmer_mph/kmer_index_builder_mpi.hpp | 88 --------------- 2 files changed, 103 insertions(+), 90 deletions(-) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index f99149147d..4421d80996 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -155,6 +155,107 @@ class SplitKPOMersTask { std::string dir_; }; +/* Merge KMerDiskStorages from a few nodes + * + * storages -- KMerDiskStorages which were received from different nodes after SplitKPOMersTask. + * Each storage must be valid and unmerged. It is mean that each storage contain kmers splitted on files consist with + * segmentation_policy, in each file all kmers are unique and sorted. However kmer can be present in different storage, + * but in one particular storage only one time. + * + * MergeKMerFiles merge buckets from different Storages into one storage. The policy of splitting kmers should be the + * same for all storages and should stay the same for final storage. In MergeKMerFilesTask all kmers from 0 buckets + * from all storages should be merge into 0 bucket in final storage, 1 buckets should be merge into 1 bucket etc. + * + * Instead of final storage MergeKMerFilesTask take as a input the vector of files name(ofiles) correspondent to final bucket + * in new storage. After the work of MergeKmerFilesTask files from ofiles should contain merged kmers. Kmers + * in each file should be sorted and unique. 0 file should contain all kmers from all 0 buckets from all storages, + * the 1 file should contain all kmers from 1 buckets from all storages and etc. + */ +template +class MergeKMerFilesTask { + MergeKMerFilesTask() = default; + public: + MergeKMerFilesTask(std::vector> storages, std::vector& ofiles) : storages_{std::move(storages)}, ofiles_{ofiles} {}; + MergeKMerFilesTask(std::istream &is) { + io::binary::BinRead(is, storages_, ofiles_); + } + + void serialize(std::ostream &os) const { + io::binary::BinWrite(os, storages_, ofiles_); + } + + auto make_splitter(size_t) { + return partask::make_seq_generator(storages_[0].num_buckets()); + } + + void process(std::istream &is, std::ostream &) { + std::vector residuals; + while (is.get() && is) { + size_t i; + io::binary::BinRead(is, i); + DEBUG("MergeKMerFilesTask: process, i = " << i); + residuals.push_back(i); + } + + /*size_t num_open_files = omp_get_max_threads() * (2 * partask::world_size()); + INFO("Setting open file limit to " << num_open_files); + utils::limit_file(num_open_files);*/ + + int totalsum = 0; +#pragma omp parallel for + for (size_t idx = 0; idx < residuals.size(); ++idx) { + size_t i = residuals[idx]; // TODO rename var i -> residual + + MMappedRecordArrayWriter os(ofiles_[i], Seq::GetDataSize(storages_[0].k())); + auto elcnt = Seq::GetDataSize(storages_[0].k()); + std::vector> ins; + std::vector strs(storages_.size(), 0); + + bool notEmpty = true; + size_t prevId = -1ULL; + unsigned sumsize = 0; + for (size_t sid = 0; sid < storages_.size(); ++sid) { + ins.push_back(MMappedRecordArrayReader(*storages_[sid].bucket_file(i), Seq::GetDataSize(storages_[0].k()), /* unlink */ false)); + sumsize += ins.back().size(); + } + + int total = 0; + while (notEmpty) { + size_t bstpos = -1ULL; + for (size_t sid = 0; sid < storages_.size(); ++sid) { + if (ins[sid].size() == strs[sid]) { + continue; + } + if (bstpos == -1ULL || adt::array_less()(*(ins[sid].begin() + strs[sid]), + *(ins[bstpos].begin() + strs[bstpos]))) { + bstpos = sid; + } + } + if (bstpos != -1ULL) { + if (prevId == -1ULL || adt::array_less()(*(ins[prevId].begin() + (strs[prevId] - 1)), *(ins[bstpos].begin() + (strs[bstpos])))) { + os.resize(1); + os.write(ins[bstpos].data() + strs[bstpos] * elcnt, 1); + total += 1; + } + prevId = bstpos; + strs[bstpos] += 1; + } else { + notEmpty = false; + } + } + totalsum += total; + } + INFO("Total kmers writen" << totalsum); + + } + + void merge(const std::vector &) {} + + private: + std::vector> storages_; + std::vector ofiles_; +}; + template inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI(fs::TmpDir workdir, Index &index, @@ -168,7 +269,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( INFO("DeBruijnExtensionIndexBuilder started nthreads = " << nthreads); partask::TaskRegistry treg; - auto merge_kmer_files = treg.add>(); + auto merge_kmer_files = treg.add>(); auto split_kpo_mers = treg.add>(); treg.listen(); DEBUG("Listening started"); @@ -183,7 +284,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( outputfiles.push_back(kmerfiles2.create(i)->file()); } - merge_kmer_files(std::move(unmerged_kmerfiles2), outputfiles, index.k(), workdir->dir()); + merge_kmer_files(std::move(unmerged_kmerfiles2), outputfiles); DEBUG("Merge_kmer_files finished"); } treg.stop_listening(); diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp index 9f35070840..1aee9c04e6 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp @@ -168,92 +168,4 @@ size_t KMerIndexBuilderMPI::BuildIndexMPI(Index &index, KMerDiskStorage -class MergeKMerFilesTask { - MergeKMerFilesTask() = default; - public: - MergeKMerFilesTask(std::vector> storages, std::vector& ofiles, unsigned k, const std::string &dir) : storages_{std::move(storages)}, ofiles_{ofiles}, k_{k}, dir_{dir} {}; - MergeKMerFilesTask(std::istream &is) { - io::binary::BinRead(is, storages_, ofiles_, dir_, k_); - } - - void serialize(std::ostream &os) const { - io::binary::BinWrite(os, storages_, ofiles_, dir_, k_); - } - - auto make_splitter(size_t) { - return partask::make_seq_generator(storages_[0].num_buckets()); - } - - void process(std::istream &is, std::ostream &) { - std::vector residuals; - while (is.get() && is) { - size_t i; - io::binary::BinRead(is, i); - DEBUG("MergeKMerFilesTask: process, i = " << i); - residuals.push_back(i); - } - - /*size_t num_open_files = omp_get_max_threads() * (2 * partask::world_size()); - INFO("Setting open file limit to " << num_open_files); - utils::limit_file(num_open_files);*/ - - int totalsum = 0; -#pragma omp parallel for - for (size_t idx = 0; idx < residuals.size(); ++idx) { - size_t i = residuals[idx]; // TODO rename var i -> residual - - MMappedRecordArrayWriter os(ofiles_[i], Seq::GetDataSize(storages_[0].k())); - auto elcnt = Seq::GetDataSize(storages_[0].k()); - std::vector> ins; - std::vector strs(storages_.size(), 0); - - bool notEmpty = true; - size_t prevId = -1ULL; - unsigned sumsize = 0; - for (size_t sid = 0; sid < storages_.size(); ++sid) { - ins.push_back(MMappedRecordArrayReader(*storages_[sid].bucket_file(i), Seq::GetDataSize(storages_[0].k()), /* unlink */ false)); - sumsize += ins.back().size(); - } - - int total = 0; - while (notEmpty) { - size_t bstpos = -1ULL; - for (size_t sid = 0; sid < storages_.size(); ++sid) { - if (ins[sid].size() == strs[sid]) { - continue; - } - if (bstpos == -1ULL || adt::array_less()(*(ins[sid].begin() + strs[sid]), - *(ins[bstpos].begin() + strs[bstpos]))) { - bstpos = sid; - } - } - if (bstpos != -1ULL) { - if (prevId == -1ULL || adt::array_less()(*(ins[prevId].begin() + (strs[prevId] - 1)), *(ins[bstpos].begin() + (strs[bstpos])))) { - os.resize(1); - os.write(ins[bstpos].data() + strs[bstpos] * elcnt, 1); - total += 1; - } - prevId = bstpos; - strs[bstpos] += 1; - } else { - notEmpty = false; - } - } - totalsum += total; - } - INFO("Total kmers writen" << totalsum); - - } - - void merge(const std::vector &) {} - - private: - std::vector> storages_; - std::vector ofiles_; - unsigned k_; - std::string dir_; -}; } From cdb191dedacdb22345905f20403c757780336dfd Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 11:52:35 +0300 Subject: [PATCH 038/102] verification that after split the #buckets the same in all storages --- .../extension_index/kmer_extension_index_builder_mpi.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 4421d80996..6c30e379c3 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -278,6 +278,12 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( std::vector outputfiles; DEBUG("Split_kpo_mers started"); auto unmerged_kmerfiles2 = split_kpo_mers(kmerstorage, index.k(), nthreads, read_buffer_size, workdir->dir()); + + //VERIFY that number of buckets in each splitted storage the same + for (size_t i = 0; i < unmerged_kmerfiles2.size(); ++i) { + VERIFY(unmerged_kmerfiles2[i].num_buckets() == kmerstorage.num_buckets()); + } + DEBUG("Split_kpo_mers finished"); for (unsigned i = 0; i < kmerfiles2.num_buckets(); ++i) { From 8bc46ff5789c2fa3fadd044cc9b6424aa2d3334e Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 12:05:55 +0300 Subject: [PATCH 039/102] make kpostorage const --- .../kmer_extension_index_builder_mpi.hpp | 54 +++++++++++-------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 6c30e379c3..98dad4e7c0 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -38,9 +38,14 @@ class DeBruijnExtensionIndexBuilderMPI : public DeBruijnExtensionIndexBuilder { nthreads, read_buffer_size); } + /* + * Build extension index from k+1-mers. + * + * kpostorage -- storage with k+1 mers. This storage is unmerged. + * */ template void BuildExtensionIndexFromKPOMersMPI(fs::TmpDir workdir, - Index &index, KMerStorage &kmerstorage, + Index &index, const KMerStorage &kpostorage, unsigned nbuckets, size_t read_buffer_size = 0) const; private: @@ -52,18 +57,18 @@ class FillIndexTask { public: FillIndexTask() = default; - FillIndexTask(kmers::KMerDiskStorage &storage) : storage_{storage} {}; + FillIndexTask(const kmers::KMerDiskStorage &kpostorage) : kpostorage_{kpostorage} {}; FillIndexTask(std::istream &is) { - storage_.BinRead(is); + kpostorage_.BinRead(is); } std::ostream &serialize(std::ostream &os) const { - io::binary::BinWrite(os, storage_); + io::binary::BinWrite(os, kpostorage_); return os; } auto make_splitter(size_t, Index &) { - return partask::make_seq_generator(storage_.num_buckets()); + return partask::make_seq_generator(kpostorage_.num_buckets()); } void process(std::istream &is, std::ostream & /*os*/, Index &index) { @@ -71,7 +76,7 @@ class FillIndexTask { # pragma omp parallel for for (size_t i = 0; i < file_ids.size(); ++i) { size_t idx = file_ids[i]; - builder_.FillExtensionsFromIndex(storage_.bucket_begin(idx), storage_.bucket_end(idx), index); + builder_.FillExtensionsFromIndex(kpostorage_.bucket_begin(idx), kpostorage_.bucket_end(idx), index); } // Send nothing @@ -84,7 +89,7 @@ class FillIndexTask { } private: - kmers::KMerDiskStorage storage_; + kmers::KMerDiskStorage kpostorage_; DeBruijnExtensionIndexBuilder builder_; }; @@ -93,24 +98,25 @@ class SplitKPOMersTask { typedef typename Index::traits_t::SeqType Seq; SplitKPOMersTask() = default; public: - SplitKPOMersTask(kmers::KMerDiskStorage &storage, + SplitKPOMersTask(const kmers::KMerDiskStorage &kpostorage, unsigned k, unsigned nthreads, size_t read_buffer_size, const std::string &dir) - : storage_{storage}, k_{k}, nthreads_{nthreads}, read_buffer_size_{read_buffer_size}, dir_{dir} {}; + : kpostorage_{kpostorage}, k_{k}, nthreads_{nthreads}, read_buffer_size_{read_buffer_size}, dir_{dir} {}; + SplitKPOMersTask(std::istream &is) { - io::binary::BinRead(is, storage_, k_, nthreads_, read_buffer_size_, dir_); + io::binary::BinRead(is, kpostorage_, k_, nthreads_, read_buffer_size_, dir_); } std::ostream &serialize(std::ostream &os) const { - io::binary::BinWrite(os, storage_, k_, nthreads_, read_buffer_size_, dir_); + io::binary::BinWrite(os, kpostorage_, k_, nthreads_, read_buffer_size_, dir_); return os; } auto make_splitter(size_t) { - return partask::make_seq_generator(storage_.num_buckets()); + return partask::make_seq_generator(kpostorage_.num_buckets()); } void process(std::istream &is, std::ostream &os) { @@ -123,11 +129,11 @@ class SplitKPOMersTask { policy_ = splitter.bucket_policy(); auto file_ids = partask::get_seq(is); for (size_t i : file_ids) { - splitter.AddKMers(storage_.bucket(i)); + splitter.AddKMers(kpostorage_.bucket(i)); } kmers::KMerDiskCounter counter2(workdir, splitter); - auto storage2 = counter2.CountAll(storage_.num_buckets(), nthreads_,/* merge */ false); + auto storage2 = counter2.CountAll(kpostorage_.num_buckets(), nthreads_,/* merge */ false); storage2.BinWrite(os); storage2.release_all(); } @@ -147,7 +153,7 @@ class SplitKPOMersTask { } private: - kmers::KMerDiskStorage storage_; + kmers::KMerDiskStorage kpostorage_; typename kmers::KMerDiskStorage::KMerSegmentPolicy policy_; unsigned k_; unsigned nthreads_; @@ -155,6 +161,7 @@ class SplitKPOMersTask { std::string dir_; }; + /* Merge KMerDiskStorages from a few nodes * * storages -- KMerDiskStorages which were received from different nodes after SplitKPOMersTask. @@ -256,16 +263,21 @@ class MergeKMerFilesTask { std::vector ofiles_; }; +/* + * Build extension index from k+1-mers. + * + * kpostorage -- storage with k+1 mers. This storage is unmerged. + */ template inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI(fs::TmpDir workdir, Index &index, - KMerStorage &kmerstorage, + const KMerStorage &kpostorage, unsigned nthreads, size_t read_buffer_size) const { typedef typename Index::traits_t::SeqType Seq; - VERIFY(kmerstorage.k() == index.k() + 1); + VERIFY(kpostorage.k() == index.k() + 1); - KMerStorage kmerfiles2(workdir, index.k(), kmerstorage.segment_policy()); + KMerStorage kmerfiles2(workdir, index.k(), kpostorage.segment_policy()); INFO("DeBruijnExtensionIndexBuilder started nthreads = " << nthreads); partask::TaskRegistry treg; @@ -277,11 +289,11 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( if (partask::master()) { std::vector outputfiles; DEBUG("Split_kpo_mers started"); - auto unmerged_kmerfiles2 = split_kpo_mers(kmerstorage, index.k(), nthreads, read_buffer_size, workdir->dir()); + auto unmerged_kmerfiles2 = split_kpo_mers(kpostorage, index.k(), nthreads, read_buffer_size, workdir->dir()); //VERIFY that number of buckets in each splitted storage the same for (size_t i = 0; i < unmerged_kmerfiles2.size(); ++i) { - VERIFY(unmerged_kmerfiles2[i].num_buckets() == kmerstorage.num_buckets()); + VERIFY(unmerged_kmerfiles2[i].num_buckets() == kpostorage.num_buckets()); } DEBUG("Split_kpo_mers finished"); @@ -304,7 +316,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( DEBUG("Listening started"); if (partask::master()) { - fill_index(kmerstorage); + fill_index(kpostorage); } treg.stop_listening(); From 96c24141cb9c102594902b828b870992e13321cf Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 12:28:22 +0300 Subject: [PATCH 040/102] comment for release_all --- src/common/kmer_index/kmer_mph/kmer_index_builder.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp index ab7a88ef9e..78243aaeff 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp @@ -141,6 +141,14 @@ class KMerDiskStorage { return fsize / (Seq::GetDataSize(k_) * sizeof(typename Seq::DataType)); } + /* + * Stop owning all files. After this object dies the bucket, + * the files will not be deleted. Files will need to be deleted manually. + * + * TmpFiles contain some counter and it is deleted when counter = 0. In this function + * we don't decrement counter, but don't decrement it even when this object is freed. If we owning file + * we will decrement counter on deletion. + */ void release_all() { if (all_kmers_) { all_kmers_->release(); From dde28921427ea793ae7ecf834bc9ebf0869e2d98 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 14:04:21 +0300 Subject: [PATCH 041/102] add comment for Splitter --- .../extension_index/kmer_extension_index_builder_mpi.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 98dad4e7c0..781cc93447 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -93,6 +93,14 @@ class FillIndexTask { DeBruijnExtensionIndexBuilder builder_; }; + +/* + * Build kmer storages from k+1-mer storage + * + * Return the vector of kmer storages one for each node. Each kmer storage contain unique sorted kmers. + * The segmentation policy for each returned kmer storage the same as policy for k+1-mer storage. + * All returned storages are unmerged. + */ template class SplitKPOMersTask { typedef typename Index::traits_t::SeqType Seq; From a6f36911207563be8df2aacbc965280ee7f875c4 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 16:37:40 +0300 Subject: [PATCH 042/102] resize one time in MergeKMers --- .../kmer_extension_index_builder_mpi.hpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 781cc93447..9e7560714e 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -225,6 +225,7 @@ class MergeKMerFilesTask { auto elcnt = Seq::GetDataSize(storages_[0].k()); std::vector> ins; std::vector strs(storages_.size(), 0); + std::vector> oids; bool notEmpty = true; size_t prevId = -1ULL; @@ -248,8 +249,7 @@ class MergeKMerFilesTask { } if (bstpos != -1ULL) { if (prevId == -1ULL || adt::array_less()(*(ins[prevId].begin() + (strs[prevId] - 1)), *(ins[bstpos].begin() + (strs[bstpos])))) { - os.resize(1); - os.write(ins[bstpos].data() + strs[bstpos] * elcnt, 1); + oids.push_back(std::make_pair(bstpos, strs[bstpos])); total += 1; } prevId = bstpos; @@ -258,9 +258,14 @@ class MergeKMerFilesTask { notEmpty = false; } } + os.resize(total); + for (auto oid : oids) { + os.write(ins[oid.first].data() + oid.second * elcnt, 1); + } + totalsum += total; } - INFO("Total kmers writen" << totalsum); + DEBUG("Total kmers writen= " << totalsum); } @@ -296,7 +301,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( if (partask::master()) { std::vector outputfiles; - DEBUG("Split_kpo_mers started"); + INFO("Split_kpo_mers started"); auto unmerged_kmerfiles2 = split_kpo_mers(kpostorage, index.k(), nthreads, read_buffer_size, workdir->dir()); //VERIFY that number of buckets in each splitted storage the same @@ -304,19 +309,19 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( VERIFY(unmerged_kmerfiles2[i].num_buckets() == kpostorage.num_buckets()); } - DEBUG("Split_kpo_mers finished"); + INFO("Split_kpo_mers finished"); for (unsigned i = 0; i < kmerfiles2.num_buckets(); ++i) { outputfiles.push_back(kmerfiles2.create(i)->file()); } merge_kmer_files(std::move(unmerged_kmerfiles2), outputfiles); - DEBUG("Merge_kmer_files finished"); + INFO("Merge_kmer_files finished"); } treg.stop_listening(); partask::broadcast(kmerfiles2); - INFO("Total kmers=" << kmerfiles2.total_kmers()); + INFO("Total kmers= " << kmerfiles2.total_kmers()); BuildIndexMPI(index, kmerfiles2, /* save_final */ true); auto fill_index = treg.add>(std::ref(index)); From d2ad40be6361a9e962a780e27eb9961147fe512a Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 19:24:47 +0300 Subject: [PATCH 043/102] VERIFY kmers in kmer storage are sorted and unique --- .../kmer_extension_index_builder_mpi.hpp | 2 ++ .../kmer_mph/kmer_index_builder.hpp | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 9e7560714e..f538d39014 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -307,6 +307,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( //VERIFY that number of buckets in each splitted storage the same for (size_t i = 0; i < unmerged_kmerfiles2.size(); ++i) { VERIFY(unmerged_kmerfiles2[i].num_buckets() == kpostorage.num_buckets()); + VERIFY_DEV(unmerged_kmerfiles2[i].is_unique_and_sorted()); } INFO("Split_kpo_mers finished"); @@ -316,6 +317,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( } merge_kmer_files(std::move(unmerged_kmerfiles2), outputfiles); + VERIFY_DEV(kmerfiles2.is_unique_and_sorted()); INFO("Merge_kmer_files finished"); } treg.stop_listening(); diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp index 78243aaeff..41523d9648 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp @@ -219,6 +219,33 @@ class KMerDiskStorage { io::binary::BinWrite(os, buckets_); } + //helper function for verification + //all kmers in all buckets are sorted and unique + bool is_unique_and_sorted() const { + if (all_kmers_) { + return true; + } + bool is_good = true; + +#pragma omp parallel for + for (size_t bid = 0; bid < this->num_buckets(); ++bid) { + MMappedRecordArrayReader ins(*buckets_[bid], Seq::GetDataSize(k_), /* unlink */ false); + for (size_t i = 1; i < ins.size(); ++i) { + if (!adt::array_less()(*(ins.begin() + i - 1), *(ins.begin() + i))) { +#pragma omp critical + is_good = false; + } + } + } + + if (is_good) { + INFO("Storage contain only sorted and unique kmers"); + } else { + INFO("Kmers in storage aren't sorted or aren't unique"); + } + return is_good; + } + private: fs::TmpDir work_dir_; fs::TmpFile kmer_prefix_; From 9b2a16b736d10507e12bf4e1df0d76d85a623338 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 22 Jul 2021 19:49:37 +0300 Subject: [PATCH 044/102] fix warnings --- .../kmer_extension_index_builder_mpi.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index f538d39014..f63a6112b5 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -141,7 +141,7 @@ class SplitKPOMersTask { } kmers::KMerDiskCounter counter2(workdir, splitter); - auto storage2 = counter2.CountAll(kpostorage_.num_buckets(), nthreads_,/* merge */ false); + auto storage2 = counter2.CountAll(unsigned(kpostorage_.num_buckets()), nthreads_,/* merge */ false); storage2.BinWrite(os); storage2.release_all(); } @@ -224,15 +224,13 @@ class MergeKMerFilesTask { MMappedRecordArrayWriter os(ofiles_[i], Seq::GetDataSize(storages_[0].k())); auto elcnt = Seq::GetDataSize(storages_[0].k()); std::vector> ins; - std::vector strs(storages_.size(), 0); - std::vector> oids; + std::vector strs(storages_.size(), 0ul); + std::vector> oids; bool notEmpty = true; size_t prevId = -1ULL; - unsigned sumsize = 0; for (size_t sid = 0; sid < storages_.size(); ++sid) { ins.push_back(MMappedRecordArrayReader(*storages_[sid].bucket_file(i), Seq::GetDataSize(storages_[0].k()), /* unlink */ false)); - sumsize += ins.back().size(); } int total = 0; @@ -312,7 +310,7 @@ inline void DeBruijnExtensionIndexBuilderMPI::BuildExtensionIndexFromKPOMersMPI( INFO("Split_kpo_mers finished"); - for (unsigned i = 0; i < kmerfiles2.num_buckets(); ++i) { + for (size_t i = 0; i < kmerfiles2.num_buckets(); ++i) { outputfiles.push_back(kmerfiles2.create(i)->file()); } From f148be38a741b602fe502b82694237050b653cc0 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 26 Jul 2021 20:50:32 +0300 Subject: [PATCH 045/102] MPI KMerCounter --- src/common/stages/construction_mpi.cpp | 133 +++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 9 deletions(-) diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 9230319953..9858098c3d 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -218,6 +218,87 @@ void save(const graph_pack::GraphPack&, }; +template +class ReadKMerCountingTask { + private: + ReadKMerCountingTask() = default; + using ReadStreams = io::ReadStreamList; + public: + ReadKMerCountingTask(const std::string &dir, unsigned k, size_t buffer_size, + unsigned num_buckets, unsigned num_threads) + : dir_{dir}, k_{k}, buffer_size_{buffer_size}, num_buckets_{num_buckets}, num_threads_{num_threads} {}; + ReadKMerCountingTask(std::istream &is) { deserialize(is); } + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, dir_, k_, buffer_size_, num_buckets_, num_threads_); + return os; + } + + std::istream &deserialize(std::istream &is) { + io::binary::BinRead(is, dir_, k_, buffer_size_, num_buckets_, num_threads_); + return is; + } + + template + auto make_splitter(size_t, ReadStreams &read_streams, Args &&...) { + return partask::make_seq_along_generator(read_streams); + } + + void process(std::istream &is, std::ostream &os, ReadStreams &read_streams) { + auto chunks = partask::get_seq(is); + + ReadStreams streams = partask::create_empty_stream_list(chunks.size()); + partask::swap_streams(read_streams, streams, chunks); + streams.reset(); + + if (streams.size() == 0) { + return; + } + + auto workdir = fs::tmp::acquire_temp_dir(dir_); + workdir->release(); + kmers::DeBruijnReadKMerSplitter> splitter( + workdir, k_, streams, buffer_size_); + + kmers::KMerDiskCounter counter(workdir, splitter); + + auto kmerstorage = counter.CountAll(num_buckets_, num_threads_, /* merge */ false); + INFO("k-mers counted successfully"); + + if (kmerstorage.total_kmers() == 0) { + WARN("No kmers were extracted from reads. Check the read lengths and k-mer length settings"); + } + + kmerstorage.BinWrite(os); + kmerstorage.release_all(); + + partask::swap_streams(read_streams, streams, chunks); + } + + auto merge(const std::vector &piss, ReadStreams & /*read_streams*/) { + auto workdir = fs::tmp::acquire_temp_dir(dir_); + workdir->release(); + + + std::vector> storages; + for (size_t i = 0; i < piss.size(); ++i) { + auto &is = *piss[i]; + kmers::KMerDiskStorage kmerstorage(workdir, k_, typename kmer::KMerSegmentPolicy(num_buckets_)); + kmerstorage.BinRead(is); + storages.push_back(std::move(kmerstorage)); + } + + return storages; + } + + private: + std::string dir_; + unsigned k_; + size_t buffer_size_; + unsigned num_buckets_; + unsigned num_threads_; +}; + + class KMerCounting : public ConstructionMPI::Phase { typedef rolling_hash::SymmetricCyclicHash<> SeqHasher; public: @@ -226,7 +307,10 @@ class KMerCounting : public ConstructionMPI::Phase { virtual ~KMerCounting() = default; + bool distributed() const override { return true; } + void run(graph_pack::GraphPack &, const char*) override { + sync(); // TODO change for syncfs (it requires opened file descriptor from the fs) auto &read_streams = storage().read_streams; auto &contigs_streams = storage().contigs_streams; const auto &index = storage().ext_index; @@ -235,18 +319,49 @@ class KMerCounting : public ConstructionMPI::Phase { VERIFY_MSG(read_streams.size(), "No input streams specified"); - io::ReadStreamList merge_streams = temp_merge_read_streams(read_streams, contigs_streams); + unsigned k = index.k(); unsigned nthreads = cfg::get().max_threads; - using Splitter = kmers::DeBruijnReadKMerSplitter>; - - kmers::KMerDiskCounter - counter(storage().workdir, - Splitter(storage().workdir, index.k() + 1, merge_streams, buffer_size)); - auto kmers = counter.Count(10 * nthreads, nthreads); - storage().kmers.reset(new kmers::KMerDiskStorage(std::move(kmers))); + unsigned num_buckets = 10 * nthreads; + using Seq = RtSeq; + + kmers::KMerDiskStorage kmerfiles2(storage().workdir, k + 1, typename kmer::KMerSegmentPolicy(num_buckets)); + + + VERIFY(partask::all_equal(num_buckets)); + VERIFY(partask::all_equal(k)); + VERIFY(partask::all_equal(nthreads)); + + partask::TaskRegistry treg; + auto merge_kmer_files = treg.add>(); + auto kmercount = treg.add>(std::ref(merge_streams)); + treg.listen(); + INFO("Listening started " << (partask::master() ? "(master)" : "(worker)")); + + if (partask::master()) { + INFO("Start KMer Counting") + auto unmerged_kmerfiles = kmercount(storage().workdir->dir(), k + 1, buffer_size, num_buckets, nthreads); + + std::vector outputfiles; + for (size_t i = 0; i < kmerfiles2.num_buckets(); ++i) { + outputfiles.push_back(kmerfiles2.create(i)->file()); + } + + INFO("Start Merge results from different nodes") + merge_kmer_files(std::move(unmerged_kmerfiles), outputfiles); + } + treg.stop_listening(); + partask::broadcast(kmerfiles2); + + size_t kmers = kmerfiles2.total_kmers(); + + if (!kmers) { + FATAL_ERROR("No kmers were extracted from reads. Check the read lengths and k-mer length settings"); + } + INFO(kmers << " k+1-mers (k=" << k << ") were extracted"); + + storage().kmers.reset(new kmers::KMerDiskStorage(std::move(kmerfiles2))); } void load(graph_pack::GraphPack&, From f4fa3b619f1b59c1869ca9dc8d0e19fc9683a2dd Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 27 Jul 2021 20:04:52 +0300 Subject: [PATCH 046/102] close stream after use --- src/common/io/reads/binary_streams.hpp | 12 ++++++++---- src/common/io/reads/multifile_reader.hpp | 2 -- src/common/stages/construction_mpi.cpp | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/common/io/reads/binary_streams.hpp b/src/common/io/reads/binary_streams.hpp index ac67e6a321..1f82566170 100644 --- a/src/common/io/reads/binary_streams.hpp +++ b/src/common/io/reads/binary_streams.hpp @@ -31,8 +31,13 @@ class BinaryFileStream { private: size_t offset_, count_, current_; + std::filesystem::path filename_; void Init() { + if (!is_open()) { + stream_.open(filename_, std::ios_base::binary | std::ios_base::in); + } + stream_.clear(); stream_.seekg(offset_); VERIFY_MSG(stream_.good(), "Stream is not good(), offset_ " << offset_ << " count_ " << count_); @@ -49,10 +54,11 @@ class BinaryFileStream { BinaryFileStream(const std::string &file_name_prefix, size_t portion_count, size_t portion_num) { DEBUG("Preparing binary stream #" << portion_num << "/" << portion_count); VERIFY(portion_num < portion_count); - const std::filesystem::path fname = file_name_prefix + ".seq"; - stream_.open(fname, std::ios_base::binary | std::ios_base::in); + filename_ = file_name_prefix + ".seq"; + stream_.open(filename_, std::ios_base::binary | std::ios_base::in); ReadStreamStat stat; stat.read(stream_); + stream_.close(); const std::filesystem::path offset_name = file_name_prefix + ".off"; const size_t chunk_count = file_size(offset_name) / sizeof(size_t); @@ -93,8 +99,6 @@ class BinaryFileStream { count_ = 0; DEBUG("Empty BinaryFileStream constructed"); } - - Init(); } /** diff --git a/src/common/io/reads/multifile_reader.hpp b/src/common/io/reads/multifile_reader.hpp index 4f8d5e0685..7f27b01881 100644 --- a/src/common/io/reads/multifile_reader.hpp +++ b/src/common/io/reads/multifile_reader.hpp @@ -29,13 +29,11 @@ class MultifileStream { MultifileStream(ReadStreamT reader_1) : current_reader_index_(0) { - VERIFY(reader_1.is_open()); readers_.push_back(std::move(reader_1)); } MultifileStream(ReadStreamT reader_1, ReadStreamT reader_2) : current_reader_index_(0) { - VERIFY(reader_1.is_open() && reader_2.is_open()); readers_.push_back(std::move(reader_1)); readers_.push_back(std::move(reader_2)); } diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 9858098c3d..11310d59bb 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -271,6 +271,7 @@ class ReadKMerCountingTask { kmerstorage.BinWrite(os); kmerstorage.release_all(); + streams.close(); partask::swap_streams(read_streams, streams, chunks); } From 1955316ccbaa65f87191a14e92a178b8379dc1a2 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 27 Jul 2021 20:14:05 +0300 Subject: [PATCH 047/102] constat for contig output stage --- src/projects/spades/contig_output_stage.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/projects/spades/contig_output_stage.hpp b/src/projects/spades/contig_output_stage.hpp index c245d6d1c2..3da4228dbb 100644 --- a/src/projects/spades/contig_output_stage.hpp +++ b/src/projects/spades/contig_output_stage.hpp @@ -33,6 +33,7 @@ class ContigOutput : public spades::AssemblyStage { void save(const graph_pack::GraphPack &, const std::filesystem::path &, const char *) const override { } void run(graph_pack::GraphPack &gp, const char *) override; + bool constant() const override { return true; } private: OutputList outputs_; }; From ff707c095860ad6ebff37ce0da3dd6d58542cee6 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 27 Jul 2021 20:57:06 +0300 Subject: [PATCH 048/102] add sync for read conversion --- src/common/stages/read_conversion.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/stages/read_conversion.cpp b/src/common/stages/read_conversion.cpp index 250f1a88d4..7275ac45d5 100644 --- a/src/common/stages/read_conversion.cpp +++ b/src/common/stages/read_conversion.cpp @@ -13,6 +13,7 @@ namespace spades { void ReadConversion::run(graph_pack::GraphPack &, const char *) { io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); + sync(); // TODO change for syncfs (it requires opened file descriptor from the fs) } void ReadConversion::load(graph_pack::GraphPack &, From b7370935ab61c54ae4bd2992939af5946140b754 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 31 Aug 2021 19:23:14 +0300 Subject: [PATCH 049/102] MPI ATtipClipper --- src/common/stages/construction_mpi.cpp | 87 +++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index 11310d59bb..dc2ea82d55 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -413,6 +413,76 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { } }; +template +class ATEdgesClippingTask { +public: + ATEdgesClippingTask() = default; + ATEdgesClippingTask(std::istream &is) { deserialize(is); } + + std::ostream &serialize(std::ostream &os) const { return os; } + + std::istream &deserialize(std::istream &is) { return is; } + + auto make_splitter(size_t size, Index &) { + return partask::make_seq_plus_n_generator(size); + } + + void process(std::istream &is, std::ostream &os, Index &index) { + size_t n = 0; + std::vector chunks = partask::get_seq_plus_n(is, n); + + INFO("Job got, " << chunks.size() << "/" << n << "chunks"); + auto iters = index.kmer_begin(n); + std::vector local_iters; + for (size_t i : chunks) { + if (i < iters.size()) { + local_iters.push_back(std::move(iters[i])); + } + } + + EarlyLowComplexityClipperProcessor at_processor(index, 0.8, 10, 200); + at_processor.RemoveATEdges(local_iters); + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + } + + void merge(const std::vector &, Index &) {} +}; + +template +class ATTipClippingTask { +public: + ATTipClippingTask() = default; + + ATTipClippingTask(std::istream &is) { deserialize(is); } + + std::ostream &serialize(std::ostream &os) const { return os; } + + std::istream &deserialize(std::istream &is) { return is; } + + auto make_splitter(size_t size, Index &) { + return partask::make_seq_plus_n_generator(size); + } + + void process(std::istream &is, std::ostream &os, Index &index) { + size_t n = 0; + std::vector chunks = partask::get_seq_plus_n(is, n); + + INFO("Job got, " << chunks.size() << "/" << n << "chunks"); + auto iters = index.kmer_begin(n); + std::vector local_iters; + for (size_t i : chunks) { + if (i < iters.size()) { + local_iters.push_back(std::move(iters[i])); + } + } + + EarlyLowComplexityClipperProcessor at_processor(index, 0.8, 10, 200); + at_processor.RemoveATTips(local_iters); + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + } + + void merge(const std::vector &, Index &index) {} +}; template class TipClippingTask { @@ -521,9 +591,20 @@ class EarlyATClipper : public ConstructionMPI::Phase { bool distributed() const override { return true; } void run(graph_pack::GraphPack &, const char*) override { - EarlyLowComplexityClipperProcessor at_processor(storage().ext_index, 0.8, 10, 200); - at_processor.RemoveATEdges(); - at_processor.RemoveATTips(); + partask::TaskRegistry treg; + auto &index = storage().ext_index; + using Index = std::remove_reference_t; + VERIFY(partask::all_equal(index.size())); + + auto clip_edges = treg.add>(std::ref(index)); + auto clip_tips = treg.add>(std::ref(index)); + treg.listen(); + + if (partask::master()) { + clip_edges(); + clip_tips(); + } + treg.stop_listening(); } void load(graph_pack::GraphPack&, From 556666d3f0ee4006149145c4751948426586a07c Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 2 Sep 2021 15:40:18 +0300 Subject: [PATCH 050/102] reduce code dublication --- src/common/stages/construction_mpi.cpp | 123 +++++++++++-------------- 1 file changed, 52 insertions(+), 71 deletions(-) diff --git a/src/common/stages/construction_mpi.cpp b/src/common/stages/construction_mpi.cpp index dc2ea82d55..de2c3dadd0 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/common/stages/construction_mpi.cpp @@ -413,15 +413,13 @@ class ExtensionIndexBuilder : public ConstructionMPI::Phase { } }; + template -class ATEdgesClippingTask { +class TipClippingTaskBase { public: - ATEdgesClippingTask() = default; - ATEdgesClippingTask(std::istream &is) { deserialize(is); } - - std::ostream &serialize(std::ostream &os) const { return os; } - - std::istream &deserialize(std::istream &is) { return is; } + TipClippingTaskBase() = default; + virtual std::ostream &serialize(std::ostream &os) const = 0; + virtual std::istream &deserialize(std::istream &is) = 0; auto make_splitter(size_t size, Index &) { return partask::make_seq_plus_n_generator(size); @@ -440,98 +438,79 @@ class ATEdgesClippingTask { } } - EarlyLowComplexityClipperProcessor at_processor(index, 0.8, 10, 200); - at_processor.RemoveATEdges(local_iters); + size_t kpo_mers_removed = process_iner(index, local_iters); + + INFO("K+1-mers removed: " << kpo_mers_removed); partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + io::binary::BinWrite(os, kpo_mers_removed); } - void merge(const std::vector &, Index &) {} + size_t merge(const std::vector &piss, Index &) { + size_t kpo_mers_removed = 0; + for (auto &pis : piss) { + kpo_mers_removed += io::binary::BinRead(*pis); + } + return kpo_mers_removed; + } + +private: + virtual size_t process_iner(Index &, std::vector& /*local_iters*/) = 0; }; + template -class ATTipClippingTask { +class ATEdgesClippingTask : public TipClippingTaskBase { public: - ATTipClippingTask() = default; - - ATTipClippingTask(std::istream &is) { deserialize(is); } - - std::ostream &serialize(std::ostream &os) const { return os; } - - std::istream &deserialize(std::istream &is) { return is; } + ATEdgesClippingTask() = default; + ATEdgesClippingTask(std::istream &is) { deserialize(is); } + std::ostream &serialize(std::ostream &os) const override { return os; } + std::istream &deserialize(std::istream &is) override { return is; } - auto make_splitter(size_t size, Index &) { - return partask::make_seq_plus_n_generator(size); +private: + size_t process_iner(Index &index, std::vector& local_iters) override { + EarlyLowComplexityClipperProcessor at_processor(index, 0.8, 10, 200); + return at_processor.RemoveATEdges(local_iters); } +}; - void process(std::istream &is, std::ostream &os, Index &index) { - size_t n = 0; - std::vector chunks = partask::get_seq_plus_n(is, n); - - INFO("Job got, " << chunks.size() << "/" << n << "chunks"); - auto iters = index.kmer_begin(n); - std::vector local_iters; - for (size_t i : chunks) { - if (i < iters.size()) { - local_iters.push_back(std::move(iters[i])); - } - } +template +class ATTipClippingTask : public TipClippingTaskBase { +public: + ATTipClippingTask() = default; + ATTipClippingTask(std::istream &is) { deserialize(is); } + std::ostream &serialize(std::ostream &os) const override { return os; } + std::istream &deserialize(std::istream &is) override { return is; } +private: + size_t process_iner(Index &index, std::vector& local_iters) override { EarlyLowComplexityClipperProcessor at_processor(index, 0.8, 10, 200); - at_processor.RemoveATTips(local_iters); - partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + return at_processor.RemoveATTips(local_iters); } - - void merge(const std::vector &, Index &index) {} }; + template -class TipClippingTask { +class TipClippingTask : public TipClippingTaskBase { TipClippingTask() = default; public: TipClippingTask(size_t length_bound) : length_bound_{length_bound} {} TipClippingTask(std::istream &is) { deserialize(is); } - std::ostream &serialize(std::ostream &os) const { + + std::ostream &serialize(std::ostream &os) const override { io::binary::BinWrite(os, length_bound_); return os; } - std::istream &deserialize(std::istream &is) { + std::istream &deserialize(std::istream &is) override { io::binary::BinRead(is, length_bound_); return is; } - auto make_splitter(size_t size, Index &) { - return partask::make_seq_plus_n_generator(size); - } - - void process(std::istream &is, std::ostream &os, Index &index) { - size_t n = 0; - std::vector chunks = partask::get_seq_plus_n(is, n); - - INFO("Job got, " << chunks.size() << "/" << n << "chunks"); - auto iters = index.kmer_begin(n); - std::vector local_iters; - for (size_t i : chunks) { - if (i < iters.size()) { - local_iters.push_back(std::move(iters[i])); - } - } - size_t kpo_mers_removed = EarlyTipClipperProcessor(index, length_bound_).ClipTips(local_iters); // TODO support empty input - - INFO("K+1-mers removed: " << kpo_mers_removed); - partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); - io::binary::BinWrite(os, kpo_mers_removed); - } - - size_t merge(const std::vector &piss, Index&) { - size_t kpo_mers_removed = 0; - for (auto &pis : piss) { - kpo_mers_removed += io::binary::BinRead(*pis); - } - return kpo_mers_removed; + private: + size_t process_iner(Index &index, std::vector& local_iters) override { + return EarlyTipClipperProcessor(index, length_bound_).ClipTips(local_iters); } - private: size_t length_bound_; }; @@ -601,8 +580,10 @@ class EarlyATClipper : public ConstructionMPI::Phase { treg.listen(); if (partask::master()) { - clip_edges(); - clip_tips(); + auto edges_removed = clip_edges(); + auto tips_removed = clip_tips(); + + INFO(edges_removed << " A/T edges and " << tips_removed << " A/T tips were removed by early A-/T-tip clipper"); } treg.stop_listening(); } From d689e1f4cb25a5bfbe1ebe842ce7b78f0cbdccbc Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 14 Oct 2021 14:40:01 +0300 Subject: [PATCH 051/102] fix pair_info_counter --- src/projects/spades/pair_info_count.cpp | 42 ------------------------- 1 file changed, 42 deletions(-) diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index b7cb24e42f..f90cf8f1ca 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -190,48 +190,6 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, return single_long_reads.size(); } - -void ProcessPairedReads(graph_pack::GraphPack &gp, - std::unique_ptr filter, - unsigned filter_threshold, - size_t ilib) { - SequencingLib &reads = cfg::get_writable().ds.reads[ilib]; - const auto &data = reads.data(); - - unsigned round_thr = 0; - // Do not round if filtering is disabled - if (filter) - round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * data.insert_size_deviation * cfg::get().de.rounding_coeff, - cfg::get().de.rounding_thr)); - - SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); - INFO("Left insert size quantile " << data.insert_size_left_quantile << - ", right insert size quantile " << data.insert_size_right_quantile << - ", filtering threshold " << filter_threshold << - ", rounding threshold " << round_thr); - - LatePairedIndexFiller::WeightF weight; - if (filter) { - weight = [&](const std::pair &ep, - const MappingRange&, const MappingRange&) { - return (filter->lookup(ep) > filter_threshold ? 1. : 0.); - }; - } else { - weight = [&](const std::pair &, - const MappingRange&, const MappingRange&) { - return 1.; - }; - } - - using Indices = omnigraph::de::UnclusteredPairedInfoIndicesT; - LatePairedIndexFiller pif(gp.get(), weight, round_thr, gp.get_mutable()[ilib]); - notifier.Subscribe(&pif, ilib); - - size_t num_readers = partask::overall_num_threads(); - auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, (size_t) data.mean_insert_size, - /*include merged*/true, num_readers); - notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads)); -} } // namespace void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { From fe27f20b87758fc528e87b325931becf0e385efa Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 27 Oct 2021 21:31:16 +0300 Subject: [PATCH 052/102] detach edge index on load --- src/common/io/binary/graph_pack.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/common/io/binary/graph_pack.cpp b/src/common/io/binary/graph_pack.cpp index 53c6606f47..e83f57e1f8 100644 --- a/src/common/io/binary/graph_pack.cpp +++ b/src/common/io/binary/graph_pack.cpp @@ -91,13 +91,15 @@ class Loader { template void Load() { INFO("Trying to load " << typeid(T).name()); + auto &component = gp.get_mutable(); + if (component.IsAttached()) + component.Detach(); + if (!io::binary::BinRead(infoStream)) { INFO("Not attached, skipping"); return; } - auto &component = gp.get_mutable(); - if (component.IsAttached()) - component.Detach(); + typename IOTraits::Type io; bool loaded = io.Load(basename, component); VERIFY(loaded); @@ -120,13 +122,14 @@ class BinReader { template void Read() { INFO("Trying to read " << typeid(T).name()); + auto &component = gp.get_mutable(); + if (component.IsAttached()) + component.Detach(); + if (!io::binary::BinRead(is)) { INFO("Not attached, skipping"); return; } - auto &component = gp.get_mutable(); - if (component.IsAttached()) - component.Detach(); typename IOTraits::Type io; auto loaded = io.BinRead(is, component); VERIFY(loaded); From ed7c152b24a5a3c77ceb8804a30975f07ce1d2cd Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 28 Oct 2021 17:00:38 +0300 Subject: [PATCH 053/102] detach edge index at the end of pair_info_counter --- src/projects/spades/pair_info_count.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index f90cf8f1ca..ec1b64902a 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -294,6 +294,8 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { } } } + + DetachEdgeIndex(gp); } } // namespace debruijn_graph From 61b831854e6a946a4abadb951edb1a7217eb821a Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 5 Oct 2021 19:53:13 +0300 Subject: [PATCH 054/102] create lib Spades-MPI --- assembler/src/mpi/CMakeLists.txt | 7 +++++++ assembler/src/mpi/projects/CMakeLists.txt | 7 +++++++ src/common/paired_info/paired_info_buffer.hpp | 2 +- src/projects/spades/CMakeLists.txt | 17 ----------------- 4 files changed, 15 insertions(+), 18 deletions(-) create mode 100644 assembler/src/mpi/CMakeLists.txt create mode 100644 assembler/src/mpi/projects/CMakeLists.txt diff --git a/assembler/src/mpi/CMakeLists.txt b/assembler/src/mpi/CMakeLists.txt new file mode 100644 index 0000000000..489b67e2bb --- /dev/null +++ b/assembler/src/mpi/CMakeLists.txt @@ -0,0 +1,7 @@ +############################################################################ +# Copyright (c) 2021 Saint Petersburg State University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +add_subdirectory(projects) \ No newline at end of file diff --git a/assembler/src/mpi/projects/CMakeLists.txt b/assembler/src/mpi/projects/CMakeLists.txt new file mode 100644 index 0000000000..ab1ab907e0 --- /dev/null +++ b/assembler/src/mpi/projects/CMakeLists.txt @@ -0,0 +1,7 @@ +############################################################################ +# Copyright (c) 2021 Saint Petersburg State University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +add_subdirectory(spades) \ No newline at end of file diff --git a/src/common/paired_info/paired_info_buffer.hpp b/src/common/paired_info/paired_info_buffer.hpp index 460b573862..c5fe6b7258 100644 --- a/src/common/paired_info/paired_info_buffer.hpp +++ b/src/common/paired_info/paired_info_buffer.hpp @@ -236,7 +236,7 @@ class PairedBuffer : public PairedBufferBase, } } - private: +private: std::pair InsertOne(EdgeId e1, EdgeId e2, InnerPoint p) { InnerMap& second = storage_[e1]; typename InnerHistPtr::pointer inserted = nullptr; diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt index c31c42b0b0..bd2b6b8423 100644 --- a/src/projects/spades/CMakeLists.txt +++ b/src/projects/spades/CMakeLists.txt @@ -44,23 +44,6 @@ install(TARGETS spades-core DESTINATION bin COMPONENT spades) -if (MPI_FOUND) - add_executable(spades-hpc - main_mpi.cpp - ${SPADES_SRC}) - - target_link_libraries(spades-hpc ${SPADES_LIB} ${MPI_LIBRARIES}) - set_target_properties(spades-hpc PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - - if (SPADES_STATIC_BUILD) - set_target_properties(spades-hpc PROPERTIES LINK_SEARCH_END_STATIC 1) - endif() - install(TARGETS spades-hpc - DESTINATION bin - COMPONENT runtime) -endif() - - # Configs install(DIRECTORY "configs/" # Trailing / is important DESTINATION share/spades/configs/debruijn From 39b5ecee6428b905ac89bd4148ab9aff37b77830 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 12 Oct 2021 12:23:08 +0300 Subject: [PATCH 055/102] separate distEst --- .../paired_info/distance_estimation.cpp | 21 -- .../paired_info/distance_estimation.hpp | 95 ------ .../paired_info/distance_estimation_utils.cpp | 250 +++++++------- .../paired_info/distance_estimation_utils.hpp | 130 ++++++-- src/projects/hpcspades/CMakeLists.txt | 27 ++ .../projects/hpcspades/common}/CMakeLists.txt | 5 +- .../common/paired_info}/CMakeLists.txt | 8 +- .../paired_info/distance_estimation.cpp | 33 ++ .../paired_info/distance_estimation.hpp | 108 ++++++ .../paired_info/distance_estimation_utils.cpp | 30 ++ .../paired_info/distance_estimation_utils.hpp | 75 +++++ .../hpcspades/distance_estimation_mpi.cpp | 34 ++ .../hpcspades/distance_estimation_mpi.hpp | 42 +++ src/projects/hpcspades/main_mpi.cpp | 143 ++++++++ src/projects/hpcspades/pipeline.cpp | 315 ++++++++++++++++++ src/projects/spades/distance_estimation.cpp | 29 +- src/projects/spades/distance_estimation.hpp | 26 +- .../spades/distance_estimation_mpi.cpp | 34 ++ .../spades/distance_estimation_mpi.hpp | 43 +++ src/projects/spades/pipeline.cpp | 7 +- 20 files changed, 1187 insertions(+), 268 deletions(-) create mode 100644 src/projects/hpcspades/CMakeLists.txt rename {assembler/src/mpi => src/projects/hpcspades/common}/CMakeLists.txt (67%) rename {assembler/src/mpi/projects => src/projects/hpcspades/common/paired_info}/CMakeLists.txt (57%) create mode 100644 src/projects/hpcspades/common/paired_info/distance_estimation.cpp create mode 100644 src/projects/hpcspades/common/paired_info/distance_estimation.hpp create mode 100644 src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp create mode 100644 src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp create mode 100644 src/projects/hpcspades/distance_estimation_mpi.cpp create mode 100644 src/projects/hpcspades/distance_estimation_mpi.hpp create mode 100644 src/projects/hpcspades/main_mpi.cpp create mode 100644 src/projects/hpcspades/pipeline.cpp create mode 100644 src/projects/spades/distance_estimation_mpi.cpp create mode 100644 src/projects/spades/distance_estimation_mpi.hpp diff --git a/src/common/paired_info/distance_estimation.cpp b/src/common/paired_info/distance_estimation.cpp index 1f8b83a09a..131b18e20c 100644 --- a/src/common/paired_info/distance_estimation.cpp +++ b/src/common/paired_info/distance_estimation.cpp @@ -8,7 +8,6 @@ #include "distance_estimation.hpp" #include "assembly_graph/paths/path_processor.hpp" -#include "pipeline/partask_mpi.hpp" namespace omnigraph { namespace de { @@ -182,25 +181,5 @@ void DistanceEstimator::ProcessEdge(EdgeId e1, const InPairedIndex &pi, PairedIn this->AddToResult(res, ep, result); } } - -void DistanceEstimatorMPI::Estimate(PairedInfoIndexT &result, size_t nthreads) const { - this->Init(); - const auto &index = this->index(); - - DEBUG("Collecting edge infos"); - std::vector edges; - for (EdgeId e : this->graph().edges()) - edges.push_back(e); - - partask::TaskRegistry treg; - auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(dist_estimator_), std::ref(result)); - treg.listen(); - - if (partask::master()) { - dist_estimator_mpi(edges, nthreads); - } - treg.stop_listening(); - partask::broadcast(result); -} } } diff --git a/src/common/paired_info/distance_estimation.hpp b/src/common/paired_info/distance_estimation.hpp index d76798e74b..cdad768839 100644 --- a/src/common/paired_info/distance_estimation.hpp +++ b/src/common/paired_info/distance_estimation.hpp @@ -16,7 +16,6 @@ #include "utils/parallel/openmp_wrapper.h" #include "math/xmath.h" -#include "pipeline/partask_mpi.hpp" namespace omnigraph { @@ -138,100 +137,6 @@ class DistanceEstimator : public AbstractDistanceEstimator { DECL_LOGGER("DistanceEstimator"); }; -class DistanceEstimatorMPI : public DistanceEstimator { - typedef DistanceEstimator base; - typedef std::vector GraphLengths; - typedef std::vector> EstimHist; - typedef std::pair EdgePair; - - protected: - typedef typename base::InPairedIndex InPairedIndex; - typedef typename base::OutPairedIndex OutPairedIndex; - typedef typename base::InHistogram InHistogram; - typedef typename base::OutHistogram OutHistogram; - - public: - DistanceEstimatorMPI(const debruijn_graph::Graph &graph, - const InPairedIndex &index, - const GraphDistanceFinder &distance_finder, - size_t linkage_distance, size_t max_distance, - const DistanceEstimator& base_dist_estimator) - : base(graph, index, distance_finder, linkage_distance, max_distance), dist_estimator_(base_dist_estimator) {} - - virtual ~DistanceEstimatorMPI() = default; - - class DistanceEstimatorTask { - DistanceEstimatorTask() = default; - public: - DistanceEstimatorTask(std::vector &edges, - unsigned int nthreads) : edges_(edges), nthreads_(nthreads) {}; - - DistanceEstimatorTask(std::istream &is) { - io::binary::BinRead(is, edges_, nthreads_); - - } - - std::ostream &serialize(std::ostream &os) const { - io::binary::BinWrite(os, edges_, nthreads_); - return os; - } - - auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimator&, - PairedInfoIndexT & /*result*/) { - return partask::make_seq_along_generator(edges_); - } - - void process(std::istream &is, std::ostream &os, const InPairedIndex &index, - const DistanceEstimator& self, PairedInfoIndexT & /*result*/) { - DEBUG("Processing"); - auto edges_id = partask::get_seq(is); - PairedInfoBuffersT buffer(self.graph(), nthreads_); - # pragma omp parallel for num_threads(nthreads_) schedule(guided, 10) - for (size_t i = 0; i < edges_id.size(); ++i) { - debruijn_graph::EdgeId edge = edges_[edges_id[i]]; - self.ProcessEdge(edge, index, buffer[omp_get_thread_num()]); - } - - buffer.BinWrite(os); - buffer.Clear(); - } - - auto merge(const std::vector &piss, - const InPairedIndex&, - const DistanceEstimator& self, - PairedInfoIndexT &result) { - for (auto pis : piss) { - PairedInfoBuffersT buffer(self.graph(), nthreads_); - buffer.BinRead(*pis); - for (size_t j = 0; j < nthreads_; ++j) { - result.Merge(buffer[j]); - buffer[j].clear(); - } - } - } - - private: - std::vector edges_; - unsigned nthreads_; - }; - - void Init() const { - INFO("Using " << this->Name() << " distance estimator"); - } - - virtual void Estimate(OutPairedIndex &result, size_t nthreads) const; - - friend DistanceEstimatorTask; - private: - const DistanceEstimator& dist_estimator_; - - virtual const std::string Name() const { - const std::string my_name = dist_estimator_.Name() + "_MPI"; - return my_name; - } - - DECL_LOGGER("DistanceEstimatorMPI"); -}; } diff --git a/src/common/paired_info/distance_estimation_utils.cpp b/src/common/paired_info/distance_estimation_utils.cpp index 651844501d..7517d15963 100644 --- a/src/common/paired_info/distance_estimation_utils.cpp +++ b/src/common/paired_info/distance_estimation_utils.cpp @@ -10,141 +10,163 @@ #include "assembly_graph/core/graph.hpp" #include "paired_info/pair_info_improver.hpp" -#include "paired_info/smoothing_distance_estimation.hpp" #include "paired_info/weights.hpp" namespace distance_estimation { -using namespace debruijn_graph; -using namespace omnigraph::de; + using namespace debruijn_graph; + using namespace omnigraph::de; -void EstimateWithEstimator(PairedInfoIndexT &clustered_index, - const AbstractDistanceEstimator &estimator, - AbstractPairInfoChecker &checker) { - DEBUG("Estimating distances"); + void EstimateWithEstimator(PairedInfoIndexT &clustered_index, + const AbstractDistanceEstimator &estimator, + AbstractPairInfoChecker &checker) { + DEBUG("Estimating distances"); - estimator.Estimate(clustered_index, omp_get_max_threads()); + estimator.Estimate(clustered_index, omp_get_max_threads()); - INFO("Filtering info"); - PairInfoFilter(checker).Filter(clustered_index); - DEBUG("Info Filtered"); -} + INFO("Filtering info"); + PairInfoFilter(checker).Filter(clustered_index); + DEBUG("Info Filtered"); + } // Postprocessing, checking that clusters do not intersect -void RefinePairedInfo(PairedInfoIndexT& clustered_index, const Graph& graph) { - for (auto iter = pair_begin(clustered_index); iter != pair_end(clustered_index); ++iter) { - EdgeId first_edge = iter.first(); - EdgeId second_edge = iter.second(); - auto infos = iter->Unwrap(); //we need an ordered histogram here - if (infos.empty()) - continue; - - auto prev_it = infos.begin(); - auto it = prev_it; - ++it; - for (auto end_it = infos.end(); it != end_it; ++it) { - if (math::le(std::abs(it->d - prev_it->d), it->var + prev_it->var)) { - WARN("Clusters intersect, edges -- " << graph.int_id(first_edge) - << " " << graph.int_id(second_edge)); - INFO("Trying to handle this case"); - // seeking the symmetric pair info to [i - 1] - bool success = false; - double total_weight = prev_it->weight; - for (auto inner_it = it; inner_it != end_it; ++inner_it) { - total_weight += inner_it->weight; - if (math::eq(inner_it->d + prev_it->d, 0.f)) { - success = true; - DEDistance center = 0.; - DEVariance var = inner_it->d + inner_it->var; - for (auto inner_it_2 = prev_it; inner_it_2 != inner_it; ++inner_it_2) { - TRACE("Removing pair info " << *inner_it_2); - clustered_index.Remove(first_edge, second_edge, *inner_it_2); + void RefinePairedInfo(PairedInfoIndexT &clustered_index, const Graph &graph) { + for (auto iter = pair_begin(clustered_index); iter != pair_end(clustered_index); ++iter) { + EdgeId first_edge = iter.first(); + EdgeId second_edge = iter.second(); + auto infos = iter->Unwrap(); //we need an ordered histogram here + if (infos.empty()) + continue; + + auto prev_it = infos.begin(); + auto it = prev_it; + ++it; + for (auto end_it = infos.end(); it != end_it; ++it) { + if (math::le(std::abs(it->d - prev_it->d), it->var + prev_it->var)) { + WARN("Clusters intersect, edges -- " << graph.int_id(first_edge) + << " " << graph.int_id(second_edge)); + INFO("Trying to handle this case"); + // seeking the symmetric pair info to [i - 1] + bool success = false; + double total_weight = prev_it->weight; + for (auto inner_it = it; inner_it != end_it; ++inner_it) { + total_weight += inner_it->weight; + if (math::eq(inner_it->d + prev_it->d, 0.f)) { + success = true; + DEDistance center = 0.; + DEVariance var = inner_it->d + inner_it->var; + for (auto inner_it_2 = prev_it; inner_it_2 != inner_it; ++inner_it_2) { + TRACE("Removing pair info " << *inner_it_2); + clustered_index.Remove(first_edge, second_edge, *inner_it_2); + } + clustered_index.Remove(first_edge, second_edge, *inner_it); + Point new_point(center, total_weight, var); + TRACE("Adding new pair info " << first_edge << " " << second_edge << " " << new_point); + clustered_index.Add(first_edge, second_edge, new_point); + break; } - clustered_index.Remove(first_edge, second_edge, *inner_it); - Point new_point(center, total_weight, var); - TRACE("Adding new pair info " << first_edge << " " << second_edge << " " << new_point); - clustered_index.Add(first_edge, second_edge, new_point); - break; } - } - INFO("Pair information was resolved"); + INFO("Pair information was resolved"); - if (!success) - WARN("This intersection can not be handled in the right way"); + if (!success) + WARN("This intersection can not be handled in the right way"); - break; + break; + } } } } -} -void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, - const Graph &graph, const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) { - INFO("Filling scaffolding index"); - - double is_var = lib.data().insert_size_deviation; - size_t delta = size_t(is_var); - size_t linkage_distance = size_t(de_config.linkage_distance_coeff * is_var); - GraphDistanceFinder dist_finder(graph, - (size_t) math::round(lib.data().mean_insert_size), - lib.data().unmerged_read_length, delta); - size_t max_distance = size_t(de_config.max_distance_coeff_scaff * is_var); - - DEBUG("Retaining insert size distribution for it"); - if (lib.data().insert_size_distribution.size() == 0) { - WARN("The library will not be used for scaffolding"); - return; - } + void EstimateScaffoldingDistancesInner(PairedInfoIndexT &scaffolding_index, + const Graph &graph, const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config, + const AbstractScaffoldDistanceEstimatorFabric &distance_estimator_fabric) { + INFO("Filling scaffolding index"); + + double is_var = lib.data().insert_size_deviation; + size_t delta = size_t(is_var); + size_t linkage_distance = size_t(de_config.linkage_distance_coeff * is_var); + GraphDistanceFinder dist_finder(graph, + (size_t) math::round(lib.data().mean_insert_size), + lib.data().unmerged_read_length, delta); + size_t max_distance = size_t(de_config.max_distance_coeff_scaff * is_var); + + DEBUG("Retaining insert size distribution for it"); + if (lib.data().insert_size_distribution.size() == 0) { + WARN("The library will not be used for scaffolding"); + return; + } - WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size); - DEBUG("Weight Wrapper Done"); + WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size); + DEBUG("Weight Wrapper Done"); // PairInfoWeightFilter filter(gp.g, 0.); - PairInfoWeightChecker checker(graph, 0.); - DEBUG("Weight Filter Done"); - - SmoothingDistanceEstimator estimator_base(graph, paired_index, dist_finder, - [&] (int i) {return wrapper.CountWeight(i);}, - linkage_distance, max_distance, - ade.threshold, ade.range_coeff, - ade.delta_coeff, ade.cutoff, - ade.min_peak_points, - ade.percentage, - ade.derivative_threshold); - DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance, estimator_base); - - EstimateWithEstimator(scaffolding_index, estimator, checker); -} - -void EstimatePairedDistances(PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) { - size_t delta = size_t(lib.data().insert_size_deviation); - size_t linkage_distance = size_t(de_config.linkage_distance_coeff * lib.data().insert_size_deviation); - GraphDistanceFinder dist_finder(graph, (size_t)math::round(lib.data().mean_insert_size), lib.data().unmerged_read_length, delta); - size_t max_distance = size_t(de_config.max_distance_coeff * lib.data().insert_size_deviation); - - PairInfoWeightChecker checker(graph, de_config.clustered_filter_threshold); - - INFO("Weight Filter Done"); - - DistanceEstimator estimator_base(graph, paired_index, dist_finder, linkage_distance, max_distance); - DistanceEstimatorMPI estimator(graph, paired_index, dist_finder, linkage_distance, max_distance, estimator_base); - - EstimateWithEstimator(clustered_index, estimator, checker); + PairInfoWeightChecker checker(graph, 0.); + DEBUG("Weight Filter Done"); + + auto estimator = distance_estimator_fabric.getDistanceEstimator(graph, paired_index, dist_finder, + [&](int i) { + return wrapper.CountWeight(i); + }, + linkage_distance, max_distance, + ade.threshold, ade.range_coeff, + ade.delta_coeff, ade.cutoff, + ade.min_peak_points, + ade.percentage, + ade.derivative_threshold); + EstimateWithEstimator(scaffolding_index, *estimator, checker); + } - INFO("Refining clustered pair information "); // this procedure checks, whether index - RefinePairedInfo(clustered_index, graph); // contains intersecting paired info clusters, - INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts. + void EstimatePairedDistancesInner(PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config, + const AbstractDistanceEstimatorFabric &distance_estimator_fabric) { + size_t delta = size_t(lib.data().insert_size_deviation); + size_t linkage_distance = size_t(de_config.linkage_distance_coeff * lib.data().insert_size_deviation); + GraphDistanceFinder dist_finder(graph, (size_t) math::round(lib.data().mean_insert_size), + lib.data().unmerged_read_length, delta); + size_t max_distance = size_t(de_config.max_distance_coeff * lib.data().insert_size_deviation); + + PairInfoWeightChecker checker(graph, de_config.clustered_filter_threshold); + + INFO("Weight Filter Done"); + + auto estimator = distance_estimator_fabric.getDistanceEstimator(graph, paired_index, dist_finder, + linkage_distance, max_distance); + EstimateWithEstimator(clustered_index, *estimator, checker); + + INFO("Refining clustered pair information "); // this procedure checks, whether index + RefinePairedInfo(clustered_index, + graph); // contains intersecting paired info clusters, + INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts. + + INFO("Improving paired information"); + PairInfoImprover(graph, clustered_index, lib, max_repeat_length).ImprovePairedInfo( + omp_get_max_threads()); + } - INFO("Improving paired information"); - PairInfoImprover(graph, clustered_index, lib, max_repeat_length).ImprovePairedInfo(omp_get_max_threads()); -} + void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, + paired_index, ade, de_config, ScaffoldDistanceEstimatorFabric()); + } + void EstimatePairedDistances(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, + max_repeat_length, de_config, DistanceEstimatorFabric()); + } } diff --git a/src/common/paired_info/distance_estimation_utils.hpp b/src/common/paired_info/distance_estimation_utils.hpp index b142fb055f..a3224d2f1a 100644 --- a/src/common/paired_info/distance_estimation_utils.hpp +++ b/src/common/paired_info/distance_estimation_utils.hpp @@ -10,6 +10,7 @@ #include "distance_estimation.hpp" #include "paired_info.hpp" #include "pair_info_filters.hpp" +#include "smoothing_distance_estimation.hpp" #include "library/library.hpp" #include "library/library_data.hpp" @@ -17,31 +18,106 @@ #include "configs/distance_estimation.hpp" namespace distance_estimation { -using omnigraph::de::AbstractDistanceEstimator; -using omnigraph::de::AbstractPairInfoChecker; -using omnigraph::de::PairedInfoIndexT; -using omnigraph::de::UnclusteredPairedInfoIndexT; - -void EstimateWithEstimator(PairedInfoIndexT &clustered_index, - const AbstractDistanceEstimator &estimator, - AbstractPairInfoChecker &checker); - -void RefinePairedInfo(PairedInfoIndexT& clustered_index, - const debruijn_graph::Graph& graph); - -void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config = - debruijn_graph::config::distance_estimator()); - -void EstimatePairedDistances(PairedInfoIndexT &clustered_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length = std::numeric_limits::max(), - const debruijn_graph::config::distance_estimator &de_config = - debruijn_graph::config::distance_estimator()); + using omnigraph::de::AbstractDistanceEstimator; + using omnigraph::de::AbstractPairInfoChecker; + using omnigraph::de::PairedInfoIndexT; + using omnigraph::de::UnclusteredPairedInfoIndexT; + + class AbstractDistanceEstimatorFabric { + public: + virtual std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const = 0; + }; + + class DistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const override { + return std::make_unique(graph, index, distance_finder, linkage_distance, + max_distance); + } + }; + + class AbstractScaffoldDistanceEstimatorFabric { + public: + virtual std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const = 0; + }; + + class ScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const override { + return std::unique_ptr( + new omnigraph::de::SmoothingDistanceEstimator(graph, histogram, dist_finder, weight_f, + linkage_distance, max_distance, threshold, + range_coeff, delta_coeff, cutoff, min_peak_points, + percentage, derivative_threshold)); + } + }; + + void EstimateWithEstimator(PairedInfoIndexT &clustered_index, + const AbstractDistanceEstimator &estimator, + AbstractPairInfoChecker &checker); + + void RefinePairedInfo(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph); + + void EstimateScaffoldingDistancesInner(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator(), + const AbstractScaffoldDistanceEstimatorFabric& distance_estimator_fabric = + ScaffoldDistanceEstimatorFabric()); + + void EstimatePairedDistancesInner(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length = std::numeric_limits::max(), + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator(), + const AbstractDistanceEstimatorFabric& distance_estimator_fabric = + DistanceEstimatorFabric()); + + void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); + + void EstimatePairedDistances(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length = std::numeric_limits::max(), + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); } diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt new file mode 100644 index 0000000000..dbfe30f0ba --- /dev/null +++ b/src/projects/hpcspades/CMakeLists.txt @@ -0,0 +1,27 @@ +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(hpcspades CXX) + +add_subdirectory(common) + +add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) + +set(HPCSPADES_SRC pipeline.cpp ../../projects/spades/series_analysis.cpp ../../projects/mts/contig_abundance.cpp) +set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi ${COMMON_LIBRARIES}) + +add_executable(spades-hpc main_mpi.cpp ${HPCSPADES_SRC}) + +target_link_libraries(spades-hpc ${HPCSPADES_LIB} ${MPI_LIBRARIES}) +set_target_properties(spades-hpc PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + +if (SPADES_STATIC_BUILD) + set_target_properties(spades-hpc PROPERTIES LINK_SEARCH_END_STATIC 1) +endif() + +install(TARGETS spades-hpc + DESTINATION bin + COMPONENT runtime) diff --git a/assembler/src/mpi/CMakeLists.txt b/src/projects/hpcspades/common/CMakeLists.txt similarity index 67% rename from assembler/src/mpi/CMakeLists.txt rename to src/projects/hpcspades/common/CMakeLists.txt index 489b67e2bb..d87e26ab4d 100644 --- a/assembler/src/mpi/CMakeLists.txt +++ b/src/projects/hpcspades/common/CMakeLists.txt @@ -1,7 +1,10 @@ ############################################################################ # Copyright (c) 2021 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ -add_subdirectory(projects) \ No newline at end of file +project(common_modules_mpi CXX) + +add_subdirectory(paired_info) \ No newline at end of file diff --git a/assembler/src/mpi/projects/CMakeLists.txt b/src/projects/hpcspades/common/paired_info/CMakeLists.txt similarity index 57% rename from assembler/src/mpi/projects/CMakeLists.txt rename to src/projects/hpcspades/common/paired_info/CMakeLists.txt index ab1ab907e0..26174aede6 100644 --- a/assembler/src/mpi/projects/CMakeLists.txt +++ b/src/projects/hpcspades/common/paired_info/CMakeLists.txt @@ -4,4 +4,10 @@ # See file LICENSE for details. ############################################################################ -add_subdirectory(spades) \ No newline at end of file +project(paired_info_mpi CXX) + +add_library(paired_info_mpi STATIC + distance_estimation.cpp + distance_estimation_utils.cpp) + +target_link_libraries(paired_info_mpi modules) diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.cpp b/src/projects/hpcspades/common/paired_info/distance_estimation.cpp new file mode 100644 index 0000000000..7fdd916dfc --- /dev/null +++ b/src/projects/hpcspades/common/paired_info/distance_estimation.cpp @@ -0,0 +1,33 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "distance_estimation.hpp" + +namespace omnigraph { + namespace de { + using namespace debruijn_graph; + + void DistanceEstimatorMPI::Estimate(OutPairedIndex &result, size_t nthreads) const { + this->Init(); + const auto &index = this->index(); + + DEBUG("Collecting edge infos"); + std::vector edges; + for (EdgeId e : this->graph().edges()) + edges.push_back(e); + + partask::TaskRegistry treg; + auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(*dist_estimator_), std::ref(result)); + treg.listen(); + + if (partask::master()) { + dist_estimator_mpi(edges, nthreads); + } + treg.stop_listening(); + partask::broadcast(result); + } + } +} diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp new file mode 100644 index 0000000000..f6c7ba8eef --- /dev/null +++ b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp @@ -0,0 +1,108 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#ifndef MPI_DISTANCE_ESTIMATION_HPP_ +#define MPI_DISTANCE_ESTIMATION_HPP_ + +#include "common/paired_info/distance_estimation.hpp" +#include "pipeline/partask_mpi.hpp" + +namespace omnigraph { + namespace de { + class DistanceEstimatorMPI : public DistanceEstimator { + typedef DistanceEstimator base; + typedef std::vector GraphLengths; + typedef std::vector > EstimHist; + typedef std::pair EdgePair; + + protected: + typedef typename base::InPairedIndex InPairedIndex; + typedef typename base::OutPairedIndex OutPairedIndex; + typedef typename base::InHistogram InHistogram; + typedef typename base::OutHistogram OutHistogram; + + public: + DistanceEstimatorMPI(const debruijn_graph::Graph &graph, + const InPairedIndex &index, + const GraphDistanceFinder &distance_finder, + size_t linkage_distance, size_t max_distance, + std::unique_ptr base_dist_estimator) + : base(graph, index, distance_finder, linkage_distance, max_distance), + dist_estimator_(std::move(base_dist_estimator)) {} + + class DistanceEstimatorTask { + DistanceEstimatorTask() = default; + + public: + DistanceEstimatorTask(std::vector &edges, + unsigned int nthreads) : edges_(edges), nthreads_(nthreads) {}; + + DistanceEstimatorTask(std::istream &is) { + io::binary::BinRead(is, edges_, nthreads_); + + } + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, edges_, nthreads_); + return os; + } + + auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimator &, + PairedInfoIndexT & /*result*/) { + return partask::make_seq_along_generator(edges_); + } + + void process(std::istream &is, std::ostream &os, const InPairedIndex &index, + const DistanceEstimator &self, PairedInfoIndexT & /*result*/) { + DEBUG("Processing"); + auto edges_id = partask::get_seq(is); + PairedInfoBuffersT buffer(self.graph(), nthreads_); +# pragma omp parallel for num_threads(nthreads_) schedule(guided, 10) + for (size_t i = 0; i < edges_id.size(); ++i) { + debruijn_graph::EdgeId edge = edges_[edges_id[i]]; + self.ProcessEdge(edge, index, buffer[omp_get_thread_num()]); + } + + buffer.BinWrite(os); + buffer.Clear(); + } + + auto merge(const std::vector &piss, + const InPairedIndex &, + const DistanceEstimator &self, + PairedInfoIndexT &result) { + for (auto pis: piss) { + PairedInfoBuffersT buffer(self.graph(), nthreads_); + buffer.BinRead(*pis); + for (size_t j = 0; j < nthreads_; ++j) { + result.Merge(buffer[j]); + buffer[j].clear(); + } + } + } + + private: + std::vector edges_; + unsigned nthreads_; + }; + + void Estimate(OutPairedIndex &result, size_t nthreads) const override; + + friend DistanceEstimatorTask; + private: + std::unique_ptr dist_estimator_; + + const std::string Name() const override { + const std::string my_name = dist_estimator_->Name() + "_MPI"; + return my_name; + } + + DECL_LOGGER("DistanceEstimatorMPI"); + }; + } +} + +#endif /* MPI_DISTANCE_ESTIMATION_HPP_ */ diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp new file mode 100644 index 0000000000..342d863254 --- /dev/null +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp @@ -0,0 +1,30 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "distance_estimation_utils.hpp" +#include "distance_estimation.hpp" + +namespace distance_estimation { + void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, + paired_index, ade, de_config, MPIScaffoldDistanceEstimatorFabric()); + } + + void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, + max_repeat_length, de_config, MPIDistanceEstimatorFabric()); + } +} diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp new file mode 100644 index 0000000000..9bb8d2acbb --- /dev/null +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp @@ -0,0 +1,75 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "common/paired_info/distance_estimation_utils.hpp" +#include "distance_estimation.hpp" + +namespace distance_estimation { + using omnigraph::de::DistanceEstimator; + using omnigraph::de::DistanceEstimatorMPI; + + class MPIDistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const override { + auto estimator_base = std::make_unique(graph, index, distance_finder, + linkage_distance, max_distance); + return std::unique_ptr(new DistanceEstimatorMPI(graph, index, + distance_finder, + linkage_distance, + max_distance, + std::move(estimator_base))); + } + }; + + class MPIScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, + size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const override { + auto estimator_base = std::unique_ptr( + new omnigraph::de::SmoothingDistanceEstimator(graph, histogram, dist_finder, weight_f, + linkage_distance, max_distance, threshold, + range_coeff, delta_coeff, cutoff, min_peak_points, + percentage, derivative_threshold)); + + return std::unique_ptr(new DistanceEstimatorMPI(graph, histogram, + dist_finder, + linkage_distance, + max_distance, + std::move(estimator_base))); + } + }; + + void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); + + void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length = std::numeric_limits::max(), + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); +} diff --git a/src/projects/hpcspades/distance_estimation_mpi.cpp b/src/projects/hpcspades/distance_estimation_mpi.cpp new file mode 100644 index 0000000000..c2f2b8bf0a --- /dev/null +++ b/src/projects/hpcspades/distance_estimation_mpi.cpp @@ -0,0 +1,34 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "distance_estimation_mpi.hpp" + +#include "common/paired_info/distance_estimation_utils.hpp" + +namespace debruijn_graph { + void DistanceEstimationMPI::run(graph_pack::GraphPack &gp, const char* s) { + DistanceEstimationInnerMPI().run(gp, s); + } + + void DistanceEstimationInnerMPI::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimatePairedDistancesMPI(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); + } + + + void DistanceEstimationInnerMPI::runEstimateScaffoldingDistances( + omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimateScaffoldingDistancesMPI(scaffolding_index, graph, lib, paired_index, ade, de_config); + } +} diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp new file mode 100644 index 0000000000..a52c8fc131 --- /dev/null +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -0,0 +1,42 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "pipeline/mpi_stage.hpp" +#include "assembly_graph/core/graph.hpp" +#include "paired_info/paired_info.hpp" +#include "pipeline/stage.hpp" +#include + +namespace debruijn_graph { + class DistanceEstimationInnerMPI : public DistanceEstimationInner { + protected: + void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) override; + + void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) override; + }; + + class DistanceEstimationMPI : public spades::MPIAssemblyStage { + public: + DistanceEstimationMPI(bool preliminary = false) + : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", + preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} + + void run(graph_pack::GraphPack &gp, const char *) override; + }; +} diff --git a/src/projects/hpcspades/main_mpi.cpp b/src/projects/hpcspades/main_mpi.cpp new file mode 100644 index 0000000000..c85379b614 --- /dev/null +++ b/src/projects/hpcspades/main_mpi.cpp @@ -0,0 +1,143 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2015-2022 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "configs/config_struct.hpp" +#include "pipeline/partask_mpi.hpp" + +#include "utils/logger/mpi_log_writers.hpp" +#include "utils/memory_limit.hpp" +#include "utils/segfault_handler.hpp" +#include "utils/perf/timetracer.hpp" + +#include "k_range.hpp" +#include "version.hpp" + +namespace spades { +void assemble_genome(bool mpi); +} + +struct TimeTracerRAII { + TimeTracerRAII(llvm::StringRef program_name, + unsigned granularity = 500, + const std::string &prefix = "", const std::string &suffix = "") { + time_trace_file_ = prefix + "spades_time_trace_" + suffix + ".json"; + llvm::timeTraceProfilerInitialize(granularity, program_name); + } + ~TimeTracerRAII() { + if (auto E = llvm::timeTraceProfilerWrite(time_trace_file_, "spades-core")) { + handleAllErrors(std::move(E), + [&](const llvm::StringError &SE) { + ERROR("" << SE.getMessage() << "\n"); + }); + return; + } else { + INFO("Time trace is written to: " << time_trace_file_); + } + llvm::timeTraceProfilerCleanup(); + } + + std::string time_trace_file_; +}; + +void load_config(const std::vector& cfg_fns) { + for (const auto& s : cfg_fns) { + CHECK_FATAL_ERROR(exists(s), "File " << s << " doesn't exist or can't be read!"); + } + + cfg::create_instance(cfg_fns); + + create_directory(cfg::get().output_dir); + create_directory(cfg::get().tmp_dir); + + create_directory(cfg::get().temp_bin_reads_path); +} + +void create_console_logger(const std::filesystem::path& dir, std::filesystem::path log_prop_fn) { + using namespace logging; + + if (!exists(log_prop_fn)) + log_prop_fn = dir / log_prop_fn; + + logger *lg = create_logger(exists(log_prop_fn) ? log_prop_fn : ""); + lg->add_writer(std::make_shared()); + attach_logger(lg); +} + +int main(int argc, char **argv) { + utils::perf_counter pc; + + const size_t GB = 1 << 30; + + srand(42); + srandom(42); + + bool init = partask::init(); + INFO("MPI init: " << (init ? "done" : "failed")); + + try { + using namespace debruijn_graph; + + std::filesystem::path cfg_dir = std::filesystem::path(argv[1]).parent_path(); + + std::vector cfg_fns; + for (int i = 1; i < argc; ++i) { + cfg_fns.push_back(argv[i]); + } + + // read configuration file (dataset path etc.) + load_config(cfg_fns); + + create_console_logger(cfg_dir, cfg::get().log_filename); + for (const auto& cfg_fn : cfg_fns) + INFO("Loaded config from " << cfg_fn); + + VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K); + VERIFY(cfg::get().K % 2 != 0); + + utils::limit_memory(cfg::get().max_memory * GB); + + // assemble it! + START_BANNER("hpcSPAdes"); + INFO("Maximum k-mer length: " << runtime_k::MAX_K); + INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K); + INFO("Maximum # of threads to use (adjusted due to OMP capabilities): " << cfg::get().max_threads); + std::unique_ptr traceraii; + if (cfg::get().tt.enable || cfg::get().developer_mode) { + traceraii.reset(new TimeTracerRAII(argv[0], + cfg::get().tt.granularity, + cfg::get().output_dir, std::to_string(cfg::get().K))); + INFO("Time tracing is enabled"); + } + + TIME_TRACE_SCOPE("spades"); + spades::assemble_genome(true); + } catch (std::bad_alloc const &e) { + std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } catch (std::exception const &e) { + std::cerr << "Exception caught " << e.what() << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } catch (...) { + std::cerr << "Unknown exception caught " << std::endl; + MPI_Abort(MPI_COMM_WORLD, EINTR); + return EINTR; + } + + unsigned ms = (unsigned) pc.time_ms(); + unsigned secs = (ms / 1000) % 60; + unsigned mins = (ms / 1000 / 60) % 60; + unsigned hours = (ms / 1000 / 60 / 60); + INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds"); + + // OK + int success = partask::finalize(); + VERIFY(success); + return 0; +} diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp new file mode 100644 index 0000000000..d111a794e3 --- /dev/null +++ b/src/projects/hpcspades/pipeline.cpp @@ -0,0 +1,315 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2020-2022 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "projects/spades/load_graph.hpp" +#include "projects/spades/gap_closer.hpp" +#include "projects/spades/mismatch_correction.hpp" +#include "projects/spades/pair_info_count.hpp" +#include "projects/spades/second_phase_setup.hpp" +#include "projects/spades/repeat_resolving.hpp" +#include "distance_estimation_mpi.hpp" +#include "projects/spades/hybrid_aligning.hpp" +#include "projects/spades/chromosome_removal.hpp" +#include "projects/spades/series_analysis.hpp" +#include "projects/spades/contig_output_stage.hpp" +#include "projects/spades/extract_domains.hpp" +#include "projects/spades/domain_graph_construction.hpp" +#include "projects/spades/restricted_edges_filling.hpp" +#include "projects/spades/wastewater_disentangle.hpp" +#include "library/library.hpp" +#include "pipeline/graph_pack.hpp" +#include "pipeline/stage.hpp" +#include "pipeline/mpi_stage.hpp" +#include "alignment/kmer_mapper.hpp" + +#include "stages/genomic_info_filler.hpp" +#include "stages/read_conversion.hpp" +#include "stages/construction.hpp" +#include "stages/construction_mpi.hpp" +#include "stages/simplification.hpp" +#include "stages/ss_edge_split.hpp" +#include "configs/config_struct.hpp" + +namespace spades { + +static bool MetaCompatibleLibraries() { + const auto& libs = cfg::get().ds.reads; + if (libs.lib_count() > 2) + return false; + + size_t paired_end_libs = 0, long_read_libs = 0; + for (const auto &lib : libs) { + auto type = lib.type(); + paired_end_libs += (type == io::LibraryType::PairedEnd); + long_read_libs += + (type == io::LibraryType::TSLReads || + type == io::LibraryType::PacBioReads || + type == io::LibraryType::NanoporeReads); + } + + return (paired_end_libs == 1 && long_read_libs <= 1); +} + +static bool HybridLibrariesPresent() { + for (const auto &lib : cfg::get().ds.reads) + if (lib.is_hybrid_lib()) + return true; + + return false; +} + +static bool AssemblyGraphPresent() { + for (const auto &lib : cfg::get().ds.reads) + if (lib.is_assembly_graph()) + return true; + + return false; +} + +static std::string GetContigName(std::string contig_id, size_t cov) { + std::string res = std::to_string(cov); + while (res.length() < 4) { + res = "_" + res; + } + return contig_id + res; +} + +static debruijn_graph::ContigOutput::OutputList GetMetaplasmidOutput(size_t cov) { + return {{debruijn_graph::ContigOutput::Kind::PlasmidContigs, + GetContigName(cfg::get().co.contigs_name, cov) }}; +} + +static void AddMetaplasmidStages(StageManager &SPAdes) { + size_t cov = cfg::get().pd->additive_step; + size_t add = cfg::get().pd->additive_step; + double multiplier = cfg::get().pd->relative_step; + size_t max_cov = 600; + SPAdes.add(GetMetaplasmidOutput(0)); + while (cov < max_cov) { + SPAdes.add(cov); + SPAdes.add(); + SPAdes.add(GetMetaplasmidOutput(cov)); + cov = std::max(cov + add, size_t((double) cov*multiplier)); + } +} + +static debruijn_graph::ContigOutput::OutputList GetPreliminaryStageOutput() { + using namespace debruijn_graph; + + return { + {ContigOutput::Kind::GFAGraph, "strain_graph"}, + {ContigOutput::Kind::FinalContigs, cfg::get().co.contigs_name} + }; +} + +static debruijn_graph::ContigOutput::OutputList GetNonFinalStageOutput() { + return { { debruijn_graph::ContigOutput::Kind::BinaryContigs, "simplified_contigs"} }; +} + +static debruijn_graph::ContigOutput::OutputList GetBeforeRROutput() { + using namespace debruijn_graph; + + return { + { ContigOutput::Kind::GFAGraph, "assembly_graph_after_simplification"}, + { ContigOutput::Kind::EdgeSequences, "before_rr"} + }; +} + +static debruijn_graph::ContigOutput::OutputList GetFinalStageOutput() { + using namespace debruijn_graph; + + return { + { ContigOutput::Kind::EdgeSequences, "before_rr" }, + { ContigOutput::Kind::GFAGraph, "assembly_graph_with_scaffolds" }, + { ContigOutput::Kind::FASTGGraph, "assembly_graph" }, + { ContigOutput::Kind::FinalContigs, cfg::get().co.contigs_name }, + { ContigOutput::Kind::Scaffolds, cfg::get().co.scaffolds_name } + }; +} + +static void AddPreliminarySimplificationStages(StageManager &SPAdes) { + using namespace debruijn_graph::config; + pipeline_type mode = cfg::get().mode; + + SPAdes.add(true); + if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify) + SPAdes.add("prelim_gapcloser"); + + if (cfg::get().use_intermediate_contigs) { + SPAdes.add(true); + SPAdes.add(true); + SPAdes.add(true); + + if (cfg::get().hm) + SPAdes.add(); + + SPAdes.add(GetPreliminaryStageOutput()) + .add(); + if (cfg::get().hm) + SPAdes.add(); + } +} + +static void AddSimplificationStages(StageManager &SPAdes) { + VERIFY(!cfg::get().gc.before_raw_simplify || !cfg::get().gc.before_simplify); + bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable; + + if (cfg::get().gap_closer_enable && + cfg::get().gc.before_raw_simplify) + SPAdes.add("early_gapcloser"); + + // Using two_step_rr is hacky here. Fix soon! + SPAdes.add(two_step_rr); + + if (cfg::get().gap_closer_enable && + cfg::get().gc.before_simplify) + SPAdes.add("early_gapcloser"); + + if (two_step_rr) + AddPreliminarySimplificationStages(SPAdes); + + SPAdes.add(); + + if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify) + SPAdes.add("late_gapcloser"); + if (cfg::get().sewage) + SPAdes.add(); + + SPAdes.add(); + + if (cfg::get().correct_mismatches) + SPAdes.add(); + + if (cfg::get().ss_coverage_splitter.enabled) + SPAdes.add(); +} + +static void AddConstructionStages(StageManager &SPAdes, bool mpi = false) { + using namespace debruijn_graph::config; + pipeline_type mode = cfg::get().mode; + + if (mpi) + SPAdes.add(); + else + SPAdes.add(); + if (!PipelineHelper::IsMetagenomicPipeline(mode)) + SPAdes.add(); +} + +static void AddRepeatResolutionStages(StageManager &SPAdes) { + using namespace debruijn_graph::config; + + if (!cfg::get().series_analysis.empty()) + SPAdes.add(); + + SPAdes.add() + .add() + .add(); +} + +class FakeStageOnlyforDataSyncDoesNothingElse : public spades::AssemblyStage { +public: + FakeStageOnlyforDataSyncDoesNothingElse() + : AssemblyStage("Fake Stage Only for Data Sync", "fake_stage_sync_data") { } + + void run(graph_pack::GraphPack&, const char *) {} +}; + +void assemble_genome(bool mpi = false) { + using namespace debruijn_graph::config; + pipeline_type mode = cfg::get().mode; + + INFO("SPAdes started"); + + // Perform various sanity checks + if (mode == pipeline_type::meta && !MetaCompatibleLibraries()) { + FATAL_ERROR("Sorry, current version of metaSPAdes can work either with single library (paired-end only) " + "or in hybrid paired-end + (TSLR or PacBio or Nanopore) mode."); + } else if (AssemblyGraphPresent() && + (mode != pipeline_type::metaextrachromosomal && + !cfg::get().hm)) { + // Disallow generic assembly graph inputs for now + FATAL_ERROR("Assembly graph inputs are supported only for plasmid / metaextrachromosomal and / bgc modes!"); + } + + INFO("Starting from stage: " << cfg::get().entry_point); + + std::unique_ptr SPAdes; + SavesPolicy saves_policy(cfg::get().checkpoints, + cfg::get().output_saves, cfg::get().load_from); + if (mpi) { + SPAdes.reset(new MPIStageManager(saves_policy)); + } else { + SPAdes.reset(new StageManager(saves_policy)); + } + + if (SPAdes->saves_policy().EnabledAnyCheckpoint()) + create_directory(cfg::get().output_saves); + + bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable; + INFO("Two-step repeat resolution " << (two_step_rr ? "enabled" : "disabled")); + + graph_pack::GraphPack conj_gp(cfg::get().K, + cfg::get().tmp_dir, + two_step_rr ? cfg::get().ds.reads.lib_count() + 1 + : cfg::get().ds.reads.lib_count(), + cfg::get().ds.reference_genome, + cfg::get().flanking_range, + cfg::get().pos.max_mapping_gap, + cfg::get().pos.max_gap_diff); + if (cfg::get().need_mapping) { + INFO("Will need read mapping, kmer mapper will be attached"); + conj_gp.get_mutable>().Attach(); + } + + // Build the pipeline + SPAdes->add(); + + if (!AssemblyGraphPresent()) { + AddConstructionStages(*SPAdes, mpi); + if (cfg::get().sewage) + SPAdes->add(); + + AddSimplificationStages(*SPAdes); + + SPAdes->add(cfg::get().main_iteration ? + GetBeforeRROutput() : GetNonFinalStageOutput()); + } else { + SPAdes->add(); + } + + if (cfg::get().main_iteration) { + // Not metaextrachromosomal! + if (mode == pipeline_type::plasmid) + SPAdes->add(); + + if (HybridLibrariesPresent()) + SPAdes->add(); + + // No graph modification allowed after HybridLibrariesAligning stage! + + if (cfg::get().rr_enable) + AddRepeatResolutionStages(*SPAdes); + + if (mode == pipeline_type::metaextrachromosomal) + AddMetaplasmidStages(*SPAdes); + else + SPAdes->add(GetFinalStageOutput()); + + if (cfg::get().hm) + SPAdes->add(); + } + + SPAdes->run(conj_gp, cfg::get().entry_point.c_str()); + + // For informing spades.py about estimated params + write_lib_data(cfg::get().output_dir / "final"); + + INFO("SPAdes finished"); +} + +} diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp index c8f6f20384..f7f5455640 100644 --- a/src/projects/spades/distance_estimation.cpp +++ b/src/projects/spades/distance_estimation.cpp @@ -31,7 +31,11 @@ namespace debruijn_graph { * * Need this histogram for edges which occur more then one time or for find out how much time we need to repeat the loop. */ -void DistanceEstimation::run(graph_pack::GraphPack &gp, const char*) { +void DistanceEstimation::run(graph_pack::GraphPack &gp, const char* s) { + DistanceEstimationInner().run(gp, s); +} + +void DistanceEstimationInner::run(graph_pack::GraphPack &gp, const char *) { using namespace omnigraph::de; using namespace distance_estimation; @@ -58,10 +62,10 @@ void DistanceEstimation::run(graph_pack::GraphPack &gp, const char*) { if (lib.data().mean_insert_size != 0.0) { INFO("Processing library #" << i); - EstimatePairedDistances(clustered_indices[i], graph, lib, paired_indices[i], + runEstimatePairedDistances(clustered_indices[i], graph, lib, paired_indices[i], max_repeat_length, config.de); if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) - EstimateScaffoldingDistances(scaffolding_indices[i], graph, lib, paired_indices[i], + runEstimateScaffoldingDistances(scaffolding_indices[i], graph, lib, paired_indices[i], config.ade, config.de); } @@ -72,4 +76,23 @@ void DistanceEstimation::run(graph_pack::GraphPack &gp, const char*) { } } +void DistanceEstimationInner::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimatePairedDistances(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); +} + + +void DistanceEstimationInner::runEstimateScaffoldingDistances( + omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimateScaffoldingDistances(scaffolding_index, graph, lib, paired_index, ade, de_config); +} + } diff --git a/src/projects/spades/distance_estimation.hpp b/src/projects/spades/distance_estimation.hpp index 66e887c785..92a85ea5fd 100644 --- a/src/projects/spades/distance_estimation.hpp +++ b/src/projects/spades/distance_estimation.hpp @@ -8,15 +8,35 @@ #pragma once -#include +#include "assembly_graph/core/graph.hpp" +#include "paired_info/paired_info.hpp" #include "pipeline/stage.hpp" namespace debruijn_graph { -class DistanceEstimation : public spades::MPIAssemblyStage { +class DistanceEstimationInner { +protected: + virtual void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config); + + virtual void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config); +public: + void run(graph_pack::GraphPack &gp, const char *); +}; + +class DistanceEstimation : public spades::AssemblyStage { public: DistanceEstimation(bool preliminary = false) - : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", + : AssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} void run(graph_pack::GraphPack &gp, const char*) override; diff --git a/src/projects/spades/distance_estimation_mpi.cpp b/src/projects/spades/distance_estimation_mpi.cpp new file mode 100644 index 0000000000..80e7c95d41 --- /dev/null +++ b/src/projects/spades/distance_estimation_mpi.cpp @@ -0,0 +1,34 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "distance_estimation_mpi.hpp" + +#include + +namespace debruijn_graph { + void DistanceEstimationMPI::run(GraphPack &gp, const char* s) { + DistanceEstimationInnerMPI().run(gp, s); + } + + void DistanceEstimationInnerMPI::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimatePairedDistancesMPI(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); + } + + + void DistanceEstimationInnerMPI::runEstimateScaffoldingDistances( + omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + distance_estimation::EstimateScaffoldingDistancesMPI(scaffolding_index, graph, lib, paired_index, ade, de_config); + } +} diff --git a/src/projects/spades/distance_estimation_mpi.hpp b/src/projects/spades/distance_estimation_mpi.hpp new file mode 100644 index 0000000000..8febaf91e3 --- /dev/null +++ b/src/projects/spades/distance_estimation_mpi.hpp @@ -0,0 +1,43 @@ +//*************************************************************************** +//* Copyright (c) 2015 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include +#include "pipeline/stage.hpp" +#include + +namespace debruijn_graph { + class DistanceEstimationInnerMPI : public DistanceEstimationInner { + protected: + void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) override; + + void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, + const Graph &graph, + const io::SequencingLibrary &lib, + const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) override; + }; + + class DistanceEstimationMPI : public spades::MPIAssemblyStage { + public: + DistanceEstimationMPI(bool preliminary = false) + : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", + preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} + + void run(GraphPack &gp, const char *) override; + }; +} + diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index e208f39503..43e912f2fc 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -140,9 +140,9 @@ static void AddPreliminarySimplificationStages(StageManager &SPAdes) { SPAdes.add("prelim_gapcloser"); if (cfg::get().use_intermediate_contigs) { - SPAdes.add(true) - .add(true) - .add(true); + SPAdes.add(true); + SPAdes.add(true); + SPAdes.add(true); if (cfg::get().hm) SPAdes.add(); @@ -178,6 +178,7 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add("late_gapcloser"); if (cfg::get().sewage) SPAdes.add(); + SPAdes.add(); if (cfg::get().correct_mismatches) From 6849808c9700c864e38ceebff94859e97f9646d8 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 14 Oct 2021 12:24:46 +0300 Subject: [PATCH 056/102] separate construction_mpi --- src/common/stages/CMakeLists.txt | 2 +- src/projects/hpcspades/CMakeLists.txt | 2 +- src/projects/hpcspades/common/CMakeLists.txt | 3 ++- .../construction/debruijn_graph_constructor_mpi.hpp | 2 +- src/projects/hpcspades/common/stages/CMakeLists.txt | 13 +++++++++++++ .../hpcspades}/common/stages/construction_mpi.cpp | 2 +- .../hpcspades}/common/stages/construction_mpi.hpp | 0 src/projects/hpcspades/pipeline.cpp | 12 ++++-------- src/projects/spades/pipeline.cpp | 10 +++------- 9 files changed, 26 insertions(+), 20 deletions(-) rename src/{ => projects/hpcspades}/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp (97%) create mode 100644 src/projects/hpcspades/common/stages/CMakeLists.txt rename src/{ => projects/hpcspades}/common/stages/construction_mpi.cpp (99%) rename src/{ => projects/hpcspades}/common/stages/construction_mpi.hpp (100%) diff --git a/src/common/stages/CMakeLists.txt b/src/common/stages/CMakeLists.txt index c6bf4cd057..69a7483a59 100644 --- a/src/common/stages/CMakeLists.txt +++ b/src/common/stages/CMakeLists.txt @@ -10,7 +10,7 @@ project(stages CXX) set(stages_src read_conversion.cpp construction.cpp simplification.cpp ss_edge_split.cpp genomic_info_filler.cpp) if (MPI_FOUND) - set(stages_src ${stages_src} test_mpi.cpp construction_mpi.cpp) + set(stages_src ${stages_src} test_mpi.cpp) endif() add_library(stages STATIC ${stages_src}) diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index dbfe30f0ba..fe30654d2c 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -11,7 +11,7 @@ add_subdirectory(common) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) set(HPCSPADES_SRC pipeline.cpp ../../projects/spades/series_analysis.cpp ../../projects/mts/contig_abundance.cpp) -set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi ${COMMON_LIBRARIES}) +set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi ${COMMON_LIBRARIES}) add_executable(spades-hpc main_mpi.cpp ${HPCSPADES_SRC}) diff --git a/src/projects/hpcspades/common/CMakeLists.txt b/src/projects/hpcspades/common/CMakeLists.txt index d87e26ab4d..32ffad2690 100644 --- a/src/projects/hpcspades/common/CMakeLists.txt +++ b/src/projects/hpcspades/common/CMakeLists.txt @@ -7,4 +7,5 @@ project(common_modules_mpi CXX) -add_subdirectory(paired_info) \ No newline at end of file +add_subdirectory(paired_info) +add_subdirectory(stages) diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp similarity index 97% rename from src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp rename to src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index 6fa6372670..323054ca3b 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -7,7 +7,7 @@ #include "pipeline/partask_mpi.hpp" #include "io/binary/graph.hpp" -#include "debruijn_graph_constructor.hpp" +#include "common/assembly_graph/construction/debruijn_graph_constructor.hpp" namespace debruijn_graph { template diff --git a/src/projects/hpcspades/common/stages/CMakeLists.txt b/src/projects/hpcspades/common/stages/CMakeLists.txt new file mode 100644 index 0000000000..810225c653 --- /dev/null +++ b/src/projects/hpcspades/common/stages/CMakeLists.txt @@ -0,0 +1,13 @@ +############################################################################ +# Copyright (c) 2015 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(stages-mpi CXX) + +set(STAGES_MPI_SRC construction_mpi.cpp) + +add_library(stages-mpi STATIC + ${STAGES_MPI_SRC}) diff --git a/src/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp similarity index 99% rename from src/common/stages/construction_mpi.cpp rename to src/projects/hpcspades/common/stages/construction_mpi.cpp index de2c3dadd0..e1cba80008 100644 --- a/src/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -10,7 +10,7 @@ #include "alignment/edge_index.hpp" #include "assembly_graph/construction/early_simplification.hpp" -#include "assembly_graph/construction/debruijn_graph_constructor_mpi.hpp" +#include "../assembly_graph/construction/debruijn_graph_constructor_mpi.hpp" #include "io/dataset_support/dataset_readers.hpp" #include "io/dataset_support/read_converter.hpp" #include "io/reads/coverage_filtering_read_wrapper.hpp" diff --git a/src/common/stages/construction_mpi.hpp b/src/projects/hpcspades/common/stages/construction_mpi.hpp similarity index 100% rename from src/common/stages/construction_mpi.hpp rename to src/projects/hpcspades/common/stages/construction_mpi.hpp diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index d111a794e3..a49031bf37 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -28,8 +28,7 @@ #include "stages/genomic_info_filler.hpp" #include "stages/read_conversion.hpp" -#include "stages/construction.hpp" -#include "stages/construction_mpi.hpp" +#include "common/stages/construction_mpi.hpp" #include "stages/simplification.hpp" #include "stages/ss_edge_split.hpp" #include "configs/config_struct.hpp" @@ -188,14 +187,11 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); } -static void AddConstructionStages(StageManager &SPAdes, bool mpi = false) { +static void AddConstructionStages(StageManager &SPAdes) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; - if (mpi) - SPAdes.add(); - else - SPAdes.add(); + SPAdes.add(); if (!PipelineHelper::IsMetagenomicPipeline(mode)) SPAdes.add(); } @@ -270,7 +266,7 @@ void assemble_genome(bool mpi = false) { SPAdes->add(); if (!AssemblyGraphPresent()) { - AddConstructionStages(*SPAdes, mpi); + AddConstructionStages(*SPAdes); if (cfg::get().sewage) SPAdes->add(); diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index 43e912f2fc..65171fe05a 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -29,7 +29,6 @@ #include "stages/genomic_info_filler.hpp" #include "stages/read_conversion.hpp" #include "stages/construction.hpp" -#include "stages/construction_mpi.hpp" #include "stages/simplification.hpp" #include "stages/ss_edge_split.hpp" #include "configs/config_struct.hpp" @@ -188,14 +187,11 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); } -static void AddConstructionStages(StageManager &SPAdes, bool mpi = false) { +static void AddConstructionStages(StageManager &SPAdes) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; - if (mpi) - SPAdes.add(); - else - SPAdes.add(); + SPAdes.add(); if (!PipelineHelper::IsMetagenomicPipeline(mode)) SPAdes.add(); } @@ -270,7 +266,7 @@ void assemble_genome(bool mpi = false) { SPAdes->add(); if (!AssemblyGraphPresent()) { - AddConstructionStages(*SPAdes, mpi); + AddConstructionStages(*SPAdes); if (cfg::get().sewage) SPAdes->add(); From 35918083ec5b3a24864434ed2028c99b700eaedd Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 14 Oct 2021 12:30:00 +0300 Subject: [PATCH 057/102] separate test_mpi --- src/common/stages/CMakeLists.txt | 5 +- .../hpcspades/common/stages/CMakeLists.txt | 2 +- .../hpcspades/common/stages/test_mpi.cpp | 106 ++++++++++++++++++ 3 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 src/projects/hpcspades/common/stages/test_mpi.cpp diff --git a/src/common/stages/CMakeLists.txt b/src/common/stages/CMakeLists.txt index 69a7483a59..89e3f33bc0 100644 --- a/src/common/stages/CMakeLists.txt +++ b/src/common/stages/CMakeLists.txt @@ -9,9 +9,6 @@ project(stages CXX) set(stages_src read_conversion.cpp construction.cpp simplification.cpp ss_edge_split.cpp genomic_info_filler.cpp) -if (MPI_FOUND) - set(stages_src ${stages_src} test_mpi.cpp) -endif() - add_library(stages STATIC ${stages_src}) + target_link_libraries(stages coverage_model pipeline gqf input) diff --git a/src/projects/hpcspades/common/stages/CMakeLists.txt b/src/projects/hpcspades/common/stages/CMakeLists.txt index 810225c653..6d91cad5ef 100644 --- a/src/projects/hpcspades/common/stages/CMakeLists.txt +++ b/src/projects/hpcspades/common/stages/CMakeLists.txt @@ -7,7 +7,7 @@ project(stages-mpi CXX) -set(STAGES_MPI_SRC construction_mpi.cpp) +set(STAGES_MPI_SRC construction_mpi.cpp test_mpi.cpp) add_library(stages-mpi STATIC ${STAGES_MPI_SRC}) diff --git a/src/projects/hpcspades/common/stages/test_mpi.cpp b/src/projects/hpcspades/common/stages/test_mpi.cpp new file mode 100644 index 0000000000..120b20a98f --- /dev/null +++ b/src/projects/hpcspades/common/stages/test_mpi.cpp @@ -0,0 +1,106 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "pipeline/partask_mpi.hpp" +#include "pipeline/mpi_stage.hpp" +#include "pipeline/stage.hpp" + +#include +#include +#include +#include +#include + +namespace debruijn_graph { + +class ArraySum { +public: + ArraySum(const std::string &message = "") : message_{message} {}; + ArraySum(const ArraySum&) = delete; + ArraySum(ArraySum&&) = default; + + std::string message_; + ArraySum(std::istream &is) { std::getline(is, message_); } + + std::ostream &serialize(std::ostream &os) const { return os << message_; } + + template + auto make_splitter(size_t n, const Data &data) { + size_t N = data.size(); + auto splitter = [N, n, i = size_t(0)](std::ostream &os, size_t /*node*/) mutable -> bool { + if (i == n) return false; + size_t begin = i * N / n; + size_t end = (i + 1) * N / n; + ++i; + os << begin << " " << end << " "; + return true; + }; + + return splitter; + }; + + template + void process(std::istream &is, std::ostream &os, const Data &data) { + INFO("MESSAGE: " << message_); + long long int sum = 0; +#pragma omp parallel reduction(+ : sum) + while (true) { + size_t begin, end; + bool exit = false; +#pragma omp critical + { + if (is.peek() == EOF || !(is >> begin >> end)) { + exit = true; + } else { + DEBUG("Extracted range: " << begin << " " << end); + } + } + if (exit) break; + for (size_t i = begin; i < end; ++i) { + sum += data[i]; + } + } + INFO("Computed sum: " << sum); + os << sum; + } + + auto merge(const std::vector &piss, ...) { + long long int sum = 0; + for (auto &pis : piss) { + long long int local_sum; + *pis >> local_sum; + sum += local_sum; + } + + return sum; + }; +}; + +class TestMPI : public spades::MPIAssemblyStage { +public: + TestMPI() : MPIAssemblyStage("Test MPI", "test_mpi") {} + + void run(graph_pack::GraphPack& /*gp*/, const char *) override { + INFO("TestMPI started"); + partask::TaskRegistry treg; + + const size_t N = 100000; + std::array data; + std::iota(data.begin(), data.end(), 1); + + auto job = treg.add(std::cref(data)); + treg.listen(); + + if (treg.master()) { + auto res = job("Message1"); + INFO("JOB RESULT: " << res); + } + + treg.stop_listening(); + } +}; + +} // namespace debruijn_graph From 37653931e5cd54630f3215646993d6d85f374702 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Fri, 19 Nov 2021 15:01:04 +0300 Subject: [PATCH 058/102] update distEst Arhitecture --- .../hpcspades/distance_estimation_mpi.cpp | 25 ++--------- .../hpcspades/distance_estimation_mpi.hpp | 27 ++---------- src/projects/spades/distance_estimation.cpp | 41 +++++++----------- src/projects/spades/distance_estimation.hpp | 35 +++++++-------- .../spades/distance_estimation_mpi.cpp | 34 --------------- .../spades/distance_estimation_mpi.hpp | 43 ------------------- 6 files changed, 41 insertions(+), 164 deletions(-) delete mode 100644 src/projects/spades/distance_estimation_mpi.cpp delete mode 100644 src/projects/spades/distance_estimation_mpi.hpp diff --git a/src/projects/hpcspades/distance_estimation_mpi.cpp b/src/projects/hpcspades/distance_estimation_mpi.cpp index c2f2b8bf0a..fa2cec59ae 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.cpp +++ b/src/projects/hpcspades/distance_estimation_mpi.cpp @@ -9,26 +9,7 @@ #include "common/paired_info/distance_estimation_utils.hpp" namespace debruijn_graph { - void DistanceEstimationMPI::run(graph_pack::GraphPack &gp, const char* s) { - DistanceEstimationInnerMPI().run(gp, s); - } - - void DistanceEstimationInnerMPI::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimatePairedDistancesMPI(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); - } - - - void DistanceEstimationInnerMPI::runEstimateScaffoldingDistances( - omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimateScaffoldingDistancesMPI(scaffolding_index, graph, lib, paired_index, ade, de_config); - } +void DistanceEstimationMPI::run(graph_pack::GraphPack &gp, const char* s) { + DistanceEstimationBase::run(gp, s, distance_estimation::EstimatePairedDistancesMPI, distance_estimation::EstimateScaffoldingDistancesMPI); +} } diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp index a52c8fc131..562cf4185e 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.hpp +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -7,31 +7,11 @@ #pragma once -#include "pipeline/mpi_stage.hpp" -#include "assembly_graph/core/graph.hpp" -#include "paired_info/paired_info.hpp" -#include "pipeline/stage.hpp" -#include +#include "common/pipeline/mpi_stage.hpp" +#include "projects/spades/distance_estimation.hpp" namespace debruijn_graph { - class DistanceEstimationInnerMPI : public DistanceEstimationInner { - protected: - void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) override; - - void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) override; - }; - - class DistanceEstimationMPI : public spades::MPIAssemblyStage { + class DistanceEstimationMPI : public DistanceEstimationBase, public spades::MPIAssemblyStage { public: DistanceEstimationMPI(bool preliminary = false) : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", @@ -40,3 +20,4 @@ namespace debruijn_graph { void run(graph_pack::GraphPack &gp, const char *) override; }; } + diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp index f7f5455640..5c5860c662 100644 --- a/src/projects/spades/distance_estimation.cpp +++ b/src/projects/spades/distance_estimation.cpp @@ -20,7 +20,6 @@ #include namespace debruijn_graph { - /* * Input: raw_paired_indices -- the map from pairs of edges to histogram of estimated distance between them. * Output: clustered_indices -- the map from pairs of edges to histogram of distance, but now clustering @@ -32,10 +31,22 @@ namespace debruijn_graph { * Need this histogram for edges which occur more then one time or for find out how much time we need to repeat the loop. */ void DistanceEstimation::run(graph_pack::GraphPack &gp, const char* s) { - DistanceEstimationInner().run(gp, s); + DistanceEstimationBase::run(gp, s, distance_estimation::EstimatePairedDistances, distance_estimation::EstimateScaffoldingDistances); } -void DistanceEstimationInner::run(graph_pack::GraphPack &gp, const char *) { +void DistanceEstimationBase::run(graph_pack::GraphPack &gp, const char *, + const std::function &clustered_index, + const Graph &graph, + const SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const distance_estimator &de_config)>& runEstimatePairedDistances, + const std::function &scaffolding_index, + const Graph &graph, + const SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const smoothing_distance_estimator &ade, + const distance_estimator &de_config)>& runEstimateScaffoldingDistances) { using namespace omnigraph::de; using namespace distance_estimation; @@ -63,10 +74,10 @@ void DistanceEstimationInner::run(graph_pack::GraphPack &gp, const char *) { if (lib.data().mean_insert_size != 0.0) { INFO("Processing library #" << i); runEstimatePairedDistances(clustered_indices[i], graph, lib, paired_indices[i], - max_repeat_length, config.de); + max_repeat_length, config.de); if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) runEstimateScaffoldingDistances(scaffolding_indices[i], graph, lib, paired_indices[i], - config.ade, config.de); + config.ade, config.de); } if (!cfg::get().preserve_raw_paired_index) { @@ -75,24 +86,4 @@ void DistanceEstimationInner::run(graph_pack::GraphPack &gp, const char *) { } } } - -void DistanceEstimationInner::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimatePairedDistances(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); -} - - -void DistanceEstimationInner::runEstimateScaffoldingDistances( - omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimateScaffoldingDistances(scaffolding_index, graph, lib, paired_index, ade, de_config); -} - } diff --git a/src/projects/spades/distance_estimation.hpp b/src/projects/spades/distance_estimation.hpp index 92a85ea5fd..f78c472ddd 100644 --- a/src/projects/spades/distance_estimation.hpp +++ b/src/projects/spades/distance_estimation.hpp @@ -13,27 +13,28 @@ #include "pipeline/stage.hpp" namespace debruijn_graph { + using namespace omnigraph::de; + using namespace io; + using namespace debruijn_graph::config; -class DistanceEstimationInner { -protected: - virtual void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config); - - virtual void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config); +class DistanceEstimationBase { public: - void run(graph_pack::GraphPack &gp, const char *); + void run(graph_pack::GraphPack &gp, const char *, + const std::function &clustered_index, + const Graph &graph, + const SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const distance_estimator &de_config)> &runEstimatePairedDistances, + const std::function &scaffolding_index, + const Graph &graph, + const SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const smoothing_distance_estimator &ade, + const distance_estimator &de_config)> &runEstimateScaffoldingDistances); }; -class DistanceEstimation : public spades::AssemblyStage { +class DistanceEstimation : public DistanceEstimationBase, public spades::AssemblyStage { public: DistanceEstimation(bool preliminary = false) : AssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", diff --git a/src/projects/spades/distance_estimation_mpi.cpp b/src/projects/spades/distance_estimation_mpi.cpp deleted file mode 100644 index 80e7c95d41..0000000000 --- a/src/projects/spades/distance_estimation_mpi.cpp +++ /dev/null @@ -1,34 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2021 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "distance_estimation_mpi.hpp" - -#include - -namespace debruijn_graph { - void DistanceEstimationMPI::run(GraphPack &gp, const char* s) { - DistanceEstimationInnerMPI().run(gp, s); - } - - void DistanceEstimationInnerMPI::runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimatePairedDistancesMPI(clustered_index, graph, lib, paired_index, max_repeat_length, de_config); - } - - - void DistanceEstimationInnerMPI::runEstimateScaffoldingDistances( - omnigraph::de::PairedInfoIndexT &scaffolding_index, const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) { - distance_estimation::EstimateScaffoldingDistancesMPI(scaffolding_index, graph, lib, paired_index, ade, de_config); - } -} diff --git a/src/projects/spades/distance_estimation_mpi.hpp b/src/projects/spades/distance_estimation_mpi.hpp deleted file mode 100644 index 8febaf91e3..0000000000 --- a/src/projects/spades/distance_estimation_mpi.hpp +++ /dev/null @@ -1,43 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2015 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#pragma once - -#include -#include -#include -#include "pipeline/stage.hpp" -#include - -namespace debruijn_graph { - class DistanceEstimationInnerMPI : public DistanceEstimationInner { - protected: - void runEstimatePairedDistances(omnigraph::de::PairedInfoIndexT &clustered_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) override; - - void runEstimateScaffoldingDistances(omnigraph::de::PairedInfoIndexT &scaffolding_index, - const Graph &graph, - const io::SequencingLibrary &lib, - const omnigraph::de::UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) override; - }; - - class DistanceEstimationMPI : public spades::MPIAssemblyStage { - public: - DistanceEstimationMPI(bool preliminary = false) - : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", - preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} - - void run(GraphPack &gp, const char *) override; - }; -} - From df328d82ab663d72f722b53dafb4e4544abc639d Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 13:52:20 +0300 Subject: [PATCH 059/102] make SeqMapperNot in GCMPI as in GC --- src/common/alignment/sequence_mapper_notifier.hpp | 6 ++++++ src/projects/spades/gap_closer.cpp | 9 ++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 575d5cebe1..3f2e0d89ef 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -189,6 +189,12 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { INFO("Listeners synced"); } } + + template + void ProcessLibrary(io::ReadStreamList& streams, + const SequenceMapperT& mapper, size_t threads_count = 0) { + return ProcessLibrary(streams, 0, mapper, threads_count); + } }; } // namespace debruijn_graph diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index ac4c0bfe45..afcbbdbd68 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -494,18 +494,17 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { return; } - SequenceMapperNotifierMPI notifier(cfg::get().ds.reads.lib_count()); - size_t num_readers = partask::overall_num_threads(); - auto& dataset = cfg::get_writable().ds; for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { if (dataset.reads[i].type() != io::LibraryType::PairedEnd) continue; - notifier.Subscribe(&gcpif, i); + SequenceMapperNotifierMPI notifier; + size_t num_readers = partask::overall_num_threads(); + notifier.Subscribe(&gcpif); io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, 0, false, num_readers); - notifier.ProcessLibrary(paired_streams, i, *gcpif.GetMapper()); + notifier.ProcessLibrary(paired_streams, *gcpif.GetMapper()); INFO("Initializing gap closer"); g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves From 9ccb1dfea6166e9397358081342e7181746ad61d Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 14:40:32 +0300 Subject: [PATCH 060/102] GapCloserBase --- src/projects/spades/gap_closer.cpp | 18 +++++++++--------- src/projects/spades/gap_closer.hpp | 24 ++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index afcbbdbd68..f86fc9a4a2 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -14,7 +14,6 @@ #include "modules/simplification/compressor.hpp" #include "paired_info/concurrent_pair_info_buffer.hpp" #include "pipeline/sequence_mapper_gp_api.hpp" -#include "pipeline/partask_mpi.hpp" #include "io/binary/edge_index.hpp" #include @@ -469,13 +468,13 @@ class GapCloser { DECL_LOGGER("GapCloser"); }; -void GapClosing::run(graph_pack::GraphPack &gp, const char *) { +void GapClosingBase::execute(graph_pack::GraphPack &gp, const char *) { visualization::graph_labeler::DefaultLabeler labeler(gp.get(), gp.get>()); stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir); printer(config::info_printer_pos::before_first_gap_closer); size_t cnt_pe = 0; - for (const auto& lib : cfg::get().ds.reads.libraries()) { + for (const auto &lib: cfg::get().ds.reads.libraries()) { if (lib.type() != io::LibraryType::PairedEnd) continue; @@ -494,17 +493,14 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { return; } - auto& dataset = cfg::get_writable().ds; + auto &dataset = cfg::get_writable().ds; for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { if (dataset.reads[i].type() != io::LibraryType::PairedEnd) continue; - SequenceMapperNotifierMPI notifier; - size_t num_readers = partask::overall_num_threads(); - notifier.Subscribe(&gcpif); io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, - 0, false, num_readers); - notifier.ProcessLibrary(paired_streams, *gcpif.GetMapper()); + 0, false, num_readers); + processLibrary(&gcpif, *gcpif.GetMapper(), paired_streams); INFO("Initializing gap closer"); g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves @@ -515,4 +511,8 @@ void GapClosing::run(graph_pack::GraphPack &gp, const char *) { } } +void GapClosing::run(graph_pack::GraphPack &gp, const char *s) { + execute(gp, s); +} + } diff --git a/src/projects/spades/gap_closer.hpp b/src/projects/spades/gap_closer.hpp index c01151aee7..62df43b69a 100644 --- a/src/projects/spades/gap_closer.hpp +++ b/src/projects/spades/gap_closer.hpp @@ -7,15 +7,35 @@ //*************************************************************************** #pragma once + +#include "alignment/sequence_mapper_notifier.hpp" #include "pipeline/mpi_stage.hpp" #include "pipeline/stage.hpp" +#include "io/reads/io_helper.hpp" namespace debruijn_graph { -class GapClosing : public spades::MPIAssemblyStage { +class GapClosingBase { + protected: + size_t num_readers = 0; + virtual void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) = 0; + public: + void execute(graph_pack::GraphPack &gp, const char *); +}; + +class GapClosing : public GapClosingBase, public spades::MPIAssemblyStage { + protected: + void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { + SequenceMapperNotifierMPI notifier; + notifier.Subscribe(listener); + notifier.ProcessLibrary(paired_streams, mapper); + } + public: GapClosing(const char* id) - : MPIAssemblyStage("Gap Closer (parmap)", id) {} + : MPIAssemblyStage("Gap Closer (parmap)", id) { + num_readers = partask::overall_num_threads(); + } void run(graph_pack::GraphPack &gp, const char*) override; }; From d101a7ba862694cf38d985f47a2fa0fae94e4954 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 16:43:40 +0300 Subject: [PATCH 061/102] Separate MPI gap closer --- src/projects/hpcspades/gap_closer_mpi.hpp | 37 +++++++++++++++++++++++ src/projects/hpcspades/pipeline.cpp | 10 +++--- src/projects/spades/gap_closer.hpp | 9 ++---- 3 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 src/projects/hpcspades/gap_closer_mpi.hpp diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp new file mode 100644 index 0000000000..60432da86c --- /dev/null +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -0,0 +1,37 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* Copyright (c) 2015-2022 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "projects/spades/gap_closer.hpp" +#include "alignment/sequence_mapper_notifier.hpp" +#include "pipeline/mpi_stage.hpp" +#include "io/reads/io_helper.hpp" + +namespace debruijn_graph { + +class GapClosingMPI : public GapClosingBase, public spades::MPIAssemblyStage { + protected: + void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { + SequenceMapperNotifierMPI notifier; + notifier.Subscribe(listener); + notifier.ProcessLibrary(paired_streams, mapper); + } + + public: + GapClosingMPI(const char* id) + : MPIAssemblyStage("Gap Closer (parmap)", id) { + num_readers = partask::overall_num_threads(); + } + + void run(graph_pack::GraphPack &gp, const char* s) override { + execute(gp, s); + } +}; + +} diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index a49031bf37..a3b432e812 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -6,7 +6,7 @@ //*************************************************************************** #include "projects/spades/load_graph.hpp" -#include "projects/spades/gap_closer.hpp" +#include "gap_closer_mpi.hpp" #include "projects/spades/mismatch_correction.hpp" #include "projects/spades/pair_info_count.hpp" #include "projects/spades/second_phase_setup.hpp" @@ -136,7 +136,7 @@ static void AddPreliminarySimplificationStages(StageManager &SPAdes) { SPAdes.add(true); if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify) - SPAdes.add("prelim_gapcloser"); + SPAdes.add("prelim_gapcloser"); if (cfg::get().use_intermediate_contigs) { SPAdes.add(true); @@ -159,14 +159,14 @@ static void AddSimplificationStages(StageManager &SPAdes) { if (cfg::get().gap_closer_enable && cfg::get().gc.before_raw_simplify) - SPAdes.add("early_gapcloser"); + SPAdes.add("early_gapcloser"); // Using two_step_rr is hacky here. Fix soon! SPAdes.add(two_step_rr); if (cfg::get().gap_closer_enable && cfg::get().gc.before_simplify) - SPAdes.add("early_gapcloser"); + SPAdes.add("early_gapcloser"); if (two_step_rr) AddPreliminarySimplificationStages(SPAdes); @@ -174,7 +174,7 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify) - SPAdes.add("late_gapcloser"); + SPAdes.add("late_gapcloser"); if (cfg::get().sewage) SPAdes.add(); diff --git a/src/projects/spades/gap_closer.hpp b/src/projects/spades/gap_closer.hpp index 62df43b69a..1c5e4898fe 100644 --- a/src/projects/spades/gap_closer.hpp +++ b/src/projects/spades/gap_closer.hpp @@ -9,7 +9,6 @@ #pragma once #include "alignment/sequence_mapper_notifier.hpp" -#include "pipeline/mpi_stage.hpp" #include "pipeline/stage.hpp" #include "io/reads/io_helper.hpp" @@ -23,19 +22,17 @@ class GapClosingBase { void execute(graph_pack::GraphPack &gp, const char *); }; -class GapClosing : public GapClosingBase, public spades::MPIAssemblyStage { +class GapClosing : public GapClosingBase, public spades::AssemblyStage { protected: void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { - SequenceMapperNotifierMPI notifier; + SequenceMapperNotifier notifier; notifier.Subscribe(listener); notifier.ProcessLibrary(paired_streams, mapper); } public: GapClosing(const char* id) - : MPIAssemblyStage("Gap Closer (parmap)", id) { - num_readers = partask::overall_num_threads(); - } + : AssemblyStage("Gap Closer", id) {} void run(graph_pack::GraphPack &gp, const char*) override; }; From 47c28e05f420daf7eb75d862caba061e11825b7e Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 21:24:25 +0300 Subject: [PATCH 062/102] mismatch_corrector with functor --- .../alignment/sequence_mapper_notifier.hpp | 15 ++++++ src/projects/spades/mismatch_correction.cpp | 51 ++++--------------- 2 files changed, 25 insertions(+), 41 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index 3f2e0d89ef..c4758cc897 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -197,6 +197,21 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { } }; +template +void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { + SequenceMapperNotifier notifier; + notifier.Subscribe(listener); + notifier.ProcessLibrary(streams, mapper); +} + + +template +void ProcessLibraryMPI(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { + SequenceMapperNotifierMPI notifier; + notifier.Subscribe(listener); + notifier.ProcessLibrary(streams, mapper); +} + } // namespace debruijn_graph diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index 040c5924a7..cefc02d9ba 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -324,11 +324,14 @@ class MismatchShallNotPass { private: typedef typename Graph::EdgeId EdgeId; typedef typename Graph::VertexId VertexId; + typedef std::function&, io::ReadStreamList& streams)> ProccessLibFuncT; graph_pack::GraphPack &gp_; Graph &graph_; const size_t k_; const double relative_threshold_; + const ProccessLibFuncT& proccess_lib_func_; + const size_t num_readers_; EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) { VERIFY(position >= k_); @@ -437,7 +440,7 @@ class MismatchShallNotPass { SequenceMapperNotifier notifier; notifier.Subscribe(&statistics); - auto &reads = cfg::get_writable().ds.reads[i]; + auto &reads = dataset.reads[i]; auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true); notifier.ProcessLibrary(single_streams, *mapper); } @@ -445,36 +448,14 @@ class MismatchShallNotPass { return CorrectAllEdges(statistics); } - size_t ParallelStopMismatchIterationMPI() { - INFO("Collect potential mismatches"); - MismatchStatistics statistics(gp_); - INFO("Potential mismatches collected"); - - SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); - - auto& dataset = cfg::get_writable().ds; - - auto mapper = MapperInstance(gp_); - for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { - if (!dataset.reads[i].is_mismatch_correctable()) - continue; - - notifier.Subscribe(&statistics, i); - auto &reads = dataset.reads[i]; - size_t num_readers = partask::overall_num_threads(); - auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true, num_readers); - notifier.ProcessLibrary(single_streams, i, *mapper); - } - - return CorrectAllEdges(statistics); - } - public: - MismatchShallNotPass(graph_pack::GraphPack &gp, double relative_threshold = 1.5) : + MismatchShallNotPass(const ProccessLibFuncT& processLib, graph_pack::GraphPack &gp, double relative_threshold = 1.5, size_t num_readers = 0) : gp_(gp), graph_(gp.get_mutable()), k_(gp.k()), - relative_threshold_(relative_threshold) { + relative_threshold_(relative_threshold), + proccess_lib_func_(processLib), + num_readers_(num_readers) { VERIFY(relative_threshold >= 1); } @@ -490,26 +471,14 @@ class MismatchShallNotPass { } return res; } - - size_t ParallelStopAllMismatchesMPI(size_t max_iterations = 1) { - size_t res = 0; - while (max_iterations > 0) { - size_t last = ParallelStopMismatchIterationMPI(); - res += last; - if (last == 0) - break; - max_iterations--; - } - return res; - } }; } // namespace mismatches void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { EnsureBasicMapping(gp); - size_t corrected = mismatches::MismatchShallNotPass(gp, 2). - ParallelStopAllMismatchesMPI(1); + size_t corrected = mismatches::MismatchShallNotPass(ProcessLibraryMPI, gp, 2, partask::overall_num_threads()). + ParallelStopAllMismatches(1); INFO("Corrected " << corrected << " nucleotides"); } From b5817c4d77ac1d2f8e29628f1744e8f1567007d9 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 22:45:44 +0300 Subject: [PATCH 063/102] declarate MismatchShallNotPass to hpp --- src/projects/spades/mismatch_correction.cpp | 456 ++++++++++---------- src/projects/spades/mismatch_correction.hpp | 53 ++- 2 files changed, 270 insertions(+), 239 deletions(-) diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index cefc02d9ba..2d7c77e719 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -49,291 +49,282 @@ std::vector split_iterator(size_t chunks, Iter b, Iter e, size_t n) { namespace debruijn_graph { namespace mismatches { -struct NuclCount { - std::array counts_; + struct NuclCount { + std::array counts_; - NuclCount() - : counts_{} {} + NuclCount() + : counts_{} {} - size_t &operator[](size_t nucl) { - return counts_[nucl]; - } - - size_t operator[](size_t nucl) const { - return counts_[nucl]; - } - - NuclCount &operator+=(const NuclCount &other) { - counts_[0] += other.counts_[0]; - counts_[1] += other.counts_[1]; - counts_[2] += other.counts_[2]; - counts_[3] += other.counts_[3]; - return *this; - } - - void BinWrite(std::ostream &os) const { - io::binary::BinWrite(os, counts_); - } + size_t &operator[](size_t nucl) { + return counts_[nucl]; + } - void BinRead(std::istream &is) { - io::binary::BinRead(is, counts_); - } -}; + size_t operator[](size_t nucl) const { + return counts_[nucl]; + } -struct MismatchEdgeInfo { - NuclCount operator[](size_t i) const { - auto it = info_.find(uint32_t(i)); - if (it == info_.end()) - return NuclCount(); - else - return it->second; - } + NuclCount &operator+=(const NuclCount &other) { + counts_[0] += other.counts_[0]; + counts_[1] += other.counts_[1]; + counts_[2] += other.counts_[2]; + counts_[3] += other.counts_[3]; + return *this; + } - void operator+=(const MismatchEdgeInfo &other) { - for (const auto &entry : other.info_) { - info_[entry.first] += entry.second; + void BinWrite(std::ostream &os) const { + io::binary::BinWrite(os, counts_); } - } - void increment(size_t position, size_t nucl) { - info_[uint32_t(position)][nucl] += 1; - } + void BinRead(std::istream &is) { + io::binary::BinRead(is, counts_); + } + }; + + struct MismatchEdgeInfo { + NuclCount operator[](size_t i) const { + auto it = info_.find(uint32_t(i)); + if (it == info_.end()) + return NuclCount(); + else + return it->second; + } - void ClearValues() { - for (auto &kv : info_) { - kv.second = NuclCount(); + void operator+=(const MismatchEdgeInfo &other) { + for (const auto &entry: other.info_) { + info_[entry.first] += entry.second; + } } - } -public: - phmap::flat_hash_map info_; + void increment(size_t position, size_t nucl) { + info_[uint32_t(position)][nucl] += 1; + } - void BinWrite(std::ostream &os) const { - io::binary::BinWrite(os, info_); - } + void ClearValues() { + for (auto &kv: info_) { + kv.second = NuclCount(); + } + } - void BinRead(std::istream &is) { - io::binary::BinRead(is, info_); - } -}; + public: + phmap::flat_hash_map info_; -class MismatchStatistics : public SequenceMapperListener { -private: - typedef Graph::EdgeId EdgeId; - typedef phmap::node_hash_map InnerMismatchStatistics; - typedef typename InnerMismatchStatistics::const_iterator const_iterator; - InnerMismatchStatistics statistics_; - std::vector statistics_buffers_; + void BinWrite(std::ostream &os) const { + io::binary::BinWrite(os, info_); + } - typedef phmap::node_hash_map> MismatchCandidates; - MismatchCandidates candidates_; + void BinRead(std::istream &is) { + io::binary::BinRead(is, info_); + } + }; - const Graph &g_; + class MismatchStatistics : public SequenceMapperListener { + private: + typedef Graph::EdgeId EdgeId; + typedef phmap::node_hash_map InnerMismatchStatistics; + typedef typename InnerMismatchStatistics::const_iterator const_iterator; + InnerMismatchStatistics statistics_; + std::vector statistics_buffers_; - template - void CollectPotentialMismatches(const graph_pack::GraphPack &gp, Iter b, Iter e, MismatchCandidates &candidates) { - const auto &index = gp.get>(); + typedef phmap::node_hash_map> MismatchCandidates; + MismatchCandidates candidates_; - for (const auto &mentry : adt::make_range(b, e)) { - // Kmer mapper iterator dereferences to pair (KMer, KMer), not to the reference! - const RtSeq &from = mentry.first; - const RtSeq &to = mentry.second; + const Graph &g_; - // No need to do anything if the target is not in the graph. - // This certainly expects normalized index. - if (!index.contains(to)) - continue; + template + void CollectPotentialMismatches(const graph_pack::GraphPack &gp, Iter b, Iter e, MismatchCandidates &candidates) { + const auto &index = gp.get>(); - size_t cnt = 0; - std::array cnt_arr{}; + for (const auto &mentry: adt::make_range(b, e)) { + // Kmer mapper iterator dereferences to pair (KMer, KMer), not to the reference! + const RtSeq &from = mentry.first; + const RtSeq &to = mentry.second; - for (size_t i = 0; i < from.size(); i++) { - if (from[i] == to[i]) + // No need to do anything if the target is not in the graph. + // This certainly expects normalized index. + if (!index.contains(to)) continue; - cnt += 1; - cnt_arr[(i * 4) / from.size()] += 1; - } + size_t cnt = 0; + std::array cnt_arr{}; - // No mismatches, no cookies - if (cnt == 0) - continue; + for (size_t i = 0; i < from.size(); i++) { + if (from[i] == to[i]) + continue; - // If there are too many mismatches, then it means erroneous mapping - if (cnt > from.size() / 3) - continue; + cnt += 1; + cnt_arr[(i * 4) / from.size()] += 1; + } - // These conditions are to avoid excessive indels: if two/third of - // nucleotides in first/last quarter are mismatches, then it means - // erroneous mapping - if (cnt_arr[0] > from.size() / 6 || cnt_arr[3] > from.size() / 6) - continue; + // No mismatches, no cookies + if (cnt == 0) + continue; - const auto &position = index.get(to); - for (size_t i = 0; i < from.size(); i++) { - if (from[i] == to[i]) + // If there are too many mismatches, then it means erroneous mapping + if (cnt > from.size() / 3) continue; - if (position.second > std::numeric_limits::max()) + // These conditions are to avoid excessive indels: if two/third of + // nucleotides in first/last quarter are mismatches, then it means + // erroneous mapping + if (cnt_arr[0] > from.size() / 6 || cnt_arr[3] > from.size() / 6) continue; - //FIXME add only canonical edges? - candidates[position.first].insert(uint32_t(position.second + i)); + const auto &position = index.get(to); + for (size_t i = 0; i < from.size(); i++) { + if (from[i] == to[i]) + continue; + + if (position.second > std::numeric_limits::max()) + continue; + + //FIXME add only canonical edges? + candidates[position.first].insert(uint32_t(position.second + i)); + } } } - } - void CollectPotentialMismatches(const graph_pack::GraphPack &gp) { - size_t nthreads = omp_get_max_threads(); - const auto &kmer_mapper = gp.get>(); - auto iters = split_iterator(nthreads, kmer_mapper.begin(), kmer_mapper.end(), kmer_mapper.size()); - VERIFY(iters.front() == kmer_mapper.begin()); - VERIFY(iters.back() == kmer_mapper.end()); + void CollectPotentialMismatches(const graph_pack::GraphPack &gp) { + size_t nthreads = omp_get_max_threads(); + const auto &kmer_mapper = gp.get>(); + auto iters = split_iterator(nthreads, kmer_mapper.begin(), kmer_mapper.end(), kmer_mapper.size()); + VERIFY(iters.front() == kmer_mapper.begin()); + VERIFY(iters.back() == kmer_mapper.end()); - std::vector potential_mismatches(nthreads); + std::vector potential_mismatches(nthreads); # pragma omp parallel for - for (size_t i = 0; i < nthreads; ++i) { - CollectPotentialMismatches(gp, iters[i], iters[i + 1], potential_mismatches[i]); - } - - for (auto &entry : potential_mismatches) { - for (const auto &candidate : entry) { - candidates_[candidate.first].insert(candidate.second.begin(), - candidate.second.end()); + for (size_t i = 0; i < nthreads; ++i) { + CollectPotentialMismatches(gp, iters[i], iters[i + 1], potential_mismatches[i]); } - entry.clear(); - } - { - size_t edges = candidates_.size(); - size_t positions = 0; - for (const auto &candidate : candidates_) { - positions += candidate.second.size(); + for (auto &entry: potential_mismatches) { + for (const auto &candidate: entry) { + candidates_[candidate.first].insert(candidate.second.begin(), + candidate.second.end()); + } + entry.clear(); } - INFO("Total " << edges << " edges (out of " << gp.get().e_size() << ") with " << positions << " potential mismatch positions (" - << double(positions) / double(edges) << " positions per edge)"); + { + size_t edges = candidates_.size(); + size_t positions = 0; + for (const auto &candidate: candidates_) { + positions += candidate.second.size(); + } + + INFO("Total " << edges << " edges (out of " << gp.get().e_size() << ") with " << positions + << " potential mismatch positions (" + << double(positions) / double(edges) << " positions per edge)"); + } } - } - template - void ProcessSingleReadImpl(size_t thread_index, const Read& read, const MappingPath &path) { - // VERIFY(path.size() <= 1); - if (path.size() != 1) // TODO Use only_simple feature - return; + template + void ProcessSingleReadImpl(size_t thread_index, const Read &read, const MappingPath &path) { + // VERIFY(path.size() <= 1); + if (path.size() != 1) // TODO Use only_simple feature + return; - EdgeId e = path[0].first; - MappingRange mr = path[0].second; - const Sequence &s_read = read.sequence(); - auto &buffer = statistics_buffers_[thread_index]; + EdgeId e = path[0].first; + MappingRange mr = path[0].second; + const Sequence &s_read = read.sequence(); + auto &buffer = statistics_buffers_[thread_index]; - if (mr.initial_range.size() != mr.mapped_range.size()) - return; + if (mr.initial_range.size() != mr.mapped_range.size()) + return; - auto it = candidates_.find(e); - if (it == candidates_.end()) - return; + auto it = candidates_.find(e); + if (it == candidates_.end()) + return; - const Sequence &s_edge = g_.EdgeNucls(e); - size_t len = mr.initial_range.size() + g_.k(); - size_t cnt = 0; - for (size_t i = 0; i < len; i++) { - cnt += (s_read[mr.initial_range.start_pos + i] != - s_edge[mr.mapped_range.start_pos + i]); - } + const Sequence &s_edge = g_.EdgeNucls(e); + size_t len = mr.initial_range.size() + g_.k(); + size_t cnt = 0; + for (size_t i = 0; i < len; i++) { + cnt += (s_read[mr.initial_range.start_pos + i] != + s_edge[mr.mapped_range.start_pos + i]); + } - if (cnt > g_.k() / 3) - return; + if (cnt > g_.k() / 3) + return; - TRACE("statistics might be changing"); - for (size_t i = 0; i < len; i++) { - size_t pos = mr.mapped_range.start_pos + i; - if (pos > std::numeric_limits::max()) - continue; + TRACE("statistics might be changing"); + for (size_t i = 0; i < len; i++) { + size_t pos = mr.mapped_range.start_pos + i; + if (pos > std::numeric_limits::max()) + continue; - if (!it->second.count(uint32_t(pos))) - continue; + if (!it->second.count(uint32_t(pos))) + continue; - char nucl_code = s_read[mr.initial_range.start_pos + i]; - buffer[e].increment(pos, nucl_code); + char nucl_code = s_read[mr.initial_range.start_pos + i]; + buffer[e].increment(pos, nucl_code); + } } - } - void Merge(InnerMismatchStatistics &other_statistics) { - for (auto &e_info : other_statistics) { - statistics_[e_info.first] += e_info.second; - e_info.second.ClearValues(); + void Merge(InnerMismatchStatistics &other_statistics) { + for (auto &e_info: other_statistics) { + statistics_[e_info.first] += e_info.second; + e_info.second.ClearValues(); + } } - } -public: - MismatchStatistics(const graph_pack::GraphPack &gp): - g_(gp.get()) { - CollectPotentialMismatches(gp); - } + public: + MismatchStatistics(const graph_pack::GraphPack &gp) : + g_(gp.get()) { + CollectPotentialMismatches(gp); + } - void StartProcessLibrary(size_t threads_count) override { - statistics_buffers_.clear(); - statistics_buffers_.resize(threads_count, statistics_); - } + void StartProcessLibrary(size_t threads_count) override { + statistics_buffers_.clear(); + statistics_buffers_.resize(threads_count, statistics_); + } - void StopProcessLibrary() override { - statistics_buffers_.clear(); - } + void StopProcessLibrary() override { + statistics_buffers_.clear(); + } - void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq &read, const MappingPath &path) override { - ProcessSingleReadImpl(thread_index, read, path); - } + void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq &read, + const MappingPath &path) override { + ProcessSingleReadImpl(thread_index, read, path); + } - void ProcessSingleRead(size_t thread_index, const io::SingleRead &read, const MappingPath &path) override { - ProcessSingleReadImpl(thread_index, read, path); - } + void + ProcessSingleRead(size_t thread_index, const io::SingleRead &read, const MappingPath &path) override { + ProcessSingleReadImpl(thread_index, read, path); + } - void MergeBuffer(size_t thread_index) override { - Merge(statistics_buffers_[thread_index]); - } + void MergeBuffer(size_t thread_index) override { + Merge(statistics_buffers_[thread_index]); + } - void Serialize(std::ostream &os) const override { - io::binary::BinWrite(os, statistics_); - } + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, statistics_); + } - void Deserialize(std::istream &is) override { - io::binary::BinRead(is, statistics_); - } + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, statistics_); + } - void MergeFromStream(std::istream &is) override { - InnerMismatchStatistics other_statistics; - io::binary::BinRead(is, other_statistics); - Merge(other_statistics); - } + void MergeFromStream(std::istream &is) override { + InnerMismatchStatistics other_statistics; + io::binary::BinRead(is, other_statistics); + Merge(other_statistics); + } - const_iterator begin() const { - return statistics_.begin(); - } + const_iterator begin() const { + return statistics_.begin(); + } - const_iterator end() const { - return statistics_.end(); - } + const_iterator end() const { + return statistics_.end(); + } - const_iterator find(const EdgeId &edge) const { - return statistics_.find(edge); - } -}; - -class MismatchShallNotPass { -private: - typedef typename Graph::EdgeId EdgeId; - typedef typename Graph::VertexId VertexId; - typedef std::function&, io::ReadStreamList& streams)> ProccessLibFuncT; - - graph_pack::GraphPack &gp_; - Graph &graph_; - const size_t k_; - const double relative_threshold_; - const ProccessLibFuncT& proccess_lib_func_; - const size_t num_readers_; - - EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) { + const_iterator find(const EdgeId &edge) const { + return statistics_.find(edge); + } + }; + + + EdgeId MismatchShallNotPass::CorrectNucl(EdgeId edge, size_t position, char nucl) { VERIFY(position >= k_); if (position + 1 < graph_.length(edge)) { auto tmp = graph_.SplitEdge(edge, position + 1); @@ -355,7 +346,7 @@ class MismatchShallNotPass { return position > k_ ? edge : glued; } - EdgeId CorrectNucls(EdgeId edge, const std::vector> &mismatches) { + EdgeId MismatchShallNotPass::CorrectNucls(EdgeId edge, const std::vector> &mismatches) { // Nothing to correct, bail out. // Note that this might be a correctness thing as well, as we're calling Compress // down below. @@ -372,7 +363,8 @@ class MismatchShallNotPass { return tmp; } - std::vector> FindMismatches(EdgeId edge, const MismatchEdgeInfo &statistics) const { + std::vector> + MismatchShallNotPass::FindMismatches(EdgeId edge, const MismatchEdgeInfo &statistics) const { std::vector> to_correct; const Sequence &s_edge = graph_.EdgeNucls(edge); for (size_t i = k_; i < graph_.length(edge); i++) { @@ -393,7 +385,7 @@ class MismatchShallNotPass { return to_correct; } - size_t CorrectEdge(EdgeId edge, const MismatchEdgeInfo &statistics) { + size_t MismatchShallNotPass::CorrectEdge(EdgeId edge, const MismatchEdgeInfo &statistics) { auto to_correct = FindMismatches(edge, statistics); EdgeId new_edge = CorrectNucls(edge, to_correct); if (new_edge == EdgeId()) @@ -402,7 +394,7 @@ class MismatchShallNotPass { return to_correct.size(); } - size_t CorrectAllEdges(const MismatchStatistics &statistics) { + size_t MismatchShallNotPass::CorrectAllEdges(const MismatchStatistics &statistics) { size_t res = 0; phmap::btree_set conjugate_fix; @@ -411,7 +403,7 @@ class MismatchShallNotPass { conjugate_fix.insert(e); } - for (EdgeId e : conjugate_fix) { + for (EdgeId e: conjugate_fix) { DEBUG("processing edge" << graph_.int_id(e)); auto stat_it = statistics.find(e); @@ -426,12 +418,12 @@ class MismatchShallNotPass { return res; } - size_t ParallelStopMismatchIteration() { + size_t MismatchShallNotPass::ParallelStopMismatchIteration() { INFO("Collect potential mismatches"); MismatchStatistics statistics(gp_); INFO("Potential mismatches collected"); - auto& dataset = cfg::get_writable().ds; + auto &dataset = cfg::get_writable().ds; auto mapper = MapperInstance(gp_); for (size_t i = 0; i < dataset.reads.lib_count(); ++i) { @@ -448,8 +440,8 @@ class MismatchShallNotPass { return CorrectAllEdges(statistics); } -public: - MismatchShallNotPass(const ProccessLibFuncT& processLib, graph_pack::GraphPack &gp, double relative_threshold = 1.5, size_t num_readers = 0) : + MismatchShallNotPass::MismatchShallNotPass(const ProccessLibFuncT &processLib, graph_pack::GraphPack &gp, + double relative_threshold, size_t num_readers) : gp_(gp), graph_(gp.get_mutable()), k_(gp.k()), @@ -460,7 +452,7 @@ class MismatchShallNotPass { } - size_t ParallelStopAllMismatches(size_t max_iterations = 1) { + size_t MismatchShallNotPass::ParallelStopAllMismatches(size_t max_iterations = 1) { size_t res = 0; while (max_iterations > 0) { size_t last = ParallelStopMismatchIteration(); @@ -471,8 +463,6 @@ class MismatchShallNotPass { } return res; } -}; - } // namespace mismatches void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { diff --git a/src/projects/spades/mismatch_correction.hpp b/src/projects/spades/mismatch_correction.hpp index 4c75a216f5..cbfe5ca364 100644 --- a/src/projects/spades/mismatch_correction.hpp +++ b/src/projects/spades/mismatch_correction.hpp @@ -10,16 +10,57 @@ #include "pipeline/stage.hpp" #include "pipeline/mpi_stage.hpp" +#include "assembly_graph/core/graph.hpp" +#include "alignment/sequence_mapper_notifier.hpp" namespace debruijn_graph { + namespace mismatches { + struct MismatchEdgeInfo; + class MismatchStatistics; -class MismatchCorrection : public spades::MPIAssemblyStage { -public: - MismatchCorrection() - : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") { } + class MismatchShallNotPass { + private: + typedef typename Graph::EdgeId EdgeId; + typedef typename Graph::VertexId VertexId; + typedef std::function &, + io::ReadStreamList &streams)> ProccessLibFuncT; - void run(graph_pack::GraphPack &gp, const char *) override; -}; + graph_pack::GraphPack &gp_; + Graph &graph_; + const size_t k_; + const double relative_threshold_; + const ProccessLibFuncT &proccess_lib_func_; + const size_t num_readers_; + EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl); + + EdgeId CorrectNucls(EdgeId edge, const std::vector> &mismatches); + + std::vector> FindMismatches(EdgeId edge, const MismatchEdgeInfo &statistics) const; + + size_t CorrectEdge(EdgeId edge, const MismatchEdgeInfo &statistics); + + size_t CorrectAllEdges(const MismatchStatistics &statistics); + + size_t ParallelStopMismatchIteration(); + + public: + MismatchShallNotPass(const ProccessLibFuncT &processLib, graph_pack::GraphPack &gp, + double relative_threshold = 1.5, + size_t num_readers = 0); + + + size_t ParallelStopAllMismatches(size_t max_iterations); + }; + } + + + class MismatchCorrection : public spades::MPIAssemblyStage { + public: + MismatchCorrection() + : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} + + void run(graph_pack::GraphPack &gp, const char *) override; + }; } From 6c541d1a53f74b49cab9c414a239e6172ef6d3ce Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 23 Nov 2021 23:02:36 +0300 Subject: [PATCH 064/102] separate MPI mismatch_correction --- .../hpcspades/mismatch_correction_mpi.hpp | 27 +++++++++++++++++++ src/projects/hpcspades/pipeline.cpp | 4 +-- src/projects/spades/mismatch_correction.cpp | 2 +- src/projects/spades/mismatch_correction.hpp | 5 ++-- 4 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 src/projects/hpcspades/mismatch_correction_mpi.hpp diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp new file mode 100644 index 0000000000..beb33f3226 --- /dev/null +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -0,0 +1,27 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "projects/spades/mismatch_correction.hpp" +#include "pipeline/mpi_stage.hpp" +#include "pipeline/graph_pack_helpers.h" + +namespace debruijn_graph { + class MismatchCorrectionMPI : public spades::MPIAssemblyStage { + public: + MismatchCorrectionMPI() + : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} + + void run(graph_pack::GraphPack &gp, const char *) override { + EnsureBasicMapping(gp); + size_t corrected = mismatches::MismatchShallNotPass(ProcessLibraryMPI, gp, 2, partask::overall_num_threads()). + ParallelStopAllMismatches(1); + INFO("Corrected " << corrected << " nucleotides"); + } + }; +} + diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index a3b432e812..be765e16bc 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -7,7 +7,7 @@ #include "projects/spades/load_graph.hpp" #include "gap_closer_mpi.hpp" -#include "projects/spades/mismatch_correction.hpp" +#include "mismatch_correction_mpi.hpp" #include "projects/spades/pair_info_count.hpp" #include "projects/spades/second_phase_setup.hpp" #include "projects/spades/repeat_resolving.hpp" @@ -181,7 +181,7 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); if (cfg::get().correct_mismatches) - SPAdes.add(); + SPAdes.add(); if (cfg::get().ss_coverage_splitter.enabled) SPAdes.add(); diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index 2d7c77e719..6edd23506d 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -467,7 +467,7 @@ namespace mismatches { void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { EnsureBasicMapping(gp); - size_t corrected = mismatches::MismatchShallNotPass(ProcessLibraryMPI, gp, 2, partask::overall_num_threads()). + size_t corrected = mismatches::MismatchShallNotPass(ProcessLibrary, gp, 2, partask::overall_num_threads()). ParallelStopAllMismatches(1); INFO("Corrected " << corrected << " nucleotides"); } diff --git a/src/projects/spades/mismatch_correction.hpp b/src/projects/spades/mismatch_correction.hpp index cbfe5ca364..511e31c3c2 100644 --- a/src/projects/spades/mismatch_correction.hpp +++ b/src/projects/spades/mismatch_correction.hpp @@ -9,7 +9,6 @@ #pragma once #include "pipeline/stage.hpp" -#include "pipeline/mpi_stage.hpp" #include "assembly_graph/core/graph.hpp" #include "alignment/sequence_mapper_notifier.hpp" @@ -55,10 +54,10 @@ namespace debruijn_graph { } - class MismatchCorrection : public spades::MPIAssemblyStage { + class MismatchCorrection : public spades::AssemblyStage { public: MismatchCorrection() - : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} + : AssemblyStage("Mismatch Correction", "mismatch_correction") {} void run(graph_pack::GraphPack &gp, const char *) override; }; From be0eeeeeaac9df1d5b10eba30b132835d5364864 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 24 Nov 2021 21:25:40 +0300 Subject: [PATCH 065/102] make pair_info_count consistent with master version --- src/common/paired_info/paired_info_utils.cpp | 59 ++++++++++++++--- src/common/paired_info/paired_info_utils.hpp | 5 ++ src/projects/spades/pair_info_count.cpp | 67 +------------------- 3 files changed, 55 insertions(+), 76 deletions(-) diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index a756f3cede..a9b78cc621 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -168,25 +168,41 @@ void FillPairedIndex(const Graph &graph, } class DEFilter : public SequenceMapperListener { - public: +public: DEFilter(paired_info::PairedInfoFilter &filter, const Graph &g) : bf_(filter), g_(g) {} void ProcessPairedRead(size_t, - const io::PairedRead&, - const MappingPath& read1, - const MappingPath& read2) override { + const io::PairedRead &, + const MappingPath &read1, + const MappingPath &read2) override { ProcessPairedRead(read1, read2); } + void ProcessPairedRead(size_t, - const io::PairedReadSeq&, - const MappingPath& read1, - const MappingPath& read2) override { + const io::PairedReadSeq &, + const MappingPath &read1, + const MappingPath &read2) override { ProcessPairedRead(read1, read2); } - private: - void ProcessPairedRead(const MappingPath& path1, - const MappingPath& path2) { + + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, bf_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, bf_); + } + + void MergeFromStream(std::istream &is) override { + paired_info::PairedInfoFilter remote; + io::binary::BinRead(is, remote); + bf_.merge(remote); + } + +private: + void ProcessPairedRead(const MappingPath &path1, + const MappingPath &path2) { for (size_t i = 0; i < path1.size(); ++i) { EdgeId edge1 = path1.edge_at(i); for (size_t j = 0; j < path2.size(); ++j) { @@ -225,5 +241,28 @@ std::unique_ptr FillEdgePairFilter(const Graph &graph, return filter; } +std::unique_ptr FillEdgePairFilterMPI(const Graph &graph, + const SequenceMapperNotifier::SequenceMapperT &mapper, + SequencingLib &reads, + size_t edgepairs) { + auto filter = std::make_unique( + [](const std::pair &e, uint64_t seed) { + uint64_t h1 = e.first.hash(); + return XXH3_64bits_withSeed(&h1, sizeof(h1), (e.second.hash() * seed) ^ seed); + }, + 12 * edgepairs); + + SequenceMapperNotifierMPI notifier; + DEFilter filter_counter(*filter, graph); + notifier.Subscribe(&filter_counter); + + VERIFY(reads.data().unmerged_read_length != 0); + size_t num_readers = partask::overall_num_threads(); + auto stream = paired_binary_readers(reads, /*followed by rc*/false, 0, /*include merged*/true, num_readers); + notifier.ProcessLibrary(stream, mapper); + + return filter; +} + } diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index 72c18d6449..dcd6143d3e 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -38,5 +38,10 @@ std::unique_ptr FillEdgePairFilter(const debruijn_graph::Graph const debruijn_graph::SequenceMapper &mapper, SequencingLib &reads, size_t edgepairs); + +std::unique_ptr FillEdgePairFilterMPI(const debruijn_graph::Graph &gp, + const debruijn_graph::SequenceMapper &mapper, + SequencingLib &reads, + size_t edgepairs); } diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index ec1b64902a..8b589c927d 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -42,55 +42,6 @@ std::shared_ptr> ChooseProperMapper(const graph_pack::Grap return MapperInstance(gp); } -class DEFilter : public SequenceMapperListener { - public: - DEFilter(paired_info::PairedInfoFilter &filter, const Graph &g) - : bf_(filter), g_(g) {} - - void ProcessPairedRead(size_t, - const io::PairedRead&, - const MappingPath& read1, - const MappingPath& read2) override { - ProcessPairedRead(read1, read2); - } - void ProcessPairedRead(size_t, - const io::PairedReadSeq&, - const MappingPath& read1, - const MappingPath& read2) override { - ProcessPairedRead(read1, read2); - } - - void Serialize(std::ostream &os) const override { - io::binary::BinWrite(os, bf_); - } - - void Deserialize(std::istream &is) override { - io::binary::BinRead(is, bf_); - } - - void MergeFromStream(std::istream &is) override { - paired_info::PairedInfoFilter remote; - io::binary::BinRead(is, remote); - bf_.merge(remote); - } - - private: - void ProcessPairedRead(const MappingPath& path1, - const MappingPath& path2) { - for (size_t i = 0; i < path1.size(); ++i) { - EdgeId edge1 = path1.edge_at(i); - for (size_t j = 0; j < path2.size(); ++j) { - EdgeId edge2 = path2.edge_at(j); - bf_.add({edge1, edge2}); - bf_.add({g_.conjugate(edge2), g_.conjugate(edge1)}); - } - } - } - - paired_info::PairedInfoFilter &bf_; - const Graph &g_; -}; - bool HasGoodRRLibs() { for (const auto &lib : cfg::get().ds.reads) { if (lib.is_contig_lib()) @@ -249,24 +200,8 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { // Only filter paired-end libraries if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) { - filter.reset(new paired_info::PairedInfoFilter([](const std::pair &e, uint64_t seed) { - uint64_t h1 = e.first.hash(); - return XXH3_64bits_withSeed(&h1, sizeof(h1), (e.second.hash() * seed) ^ seed); - }, - 12 * edgepairs)); - INFO("Filtering data for library #" << i); - { - SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); - DEFilter filter_counter(*filter, graph); - notifier.Subscribe(&filter_counter, i); - - VERIFY(lib.data().unmerged_read_length != 0); - size_t num_readers = partask::overall_num_threads(); - auto reads = paired_binary_readers(lib, /*followed by rc*/false, - 0, /*include merged*/true, num_readers); - notifier.ProcessLibrary(reads, i, *ChooseProperMapper(gp, lib)); - } + filter = paired_info::FillEdgePairFilterMPI(graph, *ChooseProperMapper(gp, lib), lib, edgepairs); } INFO("Mapping library #" << i); From 18acaa25a76ae9b11a2a5fd32dce2557cb171b16 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 24 Nov 2021 23:49:21 +0300 Subject: [PATCH 066/102] functor for FillEdgePairFilter --- src/common/paired_info/paired_info_utils.cpp | 33 +++----------------- src/common/paired_info/paired_info_utils.hpp | 17 +++++----- src/projects/spades/pair_info_count.cpp | 2 +- 3 files changed, 15 insertions(+), 37 deletions(-) diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index a9b78cc621..58ec4abc4e 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -220,7 +220,9 @@ class DEFilter : public SequenceMapperListener { std::unique_ptr FillEdgePairFilter(const Graph &graph, const SequenceMapperNotifier::SequenceMapperT &mapper, SequencingLib &reads, - size_t edgepairs) { + size_t edgepairs, + const MapLibFuncT& map_lib_fun, + size_t num_readers) { auto filter = std::make_unique( [](const std::pair &e, uint64_t seed) { // Note that EdgeId::hash is essentially an identity function, so we'd need to @@ -230,39 +232,12 @@ std::unique_ptr FillEdgePairFilter(const Graph &graph, }, 12 * edgepairs); - SequenceMapperNotifier notifier; - DEFilter filter_counter(*filter, graph); - notifier.Subscribe(&filter_counter); - - VERIFY(reads.data().unmerged_read_length != 0); - auto stream = paired_binary_readers(reads, /*followed by rc*/false, 0, /*include merged*/true); - notifier.ProcessLibrary(stream, mapper); - - return filter; -} - -std::unique_ptr FillEdgePairFilterMPI(const Graph &graph, - const SequenceMapperNotifier::SequenceMapperT &mapper, - SequencingLib &reads, - size_t edgepairs) { - auto filter = std::make_unique( - [](const std::pair &e, uint64_t seed) { - uint64_t h1 = e.first.hash(); - return XXH3_64bits_withSeed(&h1, sizeof(h1), (e.second.hash() * seed) ^ seed); - }, - 12 * edgepairs); - - SequenceMapperNotifierMPI notifier; DEFilter filter_counter(*filter, graph); - notifier.Subscribe(&filter_counter); - VERIFY(reads.data().unmerged_read_length != 0); - size_t num_readers = partask::overall_num_threads(); auto stream = paired_binary_readers(reads, /*followed by rc*/false, 0, /*include merged*/true, num_readers); - notifier.ProcessLibrary(stream, mapper); + map_lib_fun(&filter_counter, mapper, stream); return filter; } - } diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index dcd6143d3e..88055eafc5 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -9,12 +9,14 @@ #include "paired_info.hpp" -#include "adt/bf.hpp" #include "alignment/sequence_mapper_fwd.hpp" +#include "alignment/sequence_mapper_notifier.hpp" #include "assembly_graph/core/graph.hpp" #include "library/library_data.hpp" #include "library/library_fwd.hpp" +#include "adt/bf.hpp" + namespace paired_info { using SequencingLib = io::SequencingLibrary; @@ -22,6 +24,10 @@ using PairedInfoFilter = bf::counting_bloom_filter, 2>; using PairedIndex = omnigraph::de::UnclusteredPairedInfoIndexT; +typedef std::function &, + io::ReadStreamList &streams)> MapLibFuncT; + bool CollectLibInformation(const debruijn_graph::Graph &gp, const debruijn_graph::SequenceMapper &mapper, size_t &edgepairs, SequencingLib &reads, @@ -37,11 +43,8 @@ void FillPairedIndex(const debruijn_graph::Graph &gp, std::unique_ptr FillEdgePairFilter(const debruijn_graph::Graph &gp, const debruijn_graph::SequenceMapper &mapper, SequencingLib &reads, - size_t edgepairs); - -std::unique_ptr FillEdgePairFilterMPI(const debruijn_graph::Graph &gp, - const debruijn_graph::SequenceMapper &mapper, - SequencingLib &reads, - size_t edgepairs); + size_t edgepairs, + const MapLibFuncT& map_lib_fun, + size_t num_readers = 0); } diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 8b589c927d..cbb145f672 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -201,7 +201,7 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { // Only filter paired-end libraries if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) { INFO("Filtering data for library #" << i); - filter = paired_info::FillEdgePairFilterMPI(graph, *ChooseProperMapper(gp, lib), lib, edgepairs); + filter = paired_info::FillEdgePairFilter(graph, *ChooseProperMapper(gp, lib), lib, edgepairs, ProcessLibraryMPI, partask::overall_num_threads()); } INFO("Mapping library #" << i); From ccf2e8097188a673903a84a73a515efc43a33905 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 25 Nov 2021 12:06:17 +0300 Subject: [PATCH 067/102] separate PairInfoCount MPI --- .../alignment/sequence_mapper_notifier.hpp | 24 ++++++++-- .../hpcspades/pair_info_count_mpi.hpp | 30 ++++++++++++ src/projects/hpcspades/pipeline.cpp | 6 +-- src/projects/spades/pair_info_count.cpp | 46 ++++++++++++------- src/projects/spades/pair_info_count.hpp | 24 ++++++++-- 5 files changed, 102 insertions(+), 28 deletions(-) create mode 100644 src/projects/hpcspades/pair_info_count_mpi.hpp diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index c4758cc897..f503967ebc 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -197,21 +197,35 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { } }; + template -void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { +void ProcessLibraryFewListeners(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) { SequenceMapperNotifier notifier; - notifier.Subscribe(listener); + for (auto listener : listeners) { + notifier.Subscribe(listener); + } notifier.ProcessLibrary(streams, mapper); } - template -void ProcessLibraryMPI(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { +void ProcessLibraryMPIFewListeners(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) { SequenceMapperNotifierMPI notifier; - notifier.Subscribe(listener); + for (auto listener : listeners) { + notifier.Subscribe(listener); + } notifier.ProcessLibrary(streams, mapper); } +template +void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { + ProcessLibraryFewListeners({listener}, mapper, streams); +} + +template +void ProcessLibraryMPI(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { + ProcessLibraryMPIFewListeners({listener}, mapper, streams); +} + } // namespace debruijn_graph diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp new file mode 100644 index 0000000000..5cc117faa9 --- /dev/null +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -0,0 +1,30 @@ +//*************************************************************************** +//* Copyright (c) 2015-2021 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "projects/spades/pair_info_count.hpp" +#include "alignment/sequence_mapper_notifier.hpp" +#include "pipeline/mpi_stage.hpp" + +namespace debruijn_graph { + class PairInfoCountMPI : public PairInfoCountBase, public spades::MPIAssemblyStage { + public: + PairInfoCountMPI(bool preliminary = false) + : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", + preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} + + void run(graph_pack::GraphPack &gp, const char* s) override { + execute(gp, s, ProcessLibraryMPI, + ProcessLibraryMPIFewListeners, + ProcessLibraryMPIFewListeners, + partask::overall_num_threads()); + } + }; + +} + diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index be765e16bc..89b0d32107 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -8,7 +8,7 @@ #include "projects/spades/load_graph.hpp" #include "gap_closer_mpi.hpp" #include "mismatch_correction_mpi.hpp" -#include "projects/spades/pair_info_count.hpp" +#include "pair_info_count_mpi.hpp" #include "projects/spades/second_phase_setup.hpp" #include "projects/spades/repeat_resolving.hpp" #include "distance_estimation_mpi.hpp" @@ -139,7 +139,7 @@ static void AddPreliminarySimplificationStages(StageManager &SPAdes) { SPAdes.add("prelim_gapcloser"); if (cfg::get().use_intermediate_contigs) { - SPAdes.add(true); + SPAdes.add(true); SPAdes.add(true); SPAdes.add(true); @@ -202,7 +202,7 @@ static void AddRepeatResolutionStages(StageManager &SPAdes) { if (!cfg::get().series_analysis.empty()) SPAdes.add(); - SPAdes.add() + SPAdes.add() .add() .add(); } diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index cbb145f672..df10a3dddd 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -103,12 +103,14 @@ bool ShouldObtainSingleReadsPaths(size_t ilib) { } size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, - bool use_binary = true, bool map_paired = false) { + const PairInfoCountBase::MapSingleSeqLibFuncT & map_single_seq_lib_func, + const PairInfoCountBase::MapSingleLibFuncT & map_single_lib_func, + size_t num_readers = 0, bool use_binary = true, bool map_paired = false) { //FIXME make const auto& reads = cfg::get_writable().ds.reads[ilib]; const auto &graph = gp.get(); - SequenceMapperNotifierMPI notifier(cfg::get_writable().ds.reads.lib_count()); + std::vector listeners; auto &single_long_reads = gp.get_mutable>()[ilib]; auto& trusted_paths = gp.get_mutable()[ilib]; @@ -116,7 +118,7 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, if (ShouldObtainSingleReadsPaths(ilib) || reads.is_contig_lib()) { //FIXME pretty awful, would be much better if listeners were shared ptrs - notifier.Subscribe(&read_mapper); + listeners.push_back(&read_mapper); cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true; } @@ -125,25 +127,35 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, if (cfg::get().calculate_coverage_for_each_lib) { INFO("Will calculate lib coverage as well"); map_paired = true; - notifier.Subscribe(&ss_coverage_filler); + listeners.push_back(&ss_coverage_filler); } auto mapper_ptr = ChooseProperMapper(gp, reads); - size_t num_readers = partask::overall_num_threads(); if (use_binary) { auto single_streams = single_binary_readers(reads, false, map_paired, num_readers); - notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr); + map_single_seq_lib_func(listeners, *mapper_ptr, single_streams); } else { auto single_streams = single_easy_readers(reads, false, map_paired, /*handle Ns*/false); - notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr); + map_single_lib_func(listeners, *mapper_ptr, single_streams); } return single_long_reads.size(); } } // namespace -void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { +void PairInfoCount::run(graph_pack::GraphPack &gp, const char *s) { + execute(gp, s, ProcessLibrary, + ProcessLibraryFewListeners, + ProcessLibraryFewListeners, + partask::overall_num_threads()); +} + +void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, + const PairInfoCountBase::MapPairLibFuncT & map_pair_lib_func, + const PairInfoCountBase::MapSingleSeqLibFuncT & map_single_seq_lib_func, + const PairInfoCountBase::MapSingleLibFuncT & map_single_lib_func, + size_t num_readers) { InitRRIndices(gp); EnsureBasicMapping(gp); @@ -162,7 +174,7 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { continue; } else if (lib.is_contig_lib()) { INFO("Mapping contigs library #" << i); - ProcessSingleReads(gp, i, false); + ProcessSingleReads(gp, i, map_single_seq_lib_func, map_single_lib_func, num_readers, false); } else { if (lib.is_paired()) { INFO("Estimating insert size for library #" << i); @@ -186,14 +198,14 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { } INFO(" Insert size = " << lib_data.mean_insert_size << - ", deviation = " << lib_data.insert_size_deviation << - ", left quantile = " << lib_data.insert_size_left_quantile << - ", right quantile = " << lib_data.insert_size_right_quantile << - ", read length = " << lib_data.unmerged_read_length); + ", deviation = " << lib_data.insert_size_deviation << + ", left quantile = " << lib_data.insert_size_left_quantile << + ", right quantile = " << lib_data.insert_size_right_quantile << + ", read length = " << lib_data.unmerged_read_length); if (lib_data.mean_insert_size < 1.1 * (double) rl) WARN("Estimated mean insert size " << lib_data.mean_insert_size - << " is very small compared to read length " << rl); + << " is very small compared to read length " << rl); std::unique_ptr filter; unsigned filter_threshold = cfg::get().de.raw_filter_threshold; @@ -201,7 +213,8 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { // Only filter paired-end libraries if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) { INFO("Filtering data for library #" << i); - filter = paired_info::FillEdgePairFilter(graph, *ChooseProperMapper(gp, lib), lib, edgepairs, ProcessLibraryMPI, partask::overall_num_threads()); + filter = paired_info::FillEdgePairFilter(graph, *ChooseProperMapper(gp, lib), lib, edgepairs, + map_pair_lib_func, num_readers); } INFO("Mapping library #" << i); @@ -224,7 +237,7 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { if (ShouldObtainSingleReadsPaths(i) || ShouldObtainLibCoverage()) { cfg::get_writable().use_single_reads |= ShouldObtainSingleReadsPaths(i); INFO("Mapping single reads of library #" << i); - size_t n = ProcessSingleReads(gp, i, /*use_binary*/true, /*map_paired*/true); + size_t n = ProcessSingleReads(gp, i, map_single_seq_lib_func, map_single_lib_func, num_readers, /*use_binary*/true, /*map_paired*/true); INFO("Total paths obtained from single reads: " << n); } } @@ -232,5 +245,4 @@ void PairInfoCount::run(graph_pack::GraphPack &gp, const char *) { DetachEdgeIndex(gp); } - } // namespace debruijn_graph diff --git a/src/projects/spades/pair_info_count.hpp b/src/projects/spades/pair_info_count.hpp index e7a7cd51ac..a50b658e48 100644 --- a/src/projects/spades/pair_info_count.hpp +++ b/src/projects/spades/pair_info_count.hpp @@ -8,15 +8,33 @@ #pragma once +#include "alignment/sequence_mapper_notifier.hpp" #include "pipeline/stage.hpp" -#include "pipeline/mpi_stage.hpp" namespace debruijn_graph { +class PairInfoCountBase { +public: + typedef std::function &, + io::ReadStreamList &streams)> MapPairLibFuncT; -class PairInfoCount : public spades::MPIAssemblyStage { + typedef std::function&, + const debruijn_graph::SequenceMapper &, + io::ReadStreamList &streams)> MapSingleSeqLibFuncT; + + typedef std::function&, + const debruijn_graph::SequenceMapper &, + io::ReadStreamList &streams)> MapSingleLibFuncT; + + void execute(graph_pack::GraphPack &gp, const char *, const MapPairLibFuncT&, + const MapSingleSeqLibFuncT&, + const MapSingleLibFuncT&, size_t num_readers=0); +}; + +class PairInfoCount : public PairInfoCountBase, public spades::AssemblyStage { public: PairInfoCount(bool preliminary = false) - : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", + : AssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} void run(graph_pack::GraphPack &gp, const char*) override; From 520f6f1d318855286ba1de922593bae816ca9a9e Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Sat, 27 Nov 2021 21:15:50 +0300 Subject: [PATCH 068/102] MapLibFabric --- .../alignment/sequence_mapper_notifier.hpp | 91 +++++++++++++------ src/common/paired_info/paired_info_utils.cpp | 4 +- src/common/paired_info/paired_info_utils.hpp | 2 +- .../hpcspades/mismatch_correction_mpi.hpp | 2 +- .../hpcspades/pair_info_count_mpi.hpp | 5 +- src/projects/spades/mismatch_correction.cpp | 7 +- src/projects/spades/mismatch_correction.hpp | 6 +- src/projects/spades/pair_info_count.cpp | 26 +++--- src/projects/spades/pair_info_count.hpp | 16 +--- 9 files changed, 87 insertions(+), 72 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index f503967ebc..b5b0baf789 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -197,34 +197,71 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { } }; +class MapLibBase { +public: + virtual void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const = 0; + virtual void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const = 0; + virtual void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const = 0; + virtual void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const = 0; + + template + void operator() (SequenceMapperListener* listener, const SequenceMapper& mapper, Streams& streams) const { + this->operator() (std::vector(1, listener), mapper, streams); + } +}; + +class MapLibFunc : public MapLibBase { +public: + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLib(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLib(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLib(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLib(listeners, mapper, streams); + } -template -void ProcessLibraryFewListeners(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) { - SequenceMapperNotifier notifier; - for (auto listener : listeners) { - notifier.Subscribe(listener); - } - notifier.ProcessLibrary(streams, mapper); -} - -template -void ProcessLibraryMPIFewListeners(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) { - SequenceMapperNotifierMPI notifier; - for (auto listener : listeners) { - notifier.Subscribe(listener); - } - notifier.ProcessLibrary(streams, mapper); -} - -template -void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { - ProcessLibraryFewListeners({listener}, mapper, streams); -} - -template -void ProcessLibraryMPI(SequenceMapperListener* listener, const SequenceMapper& mapper, io::ReadStreamList& streams) { - ProcessLibraryMPIFewListeners({listener}, mapper, streams); -} +private: + template + void MapLib(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const { + SequenceMapperNotifier notifier; + for (auto listener: listeners) { + notifier.Subscribe(listener); + } + notifier.ProcessLibrary(streams, mapper); + } +}; + + +class MapLibFuncMPI : public MapLibBase { +public: + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + +private: + template + void MapLibMPI(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const { + SequenceMapperNotifierMPI notifier; + for (auto listener: listeners) { + notifier.Subscribe(listener); + } + notifier.ProcessLibrary(streams, mapper); + } +}; } // namespace debruijn_graph diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index 58ec4abc4e..b09af6388c 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -221,7 +221,7 @@ std::unique_ptr FillEdgePairFilter(const Graph &graph, const SequenceMapperNotifier::SequenceMapperT &mapper, SequencingLib &reads, size_t edgepairs, - const MapLibFuncT& map_lib_fun, + const MapLibBase& map_lib_fun, size_t num_readers) { auto filter = std::make_unique( [](const std::pair &e, uint64_t seed) { @@ -237,7 +237,7 @@ std::unique_ptr FillEdgePairFilter(const Graph &graph, auto stream = paired_binary_readers(reads, /*followed by rc*/false, 0, /*include merged*/true, num_readers); map_lib_fun(&filter_counter, mapper, stream); + return filter; } } - diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index 88055eafc5..ab0bb44ed5 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -44,7 +44,7 @@ std::unique_ptr FillEdgePairFilter(const debruijn_graph::Graph const debruijn_graph::SequenceMapper &mapper, SequencingLib &reads, size_t edgepairs, - const MapLibFuncT& map_lib_fun, + const debruijn_graph::MapLibBase& map_lib_fun, size_t num_readers = 0); } diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index beb33f3226..d7e176b480 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -18,7 +18,7 @@ namespace debruijn_graph { void run(graph_pack::GraphPack &gp, const char *) override { EnsureBasicMapping(gp); - size_t corrected = mismatches::MismatchShallNotPass(ProcessLibraryMPI, gp, 2, partask::overall_num_threads()). + size_t corrected = mismatches::MismatchShallNotPass(MapLibFuncMPI(), gp, 2, partask::overall_num_threads()). ParallelStopAllMismatches(1); INFO("Corrected " << corrected << " nucleotides"); } diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index 5cc117faa9..3c326e8041 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -19,10 +19,7 @@ namespace debruijn_graph { preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} void run(graph_pack::GraphPack &gp, const char* s) override { - execute(gp, s, ProcessLibraryMPI, - ProcessLibraryMPIFewListeners, - ProcessLibraryMPIFewListeners, - partask::overall_num_threads()); + execute(gp, s, MapLibFuncMPI(), partask::overall_num_threads()); } }; diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index 6edd23506d..dfdcc457ec 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -440,7 +440,7 @@ namespace mismatches { return CorrectAllEdges(statistics); } - MismatchShallNotPass::MismatchShallNotPass(const ProccessLibFuncT &processLib, graph_pack::GraphPack &gp, + MismatchShallNotPass::MismatchShallNotPass(const MapLibBase &processLib, graph_pack::GraphPack &gp, double relative_threshold, size_t num_readers) : gp_(gp), graph_(gp.get_mutable()), @@ -467,8 +467,9 @@ namespace mismatches { void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { EnsureBasicMapping(gp); - size_t corrected = mismatches::MismatchShallNotPass(ProcessLibrary, gp, 2, partask::overall_num_threads()). - ParallelStopAllMismatches(1); + size_t corrected = + mismatches::MismatchShallNotPass(MapLibFunc(), gp, 2, partask::overall_num_threads()) + .ParallelStopAllMismatches(1); INFO("Corrected " << corrected << " nucleotides"); } diff --git a/src/projects/spades/mismatch_correction.hpp b/src/projects/spades/mismatch_correction.hpp index 511e31c3c2..555e621b5d 100644 --- a/src/projects/spades/mismatch_correction.hpp +++ b/src/projects/spades/mismatch_correction.hpp @@ -21,14 +21,12 @@ namespace debruijn_graph { private: typedef typename Graph::EdgeId EdgeId; typedef typename Graph::VertexId VertexId; - typedef std::function &, - io::ReadStreamList &streams)> ProccessLibFuncT; graph_pack::GraphPack &gp_; Graph &graph_; const size_t k_; const double relative_threshold_; - const ProccessLibFuncT &proccess_lib_func_; + const MapLibBase &proccess_lib_func_; const size_t num_readers_; EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl); @@ -44,7 +42,7 @@ namespace debruijn_graph { size_t ParallelStopMismatchIteration(); public: - MismatchShallNotPass(const ProccessLibFuncT &processLib, graph_pack::GraphPack &gp, + MismatchShallNotPass(const MapLibBase &processLib, graph_pack::GraphPack &gp, double relative_threshold = 1.5, size_t num_readers = 0); diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index df10a3dddd..63c348cadf 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -103,9 +103,10 @@ bool ShouldObtainSingleReadsPaths(size_t ilib) { } size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, - const PairInfoCountBase::MapSingleSeqLibFuncT & map_single_seq_lib_func, - const PairInfoCountBase::MapSingleLibFuncT & map_single_lib_func, - size_t num_readers = 0, bool use_binary = true, bool map_paired = false) { + const MapLibBase & map_lib_func, + size_t num_readers = 0, + bool use_binary = true, + bool map_paired = false) { //FIXME make const auto& reads = cfg::get_writable().ds.reads[ilib]; const auto &graph = gp.get(); @@ -133,11 +134,11 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, auto mapper_ptr = ChooseProperMapper(gp, reads); if (use_binary) { auto single_streams = single_binary_readers(reads, false, map_paired, num_readers); - map_single_seq_lib_func(listeners, *mapper_ptr, single_streams); + map_lib_func(listeners, *mapper_ptr, single_streams); } else { auto single_streams = single_easy_readers(reads, false, map_paired, /*handle Ns*/false); - map_single_lib_func(listeners, *mapper_ptr, single_streams); + map_lib_func(listeners, *mapper_ptr, single_streams); } return single_long_reads.size(); @@ -145,16 +146,11 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, } // namespace void PairInfoCount::run(graph_pack::GraphPack &gp, const char *s) { - execute(gp, s, ProcessLibrary, - ProcessLibraryFewListeners, - ProcessLibraryFewListeners, - partask::overall_num_threads()); + execute(gp, s, MapLibFunc(), partask::overall_num_threads()); } void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, - const PairInfoCountBase::MapPairLibFuncT & map_pair_lib_func, - const PairInfoCountBase::MapSingleSeqLibFuncT & map_single_seq_lib_func, - const PairInfoCountBase::MapSingleLibFuncT & map_single_lib_func, + const MapLibBase &map_lib_func, size_t num_readers) { InitRRIndices(gp); EnsureBasicMapping(gp); @@ -174,7 +170,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, continue; } else if (lib.is_contig_lib()) { INFO("Mapping contigs library #" << i); - ProcessSingleReads(gp, i, map_single_seq_lib_func, map_single_lib_func, num_readers, false); + ProcessSingleReads(gp, i, map_lib_func, num_readers, false); } else { if (lib.is_paired()) { INFO("Estimating insert size for library #" << i); @@ -214,7 +210,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) { INFO("Filtering data for library #" << i); filter = paired_info::FillEdgePairFilter(graph, *ChooseProperMapper(gp, lib), lib, edgepairs, - map_pair_lib_func, num_readers); + map_lib_func, num_readers); } INFO("Mapping library #" << i); @@ -237,7 +233,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, if (ShouldObtainSingleReadsPaths(i) || ShouldObtainLibCoverage()) { cfg::get_writable().use_single_reads |= ShouldObtainSingleReadsPaths(i); INFO("Mapping single reads of library #" << i); - size_t n = ProcessSingleReads(gp, i, map_single_seq_lib_func, map_single_lib_func, num_readers, /*use_binary*/true, /*map_paired*/true); + size_t n = ProcessSingleReads(gp, i, map_lib_func, num_readers, /*use_binary*/true, /*map_paired*/true); INFO("Total paths obtained from single reads: " << n); } } diff --git a/src/projects/spades/pair_info_count.hpp b/src/projects/spades/pair_info_count.hpp index a50b658e48..a9b311bb67 100644 --- a/src/projects/spades/pair_info_count.hpp +++ b/src/projects/spades/pair_info_count.hpp @@ -14,21 +14,7 @@ namespace debruijn_graph { class PairInfoCountBase { public: - typedef std::function &, - io::ReadStreamList &streams)> MapPairLibFuncT; - - typedef std::function&, - const debruijn_graph::SequenceMapper &, - io::ReadStreamList &streams)> MapSingleSeqLibFuncT; - - typedef std::function&, - const debruijn_graph::SequenceMapper &, - io::ReadStreamList &streams)> MapSingleLibFuncT; - - void execute(graph_pack::GraphPack &gp, const char *, const MapPairLibFuncT&, - const MapSingleSeqLibFuncT&, - const MapSingleLibFuncT&, size_t num_readers=0); + void execute(graph_pack::GraphPack &gp, const char *, const MapLibBase&, size_t num_readers=0); }; class PairInfoCount : public PairInfoCountBase, public spades::AssemblyStage { From 0ebac0008c6bb9d88081a0edd0ee582ac0676ac6 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 29 Nov 2021 18:35:59 +0300 Subject: [PATCH 069/102] Separate SeqMapperNotifier --- .../alignment/sequence_mapper_notifier.cpp | 24 ---- .../alignment/sequence_mapper_notifier.hpp | 90 -------------- src/projects/hpcspades/CMakeLists.txt | 2 +- src/projects/hpcspades/common/CMakeLists.txt | 1 + .../hpcspades/common/alignment/CMakeLists.txt | 12 ++ .../sequence_mapper_notifier_mpi.cpp | 34 ++++++ .../sequence_mapper_notifier_mpi.hpp | 112 ++++++++++++++++++ .../hpcspades/distance_estimation_mpi.hpp | 2 + src/projects/hpcspades/gap_closer_mpi.hpp | 2 +- .../hpcspades/mismatch_correction_mpi.hpp | 1 + .../hpcspades/pair_info_count_mpi.hpp | 2 +- src/projects/spades/pair_info_count.cpp | 2 +- 12 files changed, 166 insertions(+), 118 deletions(-) create mode 100644 src/projects/hpcspades/common/alignment/CMakeLists.txt create mode 100644 src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp create mode 100644 src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp diff --git a/src/common/alignment/sequence_mapper_notifier.cpp b/src/common/alignment/sequence_mapper_notifier.cpp index 6c33e343fd..70518b96b8 100644 --- a/src/common/alignment/sequence_mapper_notifier.cpp +++ b/src/common/alignment/sequence_mapper_notifier.cpp @@ -13,30 +13,6 @@ #include "io/reads/read_stream_vector.hpp" namespace debruijn_graph { - -void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { - size_t mpi_size = partask::world_size(); - size_t mpi_rank = partask::world_rank(); - const size_t deadbeef = 0xDEADBEEF; - - for (size_t step = 1; step < mpi_size; step *= 2) { - if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { - partask::InputMPIStream is(mpi_rank + step); - size_t sz; - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - listener.MergeFromStream(is); - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - } else if (mpi_rank % (2*step) == step) { - partask::OutputMPIStream os(mpi_rank - step); - io::binary::BinWrite(os, deadbeef); - listener.Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } -} - SequenceMapperNotifier::SequenceMapperNotifier(size_t lib_count) : listeners_(lib_count) {} diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index b5b0baf789..c0c0470278 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -16,7 +16,6 @@ #include "io/reads/paired_read.hpp" #include "io/reads/read_stream_vector.hpp" #include "utils/perf/timetracer.hpp" -#include "pipeline/partask_mpi.hpp" #include #include @@ -135,68 +134,6 @@ class SequenceMapperNotifier { std::vector > listeners_; //first vector's size = count libs }; - -class SequenceMapperNotifierMPI : public SequenceMapperNotifier { - void PyramidMergeMPI(SequenceMapperListener &listener); - -public: - using SequenceMapperNotifier::SequenceMapperNotifier; - - template - void ProcessLibrary(io::ReadStreamList& streams, - size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { - INFO("ProcessLibraryMPI started"); - // Select streams - std::vector chunks = partask::chunks_rr(streams.size()); - INFO("Selected streams: " << chunks); - - partask::execute_on_subset(streams, chunks, - [&](io::ReadStreamList& local_streams) { - // Run ProcessLibrary - INFO("Running ProcessLibrary"); - SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); - INFO("ProcessLibrary done"); - }); - - INFO("Merging results..."); - for (const auto& listener : listeners_[lib_index]) { - INFO("Merging listener " << listener->name()); - PyramidMergeMPI(*listener); - } - INFO("Listeners merged"); - - if (partask::world_size() > 1) { - const size_t deadbeef = 0xDEADBEEF; - INFO("Syncing listeners..."); - if (partask::master()) { - partask::OutputMPIStreamBcast os(0); - for (const auto& listener : listeners_[lib_index]) { - io::binary::BinWrite(os, deadbeef); - listener->Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } else { - partask::InputMPIStreamBcast is(0); - for (const auto& listener : listeners_[lib_index]) { - size_t sz; - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - listener->Deserialize(is); - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - } - } - INFO("Listeners synced"); - } - } - - template - void ProcessLibrary(io::ReadStreamList& streams, - const SequenceMapperT& mapper, size_t threads_count = 0) { - return ProcessLibrary(streams, 0, mapper, threads_count); - } -}; - class MapLibBase { public: virtual void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const = 0; @@ -236,33 +173,6 @@ class MapLibFunc : public MapLibBase { } }; - -class MapLibFuncMPI : public MapLibBase { -public: - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - -private: - template - void MapLibMPI(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const { - SequenceMapperNotifierMPI notifier; - for (auto listener: listeners) { - notifier.Subscribe(listener); - } - notifier.ProcessLibrary(streams, mapper); - } -}; - } // namespace debruijn_graph diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index fe30654d2c..24a07be5b4 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -11,7 +11,7 @@ add_subdirectory(common) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) set(HPCSPADES_SRC pipeline.cpp ../../projects/spades/series_analysis.cpp ../../projects/mts/contig_abundance.cpp) -set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi ${COMMON_LIBRARIES}) +set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi alignment_mpi ${COMMON_LIBRARIES}) add_executable(spades-hpc main_mpi.cpp ${HPCSPADES_SRC}) diff --git a/src/projects/hpcspades/common/CMakeLists.txt b/src/projects/hpcspades/common/CMakeLists.txt index 32ffad2690..0a6e1783c7 100644 --- a/src/projects/hpcspades/common/CMakeLists.txt +++ b/src/projects/hpcspades/common/CMakeLists.txt @@ -8,4 +8,5 @@ project(common_modules_mpi CXX) add_subdirectory(paired_info) +add_subdirectory(alignment) add_subdirectory(stages) diff --git a/src/projects/hpcspades/common/alignment/CMakeLists.txt b/src/projects/hpcspades/common/alignment/CMakeLists.txt new file mode 100644 index 0000000000..ad896b644b --- /dev/null +++ b/src/projects/hpcspades/common/alignment/CMakeLists.txt @@ -0,0 +1,12 @@ +############################################################################ +# Copyright (c) 2021 Saint Petersburg State University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(alignment_mpi CXX) + +add_library(alignment_mpi STATIC + sequence_mapper_notifier_mpi.cpp) + +target_link_libraries(alignment_mpi modules) diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp new file mode 100644 index 0000000000..73c177d733 --- /dev/null +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp @@ -0,0 +1,34 @@ +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "sequence_mapper_notifier_mpi.hpp" + +#include "io/reads/read_stream_vector.hpp" + +namespace debruijn_graph { + void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { + size_t mpi_size = partask::world_size(); + size_t mpi_rank = partask::world_rank(); + const size_t deadbeef = 0xDEADBEEF; + + for (size_t step = 1; step < mpi_size; step *= 2) { + if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { + partask::InputMPIStream is(mpi_rank + step); + size_t sz; + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + listener.MergeFromStream(is); + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + } else if (mpi_rank % (2*step) == step) { + partask::OutputMPIStream os(mpi_rank - step); + io::binary::BinWrite(os, deadbeef); + listener.Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } + } +} // namespace debruijn_graph diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp new file mode 100644 index 0000000000..3a8509904e --- /dev/null +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp @@ -0,0 +1,112 @@ +//*************************************************************************** +//* Copyright (c) 2015-2021 Saint Petersburg State University +//* Copyright (c) 2011-2014 Saint Petersburg Academic University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "alignment/sequence_mapper_notifier.hpp" +#include "alignment/sequence_mapper_fwd.hpp" + +#include "assembly_graph/paths/mapping_path.hpp" +#include "assembly_graph/core/graph.hpp" +#include "io/reads/paired_read.hpp" +#include "io/reads/read_stream_vector.hpp" + +#include "utils/perf/timetracer.hpp" +#include "pipeline/partask_mpi.hpp" + +#include +#include + +namespace debruijn_graph { + class SequenceMapperNotifierMPI : public SequenceMapperNotifier { + void PyramidMergeMPI(SequenceMapperListener &listener); + + public: + using SequenceMapperNotifier::SequenceMapperNotifier; + + template + void ProcessLibrary(io::ReadStreamList& streams, + size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { + INFO("ProcessLibraryMPI started"); + // Select streams + std::vector chunks = partask::chunks_rr(streams.size()); + INFO("Selected streams: " << chunks); + + partask::execute_on_subset(streams, chunks, + [&](io::ReadStreamList& local_streams) { + // Run ProcessLibrary + INFO("Running ProcessLibrary"); + SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); + INFO("ProcessLibrary done"); + }); + + INFO("Merging results..."); + for (const auto& listener : listeners_[lib_index]) { + INFO("Merging listener " << listener->name()); + PyramidMergeMPI(*listener); + } + INFO("Listeners merged"); + + if (partask::world_size() > 1) { + const size_t deadbeef = 0xDEADBEEF; + INFO("Syncing listeners..."); + if (partask::master()) { + partask::OutputMPIStreamBcast os(0); + for (const auto& listener : listeners_[lib_index]) { + io::binary::BinWrite(os, deadbeef); + listener->Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } else { + partask::InputMPIStreamBcast is(0); + for (const auto& listener : listeners_[lib_index]) { + size_t sz; + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + listener->Deserialize(is); + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + } + } + INFO("Listeners synced"); + } + } + + template + void ProcessLibrary(io::ReadStreamList& streams, + const SequenceMapperT& mapper, size_t threads_count = 0) { + return ProcessLibrary(streams, 0, mapper, threads_count); + } + }; + + class MapLibFuncMPI : public MapLibBase { + public: + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { + MapLibMPI(listeners, mapper, streams); + } + + private: + template + void MapLibMPI(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const { + SequenceMapperNotifierMPI notifier; + for (auto listener: listeners) { + notifier.Subscribe(listener); + } + notifier.ProcessLibrary(streams, mapper); + } + }; + +} // namespace debruijn_graph diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp index 562cf4185e..285913641c 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.hpp +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -9,6 +9,8 @@ #include "common/pipeline/mpi_stage.hpp" #include "projects/spades/distance_estimation.hpp" +#include "common/alignment/sequence_mapper_notifier_mpi.hpp" + namespace debruijn_graph { class DistanceEstimationMPI : public DistanceEstimationBase, public spades::MPIAssemblyStage { diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp index 60432da86c..e137ccac07 100644 --- a/src/projects/hpcspades/gap_closer_mpi.hpp +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -9,7 +9,7 @@ #pragma once #include "projects/spades/gap_closer.hpp" -#include "alignment/sequence_mapper_notifier.hpp" +#include "common/alignment/sequence_mapper_notifier_mpi.hpp" #include "pipeline/mpi_stage.hpp" #include "io/reads/io_helper.hpp" diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index d7e176b480..8ce0d7a913 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -7,6 +7,7 @@ #pragma once #include "projects/spades/mismatch_correction.hpp" +#include "common/alignment/sequence_mapper_notifier_mpi.hpp" #include "pipeline/mpi_stage.hpp" #include "pipeline/graph_pack_helpers.h" diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index 3c326e8041..218a3406ea 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -8,7 +8,7 @@ #pragma once #include "projects/spades/pair_info_count.hpp" -#include "alignment/sequence_mapper_notifier.hpp" +#include "common/alignment/sequence_mapper_notifier_mpi.hpp" #include "pipeline/mpi_stage.hpp" namespace debruijn_graph { diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 63c348cadf..8bc6635e81 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -146,7 +146,7 @@ size_t ProcessSingleReads(graph_pack::GraphPack &gp, size_t ilib, } // namespace void PairInfoCount::run(graph_pack::GraphPack &gp, const char *s) { - execute(gp, s, MapLibFunc(), partask::overall_num_threads()); + execute(gp, s, MapLibFunc()); } void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, From 28927de4dff63189c2d92a488c6feaa5fc31f8ea Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 30 Nov 2021 20:07:39 +0300 Subject: [PATCH 070/102] PerfectHashMapperBuilder MPI --- src/common/kmer_index/ph_map/kmer_maps.hpp | 2 + .../kmer_index/ph_map/perfect_hash_map.hpp | 1 + .../ph_map/perfect_hash_map_builder.hpp | 75 +++++++++++-------- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/src/common/kmer_index/ph_map/kmer_maps.hpp b/src/common/kmer_index/ph_map/kmer_maps.hpp index 8f32b67342..0c47049f52 100644 --- a/src/common/kmer_index/ph_map/kmer_maps.hpp +++ b/src/common/kmer_index/ph_map/kmer_maps.hpp @@ -159,6 +159,7 @@ class KeyStoringMap : public PerfectHashMap { } friend struct KeyStoringIndexBuilder; + friend struct KeyStoringIndexBuilderMPI; }; template, class StoringType = SimpleStoring> @@ -194,6 +195,7 @@ class KeyIteratingMap : public PerfectHashMap { } friend struct KeyIteratingIndexBuilder; + friend struct KeyIteratingIndexBuilderMPI; }; } diff --git a/src/common/kmer_index/ph_map/perfect_hash_map.hpp b/src/common/kmer_index/ph_map/perfect_hash_map.hpp index bbd4e971a2..bfd0859e9e 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map.hpp @@ -160,6 +160,7 @@ class PerfectHashMap : public IndexWrapper { const auto& values() const { return data_; } friend struct PerfectHashMapBuilder; + friend struct PerfectHashMapBuilderMPI; protected: void resize(size_t sz) { diff --git a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp index 5b22553bd7..986d0737b6 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp @@ -20,13 +20,13 @@ struct PerfectHashMapBuilder { template kmers::KMerDiskStorage BuildIndex(PerfectHashMap &index, - Counter& counter, size_t bucket_num, + Counter &counter, size_t bucket_num, size_t thread_num, bool save_final = false) const { TIME_TRACE_SCOPE("PerfectHashMapBuilder::BuildIndex"); using KMerIndex = typename PerfectHashMap::KMerIndexT; - kmers::KMerIndexBuilder builder((unsigned)bucket_num, (unsigned)thread_num); + kmers::KMerIndexBuilder builder((unsigned) bucket_num, (unsigned) thread_num); auto res = builder.BuildIndex(*index.index_ptr_, counter, save_final); index.resize(res.total_kmers()); @@ -35,19 +35,21 @@ struct PerfectHashMapBuilder { template void BuildIndex(PerfectHashMap &index, - const KMerStorage& storage, size_t thread_num) const { + const KMerStorage &storage, size_t thread_num) const { TIME_TRACE_SCOPE("PerfectHashMapBuilder::BuildIndex"); using KMerIndex = typename PerfectHashMap::KMerIndexT; - kmers::KMerIndexBuilder builder(0, (unsigned)thread_num); + kmers::KMerIndexBuilder builder(0, (unsigned) thread_num); builder.BuildIndex(*index.index_ptr_, storage); index.resize(storage.total_kmers()); } +}; +struct PerfectHashMapBuilderMPI { template void BuildIndexMPI(PerfectHashMap &index, - KMerStorage& storage, bool save_final = true) const { + KMerStorage &storage, bool save_final = true) const { using KMerIndex = typename PerfectHashMap::KMerIndexT; kmers::KMerIndexBuilderMPI builder; @@ -104,7 +106,7 @@ struct CQFHashMapBuilder { struct KeyStoringIndexBuilder { template void BuildIndex(KeyStoringMap &index, - Counter& counter, size_t bucket_num, + Counter &counter, size_t bucket_num, size_t thread_num) const { auto res = phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, true); VERIFY(!index.kmers_.get()); @@ -112,9 +114,14 @@ struct KeyStoringIndexBuilder { index.SortUniqueKMers(); } +private: + PerfectHashMapBuilder phm_builder_; +}; + +struct KeyStoringIndexBuilderMPI { template void BuildIndexMPI(KeyStoringMap &index, - KMerStorage& kmerstorage, bool save_final = true) const { + KMerStorage &kmerstorage, bool save_final = true) const { phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); if (partask::master()) { VERIFY(!index.kmers_.get()); @@ -123,39 +130,44 @@ struct KeyStoringIndexBuilder { } } - private: - PerfectHashMapBuilder phm_builder_; +private: + PerfectHashMapBuilderMPI phm_builder_; }; struct KeyIteratingIndexBuilder { template void BuildIndex(KeyIteratingMap &index, - Counter& counter, size_t bucket_num, + Counter &counter, size_t bucket_num, size_t thread_num) const { auto res = phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, true); index.kmers_ = res.final_kmers(); } - template - void BuildIndexMPI(KeyIteratingMap &index, - KMerStorage& kmerstorage, bool save_final = true) const { - phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); - std::string final_kmers_file; - if (partask::master()) { - index.kmers_ = kmerstorage.final_kmers(); - final_kmers_file = index.kmers_->file(); - } - // MPI code leaked so far( TODO do smth with this - partask::broadcast(final_kmers_file); - if (partask::worker()) { - index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); - index.kmers_->release(); +private: + PerfectHashMapBuilder phm_builder_; +}; + +struct KeyIteratingIndexBuilderMPI { + template + void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage& kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + std::string final_kmers_file; + if (partask::master()) { + index.kmers_ = kmerstorage.final_kmers(); + final_kmers_file = index.kmers_->file(); + } + // MPI code leaked so far( TODO do smth with this + partask::broadcast(final_kmers_file); + if (partask::worker()) { + index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); + index.kmers_->release(); + } + INFO("Final K-mers file: " << final_kmers_file); } - INFO("Final K-mers file: " << final_kmers_file); - } - private: - PerfectHashMapBuilder phm_builder_; + private: + PerfectHashMapBuilderMPI phm_builder_; }; template @@ -188,19 +200,18 @@ void BuildIndex(PerfectHashMap &index, template void BuildIndexMPI(PerfectHashMap &index, KMerStorage &storage, bool save_final = true) { - PerfectHashMapBuilder().BuildIndexMPI(index, storage, save_final); + PerfectHashMapBuilderMPI().BuildIndexMPI(index, storage, save_final); } template void BuildIndexMPI(KeyStoringMap &index, KMerStorage &storage, bool save_final = true) { - KeyStoringIndexBuilder().BuildIndexMPI(index, storage, save_final); + KeyStoringIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); } template void BuildIndexMPI(KeyIteratingMap &index, KMerStorage &storage, bool save_final = true) { - KeyIteratingIndexBuilder().BuildIndexMPI(index, storage, save_final); + KeyIteratingIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); } - } From 28da1f156a0e665d6ca411e320379721b531de5b Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 30 Nov 2021 22:05:54 +0300 Subject: [PATCH 071/102] move mpi_kmer_index_builder to mpi dir --- .../extension_index/kmer_extension_index_builder_mpi.hpp | 2 +- src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp | 2 +- .../common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename src/{ => projects/hpcspades}/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp (98%) diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index f63a6112b5..1b31e9327f 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -10,7 +10,7 @@ #include "kmer_extension_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" +#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "kmer_index/kmer_counting.hpp" #include "kmer_index/ph_map/perfect_hash_map_builder.hpp" diff --git a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp index 986d0737b6..fe70bf9512 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp @@ -11,7 +11,7 @@ #include "kmer_maps.hpp" #include "cqf_hash_map.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" +#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "utils/perf/timetracer.hpp" namespace kmers { diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp similarity index 98% rename from src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp rename to src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp index 1aee9c04e6..0ec5682b77 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp @@ -5,8 +5,8 @@ //* See file LICENSE for details. //*************************************************************************** -#include "kmer_index_builder.hpp" -#include "kmer_buckets.hpp" +#include "kmer_index/kmer_mph/kmer_index_builder.hpp" +#include "kmer_index/kmer_mph/kmer_buckets.hpp" #include "pipeline/partask_mpi.hpp" From 973549737b6221e6d7141804e30641225197a1ca Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 30 Nov 2021 22:16:59 +0300 Subject: [PATCH 072/102] separate perfect_hash_map_builder_mpi --- .../kmer_extension_index_builder_mpi.hpp | 2 +- .../ph_map/perfect_hash_map_builder.hpp | 69 --------------- .../ph_map/perfect_hash_map_builder_mpi.hpp | 86 +++++++++++++++++++ 3 files changed, 87 insertions(+), 70 deletions(-) create mode 100644 src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 1b31e9327f..c312449da9 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -13,7 +13,7 @@ #include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "kmer_index/kmer_counting.hpp" -#include "kmer_index/ph_map/perfect_hash_map_builder.hpp" +#include "projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp" #include "io/reads/multifile_reader.hpp" #include "pipeline/partask_mpi.hpp" diff --git a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp index fe70bf9512..86a513f6f6 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp @@ -46,18 +46,6 @@ struct PerfectHashMapBuilder { } }; -struct PerfectHashMapBuilderMPI { - template - void BuildIndexMPI(PerfectHashMap &index, - KMerStorage &storage, bool save_final = true) const { - using KMerIndex = typename PerfectHashMap::KMerIndexT; - - kmers::KMerIndexBuilderMPI builder; - size_t sz = builder.BuildIndexMPI(*index.index_ptr_, storage, save_final); - index.resize(sz); - } -}; - struct CQFHashMapBuilder { static uint64_t hash_64(uint64_t key, uint64_t mask) { key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; @@ -118,22 +106,6 @@ struct KeyStoringIndexBuilder { PerfectHashMapBuilder phm_builder_; }; -struct KeyStoringIndexBuilderMPI { - template - void BuildIndexMPI(KeyStoringMap &index, - KMerStorage &kmerstorage, bool save_final = true) const { - phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); - if (partask::master()) { - VERIFY(!index.kmers_.get()); - index.kmers_file_ = kmerstorage.final_kmers(); - index.SortUniqueKMers(); - } - } - -private: - PerfectHashMapBuilderMPI phm_builder_; -}; - struct KeyIteratingIndexBuilder { template void BuildIndex(KeyIteratingMap &index, @@ -147,29 +119,6 @@ struct KeyIteratingIndexBuilder { PerfectHashMapBuilder phm_builder_; }; -struct KeyIteratingIndexBuilderMPI { - template - void BuildIndexMPI(KeyIteratingMap &index, - KMerStorage& kmerstorage, bool save_final = true) const { - phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); - std::string final_kmers_file; - if (partask::master()) { - index.kmers_ = kmerstorage.final_kmers(); - final_kmers_file = index.kmers_->file(); - } - // MPI code leaked so far( TODO do smth with this - partask::broadcast(final_kmers_file); - if (partask::worker()) { - index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); - index.kmers_->release(); - } - INFO("Final K-mers file: " << final_kmers_file); - } - - private: - PerfectHashMapBuilderMPI phm_builder_; -}; - template void BuildIndex(KeyIteratingMap &index, Counter& counter, size_t bucket_num, @@ -196,22 +145,4 @@ void BuildIndex(PerfectHashMap &index, const KMerStorage& storage, size_t thread_num) { PerfectHashMapBuilder().BuildIndex(index, storage, thread_num); } - -template -void BuildIndexMPI(PerfectHashMap &index, - KMerStorage &storage, bool save_final = true) { - PerfectHashMapBuilderMPI().BuildIndexMPI(index, storage, save_final); -} - -template -void BuildIndexMPI(KeyStoringMap &index, - KMerStorage &storage, bool save_final = true) { - KeyStoringIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); -} - -template -void BuildIndexMPI(KeyIteratingMap &index, - KMerStorage &storage, bool save_final = true) { - KeyIteratingIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); -} } diff --git a/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp new file mode 100644 index 0000000000..ed75107021 --- /dev/null +++ b/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp @@ -0,0 +1,86 @@ +#pragma once +//*************************************************************************** +//* Copyright (c) 2021 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "kmer_index/ph_map/perfect_hash_map.hpp" +#include "kmer_index/ph_map/kmer_maps.hpp" +#include "kmer_index/ph_map/cqf_hash_map.hpp" + +#include "..//kmer_mph/kmer_index_builder_mpi.hpp" +#include "kmer_index/kmer_mph/kmer_index_builder.hpp" +#include "kmer_index/kmer_mph/kmer_splitters.hpp" +#include "common/utils/perf/timetracer.hpp" + +namespace kmers { + struct PerfectHashMapBuilderMPI { + template + void BuildIndexMPI(PerfectHashMap &index, + KMerStorage &storage, bool save_final = true) const { + using KMerIndex = typename PerfectHashMap::KMerIndexT; + + kmers::KMerIndexBuilderMPI builder; + size_t sz = builder.BuildIndexMPI(*index.index_ptr_, storage, save_final); + index.resize(sz); + } + }; + + struct KeyStoringIndexBuilderMPI { + template + void BuildIndexMPI(KeyStoringMap &index, + KMerStorage &kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + if (partask::master()) { + VERIFY(!index.kmers_.get()); + index.kmers_file_ = kmerstorage.final_kmers(); + index.SortUniqueKMers(); + } + } + + private: + PerfectHashMapBuilderMPI phm_builder_; + }; + + struct KeyIteratingIndexBuilderMPI { + template + void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage& kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + std::string final_kmers_file; + if (partask::master()) { + index.kmers_ = kmerstorage.final_kmers(); + final_kmers_file = index.kmers_->file(); + } + // MPI code leaked so far( TODO do smth with this + partask::broadcast(final_kmers_file); + if (partask::worker()) { + index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); + index.kmers_->release(); + } + INFO("Final K-mers file: " << final_kmers_file); + } + + private: + PerfectHashMapBuilderMPI phm_builder_; + }; + + template + void BuildIndexMPI(PerfectHashMap &index, + KMerStorage &storage, bool save_final = true) { + PerfectHashMapBuilderMPI().BuildIndexMPI(index, storage, save_final); + } + + template + void BuildIndexMPI(KeyStoringMap &index, + KMerStorage &storage, bool save_final = true) { + KeyStoringIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); + } + + template + void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage &storage, bool save_final = true) { + KeyIteratingIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); + } +} From 26da67bef47691d887408b20af618bfc93e8c39c Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Tue, 30 Nov 2021 22:36:06 +0300 Subject: [PATCH 073/102] move kmer_extension_index_builder_mpi to mpi dir --- src/common/assembly_graph/CMakeLists.txt | 4 ---- src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp | 1 - .../extension_index/kmer_extension_index_builder_mpi.hpp | 8 ++++---- src/projects/hpcspades/common/stages/construction_mpi.cpp | 2 +- 4 files changed, 5 insertions(+), 10 deletions(-) rename src/{ => projects/hpcspades}/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp (98%) diff --git a/src/common/assembly_graph/CMakeLists.txt b/src/common/assembly_graph/CMakeLists.txt index 717e2c792c..09dcbd7ce9 100644 --- a/src/common/assembly_graph/CMakeLists.txt +++ b/src/common/assembly_graph/CMakeLists.txt @@ -18,7 +18,3 @@ add_library(assembly_graph STATIC ../alignment/edge_index_refiller.cpp) target_link_libraries(assembly_graph utils llvm-support) - -if (MPI_FOUND) - target_link_libraries(assembly_graph ${MPI_LIBRARIES}) -endif() diff --git a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp index 86a513f6f6..4e63cd9fdd 100644 --- a/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp +++ b/src/common/kmer_index/ph_map/perfect_hash_map_builder.hpp @@ -11,7 +11,6 @@ #include "kmer_maps.hpp" #include "cqf_hash_map.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "utils/perf/timetracer.hpp" namespace kmers { diff --git a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp similarity index 98% rename from src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp rename to src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index c312449da9..41fa120174 100644 --- a/src/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -6,14 +6,14 @@ #pragma once -#include "kmer_extension_index.hpp" -#include "kmer_extension_index_builder.hpp" +#include "kmer_index/extension_index/kmer_extension_index.hpp" +#include "kmer_index/extension_index/kmer_extension_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" +#include "../kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "kmer_index/kmer_counting.hpp" -#include "projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp" +#include "../ph_map/perfect_hash_map_builder_mpi.hpp" #include "io/reads/multifile_reader.hpp" #include "pipeline/partask_mpi.hpp" diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp index e1cba80008..353f02bf1e 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -16,7 +16,7 @@ #include "io/reads/coverage_filtering_read_wrapper.hpp" #include "io/reads/multifile_reader.hpp" #include "kmer_index/ph_map/coverage_hash_map_builder.hpp" -#include "kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp" +#include "../kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp" #include "modules/graph_construction.hpp" #include "pipeline/genomic_info.hpp" #include "pipeline/graph_pack.hpp" From 932636db431bf1cf8b59047fcef5e9af4193f2ed Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 11 Sep 2024 17:20:17 -0700 Subject: [PATCH 074/102] Move partask and mpi stage to hpcSPAdes --- src/common/pipeline/CMakeLists.txt | 11 +- src/common/pipeline/stage.cpp | 5 +- src/common/stages/test_mpi.cpp | 106 ------------------ src/projects/hpcspades/CMakeLists.txt | 2 +- src/projects/hpcspades/common/CMakeLists.txt | 1 + .../sequence_mapper_notifier_mpi.hpp | 2 +- .../debruijn_graph_constructor_mpi.hpp | 2 +- .../kmer_extension_index_builder_mpi.hpp | 6 +- .../kmer_mph/kmer_index_builder_mpi.hpp | 2 +- .../paired_info/distance_estimation.hpp | 2 +- .../hpcspades/common/pipeline/CMakeLists.txt | 11 ++ .../hpcspades}/common/pipeline/mpi_stage.cpp | 5 +- .../hpcspades}/common/pipeline/mpi_stage.hpp | 2 +- .../common/pipeline/partask_mpi.hpp | 0 .../common/stages/construction_mpi.cpp | 4 +- .../common/stages/construction_mpi.hpp | 2 +- .../hpcspades/common/stages/test_mpi.cpp | 5 +- src/projects/hpcspades/gap_closer_mpi.hpp | 2 +- src/projects/hpcspades/main_mpi.cpp | 2 +- .../hpcspades/mismatch_correction_mpi.hpp | 2 +- .../hpcspades/pair_info_count_mpi.hpp | 2 +- src/projects/hpcspades/pipeline.cpp | 2 +- src/projects/spades/mismatch_correction.cpp | 3 +- src/projects/spades/pipeline.cpp | 7 +- 24 files changed, 38 insertions(+), 150 deletions(-) delete mode 100644 src/common/stages/test_mpi.cpp create mode 100644 src/projects/hpcspades/common/pipeline/CMakeLists.txt rename src/{ => projects/hpcspades}/common/pipeline/mpi_stage.cpp (99%) rename src/{ => projects/hpcspades}/common/pipeline/mpi_stage.hpp (99%) rename src/{ => projects/hpcspades}/common/pipeline/partask_mpi.hpp (100%) diff --git a/src/common/pipeline/CMakeLists.txt b/src/common/pipeline/CMakeLists.txt index ac507cada5..77467a5b1b 100644 --- a/src/common/pipeline/CMakeLists.txt +++ b/src/common/pipeline/CMakeLists.txt @@ -14,14 +14,5 @@ set(pipeline_src sequence_mapper_gp_api.cpp stage.cpp) -if (MPI_FOUND) - set(pipeline_src ${pipeline_src} mpi_stage.cpp) -endif() - add_library(pipeline STATIC ${pipeline_src}) - -if (MPI_FOUND) - target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment ${MPI_LIBRARIES}) -else() - target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment) -endif() +target_link_libraries(pipeline binary_io path_extend input llvm-support library configs alignment) diff --git a/src/common/pipeline/stage.cpp b/src/common/pipeline/stage.cpp index 619f1d0589..ab3165fd78 100644 --- a/src/common/pipeline/stage.cpp +++ b/src/common/pipeline/stage.cpp @@ -9,7 +9,6 @@ #include "stage.hpp" #include "graph_pack_helpers.h" -#include "partask_mpi.hpp" #include "io/binary/graph_pack.hpp" #include "io/dataset_support/read_converter.hpp" @@ -36,11 +35,9 @@ void AssemblyStage::load(graph_pack::GraphPack& gp, io::binary::FullPackIO().Load(p, gp); debruijn_graph::config::load_lib_data(p); - // FIXME: Should not be here - partask::critical_ordered([] { io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); }); + io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); } - void AssemblyStage::save(const graph_pack::GraphPack& gp, const std::filesystem::path &save_to, const char* prefix) const { diff --git a/src/common/stages/test_mpi.cpp b/src/common/stages/test_mpi.cpp deleted file mode 100644 index 120b20a98f..0000000000 --- a/src/common/stages/test_mpi.cpp +++ /dev/null @@ -1,106 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2018 Saint Petersburg State University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "pipeline/partask_mpi.hpp" -#include "pipeline/mpi_stage.hpp" -#include "pipeline/stage.hpp" - -#include -#include -#include -#include -#include - -namespace debruijn_graph { - -class ArraySum { -public: - ArraySum(const std::string &message = "") : message_{message} {}; - ArraySum(const ArraySum&) = delete; - ArraySum(ArraySum&&) = default; - - std::string message_; - ArraySum(std::istream &is) { std::getline(is, message_); } - - std::ostream &serialize(std::ostream &os) const { return os << message_; } - - template - auto make_splitter(size_t n, const Data &data) { - size_t N = data.size(); - auto splitter = [N, n, i = size_t(0)](std::ostream &os, size_t /*node*/) mutable -> bool { - if (i == n) return false; - size_t begin = i * N / n; - size_t end = (i + 1) * N / n; - ++i; - os << begin << " " << end << " "; - return true; - }; - - return splitter; - }; - - template - void process(std::istream &is, std::ostream &os, const Data &data) { - INFO("MESSAGE: " << message_); - long long int sum = 0; -#pragma omp parallel reduction(+ : sum) - while (true) { - size_t begin, end; - bool exit = false; -#pragma omp critical - { - if (is.peek() == EOF || !(is >> begin >> end)) { - exit = true; - } else { - DEBUG("Extracted range: " << begin << " " << end); - } - } - if (exit) break; - for (size_t i = begin; i < end; ++i) { - sum += data[i]; - } - } - INFO("Computed sum: " << sum); - os << sum; - } - - auto merge(const std::vector &piss, ...) { - long long int sum = 0; - for (auto &pis : piss) { - long long int local_sum; - *pis >> local_sum; - sum += local_sum; - } - - return sum; - }; -}; - -class TestMPI : public spades::MPIAssemblyStage { -public: - TestMPI() : MPIAssemblyStage("Test MPI", "test_mpi") {} - - void run(graph_pack::GraphPack& /*gp*/, const char *) override { - INFO("TestMPI started"); - partask::TaskRegistry treg; - - const size_t N = 100000; - std::array data; - std::iota(data.begin(), data.end(), 1); - - auto job = treg.add(std::cref(data)); - treg.listen(); - - if (treg.master()) { - auto res = job("Message1"); - INFO("JOB RESULT: " << res); - } - - treg.stop_listening(); - } -}; - -} // namespace debruijn_graph diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index 24a07be5b4..3bb1f2f7fa 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -11,7 +11,7 @@ add_subdirectory(common) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) set(HPCSPADES_SRC pipeline.cpp ../../projects/spades/series_analysis.cpp ../../projects/mts/contig_abundance.cpp) -set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi alignment_mpi ${COMMON_LIBRARIES}) +set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi alignment_mpi pipeline_mpi ${COMMON_LIBRARIES}) add_executable(spades-hpc main_mpi.cpp ${HPCSPADES_SRC}) diff --git a/src/projects/hpcspades/common/CMakeLists.txt b/src/projects/hpcspades/common/CMakeLists.txt index 0a6e1783c7..5e28536c3d 100644 --- a/src/projects/hpcspades/common/CMakeLists.txt +++ b/src/projects/hpcspades/common/CMakeLists.txt @@ -10,3 +10,4 @@ project(common_modules_mpi CXX) add_subdirectory(paired_info) add_subdirectory(alignment) add_subdirectory(stages) +add_subdirectory(pipeline) \ No newline at end of file diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp index 3a8509904e..25c4a71ce6 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp @@ -16,7 +16,7 @@ #include "io/reads/read_stream_vector.hpp" #include "utils/perf/timetracer.hpp" -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" #include #include diff --git a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index 323054ca3b..5046927f1e 100644 --- a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -5,7 +5,7 @@ //* See file LICENSE for details. //*************************************************************************** -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" #include "io/binary/graph.hpp" #include "common/assembly_graph/construction/debruijn_graph_constructor.hpp" diff --git a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 41fa120174..5c5c22834b 100644 --- a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -10,13 +10,13 @@ #include "kmer_index/extension_index/kmer_extension_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "../kmer_mph/kmer_index_builder_mpi.hpp" +#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "kmer_index/kmer_counting.hpp" -#include "../ph_map/perfect_hash_map_builder_mpi.hpp" +#include "projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp" #include "io/reads/multifile_reader.hpp" -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" namespace kmers { class DeBruijnExtensionIndexBuilderMPI : public DeBruijnExtensionIndexBuilder { diff --git a/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp index 0ec5682b77..b5c759af95 100644 --- a/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp @@ -8,7 +8,7 @@ #include "kmer_index/kmer_mph/kmer_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_buckets.hpp" -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" namespace kmers { template diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp index f6c7ba8eef..35ed59a96e 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp @@ -8,7 +8,7 @@ #define MPI_DISTANCE_ESTIMATION_HPP_ #include "common/paired_info/distance_estimation.hpp" -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" namespace omnigraph { namespace de { diff --git a/src/projects/hpcspades/common/pipeline/CMakeLists.txt b/src/projects/hpcspades/common/pipeline/CMakeLists.txt new file mode 100644 index 0000000000..10a01e1f9b --- /dev/null +++ b/src/projects/hpcspades/common/pipeline/CMakeLists.txt @@ -0,0 +1,11 @@ +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(pipeline_mpi CXX) + +add_library(pipeline_mpi STATIC mpi_stage.cpp) + +target_link_libraries(pipeline_mpi pipeline ${MPI_LIBRARIES}) diff --git a/src/common/pipeline/mpi_stage.cpp b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp similarity index 99% rename from src/common/pipeline/mpi_stage.cpp rename to src/projects/hpcspades/common/pipeline/mpi_stage.cpp index f15e514b34..d5d8319e38 100644 --- a/src/common/pipeline/mpi_stage.cpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp @@ -4,10 +4,11 @@ //* See file LICENSE for details. //*************************************************************************** +#include "mpi_stage.hpp" +#include "partask_mpi.hpp" + #include "pipeline/stage.hpp" -#include "pipeline/mpi_stage.hpp" -#include "partask_mpi.hpp" #include "io/binary/graph_pack.hpp" #include "io/dataset_support/read_converter.hpp" #include "utils/logger/log_writers.hpp" diff --git a/src/common/pipeline/mpi_stage.hpp b/src/projects/hpcspades/common/pipeline/mpi_stage.hpp similarity index 99% rename from src/common/pipeline/mpi_stage.hpp rename to src/projects/hpcspades/common/pipeline/mpi_stage.hpp index b758cdc38a..533d4d20ab 100644 --- a/src/common/pipeline/mpi_stage.hpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.hpp @@ -6,7 +6,7 @@ #pragma once -#include "stage.hpp" +#include "pipeline/stage.hpp" #include #include diff --git a/src/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp similarity index 100% rename from src/common/pipeline/partask_mpi.hpp rename to src/projects/hpcspades/common/pipeline/partask_mpi.hpp diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp index 353f02bf1e..f70f7952e8 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -20,8 +20,8 @@ #include "modules/graph_construction.hpp" #include "pipeline/genomic_info.hpp" #include "pipeline/graph_pack.hpp" -#include "pipeline/mpi_stage.hpp" -#include "pipeline/partask_mpi.hpp" +#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" #include "utils/filesystem/temporary.hpp" namespace debruijn_graph { diff --git a/src/projects/hpcspades/common/stages/construction_mpi.hpp b/src/projects/hpcspades/common/stages/construction_mpi.hpp index b3e2d1279c..43efbbf653 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.hpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.hpp @@ -6,7 +6,7 @@ #pragma once -#include "pipeline/mpi_stage.hpp" +#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" namespace debruijn_graph { diff --git a/src/projects/hpcspades/common/stages/test_mpi.cpp b/src/projects/hpcspades/common/stages/test_mpi.cpp index 120b20a98f..d1df3920fe 100644 --- a/src/projects/hpcspades/common/stages/test_mpi.cpp +++ b/src/projects/hpcspades/common/stages/test_mpi.cpp @@ -4,9 +4,8 @@ //* See file LICENSE for details. //*************************************************************************** -#include "pipeline/partask_mpi.hpp" -#include "pipeline/mpi_stage.hpp" -#include "pipeline/stage.hpp" +#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" +#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" #include #include diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp index e137ccac07..77646bd2ce 100644 --- a/src/projects/hpcspades/gap_closer_mpi.hpp +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -10,7 +10,7 @@ #include "projects/spades/gap_closer.hpp" #include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "pipeline/mpi_stage.hpp" +#include "common/pipeline/mpi_stage.hpp" #include "io/reads/io_helper.hpp" namespace debruijn_graph { diff --git a/src/projects/hpcspades/main_mpi.cpp b/src/projects/hpcspades/main_mpi.cpp index c85379b614..36ac919672 100644 --- a/src/projects/hpcspades/main_mpi.cpp +++ b/src/projects/hpcspades/main_mpi.cpp @@ -7,7 +7,7 @@ //*************************************************************************** #include "configs/config_struct.hpp" -#include "pipeline/partask_mpi.hpp" +#include "common/pipeline/partask_mpi.hpp" #include "utils/logger/mpi_log_writers.hpp" #include "utils/memory_limit.hpp" diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index 8ce0d7a913..0e9882c338 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -8,7 +8,7 @@ #include "projects/spades/mismatch_correction.hpp" #include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "pipeline/mpi_stage.hpp" +#include "common/pipeline/mpi_stage.hpp" #include "pipeline/graph_pack_helpers.h" namespace debruijn_graph { diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index 218a3406ea..b559268d1c 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -9,7 +9,7 @@ #include "projects/spades/pair_info_count.hpp" #include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "pipeline/mpi_stage.hpp" +#include "common/pipeline/mpi_stage.hpp" namespace debruijn_graph { class PairInfoCountMPI : public PairInfoCountBase, public spades::MPIAssemblyStage { diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index 89b0d32107..246942a2f5 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -23,7 +23,7 @@ #include "library/library.hpp" #include "pipeline/graph_pack.hpp" #include "pipeline/stage.hpp" -#include "pipeline/mpi_stage.hpp" +#include "common/pipeline/mpi_stage.hpp" #include "alignment/kmer_mapper.hpp" #include "stages/genomic_info_filler.hpp" diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index dfdcc457ec..a1a4832820 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -23,7 +23,6 @@ #include "io/binary/binary.hpp" #include "io/binary/types/phmap.hpp" #include "io/binary/graph_pack.hpp" -#include "pipeline/partask_mpi.hpp" template std::vector split_iterator(size_t chunks, Iter b, Iter e, size_t n) { @@ -468,7 +467,7 @@ namespace mismatches { void MismatchCorrection::run(graph_pack::GraphPack &gp, const char*) { EnsureBasicMapping(gp); size_t corrected = - mismatches::MismatchShallNotPass(MapLibFunc(), gp, 2, partask::overall_num_threads()) + mismatches::MismatchShallNotPass(MapLibFunc(), gp, 2) .ParallelStopAllMismatches(1); INFO("Corrected " << corrected << " nucleotides"); } diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index 65171fe05a..1021c96f41 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -22,7 +22,6 @@ #include "library/library.hpp" #include "pipeline/graph_pack.hpp" #include "pipeline/stage.hpp" -#include "pipeline/mpi_stage.hpp" #include "alignment/kmer_mapper.hpp" #include "wastewater_disentangle.hpp" @@ -237,11 +236,7 @@ void assemble_genome(bool mpi = false) { std::unique_ptr SPAdes; SavesPolicy saves_policy(cfg::get().checkpoints, cfg::get().output_saves, cfg::get().load_from); - if (mpi) { - SPAdes.reset(new MPIStageManager(saves_policy)); - } else { - SPAdes.reset(new StageManager(saves_policy)); - } + SPAdes.reset(new StageManager(saves_policy)); if (SPAdes->saves_policy().EnabledAnyCheckpoint()) create_directory(cfg::get().output_saves); From 790d295adbca3f89930c71565748b0a02bb1713e Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 2 Dec 2021 16:18:07 +0300 Subject: [PATCH 075/102] run_on_load stage type --- src/common/pipeline/stage.cpp | 14 ++++++++++++-- src/common/pipeline/stage.hpp | 1 + src/common/stages/read_conversion.hpp | 1 + .../hpcspades/common/pipeline/mpi_stage.cpp | 9 +++++++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/common/pipeline/stage.cpp b/src/common/pipeline/stage.cpp index ab3165fd78..f14bcc8c98 100644 --- a/src/common/pipeline/stage.cpp +++ b/src/common/pipeline/stage.cpp @@ -34,8 +34,6 @@ void AssemblyStage::load(graph_pack::GraphPack& gp, auto p = dir / BASE_NAME; io::binary::FullPackIO().Load(p, gp); debruijn_graph::config::load_lib_data(p); - - io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); } void AssemblyStage::save(const graph_pack::GraphPack& gp, @@ -193,6 +191,18 @@ void StageManager::run(graph_pack::GraphPack& g, const char* start_from) { auto start_stage = prepare_run(g, start_from); + for (auto cur_stage = stages_.begin(); cur_stage != start_stage; ++cur_stage) { + AssemblyStage *stage = cur_stage->get(); + if (stage->run_on_load()) { + INFO("STAGE == " << stage->name() << " (id: " << stage->id() << ")"); + stage->prepare(g, start_from); + { + TIME_TRACE_SCOPE(stage->name()); + stage->run(g); + } + } + } + for (; start_stage != stages_.end(); ++start_stage) { AssemblyStage *stage = start_stage->get(); diff --git a/src/common/pipeline/stage.hpp b/src/common/pipeline/stage.hpp index 817f736d25..962582b202 100644 --- a/src/common/pipeline/stage.hpp +++ b/src/common/pipeline/stage.hpp @@ -44,6 +44,7 @@ class AssemblyStage { virtual void run(graph_pack::GraphPack &, const char *started_from = nullptr) = 0; virtual bool distributed() const { return false; } virtual bool constant() const { return false; } + virtual bool run_on_load() const {return false; } private: const char *name_; diff --git a/src/common/stages/read_conversion.hpp b/src/common/stages/read_conversion.hpp index 141e2d6963..ba05cb71d1 100644 --- a/src/common/stages/read_conversion.hpp +++ b/src/common/stages/read_conversion.hpp @@ -19,6 +19,7 @@ class ReadConversion : public AssemblyStage { void run(graph_pack::GraphPack &, const char *) override; void load(graph_pack::GraphPack &, const std::filesystem::path &load_from, const char *prefix = nullptr) override; void save(const graph_pack::GraphPack &, const std::filesystem::path &save_to, const char *prefix = nullptr) const override; + bool run_on_load() const override { return true; } }; } diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp index d5d8319e38..1bd3eff40f 100644 --- a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp @@ -169,8 +169,13 @@ void MPIStageManager::run(graph_pack::GraphPack& g, // assume that the previous was parallel for the sake of simplicity of the // implementation. bool pparallel = true; - for (; start_stage != stages().end(); ++start_stage) { - AssemblyStage *stage = start_stage->get(); + + for (auto current_stage = stages().begin(); current_stage != stages().end(); ++current_stage) { + AssemblyStage *stage = current_stage->get(); + if (current_stage < start_stage && !stage->run_on_load()) { + continue; + } + bool cparallel = stage->distributed(); if (cparallel) { From db9db375bda8109b8cfedbcb1f5d67cacbdf36cd Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 2 Dec 2021 23:38:42 +0300 Subject: [PATCH 076/102] namespace spaces --- .../sequence_mapper_notifier_mpi.cpp | 38 ++-- .../sequence_mapper_notifier_mpi.hpp | 168 ++++++++--------- .../ph_map/perfect_hash_map_builder_mpi.hpp | 114 ++++++------ .../paired_info/distance_estimation.cpp | 37 ++-- .../paired_info/distance_estimation.hpp | 170 +++++++++--------- .../paired_info/distance_estimation_utils.cpp | 36 ++-- .../paired_info/distance_estimation_utils.hpp | 114 ++++++------ .../common/stages/construction_mpi.hpp | 2 - .../hpcspades/common/stages/test_mpi.cpp | 2 - .../hpcspades/distance_estimation_mpi.cpp | 3 +- .../hpcspades/distance_estimation_mpi.hpp | 16 +- .../hpcspades/mismatch_correction_mpi.hpp | 22 +-- .../hpcspades/pair_info_count_mpi.hpp | 19 +- 13 files changed, 373 insertions(+), 368 deletions(-) diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp index 73c177d733..65e3f2390b 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp @@ -9,26 +9,26 @@ #include "io/reads/read_stream_vector.hpp" namespace debruijn_graph { - void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { - size_t mpi_size = partask::world_size(); - size_t mpi_rank = partask::world_rank(); - const size_t deadbeef = 0xDEADBEEF; +void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { + size_t mpi_size = partask::world_size(); + size_t mpi_rank = partask::world_rank(); + const size_t deadbeef = 0xDEADBEEF; - for (size_t step = 1; step < mpi_size; step *= 2) { - if ((mpi_rank % (2*step) == 0) && (mpi_rank + step < mpi_size)) { - partask::InputMPIStream is(mpi_rank + step); - size_t sz; - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - listener.MergeFromStream(is); - io::binary::BinRead(is, sz); - VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); - } else if (mpi_rank % (2*step) == step) { - partask::OutputMPIStream os(mpi_rank - step); - io::binary::BinWrite(os, deadbeef); - listener.Serialize(os); - io::binary::BinWrite(os, deadbeef); - } + for (size_t step = 1; step < mpi_size; step *= 2) { + if ((mpi_rank % (2 * step) == 0) && (mpi_rank + step < mpi_size)) { + partask::InputMPIStream is(mpi_rank + step); + size_t sz; + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + listener.MergeFromStream(is); + io::binary::BinRead(is, sz); + VERIFY_MSG(sz == deadbeef, "Listener type: " << typeid(listener).name()); + } else if (mpi_rank % (2 * step) == step) { + partask::OutputMPIStream os(mpi_rank - step); + io::binary::BinWrite(os, deadbeef); + listener.Serialize(os); + io::binary::BinWrite(os, deadbeef); } } +} } // namespace debruijn_graph diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp index 25c4a71ce6..cfc5c0b95b 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp @@ -22,91 +22,99 @@ #include namespace debruijn_graph { - class SequenceMapperNotifierMPI : public SequenceMapperNotifier { - void PyramidMergeMPI(SequenceMapperListener &listener); - - public: - using SequenceMapperNotifier::SequenceMapperNotifier; - - template - void ProcessLibrary(io::ReadStreamList& streams, - size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) { - INFO("ProcessLibraryMPI started"); - // Select streams - std::vector chunks = partask::chunks_rr(streams.size()); - INFO("Selected streams: " << chunks); - - partask::execute_on_subset(streams, chunks, - [&](io::ReadStreamList& local_streams) { - // Run ProcessLibrary - INFO("Running ProcessLibrary"); - SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, threads_count); - INFO("ProcessLibrary done"); - }); - - INFO("Merging results..."); - for (const auto& listener : listeners_[lib_index]) { - INFO("Merging listener " << listener->name()); - PyramidMergeMPI(*listener); - } - INFO("Listeners merged"); - - if (partask::world_size() > 1) { - const size_t deadbeef = 0xDEADBEEF; - INFO("Syncing listeners..."); - if (partask::master()) { - partask::OutputMPIStreamBcast os(0); - for (const auto& listener : listeners_[lib_index]) { - io::binary::BinWrite(os, deadbeef); - listener->Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } else { - partask::InputMPIStreamBcast is(0); - for (const auto& listener : listeners_[lib_index]) { - size_t sz; - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - listener->Deserialize(is); - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - } - } - INFO("Listeners synced"); - } - } +class SequenceMapperNotifierMPI : public SequenceMapperNotifier { + void PyramidMergeMPI(SequenceMapperListener &listener); - template - void ProcessLibrary(io::ReadStreamList& streams, - const SequenceMapperT& mapper, size_t threads_count = 0) { - return ProcessLibrary(streams, 0, mapper, threads_count); - } - }; +public: + using SequenceMapperNotifier::SequenceMapperNotifier; - class MapLibFuncMPI : public MapLibBase { - public: - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); - } - void operator() (const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const override { - MapLibMPI(listeners, mapper, streams); + template + void ProcessLibrary(io::ReadStreamList &streams, + size_t lib_index, const SequenceMapperT &mapper, size_t threads_count = 0) { + INFO("ProcessLibraryMPI started"); + // Select streams + std::vector chunks = partask::chunks_rr(streams.size()); + INFO("Selected streams: " << chunks); + + partask::execute_on_subset(streams, chunks, + [&](io::ReadStreamList &local_streams) { + // Run ProcessLibrary + INFO("Running ProcessLibrary"); + SequenceMapperNotifier::ProcessLibrary(local_streams, lib_index, mapper, + threads_count); + INFO("ProcessLibrary done"); + }); + + INFO("Merging results..."); + for (const auto &listener: listeners_[lib_index]) { + INFO("Merging listener " << listener->name()); + PyramidMergeMPI(*listener); } + INFO("Listeners merged"); - private: - template - void MapLibMPI(const std::vector& listeners, const SequenceMapper& mapper, io::ReadStreamList& streams) const { - SequenceMapperNotifierMPI notifier; - for (auto listener: listeners) { - notifier.Subscribe(listener); + if (partask::world_size() > 1) { + const size_t deadbeef = 0xDEADBEEF; + INFO("Syncing listeners..."); + if (partask::master()) { + partask::OutputMPIStreamBcast os(0); + for (const auto &listener: listeners_[lib_index]) { + io::binary::BinWrite(os, deadbeef); + listener->Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } else { + partask::InputMPIStreamBcast is(0); + for (const auto &listener: listeners_[lib_index]) { + size_t sz; + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + listener->Deserialize(is); + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + } } - notifier.ProcessLibrary(streams, mapper); + INFO("Listeners synced"); } - }; + } + + template + void ProcessLibrary(io::ReadStreamList &streams, + const SequenceMapperT &mapper, size_t threads_count = 0) { + return ProcessLibrary(streams, 0, mapper, threads_count); + } +}; + +class MapLibFuncMPI : public MapLibBase { +public: + void operator()(const std::vector &listeners, const SequenceMapper &mapper, + io::ReadStreamList &streams) const override { + MapLibMPI(listeners, mapper, streams); + } + void operator()(const std::vector &listeners, const SequenceMapper &mapper, + io::ReadStreamList &streams) const override { + MapLibMPI(listeners, mapper, streams); + } + + void operator()(const std::vector &listeners, const SequenceMapper &mapper, + io::ReadStreamList &streams) const override { + MapLibMPI(listeners, mapper, streams); + } + + void operator()(const std::vector &listeners, const SequenceMapper &mapper, + io::ReadStreamList &streams) const override { + MapLibMPI(listeners, mapper, streams); + } + +private: + template + void MapLibMPI(const std::vector &listeners, const SequenceMapper &mapper, + io::ReadStreamList &streams) const { + SequenceMapperNotifierMPI notifier; + for (auto listener: listeners) { + notifier.Subscribe(listener); + } + notifier.ProcessLibrary(streams, mapper); + } +}; } // namespace debruijn_graph diff --git a/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp index ed75107021..c96115f9b8 100644 --- a/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp @@ -15,72 +15,72 @@ #include "common/utils/perf/timetracer.hpp" namespace kmers { - struct PerfectHashMapBuilderMPI { - template - void BuildIndexMPI(PerfectHashMap &index, - KMerStorage &storage, bool save_final = true) const { - using KMerIndex = typename PerfectHashMap::KMerIndexT; - - kmers::KMerIndexBuilderMPI builder; - size_t sz = builder.BuildIndexMPI(*index.index_ptr_, storage, save_final); - index.resize(sz); - } - }; - - struct KeyStoringIndexBuilderMPI { - template - void BuildIndexMPI(KeyStoringMap &index, - KMerStorage &kmerstorage, bool save_final = true) const { - phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); - if (partask::master()) { - VERIFY(!index.kmers_.get()); - index.kmers_file_ = kmerstorage.final_kmers(); - index.SortUniqueKMers(); - } - } - - private: - PerfectHashMapBuilderMPI phm_builder_; - }; - - struct KeyIteratingIndexBuilderMPI { - template - void BuildIndexMPI(KeyIteratingMap &index, - KMerStorage& kmerstorage, bool save_final = true) const { - phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); - std::string final_kmers_file; - if (partask::master()) { - index.kmers_ = kmerstorage.final_kmers(); - final_kmers_file = index.kmers_->file(); - } - // MPI code leaked so far( TODO do smth with this - partask::broadcast(final_kmers_file); - if (partask::worker()) { - index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); - index.kmers_->release(); - } - INFO("Final K-mers file: " << final_kmers_file); - } - - private: - PerfectHashMapBuilderMPI phm_builder_; - }; - +struct PerfectHashMapBuilderMPI { template void BuildIndexMPI(PerfectHashMap &index, - KMerStorage &storage, bool save_final = true) { - PerfectHashMapBuilderMPI().BuildIndexMPI(index, storage, save_final); + KMerStorage &storage, bool save_final = true) const { + using KMerIndex = typename PerfectHashMap::KMerIndexT; + + kmers::KMerIndexBuilderMPI builder; + size_t sz = builder.BuildIndexMPI(*index.index_ptr_, storage, save_final); + index.resize(sz); } +}; +struct KeyStoringIndexBuilderMPI { template void BuildIndexMPI(KeyStoringMap &index, - KMerStorage &storage, bool save_final = true) { - KeyStoringIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); + KMerStorage &kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + if (partask::master()) { + VERIFY(!index.kmers_.get()); + index.kmers_file_ = kmerstorage.final_kmers(); + index.SortUniqueKMers(); + } } + private: + PerfectHashMapBuilderMPI phm_builder_; +}; + +struct KeyIteratingIndexBuilderMPI { template void BuildIndexMPI(KeyIteratingMap &index, - KMerStorage &storage, bool save_final = true) { - KeyIteratingIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); + KMerStorage& kmerstorage, bool save_final = true) const { + phm_builder_.BuildIndexMPI(index, kmerstorage, save_final); + std::string final_kmers_file; + if (partask::master()) { + index.kmers_ = kmerstorage.final_kmers(); + final_kmers_file = index.kmers_->file(); + } + // MPI code leaked so far( TODO do smth with this + partask::broadcast(final_kmers_file); + if (partask::worker()) { + index.kmers_ = fs::tmp::acquire_temp_file(final_kmers_file); + index.kmers_->release(); + } + INFO("Final K-mers file: " << final_kmers_file); } + + private: + PerfectHashMapBuilderMPI phm_builder_; +}; + +template +void BuildIndexMPI(PerfectHashMap &index, + KMerStorage &storage, bool save_final = true) { + PerfectHashMapBuilderMPI().BuildIndexMPI(index, storage, save_final); +} + +template +void BuildIndexMPI(KeyStoringMap &index, + KMerStorage &storage, bool save_final = true) { + KeyStoringIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); +} + +template +void BuildIndexMPI(KeyIteratingMap &index, + KMerStorage &storage, bool save_final = true) { + KeyIteratingIndexBuilderMPI().BuildIndexMPI(index, storage, save_final); +} } diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.cpp b/src/projects/hpcspades/common/paired_info/distance_estimation.cpp index 7fdd916dfc..1958f64697 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation.cpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation.cpp @@ -7,27 +7,28 @@ #include "distance_estimation.hpp" namespace omnigraph { - namespace de { - using namespace debruijn_graph; +namespace de { +using namespace debruijn_graph; - void DistanceEstimatorMPI::Estimate(OutPairedIndex &result, size_t nthreads) const { - this->Init(); - const auto &index = this->index(); +void DistanceEstimatorMPI::Estimate(OutPairedIndex &result, size_t nthreads) const { + this->Init(); + const auto &index = this->index(); - DEBUG("Collecting edge infos"); - std::vector edges; - for (EdgeId e : this->graph().edges()) - edges.push_back(e); + DEBUG("Collecting edge infos"); + std::vector edges; + for (EdgeId e : this->graph().edges()) + edges.push_back(e); - partask::TaskRegistry treg; - auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(*dist_estimator_), std::ref(result)); - treg.listen(); + partask::TaskRegistry treg; + auto dist_estimator_mpi = treg.add(std::cref(index), std::cref(*dist_estimator_), + std::ref(result)); + treg.listen(); - if (partask::master()) { - dist_estimator_mpi(edges, nthreads); - } - treg.stop_listening(); - partask::broadcast(result); - } + if (partask::master()) { + dist_estimator_mpi(edges, nthreads); } + treg.stop_listening(); + partask::broadcast(result); +} +} } diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp index 35ed59a96e..95635f4405 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation.hpp @@ -11,98 +11,100 @@ #include "projects/hpcspades/common/pipeline/partask_mpi.hpp" namespace omnigraph { - namespace de { - class DistanceEstimatorMPI : public DistanceEstimator { - typedef DistanceEstimator base; - typedef std::vector GraphLengths; - typedef std::vector > EstimHist; - typedef std::pair EdgePair; - - protected: - typedef typename base::InPairedIndex InPairedIndex; - typedef typename base::OutPairedIndex OutPairedIndex; - typedef typename base::InHistogram InHistogram; - typedef typename base::OutHistogram OutHistogram; - - public: - DistanceEstimatorMPI(const debruijn_graph::Graph &graph, - const InPairedIndex &index, - const GraphDistanceFinder &distance_finder, - size_t linkage_distance, size_t max_distance, - std::unique_ptr base_dist_estimator) - : base(graph, index, distance_finder, linkage_distance, max_distance), - dist_estimator_(std::move(base_dist_estimator)) {} - - class DistanceEstimatorTask { - DistanceEstimatorTask() = default; - - public: - DistanceEstimatorTask(std::vector &edges, - unsigned int nthreads) : edges_(edges), nthreads_(nthreads) {}; - - DistanceEstimatorTask(std::istream &is) { - io::binary::BinRead(is, edges_, nthreads_); - - } - - std::ostream &serialize(std::ostream &os) const { - io::binary::BinWrite(os, edges_, nthreads_); - return os; - } - - auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimator &, - PairedInfoIndexT & /*result*/) { - return partask::make_seq_along_generator(edges_); - } - - void process(std::istream &is, std::ostream &os, const InPairedIndex &index, - const DistanceEstimator &self, PairedInfoIndexT & /*result*/) { - DEBUG("Processing"); - auto edges_id = partask::get_seq(is); - PairedInfoBuffersT buffer(self.graph(), nthreads_); +namespace de { +class DistanceEstimatorMPI : public DistanceEstimator { + typedef DistanceEstimator base; + typedef std::vector GraphLengths; + typedef std::vector> EstimHist; + typedef std::pair EdgePair; + +protected: + typedef typename base::InPairedIndex InPairedIndex; + typedef typename base::OutPairedIndex OutPairedIndex; + typedef typename base::InHistogram InHistogram; + typedef typename base::OutHistogram OutHistogram; + +public: + DistanceEstimatorMPI(const debruijn_graph::Graph &graph, + const InPairedIndex &index, + const GraphDistanceFinder &distance_finder, + size_t linkage_distance, size_t max_distance, + std::unique_ptr base_dist_estimator) + : base(graph, index, distance_finder, linkage_distance, max_distance), + dist_estimator_(std::move(base_dist_estimator)) {} + + virtual ~DistanceEstimatorMPI() = default; + + class DistanceEstimatorTask { + DistanceEstimatorTask() = default; + + public: + DistanceEstimatorTask(std::vector &edges, + unsigned int nthreads) : edges_(edges), nthreads_(nthreads) {}; + + DistanceEstimatorTask(std::istream &is) { + io::binary::BinRead(is, edges_, nthreads_); + + } + + std::ostream &serialize(std::ostream &os) const { + io::binary::BinWrite(os, edges_, nthreads_); + return os; + } + + auto make_splitter(size_t, const InPairedIndex &, const DistanceEstimator &, + PairedInfoIndexT & /*result*/) { + return partask::make_seq_along_generator(edges_); + } + + void process(std::istream &is, std::ostream &os, const InPairedIndex &index, + const DistanceEstimator &self, PairedInfoIndexT & /*result*/) { + DEBUG("Processing"); + auto edges_id = partask::get_seq(is); + PairedInfoBuffersT buffer(self.graph(), nthreads_); # pragma omp parallel for num_threads(nthreads_) schedule(guided, 10) - for (size_t i = 0; i < edges_id.size(); ++i) { - debruijn_graph::EdgeId edge = edges_[edges_id[i]]; - self.ProcessEdge(edge, index, buffer[omp_get_thread_num()]); - } - - buffer.BinWrite(os); - buffer.Clear(); - } + for (size_t i = 0; i < edges_id.size(); ++i) { + debruijn_graph::EdgeId edge = edges_[edges_id[i]]; + self.ProcessEdge(edge, index, buffer[omp_get_thread_num()]); + } - auto merge(const std::vector &piss, - const InPairedIndex &, - const DistanceEstimator &self, - PairedInfoIndexT &result) { - for (auto pis: piss) { - PairedInfoBuffersT buffer(self.graph(), nthreads_); - buffer.BinRead(*pis); - for (size_t j = 0; j < nthreads_; ++j) { - result.Merge(buffer[j]); - buffer[j].clear(); - } - } + buffer.BinWrite(os); + buffer.Clear(); + } + + auto merge(const std::vector &piss, + const InPairedIndex &, + const DistanceEstimator &self, + PairedInfoIndexT &result) { + for (auto pis: piss) { + PairedInfoBuffersT buffer(self.graph(), nthreads_); + buffer.BinRead(*pis); + for (size_t j = 0; j < nthreads_; ++j) { + result.Merge(buffer[j]); + buffer[j].clear(); } + } + } - private: - std::vector edges_; - unsigned nthreads_; - }; + private: + std::vector edges_; + unsigned nthreads_; + }; - void Estimate(OutPairedIndex &result, size_t nthreads) const override; + void Estimate(OutPairedIndex &result, size_t nthreads) const override; - friend DistanceEstimatorTask; - private: - std::unique_ptr dist_estimator_; + friend DistanceEstimatorTask; + private: + std::unique_ptr dist_estimator_; - const std::string Name() const override { - const std::string my_name = dist_estimator_->Name() + "_MPI"; - return my_name; - } - - DECL_LOGGER("DistanceEstimatorMPI"); - }; + const std::string Name() const override { + const std::string my_name = dist_estimator_->Name() + "_MPI"; + return my_name; } + + DECL_LOGGER("DistanceEstimatorMPI"); +}; +} } #endif /* MPI_DISTANCE_ESTIMATION_HPP_ */ diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp index 342d863254..b5adff907d 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp @@ -8,23 +8,23 @@ #include "distance_estimation.hpp" namespace distance_estimation { - void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config) { - EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, - paired_index, ade, de_config, MPIScaffoldDistanceEstimatorFabric()); - } +void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config) { + EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, + paired_index, ade, de_config, MPIScaffoldDistanceEstimatorFabric()); +} - void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length, - const debruijn_graph::config::distance_estimator &de_config) { - EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, - max_repeat_length, de_config, MPIDistanceEstimatorFabric()); - } +void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length, + const debruijn_graph::config::distance_estimator &de_config) { + EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, + max_repeat_length, de_config, MPIDistanceEstimatorFabric()); +} } diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp index 9bb8d2acbb..65d75b7940 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp @@ -10,66 +10,66 @@ #include "distance_estimation.hpp" namespace distance_estimation { - using omnigraph::de::DistanceEstimator; - using omnigraph::de::DistanceEstimatorMPI; +using omnigraph::de::DistanceEstimator; +using omnigraph::de::DistanceEstimatorMPI; - class MPIDistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { - public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &index, - const omnigraph::de::GraphDistanceFinder &distance_finder, - size_t linkage_distance, - size_t max_distance) const override { - auto estimator_base = std::make_unique(graph, index, distance_finder, - linkage_distance, max_distance); - return std::unique_ptr(new DistanceEstimatorMPI(graph, index, - distance_finder, - linkage_distance, - max_distance, - std::move(estimator_base))); - } - }; +class MPIDistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const override { + auto estimator_base = std::make_unique(graph, index, distance_finder, + linkage_distance, max_distance); + return std::unique_ptr(new DistanceEstimatorMPI(graph, index, + distance_finder, + linkage_distance, + max_distance, + std::move(estimator_base))); + } +}; - class MPIScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { - public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &histogram, - const omnigraph::de::GraphDistanceFinder &dist_finder, - std::function weight_f, - size_t linkage_distance, - size_t max_distance, size_t threshold, - double range_coeff, double delta_coeff, - size_t cutoff, - size_t min_peak_points, - double percentage, - double derivative_threshold) const override { - auto estimator_base = std::unique_ptr( - new omnigraph::de::SmoothingDistanceEstimator(graph, histogram, dist_finder, weight_f, - linkage_distance, max_distance, threshold, - range_coeff, delta_coeff, cutoff, min_peak_points, - percentage, derivative_threshold)); +class MPIScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { + public: + std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, + size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const override { + auto estimator_base = std::unique_ptr( + new omnigraph::de::SmoothingDistanceEstimator(graph, histogram, dist_finder, weight_f, + linkage_distance, max_distance, threshold, + range_coeff, delta_coeff, cutoff, min_peak_points, + percentage, derivative_threshold)); - return std::unique_ptr(new DistanceEstimatorMPI(graph, histogram, - dist_finder, - linkage_distance, - max_distance, - std::move(estimator_base))); - } - }; + return std::unique_ptr(new DistanceEstimatorMPI(graph, histogram, + dist_finder, + linkage_distance, + max_distance, + std::move(estimator_base))); + } +}; - void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config = - debruijn_graph::config::distance_estimator()); +void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &scaffolding_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); - void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - size_t max_repeat_length = std::numeric_limits::max(), - const debruijn_graph::config::distance_estimator &de_config = - debruijn_graph::config::distance_estimator()); +void EstimatePairedDistancesMPI(PairedInfoIndexT &clustered_index, + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + size_t max_repeat_length = std::numeric_limits::max(), + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); } diff --git a/src/projects/hpcspades/common/stages/construction_mpi.hpp b/src/projects/hpcspades/common/stages/construction_mpi.hpp index 43efbbf653..966fd62d91 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.hpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.hpp @@ -9,7 +9,6 @@ #include "projects/hpcspades/common/pipeline/mpi_stage.hpp" namespace debruijn_graph { - struct ConstructionStorage; class ConstructionMPI : public spades::MPICompositeStageDeferred { @@ -20,6 +19,5 @@ class ConstructionMPI : public spades::MPICompositeStageDeferred namespace debruijn_graph { - class ArraySum { public: ArraySum(const std::string &message = "") : message_{message} {}; @@ -101,5 +100,4 @@ class TestMPI : public spades::MPIAssemblyStage { treg.stop_listening(); } }; - } // namespace debruijn_graph diff --git a/src/projects/hpcspades/distance_estimation_mpi.cpp b/src/projects/hpcspades/distance_estimation_mpi.cpp index fa2cec59ae..5cc3645a82 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.cpp +++ b/src/projects/hpcspades/distance_estimation_mpi.cpp @@ -10,6 +10,7 @@ namespace debruijn_graph { void DistanceEstimationMPI::run(graph_pack::GraphPack &gp, const char* s) { - DistanceEstimationBase::run(gp, s, distance_estimation::EstimatePairedDistancesMPI, distance_estimation::EstimateScaffoldingDistancesMPI); + DistanceEstimationBase::run(gp, s, distance_estimation::EstimatePairedDistancesMPI, + distance_estimation::EstimateScaffoldingDistancesMPI); } } diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp index 285913641c..dbacdc72ae 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.hpp +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -9,17 +9,15 @@ #include "common/pipeline/mpi_stage.hpp" #include "projects/spades/distance_estimation.hpp" -#include "common/alignment/sequence_mapper_notifier_mpi.hpp" - namespace debruijn_graph { - class DistanceEstimationMPI : public DistanceEstimationBase, public spades::MPIAssemblyStage { - public: - DistanceEstimationMPI(bool preliminary = false) - : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", - preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} +class DistanceEstimationMPI : public DistanceEstimationBase, public spades::MPIAssemblyStage { +public: + DistanceEstimationMPI(bool preliminary = false) + : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", + preliminary ? "distance_estimation_preliminary" : "distance_estimation") {} - void run(graph_pack::GraphPack &gp, const char *) override; - }; + void run(graph_pack::GraphPack &gp, const char *) override; +}; } diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index 0e9882c338..e5b421a2c9 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -12,17 +12,17 @@ #include "pipeline/graph_pack_helpers.h" namespace debruijn_graph { - class MismatchCorrectionMPI : public spades::MPIAssemblyStage { - public: - MismatchCorrectionMPI() - : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} +class MismatchCorrectionMPI : public spades::MPIAssemblyStage { +public: + MismatchCorrectionMPI() + : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} - void run(graph_pack::GraphPack &gp, const char *) override { - EnsureBasicMapping(gp); - size_t corrected = mismatches::MismatchShallNotPass(MapLibFuncMPI(), gp, 2, partask::overall_num_threads()). - ParallelStopAllMismatches(1); - INFO("Corrected " << corrected << " nucleotides"); - } - }; + void run(graph_pack::GraphPack &gp, const char *) override { + EnsureBasicMapping(gp); + size_t corrected = mismatches::MismatchShallNotPass(MapLibFuncMPI(), gp, 2, partask::overall_num_threads()). + ParallelStopAllMismatches(1); + INFO("Corrected " << corrected << " nucleotides"); + } +}; } diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index b559268d1c..89acb1adcf 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -12,16 +12,15 @@ #include "common/pipeline/mpi_stage.hpp" namespace debruijn_graph { - class PairInfoCountMPI : public PairInfoCountBase, public spades::MPIAssemblyStage { - public: - PairInfoCountMPI(bool preliminary = false) - : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", - preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} +class PairInfoCountMPI : public PairInfoCountBase, public spades::MPIAssemblyStage { + public: + PairInfoCountMPI(bool preliminary = false) + : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", + preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} - void run(graph_pack::GraphPack &gp, const char* s) override { - execute(gp, s, MapLibFuncMPI(), partask::overall_num_threads()); - } - }; + void run(graph_pack::GraphPack &gp, const char* s) override { + execute(gp, s, MapLibFuncMPI(), partask::overall_num_threads()); + } +}; } - From ddfa32c43a16eab6a5b853c604715a5ae282b352 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 2 Dec 2021 23:41:51 +0300 Subject: [PATCH 077/102] rename function by code style --- src/projects/hpcspades/gap_closer_mpi.hpp | 5 +++-- src/projects/spades/gap_closer.cpp | 2 +- src/projects/spades/gap_closer.hpp | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp index 77646bd2ce..ee1d522dc4 100644 --- a/src/projects/hpcspades/gap_closer_mpi.hpp +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -16,8 +16,9 @@ namespace debruijn_graph { class GapClosingMPI : public GapClosingBase, public spades::MPIAssemblyStage { - protected: - void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { +protected: + void ProcessLibrary(SequenceMapperListener *listener, const SequenceMapper &mapper, + io::BinaryPairedStreams &paired_streams) override { SequenceMapperNotifierMPI notifier; notifier.Subscribe(listener); notifier.ProcessLibrary(paired_streams, mapper); diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp index f86fc9a4a2..e46bb6081b 100644 --- a/src/projects/spades/gap_closer.cpp +++ b/src/projects/spades/gap_closer.cpp @@ -500,7 +500,7 @@ void GapClosingBase::execute(graph_pack::GraphPack &gp, const char *) { io::BinaryPairedStreams paired_streams = paired_binary_readers(dataset.reads[i], false, 0, false, num_readers); - processLibrary(&gcpif, *gcpif.GetMapper(), paired_streams); + ProcessLibrary(&gcpif, *gcpif.GetMapper(), paired_streams); INFO("Initializing gap closer"); g.clear_state(); // FIXME Hack-hack-hack required for uniform id distribution on master and slaves diff --git a/src/projects/spades/gap_closer.hpp b/src/projects/spades/gap_closer.hpp index 1c5e4898fe..5b86c22990 100644 --- a/src/projects/spades/gap_closer.hpp +++ b/src/projects/spades/gap_closer.hpp @@ -17,14 +17,14 @@ namespace debruijn_graph { class GapClosingBase { protected: size_t num_readers = 0; - virtual void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) = 0; + virtual void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) = 0; public: void execute(graph_pack::GraphPack &gp, const char *); }; class GapClosing : public GapClosingBase, public spades::AssemblyStage { protected: - void processLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { + void ProcessLibrary(SequenceMapperListener* listener, const SequenceMapper& mapper, io::BinaryPairedStreams& paired_streams) override { SequenceMapperNotifier notifier; notifier.Subscribe(listener); notifier.ProcessLibrary(paired_streams, mapper); From 3262e35fb83fa0ff16cc83a554c858ff9d3ea726 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 6 Dec 2021 22:44:52 +0300 Subject: [PATCH 078/102] delete mpi from local pipeline --- src/projects/spades/main.cpp | 2 +- src/projects/spades/pipeline.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp index 027b441f00..b79ce686d6 100644 --- a/src/projects/spades/main.cpp +++ b/src/projects/spades/main.cpp @@ -17,7 +17,7 @@ #include "version.hpp" namespace spades { -void assemble_genome(bool mpi = false); +void assemble_genome(); } struct TimeTracerRAII { diff --git a/src/projects/spades/pipeline.cpp b/src/projects/spades/pipeline.cpp index 1021c96f41..dfec73fe23 100644 --- a/src/projects/spades/pipeline.cpp +++ b/src/projects/spades/pipeline.cpp @@ -214,7 +214,7 @@ class FakeStageOnlyforDataSyncDoesNothingElse : public spades::AssemblyStage { void run(graph_pack::GraphPack&, const char *) {} }; -void assemble_genome(bool mpi = false) { +void assemble_genome() { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; From f3772ec0dd86c06e05a84ac5ccab6227cc0aec02 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 6 Dec 2021 23:54:33 +0300 Subject: [PATCH 079/102] separate logger mpi --- src/common/utils/CMakeLists.txt | 4 - src/projects/hpcspades/CMakeLists.txt | 2 +- src/projects/hpcspades/common/CMakeLists.txt | 3 +- .../common/kmer_index/CMakeLists.txt | 13 ++ .../kmer_index}/logger/mpi_log_writers.cpp | 0 .../kmer_index}/logger/mpi_log_writers.hpp | 2 +- .../hpcspades/common/pipeline/mpi_stage.cpp | 8 +- .../hpcspades/common/pipeline/mpi_stage.hpp | 10 +- .../common/stages/construction_mpi.cpp | 2 +- .../common/stages/construction_mpi.hpp | 2 +- .../hpcspades/common/stages/test_mpi.cpp | 2 +- .../hpcspades/common/utils/CMakeLists.txt | 12 ++ .../common/utils/logger/mpi_log_writers.cpp | 48 ++++++ .../common/utils/logger/mpi_log_writers.hpp | 21 +++ .../hpcspades/distance_estimation_mpi.hpp | 2 +- src/projects/hpcspades/gap_closer_mpi.hpp | 3 +- src/projects/hpcspades/main_mpi.cpp | 6 +- .../hpcspades/mismatch_correction_mpi.hpp | 2 +- .../hpcspades/pair_info_count_mpi.hpp | 4 +- src/projects/hpcspades/pipeline.cpp | 38 ++--- src/projects/spades/main_mpi.cpp | 143 ------------------ src/test/mpi/mpi_test.cpp | 2 +- 22 files changed, 138 insertions(+), 191 deletions(-) create mode 100644 src/projects/hpcspades/common/kmer_index/CMakeLists.txt rename src/{common/utils => projects/hpcspades/common/kmer_index}/logger/mpi_log_writers.cpp (100%) rename src/{common/utils => projects/hpcspades/common/kmer_index}/logger/mpi_log_writers.hpp (93%) create mode 100644 src/projects/hpcspades/common/utils/CMakeLists.txt create mode 100644 src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp create mode 100644 src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp delete mode 100644 src/projects/spades/main_mpi.cpp diff --git a/src/common/utils/CMakeLists.txt b/src/common/utils/CMakeLists.txt index 1bd56312ed..978f3ff739 100644 --- a/src/common/utils/CMakeLists.txt +++ b/src/common/utils/CMakeLists.txt @@ -18,10 +18,6 @@ set(utils_src logger/log_writers_thread.cpp ) -if (MPI_FOUND) - set(utils_src ${utils_src} logger/mpi_log_writers.cpp) -endif() - if (READLINE_FOUND) set(utils_src ${utils_src} autocompletion.cpp) endif() diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index 3bb1f2f7fa..616384fad1 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -11,7 +11,7 @@ add_subdirectory(common) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) set(HPCSPADES_SRC pipeline.cpp ../../projects/spades/series_analysis.cpp ../../projects/mts/contig_abundance.cpp) -set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi alignment_mpi pipeline_mpi ${COMMON_LIBRARIES}) +set(HPCSPADES_LIB spades-stages-hpc spades-stages graphio common_modules paired_info_mpi stages-mpi alignment_mpi pipeline_mpi utils_mpi ${COMMON_LIBRARIES}) add_executable(spades-hpc main_mpi.cpp ${HPCSPADES_SRC}) diff --git a/src/projects/hpcspades/common/CMakeLists.txt b/src/projects/hpcspades/common/CMakeLists.txt index 5e28536c3d..c292e254fc 100644 --- a/src/projects/hpcspades/common/CMakeLists.txt +++ b/src/projects/hpcspades/common/CMakeLists.txt @@ -10,4 +10,5 @@ project(common_modules_mpi CXX) add_subdirectory(paired_info) add_subdirectory(alignment) add_subdirectory(stages) -add_subdirectory(pipeline) \ No newline at end of file +add_subdirectory(pipeline) +add_subdirectory(utils) \ No newline at end of file diff --git a/src/projects/hpcspades/common/kmer_index/CMakeLists.txt b/src/projects/hpcspades/common/kmer_index/CMakeLists.txt new file mode 100644 index 0000000000..cdda8504c2 --- /dev/null +++ b/src/projects/hpcspades/common/kmer_index/CMakeLists.txt @@ -0,0 +1,13 @@ +############################################################################ +# Copyright (c) 2015 Saint Petersburg State University +# Copyright (c) 2011-2014 Saint Petersburg Academic University +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(utils_mpi CXX) + +set(utils_mpi_src logger/mpi_log_writers.cpp) + +add_library(utils_mpi STATIC ${utils_mpi_src}) +target_link_libraries(utils_mpi ${COMMON_LIBRARIES}) \ No newline at end of file diff --git a/src/common/utils/logger/mpi_log_writers.cpp b/src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.cpp similarity index 100% rename from src/common/utils/logger/mpi_log_writers.cpp rename to src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.cpp diff --git a/src/common/utils/logger/mpi_log_writers.hpp b/src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.hpp similarity index 93% rename from src/common/utils/logger/mpi_log_writers.hpp rename to src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.hpp index 86813662a6..deb4ecf29a 100644 --- a/src/common/utils/logger/mpi_log_writers.hpp +++ b/src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.hpp @@ -6,7 +6,7 @@ #pragma once -#include "logger.hpp" +#include "common/utils/logger/logger.hpp" namespace logging { diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp index 1bd3eff40f..71d551fb73 100644 --- a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp @@ -28,7 +28,7 @@ class PhaseIdComparator { id_ = pos + 1; } - bool operator()(const std::unique_ptr &phase) const { + bool operator()(const std::unique_ptr &phase) const { return 0 == strcmp(id_, phase->id()); } @@ -37,7 +37,7 @@ class PhaseIdComparator { }; } -namespace spades { +namespace spades_mpi { void MPICompositeStageBase::run(graph_pack::GraphPack& gp, const char* started_from) { @@ -134,7 +134,7 @@ void MPICompositeStageBase::run(graph_pack::GraphPack& gp, fini(gp); } -MPIStageManager::MPIStageManager(SavesPolicy policy) +MPIStageManager::MPIStageManager(spades::SavesPolicy policy) : StageManager(policy), world_size_(1), rank_(0), first_(false) { int initialized = 0; MPI_Initialized(&initialized); @@ -171,7 +171,7 @@ void MPIStageManager::run(graph_pack::GraphPack& g, bool pparallel = true; for (auto current_stage = stages().begin(); current_stage != stages().end(); ++current_stage) { - AssemblyStage *stage = current_stage->get(); + spades::AssemblyStage *stage = current_stage->get(); if (current_stage < start_stage && !stage->run_on_load()) { continue; } diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.hpp b/src/projects/hpcspades/common/pipeline/mpi_stage.hpp index 533d4d20ab..74ce9e1e13 100644 --- a/src/projects/hpcspades/common/pipeline/mpi_stage.hpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.hpp @@ -12,13 +12,13 @@ #include #include -namespace spades { +namespace spades_mpi { class MPIAssemblyStage; -class MPIAssemblyStage : public AssemblyStage { +class MPIAssemblyStage : public spades::AssemblyStage { public: - using AssemblyStage::AssemblyStage; + using spades::AssemblyStage::AssemblyStage; bool master() const; bool worker() const; @@ -135,9 +135,9 @@ class MPICompositeStageDeferred : public MPICompositeStageWithStorage { std::unique_ptr storage_; }; -class MPIStageManager : public StageManager { +class MPIStageManager : public spades::StageManager { public: - MPIStageManager(SavesPolicy policy = SavesPolicy()); + MPIStageManager(spades::SavesPolicy policy = spades::SavesPolicy()); ~MPIStageManager(); void run(graph_pack::GraphPack &g, diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp index f70f7952e8..cc563cb1f8 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -747,7 +747,7 @@ class PHMCoverageFiller : public ConstructionMPI::Phase { } // namespace ConstructionMPI::ConstructionMPI() - : spades::MPICompositeStageDeferred("de Bruijn graph construction", "construction") { + : spades_mpi::MPICompositeStageDeferred("de Bruijn graph construction", "construction") { if (cfg::get().con.read_cov_threshold) add(); diff --git a/src/projects/hpcspades/common/stages/construction_mpi.hpp b/src/projects/hpcspades/common/stages/construction_mpi.hpp index 966fd62d91..970c558b14 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.hpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.hpp @@ -11,7 +11,7 @@ namespace debruijn_graph { struct ConstructionStorage; -class ConstructionMPI : public spades::MPICompositeStageDeferred { +class ConstructionMPI : public spades_mpi::MPICompositeStageDeferred { public: ConstructionMPI(); ~ConstructionMPI(); diff --git a/src/projects/hpcspades/common/stages/test_mpi.cpp b/src/projects/hpcspades/common/stages/test_mpi.cpp index cfbad46ba3..39024f4e88 100644 --- a/src/projects/hpcspades/common/stages/test_mpi.cpp +++ b/src/projects/hpcspades/common/stages/test_mpi.cpp @@ -77,7 +77,7 @@ class ArraySum { }; }; -class TestMPI : public spades::MPIAssemblyStage { +class TestMPI : public spades_mpi::MPIAssemblyStage { public: TestMPI() : MPIAssemblyStage("Test MPI", "test_mpi") {} diff --git a/src/projects/hpcspades/common/utils/CMakeLists.txt b/src/projects/hpcspades/common/utils/CMakeLists.txt new file mode 100644 index 0000000000..14ede2a414 --- /dev/null +++ b/src/projects/hpcspades/common/utils/CMakeLists.txt @@ -0,0 +1,12 @@ +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +project(utils_mpi CXX) + +set(utils_mpi_src logger/mpi_log_writers.cpp) + +add_library(utils_mpi STATIC ${utils_mpi_src}) +target_link_libraries(utils_mpi ${COMMON_LIBRARIES}) diff --git a/src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp b/src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp new file mode 100644 index 0000000000..11e244244d --- /dev/null +++ b/src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp @@ -0,0 +1,48 @@ +//*************************************************************************** +//* Copyright (c) 2020 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include "mpi_log_writers.hpp" + +#include "utils/filesystem/path_helper.hpp" + +#include +#include + +namespace logging { + +void mpi_console_writer::write_msg(double time, size_t cmem, size_t max_rss, level l, const std::filesystem::path& file, size_t line_num, + const char *source, const char *msg) { + const std::string node_info = nodeinfo(); + if (cmem != -1ull) + std::cout << fmt::format("NODE {:s} | {:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(cmem), + utils::human_readable_memory(max_rss), logging::level_name(l), + source, file.filename().c_str(), int(line_num), msg) + << std::endl; + else + std::cout << fmt::format("NODE {:s} | {:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}", + node_info, + utils::human_readable_time(time), utils::human_readable_memory(max_rss), + logging::level_name(l), source, file.filename().c_str(), int(line_num), msg) + << std::endl; +} + +std::string mpi_console_writer::nodeinfo() const { + int initialized, finalized; + MPI_Initialized(&initialized); + MPI_Finalized(&finalized); + if (initialized && !finalized) { + int world_rank, world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + return fmt::format("{:>2d}/{:<2d}", world_rank + 1, world_size); + } else { + return fmt::format("{:^5}", "N/A"); + } +} + +} // logging diff --git a/src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp b/src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp new file mode 100644 index 0000000000..deb4ecf29a --- /dev/null +++ b/src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp @@ -0,0 +1,21 @@ +//*************************************************************************** +//* Copyright (c) 2018 Saint Petersburg State University +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include "common/utils/logger/logger.hpp" + +namespace logging { + +struct mpi_console_writer : public writer { + + void write_msg(double time, size_t cmem, size_t max_rss, level l, const std::filesystem::path& file, size_t line_num, + const char *source, const char *msg); +private: + std::string nodeinfo() const; +}; + +} // namespace logging diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp index dbacdc72ae..64abe377d9 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.hpp +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -11,7 +11,7 @@ #include "projects/spades/distance_estimation.hpp" namespace debruijn_graph { -class DistanceEstimationMPI : public DistanceEstimationBase, public spades::MPIAssemblyStage { +class DistanceEstimationMPI : public DistanceEstimationBase, public spades_mpi::MPIAssemblyStage { public: DistanceEstimationMPI(bool preliminary = false) : MPIAssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation", diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp index ee1d522dc4..25c15e6de8 100644 --- a/src/projects/hpcspades/gap_closer_mpi.hpp +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -14,8 +14,7 @@ #include "io/reads/io_helper.hpp" namespace debruijn_graph { - -class GapClosingMPI : public GapClosingBase, public spades::MPIAssemblyStage { +class GapClosingMPI : public GapClosingBase, public spades_mpi::MPIAssemblyStage { protected: void ProcessLibrary(SequenceMapperListener *listener, const SequenceMapper &mapper, io::BinaryPairedStreams &paired_streams) override { diff --git a/src/projects/hpcspades/main_mpi.cpp b/src/projects/hpcspades/main_mpi.cpp index 36ac919672..92e9f616f9 100644 --- a/src/projects/hpcspades/main_mpi.cpp +++ b/src/projects/hpcspades/main_mpi.cpp @@ -9,7 +9,7 @@ #include "configs/config_struct.hpp" #include "common/pipeline/partask_mpi.hpp" -#include "utils/logger/mpi_log_writers.hpp" +#include "common/utils/logger/mpi_log_writers.hpp" #include "utils/memory_limit.hpp" #include "utils/segfault_handler.hpp" #include "utils/perf/timetracer.hpp" @@ -17,7 +17,7 @@ #include "k_range.hpp" #include "version.hpp" -namespace spades { +namespace spades_mpi { void assemble_genome(bool mpi); } @@ -115,7 +115,7 @@ int main(int argc, char **argv) { } TIME_TRACE_SCOPE("spades"); - spades::assemble_genome(true); + spades_mpi::assemble_genome(true); } catch (std::bad_alloc const &e) { std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl; MPI_Abort(MPI_COMM_WORLD, EINTR); diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index e5b421a2c9..75d0aee26a 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -12,7 +12,7 @@ #include "pipeline/graph_pack_helpers.h" namespace debruijn_graph { -class MismatchCorrectionMPI : public spades::MPIAssemblyStage { +class MismatchCorrectionMPI : public spades_mpi::MPIAssemblyStage { public: MismatchCorrectionMPI() : MPIAssemblyStage("Mismatch Correction", "mismatch_correction") {} diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index 89acb1adcf..6ca31ebc2f 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -12,8 +12,8 @@ #include "common/pipeline/mpi_stage.hpp" namespace debruijn_graph { -class PairInfoCountMPI : public PairInfoCountBase, public spades::MPIAssemblyStage { - public: +class PairInfoCountMPI : public PairInfoCountBase, public spades_mpi::MPIAssemblyStage { +public: PairInfoCountMPI(bool preliminary = false) : MPIAssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting", preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {} diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index 246942a2f5..c9110ad0d6 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -33,7 +33,7 @@ #include "stages/ss_edge_split.hpp" #include "configs/config_struct.hpp" -namespace spades { +namespace spades_mpi { static bool MetaCompatibleLibraries() { const auto& libs = cfg::get().ds.reads; @@ -82,7 +82,7 @@ static debruijn_graph::ContigOutput::OutputList GetMetaplasmidOutput(size_t cov) GetContigName(cfg::get().co.contigs_name, cov) }}; } -static void AddMetaplasmidStages(StageManager &SPAdes) { +static void AddMetaplasmidStages(spades::StageManager &SPAdes) { size_t cov = cfg::get().pd->additive_step; size_t add = cfg::get().pd->additive_step; double multiplier = cfg::get().pd->relative_step; @@ -130,7 +130,7 @@ static debruijn_graph::ContigOutput::OutputList GetFinalStageOutput() { }; } -static void AddPreliminarySimplificationStages(StageManager &SPAdes) { +static void AddPreliminarySimplificationStages(spades::StageManager &SPAdes) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; @@ -153,7 +153,7 @@ static void AddPreliminarySimplificationStages(StageManager &SPAdes) { } } -static void AddSimplificationStages(StageManager &SPAdes) { +static void AddSimplificationStages(spades::StageManager &SPAdes) { VERIFY(!cfg::get().gc.before_raw_simplify || !cfg::get().gc.before_simplify); bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable; @@ -187,7 +187,7 @@ static void AddSimplificationStages(StageManager &SPAdes) { SPAdes.add(); } -static void AddConstructionStages(StageManager &SPAdes) { +static void AddConstructionStages(spades::StageManager &SPAdes) { using namespace debruijn_graph::config; pipeline_type mode = cfg::get().mode; @@ -196,7 +196,7 @@ static void AddConstructionStages(StageManager &SPAdes) { SPAdes.add(); } -static void AddRepeatResolutionStages(StageManager &SPAdes) { +static void AddRepeatResolutionStages(spades::StageManager &SPAdes) { using namespace debruijn_graph::config; if (!cfg::get().series_analysis.empty()) @@ -210,7 +210,7 @@ static void AddRepeatResolutionStages(StageManager &SPAdes) { class FakeStageOnlyforDataSyncDoesNothingElse : public spades::AssemblyStage { public: FakeStageOnlyforDataSyncDoesNothingElse() - : AssemblyStage("Fake Stage Only for Data Sync", "fake_stage_sync_data") { } + : spades::AssemblyStage("Fake Stage Only for Data Sync", "fake_stage_sync_data") { } void run(graph_pack::GraphPack&, const char *) {} }; @@ -234,13 +234,13 @@ void assemble_genome(bool mpi = false) { INFO("Starting from stage: " << cfg::get().entry_point); - std::unique_ptr SPAdes; - SavesPolicy saves_policy(cfg::get().checkpoints, - cfg::get().output_saves, cfg::get().load_from); + std::unique_ptr SPAdes; + spades::SavesPolicy saves_policy(cfg::get().checkpoints, + cfg::get().output_saves, cfg::get().load_from); if (mpi) { SPAdes.reset(new MPIStageManager(saves_policy)); } else { - SPAdes.reset(new StageManager(saves_policy)); + SPAdes.reset(new spades::StageManager(saves_policy)); } if (SPAdes->saves_policy().EnabledAnyCheckpoint()) @@ -250,20 +250,20 @@ void assemble_genome(bool mpi = false) { INFO("Two-step repeat resolution " << (two_step_rr ? "enabled" : "disabled")); graph_pack::GraphPack conj_gp(cfg::get().K, - cfg::get().tmp_dir, - two_step_rr ? cfg::get().ds.reads.lib_count() + 1 - : cfg::get().ds.reads.lib_count(), - cfg::get().ds.reference_genome, - cfg::get().flanking_range, - cfg::get().pos.max_mapping_gap, - cfg::get().pos.max_gap_diff); + cfg::get().tmp_dir, + two_step_rr ? cfg::get().ds.reads.lib_count() + 1 + : cfg::get().ds.reads.lib_count(), + cfg::get().ds.reference_genome, + cfg::get().flanking_range, + cfg::get().pos.max_mapping_gap, + cfg::get().pos.max_gap_diff); if (cfg::get().need_mapping) { INFO("Will need read mapping, kmer mapper will be attached"); conj_gp.get_mutable>().Attach(); } // Build the pipeline - SPAdes->add(); + SPAdes->add(); if (!AssemblyGraphPresent()) { AddConstructionStages(*SPAdes); diff --git a/src/projects/spades/main_mpi.cpp b/src/projects/spades/main_mpi.cpp deleted file mode 100644 index c85379b614..0000000000 --- a/src/projects/spades/main_mpi.cpp +++ /dev/null @@ -1,143 +0,0 @@ -//*************************************************************************** -//* Copyright (c) 2023-2024 SPAdes team -//* Copyright (c) 2015-2022 Saint Petersburg State University -//* Copyright (c) 2011-2014 Saint Petersburg Academic University -//* All Rights Reserved -//* See file LICENSE for details. -//*************************************************************************** - -#include "configs/config_struct.hpp" -#include "pipeline/partask_mpi.hpp" - -#include "utils/logger/mpi_log_writers.hpp" -#include "utils/memory_limit.hpp" -#include "utils/segfault_handler.hpp" -#include "utils/perf/timetracer.hpp" - -#include "k_range.hpp" -#include "version.hpp" - -namespace spades { -void assemble_genome(bool mpi); -} - -struct TimeTracerRAII { - TimeTracerRAII(llvm::StringRef program_name, - unsigned granularity = 500, - const std::string &prefix = "", const std::string &suffix = "") { - time_trace_file_ = prefix + "spades_time_trace_" + suffix + ".json"; - llvm::timeTraceProfilerInitialize(granularity, program_name); - } - ~TimeTracerRAII() { - if (auto E = llvm::timeTraceProfilerWrite(time_trace_file_, "spades-core")) { - handleAllErrors(std::move(E), - [&](const llvm::StringError &SE) { - ERROR("" << SE.getMessage() << "\n"); - }); - return; - } else { - INFO("Time trace is written to: " << time_trace_file_); - } - llvm::timeTraceProfilerCleanup(); - } - - std::string time_trace_file_; -}; - -void load_config(const std::vector& cfg_fns) { - for (const auto& s : cfg_fns) { - CHECK_FATAL_ERROR(exists(s), "File " << s << " doesn't exist or can't be read!"); - } - - cfg::create_instance(cfg_fns); - - create_directory(cfg::get().output_dir); - create_directory(cfg::get().tmp_dir); - - create_directory(cfg::get().temp_bin_reads_path); -} - -void create_console_logger(const std::filesystem::path& dir, std::filesystem::path log_prop_fn) { - using namespace logging; - - if (!exists(log_prop_fn)) - log_prop_fn = dir / log_prop_fn; - - logger *lg = create_logger(exists(log_prop_fn) ? log_prop_fn : ""); - lg->add_writer(std::make_shared()); - attach_logger(lg); -} - -int main(int argc, char **argv) { - utils::perf_counter pc; - - const size_t GB = 1 << 30; - - srand(42); - srandom(42); - - bool init = partask::init(); - INFO("MPI init: " << (init ? "done" : "failed")); - - try { - using namespace debruijn_graph; - - std::filesystem::path cfg_dir = std::filesystem::path(argv[1]).parent_path(); - - std::vector cfg_fns; - for (int i = 1; i < argc; ++i) { - cfg_fns.push_back(argv[i]); - } - - // read configuration file (dataset path etc.) - load_config(cfg_fns); - - create_console_logger(cfg_dir, cfg::get().log_filename); - for (const auto& cfg_fn : cfg_fns) - INFO("Loaded config from " << cfg_fn); - - VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K); - VERIFY(cfg::get().K % 2 != 0); - - utils::limit_memory(cfg::get().max_memory * GB); - - // assemble it! - START_BANNER("hpcSPAdes"); - INFO("Maximum k-mer length: " << runtime_k::MAX_K); - INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K); - INFO("Maximum # of threads to use (adjusted due to OMP capabilities): " << cfg::get().max_threads); - std::unique_ptr traceraii; - if (cfg::get().tt.enable || cfg::get().developer_mode) { - traceraii.reset(new TimeTracerRAII(argv[0], - cfg::get().tt.granularity, - cfg::get().output_dir, std::to_string(cfg::get().K))); - INFO("Time tracing is enabled"); - } - - TIME_TRACE_SCOPE("spades"); - spades::assemble_genome(true); - } catch (std::bad_alloc const &e) { - std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl; - MPI_Abort(MPI_COMM_WORLD, EINTR); - return EINTR; - } catch (std::exception const &e) { - std::cerr << "Exception caught " << e.what() << std::endl; - MPI_Abort(MPI_COMM_WORLD, EINTR); - return EINTR; - } catch (...) { - std::cerr << "Unknown exception caught " << std::endl; - MPI_Abort(MPI_COMM_WORLD, EINTR); - return EINTR; - } - - unsigned ms = (unsigned) pc.time_ms(); - unsigned secs = (ms / 1000) % 60; - unsigned mins = (ms / 1000 / 60) % 60; - unsigned hours = (ms / 1000 / 60 / 60); - INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds"); - - // OK - int success = partask::finalize(); - VERIFY(success); - return 0; -} diff --git a/src/test/mpi/mpi_test.cpp b/src/test/mpi/mpi_test.cpp index 849a61536d..91ba0c7738 100644 --- a/src/test/mpi/mpi_test.cpp +++ b/src/test/mpi/mpi_test.cpp @@ -13,7 +13,7 @@ #include "pipeline/partask_mpi.hpp" -#include "utils/logger/mpi_log_writers.hpp" +#include "mpi/common/utils/logger/mpi_log_writers.hpp" void create_console_logger() { using namespace logging; From 751b78220ed5398c668af5d25e2782fa022b59f0 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 11:21:17 +0300 Subject: [PATCH 080/102] Add time tracer annotations for MPI stage manager. Some cleanup here and there --- src/common/pipeline/stage.cpp | 10 +- .../hpcspades/common/pipeline/mpi_stage.cpp | 107 +++++++++++------- 2 files changed, 71 insertions(+), 46 deletions(-) diff --git a/src/common/pipeline/stage.cpp b/src/common/pipeline/stage.cpp index f14bcc8c98..f843d4f7da 100644 --- a/src/common/pipeline/stage.cpp +++ b/src/common/pipeline/stage.cpp @@ -195,7 +195,10 @@ void StageManager::run(graph_pack::GraphPack& g, AssemblyStage *stage = cur_stage->get(); if (stage->run_on_load()) { INFO("STAGE == " << stage->name() << " (id: " << stage->id() << ")"); - stage->prepare(g, start_from); + { + TIME_TRACE_SCOPE("prepare", stage->name()); + stage->prepare(g, start_from); + } { TIME_TRACE_SCOPE(stage->name()); stage->run(g); @@ -207,7 +210,10 @@ void StageManager::run(graph_pack::GraphPack& g, AssemblyStage *stage = start_stage->get(); INFO("STAGE == " << stage->name() << " (id: " << stage->id() << ")"); - stage->prepare(g, start_from); + { + TIME_TRACE_SCOPE("prepare", stage->name()); + stage->prepare(g, start_from); + } { TIME_TRACE_SCOPE(stage->name()); stage->run(g, start_from); diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp index 71d551fb73..f73f4292ce 100644 --- a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp +++ b/src/projects/hpcspades/common/pipeline/mpi_stage.cpp @@ -11,6 +11,7 @@ #include "io/binary/graph_pack.hpp" #include "io/dataset_support/read_converter.hpp" +#include "utils/perf/timetracer.hpp" #include "utils/logger/log_writers.hpp" #include @@ -39,6 +40,31 @@ class PhaseIdComparator { namespace spades_mpi { +static void SyncWorld(graph_pack::GraphPack& gp, + bool master) { + INFO("Syncing world for MPI parallel section"); + const size_t deadbeef = 0xDEADBEEF; + if (master) { + TIME_TRACE_SCOPE("sync world", "master"); + partask::OutputMPIStreamBcast s(0); + io::binary::FullPackIO().BinWrite(s, gp); + io::binary::BinWrite(s, deadbeef); + debruijn_graph::config::write_lib_data(s); + io::binary::BinWrite(s, deadbeef); + } else { + TIME_TRACE_SCOPE("sync world", "worker"); + partask::InputMPIStreamBcast s(0); + io::binary::FullPackIO().BinRead(s, gp); + size_t db; + io::binary::BinRead(s, db); + VERIFY_MSG(db == deadbeef, "Values " << db << " " << deadbeef); + debruijn_graph::config::load_lib_data(s); + io::binary::BinRead(s, db); + VERIFY(db == deadbeef); + } + INFO("World synced"); +} + void MPICompositeStageBase::run(graph_pack::GraphPack& gp, const char* started_from) { // The logic here is as follows. By this time StageManager already called @@ -61,6 +87,7 @@ void MPICompositeStageBase::run(graph_pack::GraphPack& gp, std::string composite_id(id()); composite_id += ":"; composite_id += prev_phase->id(); + TIME_TRACE_SCOPE("load phase", composite_id); prev_phase->load(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); } } @@ -74,35 +101,23 @@ void MPICompositeStageBase::run(graph_pack::GraphPack& gp, PhaseBase *phase = start_phase->get(); bool cparallel = phase->distributed(); + // Execute distributed task both on master and worker if (cparallel) { + // If previous phase was not parallel, we need to sync the world if (!pparallel) { partask::critical_ordered([this] { if (worker()) { io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); } }); - INFO("Syncing world for MPI parallel section"); - const size_t deadbeef = 0xDEADBEEF; - if (master()) { - partask::OutputMPIStreamBcast s(0); - io::binary::FullPackIO().BinWrite(s, gp); - io::binary::BinWrite(s, deadbeef); - debruijn_graph::config::write_lib_data(s); - io::binary::BinWrite(s, deadbeef); - } else { - partask::InputMPIStreamBcast s(0); - io::binary::FullPackIO().BinRead(s, gp); - size_t db; - io::binary::BinRead(s, db); - VERIFY(db == deadbeef); - debruijn_graph::config::load_lib_data(s); - io::binary::BinRead(s, db); - VERIFY(db == deadbeef); - } - INFO("World synced"); + SyncWorld(gp, master()); } + INFO("MPI PROCEDURE == " << phase->name() << (master() ? " (master)" : " (worker)")); - phase->run(gp, started_from); + { + TIME_TRACE_SCOPE(phase->name(), master() ? " (master)" : " (worker)"); + phase->run(gp, started_from); + } // Do saves only on master node if (parent_->saves_policy().EnabledCheckpoints(id()) && master()) { @@ -110,17 +125,23 @@ void MPICompositeStageBase::run(graph_pack::GraphPack& gp, composite_id += ":"; composite_id += phase->id(); + TIME_TRACE_SCOPE("save phase", composite_id); phase->save(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); } } else { if (master()) { INFO("PROCEDURE == " << phase->name()); - phase->run(gp, started_from); + { + TIME_TRACE_SCOPE(phase->name(), "(non-distributed)"); + phase->run(gp, started_from); + } + if (parent_->saves_policy().EnabledCheckpoints(id())) { std::string composite_id(id()); composite_id += ":"; composite_id += phase->id(); + TIME_TRACE_SCOPE("save phase", composite_id); phase->save(gp, parent_->saves_policy().SavesPath(), composite_id.c_str()); } } else { @@ -176,38 +197,29 @@ void MPIStageManager::run(graph_pack::GraphPack& g, continue; } + // Execute distributed task both on master and worker bool cparallel = stage->distributed(); if (cparallel) { + // If the previous stage was not parallel, we need to sync the wolrd if (!pparallel) { partask::critical_ordered([this] { if (worker()) { io::ConvertIfNeeded(cfg::get_writable().ds.reads, cfg::get().max_threads); } }); - INFO("Syncing world for MPI parallel section"); - const size_t deadbeef = 0xDEADBEEF; - if (master()) { - partask::OutputMPIStreamBcast s(0); - io::binary::FullPackIO().BinWrite(s, g); - io::binary::BinWrite(s, deadbeef); - debruijn_graph::config::write_lib_data(s); - io::binary::BinWrite(s, deadbeef); - } else { - partask::InputMPIStreamBcast s(0); - io::binary::FullPackIO().BinRead(s, g); - size_t db; - io::binary::BinRead(s, db); - VERIFY_MSG(db == deadbeef, "Values " << db << " " << deadbeef); - debruijn_graph::config::load_lib_data(s); - io::binary::BinRead(s, db); - VERIFY(db == deadbeef); - } - INFO("World synced"); + SyncWorld(g, master()); } INFO("MPI STAGE == " << stage->name() << (master() ? " (master)" : " (worker)")); - stage->prepare(g, start_from); - stage->run(g, start_from); + { + TIME_TRACE_SCOPE("prepare", stage->name()); + stage->prepare(g, start_from); + } + + { + TIME_TRACE_SCOPE(stage->name(), master() ? " (master)" : " (worker)"); + stage->run(g, start_from); + } // Do saves only on master node if (saves_policy.EnabledCheckpoints(stage->id()) && master()) @@ -215,10 +227,17 @@ void MPIStageManager::run(graph_pack::GraphPack& g, } else { if (master()) { INFO("STAGE == " << stage->name()); - stage->prepare(g, start_from); - stage->run(g, start_from); + { + TIME_TRACE_SCOPE("prepare", stage->name()); + stage->prepare(g, start_from); + } + { + TIME_TRACE_SCOPE(stage->name(), "(non-distributed)"); + stage->run(g, start_from); + } if (saves_policy.EnabledCheckpoints(stage->id())) { auto prev_saves = saves_policy.GetLastCheckpoint(); + TIME_TRACE_SCOPE("save", saves_policy.SavesPath().c_str()); stage->save(g, saves_policy.SavesPath()); saves_policy.UpdateCheckpoint(stage->id()); if (!prev_saves.empty() && saves_policy.RemovePreviousCheckpoint()) { From b4bf0e51edb291484e8be31d9c6f52fe90d98eaf Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 11:47:09 +0300 Subject: [PATCH 081/102] Do not overwrite time traces from different nodes --- src/projects/hpcspades/main_mpi.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/projects/hpcspades/main_mpi.cpp b/src/projects/hpcspades/main_mpi.cpp index 92e9f616f9..29bca4b5fb 100644 --- a/src/projects/hpcspades/main_mpi.cpp +++ b/src/projects/hpcspades/main_mpi.cpp @@ -17,6 +17,8 @@ #include "k_range.hpp" #include "version.hpp" +#include + namespace spades_mpi { void assemble_genome(bool mpi); } @@ -25,7 +27,7 @@ struct TimeTracerRAII { TimeTracerRAII(llvm::StringRef program_name, unsigned granularity = 500, const std::string &prefix = "", const std::string &suffix = "") { - time_trace_file_ = prefix + "spades_time_trace_" + suffix + ".json"; + time_trace_file_ = prefix + "spades_time_trace_" + suffix + "_" + std::to_string(partask::world_rank()) + ".json"; llvm::timeTraceProfilerInitialize(granularity, program_name); } ~TimeTracerRAII() { From 0a0f75b8d8dc719400f3a9e36b2cda529c825ad9 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 11:47:36 +0300 Subject: [PATCH 082/102] Cleanup --- .../alignment/sequence_mapper_notifier.hpp | 2 +- .../sequence_mapper_notifier_mpi.cpp | 33 +++++++++++++++-- .../sequence_mapper_notifier_mpi.hpp | 36 +++++-------------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index c0c0470278..fbb5d7c342 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -131,7 +131,7 @@ class SequenceMapperNotifier { void NotifyMergeBuffer(size_t ilib, size_t ithread) const; protected: - std::vector > listeners_; //first vector's size = count libs + std::vector listeners_; //first vector's size = count libs }; class MapLibBase { diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp index 65e3f2390b..2e8b343aca 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp @@ -6,8 +6,6 @@ #include "sequence_mapper_notifier_mpi.hpp" -#include "io/reads/read_stream_vector.hpp" - namespace debruijn_graph { void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener) { size_t mpi_size = partask::world_size(); @@ -31,4 +29,35 @@ void SequenceMapperNotifierMPI::PyramidMergeMPI(SequenceMapperListener &listener } } } + + +void SequenceMapperNotifierMPI::SyncListeners(ListenersContainer &listeners) { + if (partask::world_size() == 1) + return; + + const size_t deadbeef = 0xDEADBEEF; + INFO("Syncing listeners..."); + if (partask::master()) { + TIME_TRACE_SCOPE("sync listeners", "master"); + partask::OutputMPIStreamBcast os(0); + for (const auto &listener: listeners) { + io::binary::BinWrite(os, deadbeef); + listener->Serialize(os); + io::binary::BinWrite(os, deadbeef); + } + } else { + TIME_TRACE_SCOPE("sync listeners", "worker"); + partask::InputMPIStreamBcast is(0); + for (const auto &listener: listeners) { + size_t sz; + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + listener->Deserialize(is); + io::binary::BinRead(is, sz); + VERIFY(sz == deadbeef); + } + } + INFO("Listeners synced"); +} + } // namespace debruijn_graph diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp index cfc5c0b95b..a8a8eddc2d 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp +++ b/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp @@ -12,9 +12,7 @@ #include "assembly_graph/paths/mapping_path.hpp" #include "assembly_graph/core/graph.hpp" -#include "io/reads/paired_read.hpp" #include "io/reads/read_stream_vector.hpp" - #include "utils/perf/timetracer.hpp" #include "projects/hpcspades/common/pipeline/partask_mpi.hpp" @@ -24,6 +22,7 @@ namespace debruijn_graph { class SequenceMapperNotifierMPI : public SequenceMapperNotifier { void PyramidMergeMPI(SequenceMapperListener &listener); + void SyncListeners(ListenersContainer &listeners); public: using SequenceMapperNotifier::SequenceMapperNotifier; @@ -46,35 +45,16 @@ class SequenceMapperNotifierMPI : public SequenceMapperNotifier { }); INFO("Merging results..."); - for (const auto &listener: listeners_[lib_index]) { - INFO("Merging listener " << listener->name()); - PyramidMergeMPI(*listener); + { + TIME_TRACE_SCOPE("merge listeners"); + for (const auto &listener: listeners_[lib_index]) { + INFO("Merging listener " << listener->name()); + PyramidMergeMPI(*listener); + } } INFO("Listeners merged"); - if (partask::world_size() > 1) { - const size_t deadbeef = 0xDEADBEEF; - INFO("Syncing listeners..."); - if (partask::master()) { - partask::OutputMPIStreamBcast os(0); - for (const auto &listener: listeners_[lib_index]) { - io::binary::BinWrite(os, deadbeef); - listener->Serialize(os); - io::binary::BinWrite(os, deadbeef); - } - } else { - partask::InputMPIStreamBcast is(0); - for (const auto &listener: listeners_[lib_index]) { - size_t sz; - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - listener->Deserialize(is); - io::binary::BinRead(is, sz); - VERIFY(sz == deadbeef); - } - } - INFO("Listeners synced"); - } + SyncListeners(listeners_[lib_index]); } template From b833f1c1953e14b0944cb34644d529032baece4b Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 13:16:51 +0300 Subject: [PATCH 083/102] Time tracing for partask --- .../hpcspades/common/pipeline/partask_mpi.hpp | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp index 329001c8ce..7341554c3e 100644 --- a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp +++ b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp @@ -13,6 +13,7 @@ #include "utils/verify.hpp" #include "utils/logger/logger.hpp" #include "utils/stl_utils.hpp" +#include "utils/perf/timetracer.hpp" #include #include @@ -108,6 +109,7 @@ inline bool initialized() { inline void barrier() { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::barrier"); static size_t count = 0; DEBUG("barrier() called " << count << " times"); ++count; @@ -119,6 +121,7 @@ const size_t MPI_MAX_COUNT = 1 << 30; // Should be <= MAX_INT inline void membroadcast(void *p, size_t count, int root = 0) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::membroadcast"); static size_t call_count = 0; DEBUG("membroadcast() called " << call_count << " times"); ++call_count; @@ -135,6 +138,7 @@ inline void membroadcast(void *p, size_t count, int root = 0) { inline void memsend(const void *p, size_t count, int rank, int tag = 0) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::memsend"); char *cp = reinterpret_cast(const_cast(p)); while (count) { size_t block_size = std::min(count, MPI_MAX_COUNT); @@ -147,6 +151,7 @@ inline void memsend(const void *p, size_t count, int rank, int tag = 0) { inline void memrecv(void *p, size_t count, int rank, int tag = MPI_ANY_TAG) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::memrecv"); char *cp = reinterpret_cast(p); while (count) { size_t block_size = std::min(count, MPI_MAX_COUNT); @@ -272,6 +277,7 @@ inline MPI_Datatype mpi_datatype() { template void allreduce(T *recvbuf, size_t count, MPI_Op op) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::allreduce"); DEBUG("allreduce started for " << count << " objects of type " << typeid(T).name()); using NoneVoidT = std::conditional_t::value, char, T>; NoneVoidT *crecvbuf = reinterpret_cast(recvbuf); @@ -362,6 +368,7 @@ struct MsgInfo { // buffers should have sizeof(MsgInfo) free bytes before the beginning! inline void mpi_send_buffer(char *buffer, size_t count, int destination, int tag, bool flag) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_send_buffer"); DEBUG("mpi_send_buffer() called"); MsgInfo info{count, flag}; memcpy(buffer - sizeof(info), &info, sizeof(info)); @@ -373,6 +380,7 @@ inline void mpi_send_buffer(char *buffer, size_t count, int destination, int tag inline MsgInfo mpi_recv_buffer(char *buffer, size_t buffer_size, int source, int tag) { DEBUG("mpi_recv_buffer() called"); + TIME_TRACE_SCOPE("partask::mpi_recv_buffer"); size_t all_count = buffer_size + sizeof(MsgInfo); VERIFY(all_count <= std::numeric_limits::max()); MPI_Status status; @@ -389,6 +397,7 @@ inline MsgInfo mpi_recv_buffer(char *buffer, size_t buffer_size, int source, int inline void mpi_send_buffer_bcast(char *buffer, size_t count, size_t buffer_size, int root, bool flag) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_send_buffer_bcast"); DEBUG("mpi_send_buffer_bcast() called"); MsgInfo info{count, flag}; VERIFY(info.count || info.flag); @@ -401,6 +410,7 @@ inline void mpi_send_buffer_bcast(char *buffer, size_t count, size_t buffer_size inline MsgInfo mpi_recv_buffer_bcast(char *buffer, size_t buffer_size, int root) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_recv_buffer_bcast"); DEBUG("mpi_recv_buffer_bcast() called"); size_t all_count = buffer_size + sizeof(MsgInfo); int rc = MPI_Bcast(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD); // count should be the same! @@ -568,6 +578,7 @@ class InputMPIBuffer : public std::streambuf { inline void mpi_send_buffer_async(char *buffer, size_t count, int destination, int tag, bool flag, MPI_Request &req) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_send_buffer_async"); DEBUG("mpi_send_buffer() called"); MsgInfo info{count, flag}; memcpy(buffer - sizeof(info), &info, sizeof(info)); @@ -579,6 +590,7 @@ inline void mpi_send_buffer_async(char *buffer, size_t count, int destination, i inline void mpi_recv_buffer_async(char *buffer, size_t buffer_size, int source, int tag, MPI_Request &req) { DEBUG("mpi_recv_buffer() called"); + TIME_TRACE_SCOPE("partask::mpi_recv_buffer_async"); size_t all_count = buffer_size + sizeof(MsgInfo); VERIFY(all_count <= std::numeric_limits::max()); int rc = MPI_Irecv(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, source, tag, MPI_COMM_WORLD, &req); @@ -586,6 +598,7 @@ inline void mpi_recv_buffer_async(char *buffer, size_t buffer_size, int source, } inline MsgInfo mpi_recv_buffer_wait(char *buffer, MPI_Request &req) { + TIME_TRACE_SCOPE("partask::mpi_recv_buffer_wait"); MPI_Status status; MPI_Wait(&req, &status); int actual_count; @@ -599,6 +612,7 @@ inline MsgInfo mpi_recv_buffer_wait(char *buffer, MPI_Request &req) { inline void mpi_send_buffer_bcast_async(char *buffer, size_t count, size_t buffer_size, int root, bool flag, MPI_Request &req) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_send_buffer_bcast_async"); DEBUG("mpi_send_buffer_bcast_async() called. count = " << count << " flag " << flag); MsgInfo info{count, flag}; VERIFY(info.count || info.flag); @@ -611,6 +625,7 @@ inline void mpi_send_buffer_bcast_async(char *buffer, size_t count, size_t buffe inline void mpi_recv_buffer_bcast_async(char *buffer, size_t buffer_size, int root, MPI_Request &req) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::mpi_recv_buffer_bcast_async"); DEBUG("mpi_recv_buffer_bcast() called"); size_t all_count = buffer_size + sizeof(MsgInfo); int rc = MPI_Ibcast(buffer - sizeof(MsgInfo), static_cast(all_count), MPI_BYTE, root, MPI_COMM_WORLD, &req); // count should be the same! @@ -618,6 +633,7 @@ inline void mpi_recv_buffer_bcast_async(char *buffer, size_t buffer_size, int ro } inline MsgInfo mpi_recv_buffer_bcast_wait(char *buffer, MPI_Request &req) { + TIME_TRACE_SCOPE("partask::mpi_recv_buffer_bcast_wait"); MPI_Wait(&req, MPI_STATUS_IGNORE); MsgInfo info; memcpy(&info, buffer - sizeof(MsgInfo), sizeof(info)); @@ -959,6 +975,8 @@ class ChunkedStringStream : public MPIStream template void broadcast(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { + TIME_TRACE_SCOPE("partask::broadcast"); + ASSERT_MAIN_THREAD; DEBUG("Broadcasting of type " << typeid(T).name()); @@ -983,6 +1001,8 @@ template void broadcast_full_dump(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::broadcast_full_dump"); + DEBUG("Broadcasting of type " << typeid(T).name()); static size_t call_count = 0; @@ -1042,6 +1064,8 @@ auto broadcast(T &data, int root = 0) -> decltype(std::declval auto send(const T &data, Serialize &&serialize, int destination, int tag = 0) -> decltype(std::forward(serialize)(declref(), data), void()) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::send"); + OutputMPIStream os(destination, tag); DEBUG("Serialization..."); std::forward(serialize)(os, data); @@ -1050,6 +1074,8 @@ auto send(const T &data, Serialize &&serialize, int destination, int tag = 0) -> template auto recv(T &data, Deserialize &&deserialize, int source, int tag = MPI_ANY_TAG) -> decltype(std::forward(deserialize)(declref(), data), void()) { ASSERT_MAIN_THREAD; + TIME_TRACE_SCOPE("partask::recv"); + InputMPIStream is(source, tag); DEBUG("Serialization..."); std::forward(deserialize)(is, data); @@ -1171,15 +1197,18 @@ class TaskRegistry { } void process(std::istream &is, std::ostream &os) override { + TIME_TRACE_SCOPE("partask::task:process"); process_impl(is, os); } void sync(void) override { + TIME_TRACE_SCOPE("partask::task:sync"); sync_impl(); } template > decltype(auto) merge(std::enable_if_t &piss) { + TIME_TRACE_SCOPE("partask::task:merge"); auto merge_args = std::tuple_cat(std::make_tuple(piss), locals_); auto merge_call = [&](auto &&... ts) { return task_.merge(std::forward(ts)...); }; return std::apply(merge_call, merge_args); @@ -1258,6 +1287,7 @@ class TaskRegistry { template decltype(auto) operator()(Args &&... args) const { + TIME_TRACE_SCOPE("partask::job"); VERIFY(task_registry_.world_rank_ == 0); Task task(std::forward(args)...); @@ -1522,6 +1552,7 @@ auto make_vector_splitter(size_t n, const std::vector& data) { template auto all_equal(const T &v) -> decltype(broadcast(*new T(v)), T(v) == T(v), bool()) { + TIME_TRACE_SCOPE("partask::all_equal"); T cv(v); broadcast(cv); return v == cv; @@ -1529,6 +1560,8 @@ auto all_equal(const T &v) -> decltype(broadcast(*new T(v)), T(v) == T(v), bool( template auto critical_ordered(F &&f) -> decltype(std::forward(f)()) { + TIME_TRACE_SCOPE("partask::critically_ordered"); + using wrap_type = decltype(std::forward(f)(), detail::wrap_void()); std::unique_ptr pwrap; From eb09ca0a72ca162e80d48d96c599d1bc49856314 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 15:51:51 +0300 Subject: [PATCH 084/102] Better time tracing --- .../alignment/sequence_mapper_notifier.hpp | 5 +- .../debruijn_graph_constructor.hpp | 46 +++++++++++++------ src/common/io/binary/graph_pack.cpp | 5 +- src/common/utils/stl_utils.hpp | 29 +++++++++++- .../debruijn_graph_constructor_mpi.hpp | 10 +++- .../kmer_extension_index_builder_mpi.hpp | 1 + .../hpcspades/common/pipeline/partask_mpi.hpp | 4 ++ .../common/stages/construction_mpi.cpp | 7 ++- 8 files changed, 85 insertions(+), 22 deletions(-) diff --git a/src/common/alignment/sequence_mapper_notifier.hpp b/src/common/alignment/sequence_mapper_notifier.hpp index fbb5d7c342..a628002153 100644 --- a/src/common/alignment/sequence_mapper_notifier.hpp +++ b/src/common/alignment/sequence_mapper_notifier.hpp @@ -16,6 +16,7 @@ #include "io/reads/paired_read.hpp" #include "io/reads/read_stream_vector.hpp" #include "utils/perf/timetracer.hpp" +#include "utils/stl_utils.hpp" #include #include @@ -49,8 +50,8 @@ class SequenceMapperListener { VERIFY_MSG(false, "MergeFromStream() is not implemented"); } - virtual const char* name() const { - return typeid(*this).name(); + virtual const std::string name() const { + return utils::type_name(typeid(*this).name()); } virtual ~SequenceMapperListener() {} diff --git a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp index 8cc868929a..fb6549d264 100644 --- a/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp +++ b/src/common/assembly_graph/construction/debruijn_graph_constructor.hpp @@ -12,6 +12,7 @@ #include "kmer_index/extension_index/kmer_extension_index.hpp" #include "utils/parallel/openmp_wrapper.h" #include "utils/parallel/parallel_wrapper.hpp" +#include "utils/perf/timetracer.hpp" #include namespace debruijn_graph { @@ -318,6 +319,8 @@ class UnbranchingPathExtractor { //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it. const std::vector ExtractUnbranchingPaths(std::vector &its) const { + TIME_TRACE_SCOPE("UnbranchingPathExtractor::ExtractUnbranchingPaths"); + INFO("Extracting unbranching paths"); if (its.size() == 0) { INFO("No input iterators, returning empty vector"); @@ -354,6 +357,8 @@ class UnbranchingPathExtractor { // This methods collects all loops that were not extracted by finding // unbranching paths because there are no junctions on loops. const std::vector CollectLoops(unsigned nchunks) { + TIME_TRACE_SCOPE("UnbranchingPathExtractor::CollectLoops"); + INFO("Collecting perfect loops"); auto its = origin_.kmer_begin(nchunks); std::vector > starts(its.size()); @@ -507,10 +512,16 @@ class FastGraphFromSequencesConstructor { graph.ereserve(2*seq_size + seq_size / 100); INFO("Collecting link records") - CollectLinkRecords(helper, graph, records, sequences); + { + TIME_TRACE_SCOPE("ConstructGraph::CollectLinkRecords"); + CollectLinkRecords(helper, graph, records, sequences); + } INFO("Ordering link records") // We sort by Vertex and then by EdgeID and RC/Start mask in order to combine together records accociated with the same vertex with a special order in each group - parallel::sort(records.begin(), records.end(), LinkRecord::CompareByVertexKMerEdgeIdAndMask); + { + TIME_TRACE_SCOPE("ConstructGraph::OrderLinkRecords"); + parallel::sort(records.begin(), records.end(), LinkRecord::CompareByVertexKMerEdgeIdAndMask); + } INFO("Sorting done"); // Now we extract starting positions of each vertex group @@ -525,22 +536,28 @@ class FastGraphFromSequencesConstructor { // Now we sort vertices by their lowest edge and mask (they are unique since each edge has only one start and one stop). // It is a deterministic order while ordering by vertex kmer perfect hash is not (hashes are dependent on nthreads/nnodes) INFO("Sorting LinkRecords..."); - parallel::sort(unique_record_indices.begin(), unique_record_indices.end(), - [&records](size_t i, size_t j) { return records[i].EdgeAndMask() < records[j].EdgeAndMask(); }); + { + TIME_TRACE_SCOPE("ConstructGraph::SortLinkRecords"); + parallel::sort(unique_record_indices.begin(), unique_record_indices.end(), + [&records](size_t i, size_t j) { return records[i].EdgeAndMask() < records[j].EdgeAndMask(); }); + } INFO("LinkRecords sorted"); size_t size = unique_record_indices.size(); INFO("Total " << size << " vertices to create"); graph.vreserve(2 * size + size / 100); INFO("Connecting the graph"); - uint64_t min_id = graph.min_id(); -# pragma omp parallel for schedule(guided) - for (size_t vertex_num = 0; vertex_num < size; ++vertex_num) { - size_t i = unique_record_indices[vertex_num]; - - VertexId v = helper.CreateVertex(DeBruijnVertexData(graph.k()), min_id + (vertex_num << 1)); - for (size_t j = i; j < records.size() && records[j].GetHash() == records[i].GetHash(); j++) { - LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC()); + { + TIME_TRACE_SCOPE("ConstructGraph::ConnectGraph"); + uint64_t min_id = graph.min_id(); +# pragma omp parallel for schedule(guided) + for (size_t vertex_num = 0; vertex_num < size; ++vertex_num) { + size_t i = unique_record_indices[vertex_num]; + + VertexId v = helper.CreateVertex(DeBruijnVertexData(graph.k()), min_id + (vertex_num << 1)); + for (size_t j = i; j < records.size() && records[j].GetHash() == records[i].GetHash(); j++) { + LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC()); + } } } } @@ -574,7 +591,10 @@ class DeBruijnGraphExtentionConstructor { else edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(nchunks); INFO("Sorting edges..."); - parallel::sort(edge_sequences.begin(), edge_sequences.end(), Sequence::RawCompare); + { + TIME_TRACE_SCOPE("Sorting edges"); + parallel::sort(edge_sequences.begin(), edge_sequences.end(), Sequence::RawCompare); + } INFO("Edges sorted"); FastGraphFromSequencesConstructor(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences); } diff --git a/src/common/io/binary/graph_pack.cpp b/src/common/io/binary/graph_pack.cpp index e83f57e1f8..a9b681201a 100644 --- a/src/common/io/binary/graph_pack.cpp +++ b/src/common/io/binary/graph_pack.cpp @@ -17,6 +17,7 @@ #include "paired_index.hpp" #include "positions.hpp" #include "trusted_paths.hpp" +#include "utils/stl_utils.hpp" namespace io { @@ -90,7 +91,7 @@ class Loader { */ template void Load() { - INFO("Trying to load " << typeid(T).name()); + INFO("Trying to load " << utils::type_name()); auto &component = gp.get_mutable(); if (component.IsAttached()) component.Detach(); @@ -121,7 +122,7 @@ class BinReader { */ template void Read() { - INFO("Trying to read " << typeid(T).name()); + INFO("Trying to read " << utils::type_name()); auto &component = gp.get_mutable(); if (component.IsAttached()) component.Detach(); diff --git a/src/common/utils/stl_utils.hpp b/src/common/utils/stl_utils.hpp index 545ef507c8..4f3fa55dfe 100644 --- a/src/common/utils/stl_utils.hpp +++ b/src/common/utils/stl_utils.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace utils { template @@ -197,8 +199,33 @@ static inline void trim(std::string &s) { ltrim(s); } +template +std::string type_name() { + using TR = typename std::remove_reference::type; + std::unique_ptr + own(abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), + std::free); + std::string r = own != nullptr ? own.get() : typeid(TR).name(); + if (std::is_const::value) + r += " const"; + if (std::is_volatile::value) + r += " volatile"; + if (std::is_lvalue_reference::value) + r += "&"; + else if (std::is_rvalue_reference::value) + r += "&&"; + return r; +} + +static inline std::string type_name(const char *name) { + std::unique_ptr + own(abi::__cxa_demangle(name, nullptr, nullptr, nullptr), + std::free); + return own != nullptr ? own.get() : name; } +} // namespace utils + namespace std { template std::ostream &operator<<(std::ostream &os, std::pair const &pair) { @@ -244,4 +271,4 @@ std::ostream &operator<<(std::ostream &os, const std::map &map) { return os; } -} +} // namespace std diff --git a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index 5046927f1e..f8c50e9c2f 100644 --- a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -55,7 +55,10 @@ class DeBruijnGraphExtentionConstructorTask { UnbranchingPathExtractor extractor(index, g.k()); auto seqs = extractor.ExtractUnbranchingPaths(local_iters); - index.RemoveSequences(seqs); + { + TIME_TRACE_SCOPE("RemoveSequences"); + index.RemoveSequences(seqs); + } partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); io::binary::BinWrite(os, partask::fast_local_transfer(seqs)); } @@ -81,7 +84,10 @@ class DeBruijnGraphExtentionConstructorTask { } INFO("Sorting edges..."); - parallel::sort(seqs.begin(), seqs.end(), Sequence::RawCompare); + { + TIME_TRACE_SCOPE("Sorting edges"); + parallel::sort(seqs.begin(), seqs.end(), Sequence::RawCompare); + } INFO("Sorting edges finished"); FastGraphFromSequencesConstructor(g.k(), index).ConstructGraph(g, seqs); diff --git a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 5c5c22834b..8e6a8ba540 100644 --- a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -204,6 +204,7 @@ class MergeKMerFilesTask { } void process(std::istream &is, std::ostream &) { + TIME_TRACE_SCOPE("MergeKMerFiles"); std::vector residuals; while (is.get() && is) { size_t i; diff --git a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp index 7341554c3e..1b7badcfda 100644 --- a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp +++ b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp @@ -1198,17 +1198,20 @@ class TaskRegistry { void process(std::istream &is, std::ostream &os) override { TIME_TRACE_SCOPE("partask::task:process"); + TIME_TRACE_SCOPE(utils::type_name()); process_impl(is, os); } void sync(void) override { TIME_TRACE_SCOPE("partask::task:sync"); + TIME_TRACE_SCOPE(utils::type_name()); sync_impl(); } template > decltype(auto) merge(std::enable_if_t &piss) { TIME_TRACE_SCOPE("partask::task:merge"); + TIME_TRACE_SCOPE(utils::type_name()); auto merge_args = std::tuple_cat(std::make_tuple(piss), locals_); auto merge_call = [&](auto &&... ts) { return task_.merge(std::forward(ts)...); }; return std::apply(merge_call, merge_args); @@ -1243,6 +1246,7 @@ class TaskRegistry { template std::enable_if_t sync_impl() { + TIME_TRACE_SCOPE(utils::type_name()); auto sync_call = [this](auto &&... ts) { return task_.sync(std::forward(ts)...); }; std::apply(sync_call, locals_); } diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp index cc563cb1f8..94d2ed907e 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -708,8 +708,11 @@ class PHMCoverageFiller : public ConstructionMPI::Phase { } INFO("Filling coverage and flanking coverage from PHM"); - FillCoverageAndFlankingFromPHM(coverage_map, - gp.get_mutable(), gp.get_mutable>()); + { + TIME_TRACE_SCOPE("FillCoverageAndFlankingFromPHM"); + FillCoverageAndFlankingFromPHM(coverage_map, + gp.get_mutable(), gp.get_mutable>()); + } std::vector hist; size_t maxcov = 0; From 864332a5c402817e1bce1656710b8972b080663e Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 16:20:17 +0300 Subject: [PATCH 085/102] More information --- .../construction/debruijn_graph_constructor_mpi.hpp | 1 - src/projects/hpcspades/common/pipeline/partask_mpi.hpp | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index f8c50e9c2f..67cb661e3f 100644 --- a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -76,7 +76,6 @@ class DeBruijnGraphExtentionConstructorTask { } if (collect_loops_) { - INFO("Collecting perfect loops"); UnbranchingPathExtractor extractor(index, g.k()); std::vector loops = extractor.CollectLoops(omp_get_max_threads()); seqs.insert(seqs.end(), diff --git a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp index 1b7badcfda..283a52b203 100644 --- a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp +++ b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp @@ -975,7 +975,7 @@ class ChunkedStringStream : public MPIStream template void broadcast(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { - TIME_TRACE_SCOPE("partask::broadcast"); + TIME_TRACE_SCOPE("partask::broadcast", utils::type_name()); ASSERT_MAIN_THREAD; DEBUG("Broadcasting of type " << typeid(T).name()); @@ -1001,7 +1001,7 @@ template () + "+" + utils::type_name()); ASSERT_MAIN_THREAD; DEBUG("Broadcasting of types " << typeid(T1).name() << " " << typeid(T2).name()); @@ -1028,7 +1028,7 @@ void broadcast2(T1 &data1, Serialize1 &&serialize1, Deserialize1 &&deserialize1, template void broadcast_full_dump(T &data, Serialize &&serialize, Deserialize &&deserialize, int root = 0) { ASSERT_MAIN_THREAD; - TIME_TRACE_SCOPE("partask::broadcast_full_dump"); + TIME_TRACE_SCOPE("partask::broadcast_full_dump", utils::type_name()); DEBUG("Broadcasting of type " << typeid(T).name()); @@ -1064,7 +1064,7 @@ auto broadcast(T &data, int root = 0) -> decltype(std::declval auto send(const T &data, Serialize &&serialize, int destination, int tag = 0) -> decltype(std::forward(serialize)(declref(), data), void()) { ASSERT_MAIN_THREAD; - TIME_TRACE_SCOPE("partask::send"); + TIME_TRACE_SCOPE("partask::send", utils::type_name()); OutputMPIStream os(destination, tag); DEBUG("Serialization..."); @@ -1074,7 +1074,7 @@ auto send(const T &data, Serialize &&serialize, int destination, int tag = 0) -> template auto recv(T &data, Deserialize &&deserialize, int source, int tag = MPI_ANY_TAG) -> decltype(std::forward(deserialize)(declref(), data), void()) { ASSERT_MAIN_THREAD; - TIME_TRACE_SCOPE("partask::recv"); + TIME_TRACE_SCOPE("partask::recv", utils::type_name()); InputMPIStream is(source, tag); DEBUG("Serialization..."); From a20187f3e4c0d266ed6e6a2fe9ceaf2f8fb4718d Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 18:36:25 +0300 Subject: [PATCH 086/102] Annotate PathExtend --- .../paths/bidirectional_path_container.hpp | 9 +++- .../modules/path_extend/pe_resolver.cpp | 12 +++++ .../modules/path_extend/pipeline/launcher.cpp | 47 +++++++++++++++---- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/common/assembly_graph/paths/bidirectional_path_container.hpp b/src/common/assembly_graph/paths/bidirectional_path_container.hpp index ea3335fe7d..d8e79f7c6f 100644 --- a/src/common/assembly_graph/paths/bidirectional_path_container.hpp +++ b/src/common/assembly_graph/paths/bidirectional_path_container.hpp @@ -8,6 +8,9 @@ #pragma once #include "bidirectional_path.hpp" + +#include "utils/perf/timetracer.hpp" + #include #include #include @@ -116,7 +119,7 @@ class PathContainer { return ppair; } - + // This guy acquires the ownership of paths std::pair AddPair(std::unique_ptr p, std::unique_ptr cp) { @@ -146,6 +149,8 @@ class PathContainer { } void SortByLength(bool desc = true) { + TIME_TRACE_SCOPE("PathContainer::SortByLength"); + std::stable_sort(data_.begin(), data_.end(), [=](const PathPair& p1, const PathPair& p2) { if (p1.first->Empty() || p2.first->Empty() || p1.first->Length() != p2.first->Length()) { return desc ? p1.first->Length() > p2.first->Length() @@ -173,6 +178,8 @@ class PathContainer { } void FilterPaths(func::TypedPredicate pred) { + TIME_TRACE_SCOPE("PathContainer::FilterPaths"); + DEBUG("Filtering paths based on predicate"); for (auto &pp : data_) { if (pred(*pp.first)) { diff --git a/src/common/modules/path_extend/pe_resolver.cpp b/src/common/modules/path_extend/pe_resolver.cpp index c76377ed1e..7b33371f49 100644 --- a/src/common/modules/path_extend/pe_resolver.cpp +++ b/src/common/modules/path_extend/pe_resolver.cpp @@ -10,6 +10,8 @@ #include "path_deduplicator.hpp" #include "path_extender.hpp" +#include "utils/perf/timetracer.hpp" + namespace path_extend { using namespace debruijn_graph; @@ -17,6 +19,8 @@ using namespace debruijn_graph; void Deduplicate(const Graph &g, PathContainer &paths, GraphCoverageMap &coverage_map, size_t min_edge_len, size_t max_path_diff, bool equal_only) { + TIME_TRACE_SCOPE("DeduplicatePaths"); + //add sorting to guarantee survival of longest paths if max_path_diff used //paths.SortByLength(false); PathDeduplicator deduplicator(g, paths, coverage_map, min_edge_len, max_path_diff, equal_only); @@ -38,6 +42,8 @@ static bool InTwoEdgeCycle(EdgeId e, const Graph &g) { } PathContainer PathExtendResolver::MakeSimpleSeeds() const { + TIME_TRACE_SCOPE("PEResolver::MakeSimpleSeeds"); + PathContainer edges; for (EdgeId e : g_.canonical_edges()) { if (g_.int_id(e) <= 0 || InTwoEdgeCycle(e, g_)) @@ -48,6 +54,8 @@ PathContainer PathExtendResolver::MakeSimpleSeeds() const { } PathContainer PathExtendResolver::ExtendSeeds(PathContainer &seeds, CompositeExtender &composite_extender) const { + TIME_TRACE_SCOPE("PEResolver::ExtendSeeds"); + PathContainer paths; composite_extender.GrowAll(seeds, paths); return paths; @@ -57,6 +65,8 @@ PathContainer PathExtendResolver::ExtendSeeds(PathContainer &seeds, CompositeExt void PathExtendResolver::RemoveOverlaps(PathContainer &paths, GraphCoverageMap &coverage_map, size_t min_edge_len, size_t max_path_diff, bool end_start_only, bool cut_all) const { + TIME_TRACE_SCOPE("PEResolver::RemoveOverlaps"); + INFO("Removing overlaps"); //VERIFY(min_edge_len == 0 && max_path_diff == 0); if (!cut_all) { @@ -81,6 +91,8 @@ void PathExtendResolver::RemoveOverlaps(PathContainer &paths, GraphCoverageMap & } void PathExtendResolver::AddUncoveredEdges(PathContainer &paths, GraphCoverageMap &coverageMap) const { + TIME_TRACE_SCOPE("PEResolver::AddUncoveredEdges"); + for (EdgeId e : g_.canonical_edges()) { if (coverageMap.IsCovered(e)) continue; diff --git a/src/common/modules/path_extend/pipeline/launcher.cpp b/src/common/modules/path_extend/pipeline/launcher.cpp index f336d00894..68c41365e7 100644 --- a/src/common/modules/path_extend/pipeline/launcher.cpp +++ b/src/common/modules/path_extend/pipeline/launcher.cpp @@ -29,6 +29,7 @@ using namespace omnigraph::de; std::vector> PathExtendLauncher::ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage& edge_storage) const { + TIME_TRACE_SCOPE("ConstructPairedConnectionConditions"); std::vector> conditions; const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params; @@ -109,8 +110,7 @@ void PathExtendLauncher::PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph void PathExtendLauncher::MakeAndOutputScaffoldGraph() const { - if (!params_.pset.scaffold_graph_params.construct) - return; + TIME_TRACE_SCOPE("MakeAndOutputScaffoldGraph"); auto scaffold_graph = ConstructScaffoldGraph(unique_data_.main_unique_storage_); if (params_.pset.scaffold_graph_params.output) { @@ -131,7 +131,7 @@ void PathExtendLauncher::MakeAndOutputScaffoldGraph() const { void PathExtendLauncher::CountMisassembliesWithReference(const PathContainer &paths) const { if (!gp_.get().size()) return; - + bool use_main_storage = params_.pset.genome_consistency_checker.use_main_storage; size_t unresolvable_gap = unique_data_.main_unique_storage_.min_length(); ScaffoldingUniqueEdgeStorage tmp_storage; @@ -170,9 +170,10 @@ void PathExtendLauncher::CountMisassembliesWithReference(const PathContainer &pa } void PathExtendLauncher::CheckCoverageUniformity() { + TIME_TRACE_SCOPE("CheckCoverageUniformity"); if (params_.mode != config::pipeline_type::base) return; - + CoverageUniformityAnalyzer coverage_analyzer(graph_, std::min(size_t(1000), stats::Nx(graph_, 50) - 1)); double median_coverage = coverage_analyzer.CountMedianCoverage(); double uniformity_fraction = coverage_analyzer.UniformityFraction(unique_data_.unique_variation_, median_coverage); @@ -182,6 +183,7 @@ void PathExtendLauncher::CheckCoverageUniformity() { } void PathExtendLauncher::EstimateUniqueEdgesParams() { + TIME_TRACE_SCOPE("EstimateUniqueEdgesParams"); bool uniform_coverage = false; if (params_.pset.uniqueness_analyser.enabled) { INFO("Autodetecting unique edge set parameters..."); @@ -209,6 +211,8 @@ void PathExtendLauncher::EstimateUniqueEdgesParams() { void PathExtendLauncher::FillUniqueEdgeStorage() { + TIME_TRACE_SCOPE("FillUniqueEdgeStorage"); + ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp_, unique_data_.min_unique_length_, unique_data_.unique_variation_); unique_edge_analyzer.FillUniqueEdgeStorage(unique_data_.main_unique_storage_); } @@ -234,6 +238,8 @@ void PathExtendLauncher::DebugOutputPaths(const PathContainer &paths, const std: } void FilterInterstandBulges(PathContainer &paths) { + TIME_TRACE_SCOPE("FilterInterstandBulges"); + DEBUG ("Try to delete paths with interstand bulges"); for (auto iter = paths.begin(); iter != paths.end(); ++iter) { if (EndsWithInterstrandBulge(iter.get())) @@ -249,11 +255,13 @@ void FilterInterstandBulges(PathContainer &paths) { void PathExtendLauncher::RemoveOverlapsAndArtifacts(PathContainer &paths, GraphCoverageMap &cover_map, const PathExtendResolver &resolver) const { + TIME_TRACE_SCOPE("PathExtend::RemoveOverlapsAndArtifacts"); + INFO("Finalizing paths"); INFO("Deduplicating paths"); Deduplicate(graph_, paths, cover_map, params_.min_edge_len, - params_.max_path_diff); + params_.max_path_diff); INFO("Paths deduplicated"); @@ -277,6 +285,8 @@ void PathExtendLauncher::RemoveOverlapsAndArtifacts(PathContainer &paths, void PathExtendLauncher::CleanPaths(PathContainer &paths, const pe_config::ParamSetT::PathFiltrationT &path_filtration) const { + TIME_TRACE_SCOPE("PathExtend::CleanPaths"); + if (path_filtration.enabled) { paths.FilterPaths(LengthPathCondition(GetLengthCutoff(path_filtration.min_length, path_filtration.rel_cutoff))); paths.FilterPaths(func::And(CoveragePathCondition(graph_, path_filtration.min_coverage), @@ -301,6 +311,8 @@ size_t PathExtendLauncher::GetLengthCutoff(size_t abs_cutoff, double rel_cutoff) } void PathExtendLauncher::TraverseLoops(PathContainer &paths, GraphCoverageMap &cover_map) const { + TIME_TRACE_SCOPE("PathExtend::TraverseLoops"); + INFO("Traversing tandem repeats"); LoopTraverser @@ -341,6 +353,8 @@ Extenders PathExtendLauncher::ConstructMPExtenders(const ExtendersGenerator &gen } void PathExtendLauncher::FillPathContainer(size_t lib_index, size_t size_threshold) { + TIME_TRACE_SCOPE("PathExtend::FillPathContainer"); + INFO("filling path container"); if (dataset_info_.reads[lib_index].type() == io::LibraryType::TrustedContigs) { auto& trusted_paths = gp_.get_mutable()[lib_index]; @@ -368,6 +382,8 @@ void PathExtendLauncher::FillPathContainer(size_t lib_index, size_t size_thresho void PathExtendLauncher::FillLongReadsCoverageMaps() { + TIME_TRACE_SCOPE("PathExtend::FillLongReadsCoverageMaps"); + DEBUG("long reads start ") for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) { DEBUG("lib_index" << lib_index); @@ -379,7 +395,9 @@ void PathExtendLauncher::FillLongReadsCoverageMaps() { } } -void PathExtendLauncher::FillPBUniqueEdgeStorages() { +void PathExtendLauncher::FillPBUniqueEdgeStorages() { + TIME_TRACE_SCOPE("PathExtend::FillPBUniqueEdgeStorages"); + //FIXME magic constants //FIXME need to change for correct usage of prelimnary contigs in loops ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer_pb(gp_, 500, 0.5); @@ -417,6 +435,8 @@ Extenders PathExtendLauncher::ConstructPBExtenders(const ExtendersGenerator &gen Extenders PathExtendLauncher::ConstructExtenders(const GraphCoverageMap &cover_map, UsedUniqueStorage &used_unique_storage) { + TIME_TRACE_SCOPE("PathExtend::ConstructExtenders"); + INFO("Creating main extenders, unique edge length = " << unique_data_.min_unique_length_); if (!config::PipelineHelper::IsPlasmidPipeline(params_.mode) && (support_.SingleReadsMapped() || support_.HasLongReads())) FillLongReadsCoverageMaps(); @@ -452,6 +472,8 @@ Extenders PathExtendLauncher::ConstructExtenders(const GraphCoverageMap &cover_m void PathExtendLauncher::PolishPaths(const PathContainer &paths, PathContainer &result, const GraphCoverageMap& /* cover_map */) const { + TIME_TRACE_SCOPE("PathExtend::PolishPaths"); + //Fixes distances for paths gaps and tries to fill them in INFO("Closing gaps in paths"); @@ -487,6 +509,8 @@ void PathExtendLauncher::PolishPaths(const PathContainer &paths, PathContainer & } void PathExtendLauncher::FilterPaths(PathContainer &contig_paths) { + TIME_TRACE_SCOPE("PathExtend::FilterPaths"); + auto default_filtration = params_.pset.path_filtration.end(); for (auto it = params_.pset.path_filtration.begin(); it != params_.pset.path_filtration.end(); ++it) { if (!it->second.enabled) @@ -514,6 +538,8 @@ void PathExtendLauncher::FilterPaths(PathContainer &contig_paths) { } void PathExtendLauncher::AddFLPaths(PathContainer &paths) const { + TIME_TRACE_SCOPE("PathExtend::AddFLPaths"); + bool fl_paths_added = false; const auto &single_long_reads = gp_.get>(); for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) { @@ -579,6 +605,7 @@ void PathExtendLauncher::SelectStrandSpecificPaths(PathContainer &paths) const { if (!params_.ss.ss_enabled) return; + TIME_TRACE_SCOPE("PathExtend::SelectStrandSpecificPaths"); INFO("Paths will be printed according to strand-specific coverage"); size_t lib_index = 0; while (lib_index < dataset_info_.reads.lib_count() && !dataset_info_.reads[lib_index].is_graph_constructable()) { @@ -600,6 +627,8 @@ void MakeConjugateEdgePairsDump(ConjugateDeBruijnGraph const & graph) { } void PathExtendLauncher::Launch() { + TIME_TRACE_SCOPE("PathExtend"); + INFO("ExSPAnder repeat resolving tool started"); create_directory(params_.output_dir); create_directory(params_.etc_dir); @@ -607,12 +636,14 @@ void PathExtendLauncher::Launch() { CheckCoverageUniformity(); if (!config::PipelineHelper::IsPlasmidPipeline(params_.mode) && support_.NeedsUniqueEdgeStorage()) { + TIME_TRACE_SCOPE("FillUniqueEdges"); //Fill the storage to enable unique edge check EstimateUniqueEdgesParams(); FillUniqueEdgeStorage(); } - MakeAndOutputScaffoldGraph(); + if (params_.pset.scaffold_graph_params.construct) + MakeAndOutputScaffoldGraph(); PathContainer fl_paths; AddFLPaths(fl_paths); @@ -646,7 +677,7 @@ void PathExtendLauncher::Launch() { //TODO does path polishing correctly work with coverage map PolishPaths(paths, contig_paths, cover_map); } - + GraphCoverageMap polished_map(graph_, contig_paths, true); DebugOutputPaths(contig_paths, "polished_paths"); TraverseLoops(contig_paths, polished_map); From d4d8a8de3bc1daa5f752e595195fd94e45aab37f Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 19:03:06 +0300 Subject: [PATCH 087/102] A bit more verbosity --- src/common/modules/path_extend/loop_traverser.cpp | 2 ++ src/common/modules/path_extend/overlap_remover.cpp | 2 ++ src/common/modules/path_extend/overlap_remover.hpp | 3 +++ 3 files changed, 7 insertions(+) diff --git a/src/common/modules/path_extend/loop_traverser.cpp b/src/common/modules/path_extend/loop_traverser.cpp index f1626fe3e1..640fe622c8 100644 --- a/src/common/modules/path_extend/loop_traverser.cpp +++ b/src/common/modules/path_extend/loop_traverser.cpp @@ -93,6 +93,8 @@ bool LoopTraverser::IsEndInsideComponent(const BidirectionalPath &path, EdgeId c } bool LoopTraverser::TraverseLoop(EdgeId start, EdgeId end, const std::set &component_set) { + TIME_TRACE_SCOPE("LoopTraverser::TraverseLoop"); + DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end)); BidirectionalPathSet start_cover_paths = cov_map_.GetCoveringPaths(start); BidirectionalPathSet end_cover_paths = cov_map_.GetCoveringPaths(end); diff --git a/src/common/modules/path_extend/overlap_remover.cpp b/src/common/modules/path_extend/overlap_remover.cpp index 9b21cad843..c6799b38b1 100644 --- a/src/common/modules/path_extend/overlap_remover.cpp +++ b/src/common/modules/path_extend/overlap_remover.cpp @@ -267,6 +267,8 @@ void PathSplitter::SplitPath(BidirectionalPath * const p, const std::set } void PathSplitter::Split() { + TIME_TRACE_SCOPE("PathSplitter::Split"); + std::vector> tmp_paths; for (const auto &entry : paths_) tmp_paths.emplace_back(entry.first.get(), entry.second.get()); diff --git a/src/common/modules/path_extend/overlap_remover.hpp b/src/common/modules/path_extend/overlap_remover.hpp index 655651bd41..8221d89d7b 100644 --- a/src/common/modules/path_extend/overlap_remover.hpp +++ b/src/common/modules/path_extend/overlap_remover.hpp @@ -10,6 +10,7 @@ #include "assembly_graph/core/graph.hpp" #include "assembly_graph/paths/bidirectional_path.hpp" #include "sequence/range.hpp" +#include "utils/perf/timetracer.hpp" namespace path_extend { @@ -113,6 +114,8 @@ class OverlapRemover { //Note that during start/end removal all repeat instance have to be cut void MarkOverlaps(bool end_start_only, bool retain_one_copy) { + TIME_TRACE_SCOPE("OverlapRemover::MarkOverlaps"); + VERIFY(!end_start_only || !retain_one_copy); INFO("Marking start/end overlaps"); InnerMarkOverlaps(/*end/start overlaps only*/ true, /*retain one copy*/ false); From 70d957c12b99d75ea946031e8ea5773832f9a3bf Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 11 Jan 2022 19:21:37 +0300 Subject: [PATCH 088/102] More events + some cleanups --- .../bidirectional_path_io/io_support.cpp | 2 + .../paired_info/distance_estimation_utils.cpp | 14 +++- .../paired_info/distance_estimation_utils.hpp | 76 ++++++++++--------- src/common/paired_info/pair_info_improver.hpp | 2 + .../paired_info/distance_estimation_utils.cpp | 4 + .../paired_info/distance_estimation_utils.hpp | 8 +- src/projects/spades/contig_output_stage.cpp | 28 +++++-- 7 files changed, 85 insertions(+), 49 deletions(-) diff --git a/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp index 99bbd7c635..4aa9dd9f6f 100644 --- a/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp +++ b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp @@ -161,6 +161,8 @@ void path_extend::ScaffoldBreaker::SplitPath(const BidirectionalPath &path, Path } void path_extend::ScaffoldBreaker::Break(const PathContainer &paths, PathContainer &result) const { + TIME_TRACE_SCOPE("ScaffoldBreaker::Break"); + for (auto it = paths.begin(); it != paths.end(); ++it) { SplitPath(it.get(), result); } diff --git a/src/common/paired_info/distance_estimation_utils.cpp b/src/common/paired_info/distance_estimation_utils.cpp index 7517d15963..db3bcb1240 100644 --- a/src/common/paired_info/distance_estimation_utils.cpp +++ b/src/common/paired_info/distance_estimation_utils.cpp @@ -20,6 +20,8 @@ namespace distance_estimation { void EstimateWithEstimator(PairedInfoIndexT &clustered_index, const AbstractDistanceEstimator &estimator, AbstractPairInfoChecker &checker) { + TIME_TRACE_SCOPE("EstimateWithEstimator"); + DEBUG("Estimating distances"); estimator.Estimate(clustered_index, omp_get_max_threads()); @@ -106,7 +108,7 @@ namespace distance_estimation { PairInfoWeightChecker checker(graph, 0.); DEBUG("Weight Filter Done"); - auto estimator = distance_estimator_fabric.getDistanceEstimator(graph, paired_index, dist_finder, + auto estimator = distance_estimator_fabric.GetDistanceEstimator(graph, paired_index, dist_finder, [&](int i) { return wrapper.CountWeight(i); }, @@ -136,7 +138,7 @@ namespace distance_estimation { INFO("Weight Filter Done"); - auto estimator = distance_estimator_fabric.getDistanceEstimator(graph, paired_index, dist_finder, + auto estimator = distance_estimator_fabric.GetDistanceEstimator(graph, paired_index, dist_finder, linkage_distance, max_distance); EstimateWithEstimator(clustered_index, *estimator, checker); @@ -146,8 +148,8 @@ namespace distance_estimation { INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts. INFO("Improving paired information"); - PairInfoImprover(graph, clustered_index, lib, max_repeat_length).ImprovePairedInfo( - omp_get_max_threads()); + PairInfoImprover(graph, clustered_index, lib, max_repeat_length) + .ImprovePairedInfo(omp_get_max_threads()); } void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, @@ -156,6 +158,8 @@ namespace distance_estimation { const UnclusteredPairedInfoIndexT &paired_index, const debruijn_graph::config::smoothing_distance_estimator &ade, const debruijn_graph::config::distance_estimator &de_config) { + TIME_TRACE_SCOPE("EstimateScaffoldingDistances"); + EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, paired_index, ade, de_config, ScaffoldDistanceEstimatorFabric()); } @@ -166,6 +170,8 @@ namespace distance_estimation { const UnclusteredPairedInfoIndexT &paired_index, size_t max_repeat_length, const debruijn_graph::config::distance_estimator &de_config) { + TIME_TRACE_SCOPE("EstimatePairedDistances"); + EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, max_repeat_length, de_config, DistanceEstimatorFabric()); } diff --git a/src/common/paired_info/distance_estimation_utils.hpp b/src/common/paired_info/distance_estimation_utils.hpp index a3224d2f1a..32fd82d2cf 100644 --- a/src/common/paired_info/distance_estimation_utils.hpp +++ b/src/common/paired_info/distance_estimation_utils.hpp @@ -25,20 +25,22 @@ namespace distance_estimation { class AbstractDistanceEstimatorFabric { public: - virtual std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &index, - const omnigraph::de::GraphDistanceFinder &distance_finder, - size_t linkage_distance, - size_t max_distance) const = 0; + virtual std::unique_ptr + GetDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const = 0; }; class DistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &index, - const omnigraph::de::GraphDistanceFinder &distance_finder, - size_t linkage_distance, - size_t max_distance) const override { + std::unique_ptr + GetDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &index, + const omnigraph::de::GraphDistanceFinder &distance_finder, + size_t linkage_distance, + size_t max_distance) const override { return std::make_unique(graph, index, distance_finder, linkage_distance, max_distance); } @@ -46,30 +48,32 @@ namespace distance_estimation { class AbstractScaffoldDistanceEstimatorFabric { public: - virtual std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &histogram, - const omnigraph::de::GraphDistanceFinder &dist_finder, - std::function weight_f, - size_t linkage_distance, size_t max_distance, size_t threshold, - double range_coeff, double delta_coeff, - size_t cutoff, - size_t min_peak_points, - double percentage, - double derivative_threshold) const = 0; + virtual std::unique_ptr + GetDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const = 0; }; class ScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, - const distance_estimation::UnclusteredPairedInfoIndexT &histogram, - const omnigraph::de::GraphDistanceFinder &dist_finder, - std::function weight_f, - size_t linkage_distance, size_t max_distance, size_t threshold, - double range_coeff, double delta_coeff, - size_t cutoff, - size_t min_peak_points, - double percentage, - double derivative_threshold) const override { + std::unique_ptr + GetDistanceEstimator(const debruijn_graph::Graph &graph, + const distance_estimation::UnclusteredPairedInfoIndexT &histogram, + const omnigraph::de::GraphDistanceFinder &dist_finder, + std::function weight_f, + size_t linkage_distance, size_t max_distance, size_t threshold, + double range_coeff, double delta_coeff, + size_t cutoff, + size_t min_peak_points, + double percentage, + double derivative_threshold) const override { return std::unique_ptr( new omnigraph::de::SmoothingDistanceEstimator(graph, histogram, dist_finder, weight_f, linkage_distance, max_distance, threshold, @@ -106,12 +110,12 @@ namespace distance_estimation { DistanceEstimatorFabric()); void EstimateScaffoldingDistances(PairedInfoIndexT &scaffolding_index, - const debruijn_graph::Graph &graph, - const io::SequencingLibrary &lib, - const UnclusteredPairedInfoIndexT &paired_index, - const debruijn_graph::config::smoothing_distance_estimator &ade, - const debruijn_graph::config::distance_estimator &de_config = - debruijn_graph::config::distance_estimator()); + const debruijn_graph::Graph &graph, + const io::SequencingLibrary &lib, + const UnclusteredPairedInfoIndexT &paired_index, + const debruijn_graph::config::smoothing_distance_estimator &ade, + const debruijn_graph::config::distance_estimator &de_config = + debruijn_graph::config::distance_estimator()); void EstimatePairedDistances(PairedInfoIndexT &clustered_index, const debruijn_graph::Graph &graph, diff --git a/src/common/paired_info/pair_info_improver.hpp b/src/common/paired_info/pair_info_improver.hpp index 48168ae21a..1756e7daf6 100644 --- a/src/common/paired_info/pair_info_improver.hpp +++ b/src/common/paired_info/pair_info_improver.hpp @@ -51,6 +51,8 @@ class PairInfoImprover { : graph_(g), index_(clustered_index), lib_(lib), max_repeat_length_(max_repeat_length) { } void ImprovePairedInfo(unsigned num_threads = 1) { + TIME_TRACE_SCOPE("PairInfoImprover"); + CorrectPairedInfo(num_threads); CorrectPairedInfo(num_threads); } diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp index b5adff907d..091211cf65 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp @@ -14,6 +14,8 @@ void EstimateScaffoldingDistancesMPI(PairedInfoIndexT &sc const UnclusteredPairedInfoIndexT &paired_index, const debruijn_graph::config::smoothing_distance_estimator &ade, const debruijn_graph::config::distance_estimator &de_config) { + TIME_TRACE_SCOPE("EstimateScaffoldingDistancesMPI"); + EstimateScaffoldingDistancesInner(scaffolding_index, graph, lib, paired_index, ade, de_config, MPIScaffoldDistanceEstimatorFabric()); } @@ -24,6 +26,8 @@ void EstimatePairedDistancesMPI(PairedInfoIndexT &cluster const UnclusteredPairedInfoIndexT &paired_index, size_t max_repeat_length, const debruijn_graph::config::distance_estimator &de_config) { + TIME_TRACE_SCOPE("EstimatePairedDistancesMPI"); + EstimatePairedDistancesInner(clustered_index, graph, lib, paired_index, max_repeat_length, de_config, MPIDistanceEstimatorFabric()); } diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp index 65d75b7940..61c909c076 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp +++ b/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp @@ -14,8 +14,8 @@ using omnigraph::de::DistanceEstimator; using omnigraph::de::DistanceEstimatorMPI; class MPIDistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { - public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, +public: + std::unique_ptr GetDistanceEstimator(const debruijn_graph::Graph &graph, const distance_estimation::UnclusteredPairedInfoIndexT &index, const omnigraph::de::GraphDistanceFinder &distance_finder, size_t linkage_distance, @@ -31,8 +31,8 @@ class MPIDistanceEstimatorFabric : public AbstractDistanceEstimatorFabric { }; class MPIScaffoldDistanceEstimatorFabric : public AbstractScaffoldDistanceEstimatorFabric { - public: - std::unique_ptr getDistanceEstimator(const debruijn_graph::Graph &graph, +public: + std::unique_ptr GetDistanceEstimator(const debruijn_graph::Graph &graph, const distance_estimation::UnclusteredPairedInfoIndexT &histogram, const omnigraph::de::GraphDistanceFinder &dist_finder, std::function weight_f, diff --git a/src/projects/spades/contig_output_stage.cpp b/src/projects/spades/contig_output_stage.cpp index 970acf6f7c..9c4ccea613 100644 --- a/src/projects/spades/contig_output_stage.cpp +++ b/src/projects/spades/contig_output_stage.cpp @@ -59,7 +59,7 @@ path_extend::PathContainer GetCircularScaffolds(const path_extend::PathContainer res.Create(entry.first); } - + INFO("Got " << res.size() << " circular scaffolds"); return res; } @@ -73,9 +73,9 @@ path_extend::PathContainer GetTipScaffolds(const path_extend::PathContainer &sc_ !forbidden_vertices.count(path.g().EdgeStart(path.Front())) || !forbidden_vertices.count(path.g().EdgeEnd(path.Back()))) continue; - + res.Create(entry.first); - + } INFO("Got " << res.size() << " linear scaffolds"); @@ -129,12 +129,14 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { const auto &graph = gp.get(); if (outputs_.count(Kind::BinaryContigs)) { + TIME_TRACE_SCOPE("ContigOutput::BinaryContigs"); std::filesystem::path contigs_output_dir = output_dir / outputs_[Kind::BinaryContigs]; create_directory(contigs_output_dir); io::ReadConverter::ConvertEdgeSequencesToBinary(graph, contigs_output_dir, cfg::get().max_threads); } if (outputs_.count(Kind::EdgeSequences)) { + TIME_TRACE_SCOPE("ContigOutput::EdgeSequences"); OutputEdgeSequences(graph, output_dir / outputs_[Kind::EdgeSequences]); } @@ -143,6 +145,8 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { const auto &components = gp.get(); if (outputs_.count(Kind::GFAGraph)) { + TIME_TRACE_SCOPE("ContigOutput::GFAGraph"); + io::EdgeNamingF naming_f = config::PipelineHelper::IsPlasmidPipeline(cfg::get().mode) && components.IsFilled()? PlasmidNamingF(io::IdNamingF(), components) : @@ -158,6 +162,8 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { std::optional fastg_writer; if (outputs_.count(Kind::FASTGGraph)) { + TIME_TRACE_SCOPE("ContigOutput::FASTGGraph"); + io::EdgeNamingF naming_f = config::PipelineHelper::IsPlasmidPipeline(cfg::get().mode) && components.IsFilled()? PlasmidNamingF(io::BasicNamingF(), components) : @@ -176,6 +182,8 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { contig_paths.size(); if (output_contig_paths) { + TIME_TRACE_SCOPE("ContigOutput::ContigPaths"); + ContigWriter writer(graph, MakeContigNameGenerator(cfg::get().mode, gp)); bool output_broken_scaffolds = cfg::get().pe_params.param_set.scaffolder_options.enabled && @@ -184,6 +192,8 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { (outputs_.count(Kind::FinalContigs) || outputs_.count(Kind::PlasmidContigs)); if (output_broken_scaffolds) { + TIME_TRACE_SCOPE("ContigOutput::BrokenScaffolds"); + int min_overlap = int(gp.k()); switch (cfg::get().co.obs_mode) { default: @@ -209,12 +219,16 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { broken_scaffolds.FilterEmptyPaths(); broken_scaffolds.SortByLength(); - if (outputs_.count(Kind::FinalContigs)) + if (outputs_.count(Kind::FinalContigs)) { + TIME_TRACE_SCOPE("ContigOutput::FinalContigs"); + writer.OutputPaths(broken_scaffolds, CreatePathsWriters(output_dir / (outputs_[Kind::FinalContigs]), fastg_writer)); + } if (outputs_.count(Kind::PlasmidContigs)) { + TIME_TRACE_SCOPE("ContigOutput::PlasmidContigs"); if (!gp.count("used_edges")) gp.add("used_edges", UsedEdges(graph)); PathContainer circulars = GetCircularScaffolds(broken_scaffolds, gp.get_mutable("used_edges"), cfg::get().pd->min_circular_length); @@ -243,11 +257,15 @@ void ContigOutput::run(graph_pack::GraphPack &gp, const char*) { } } - if (outputs_.count(Kind::Scaffolds)) + if (outputs_.count(Kind::Scaffolds)) { + TIME_TRACE_SCOPE("ContigOutput::Scaffolds"); + writer.OutputPaths(contig_paths, CreatePathsWriters(output_dir / outputs_[Kind::Scaffolds], fastg_writer, gfa_writer)); + } } else if (outputs_.count(Kind::FinalContigs)) { + TIME_TRACE_SCOPE("ContigOutput::FinalContigs"); OutputEdgeSequences(graph, output_dir / outputs_[Kind::FinalContigs]); } } From 7a01e5873e7fda67babd2dfbeb6a7438604cb89e Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Thu, 13 Jan 2022 15:03:09 +0300 Subject: [PATCH 089/102] call process lib func in Mismatch Corrector --- src/projects/spades/mismatch_correction.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index a1a4832820..27e0e0fde8 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -429,11 +429,10 @@ namespace mismatches { if (!dataset.reads[i].is_mismatch_correctable()) continue; - SequenceMapperNotifier notifier; - notifier.Subscribe(&statistics); auto &reads = dataset.reads[i]; auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true); - notifier.ProcessLibrary(single_streams, *mapper); + + proccess_lib_func_(&statistics, *mapper, single_streams); } return CorrectAllEdges(statistics); From ea29617bd01990da9f53e902c818de8a7a36eaa4 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Fri, 1 Apr 2022 19:21:21 +0300 Subject: [PATCH 090/102] fix: allreduce only in sync --- src/common/kmer_index/kmer_mph/kmer_index_builder.hpp | 4 ++-- src/common/kmer_index/kmer_mph/kmer_splitter.hpp | 4 +++- .../construction/debruijn_graph_constructor_mpi.hpp | 10 +++++----- .../hpcspades/common/stages/construction_mpi.cpp | 5 ++++- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp index 41523d9648..59ec907607 100644 --- a/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_index_builder.hpp @@ -323,10 +323,10 @@ class KMerDiskCounter : public KMerCounter { } } INFO("K-mer counting done. There are " << kmers << " kmers in total. "); - if (!kmers) { + /*if (!kmers) { FATAL_ERROR("No kmers were extracted from reads. Check the read lengths and k-mer length settings"); exit(-1); - } + }*/ return res; } diff --git a/src/common/kmer_index/kmer_mph/kmer_splitter.hpp b/src/common/kmer_index/kmer_mph/kmer_splitter.hpp index 747110a4f1..f1db95b3a3 100644 --- a/src/common/kmer_index/kmer_mph/kmer_splitter.hpp +++ b/src/common/kmer_index/kmer_mph/kmer_splitter.hpp @@ -78,8 +78,10 @@ class KMerSortingSplitter : public KMerSplitter { // Determine the set of output files RawKMers out; auto tmp_prefix = this->work_dir_->tmp_file("kmers_raw"); - for (unsigned i = 0; i < num_files_; ++i) + for (unsigned i = 0; i < num_files_; ++i) { out.emplace_back(tmp_prefix->CreateDep(std::to_string(i))); + fclose(fopen(out.back()->file().c_str(), "ab")); + } size_t file_limit = num_files_ + 2*nthreads; size_t res = utils::limit_file(file_limit); diff --git a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index 67cb661e3f..f827bb33a7 100644 --- a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -55,11 +55,6 @@ class DeBruijnGraphExtentionConstructorTask { UnbranchingPathExtractor extractor(index, g.k()); auto seqs = extractor.ExtractUnbranchingPaths(local_iters); - { - TIME_TRACE_SCOPE("RemoveSequences"); - index.RemoveSequences(seqs); - } - partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); io::binary::BinWrite(os, partask::fast_local_transfer(seqs)); } @@ -75,6 +70,11 @@ class DeBruijnGraphExtentionConstructorTask { } } + { + TIME_TRACE_SCOPE("RemoveSequences"); + index.RemoveSequences(seqs); + } + if (collect_loops_) { UnbranchingPathExtractor extractor(index, g.k()); std::vector loops = extractor.CollectLoops(omp_get_max_threads()); diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/common/stages/construction_mpi.cpp index 94d2ed907e..b91462a832 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/common/stages/construction_mpi.cpp @@ -441,7 +441,6 @@ class TipClippingTaskBase { size_t kpo_mers_removed = process_iner(index, local_iters); INFO("K+1-mers removed: " << kpo_mers_removed); - partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); io::binary::BinWrite(os, kpo_mers_removed); } @@ -453,6 +452,10 @@ class TipClippingTaskBase { return kpo_mers_removed; } + void sync(Index &index) { + partask::allreduce(index.raw_data(), index.raw_size(), MPI_BAND); + } + private: virtual size_t process_iner(Index &, std::vector& /*local_iters*/) = 0; }; From dbe03b412833bd2efe0e6932cd6f5ebb49c7776b Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 4 Apr 2022 18:00:33 +0300 Subject: [PATCH 091/102] fix: block size/NNodes --- .../hpcspades/common/pipeline/partask_mpi.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp index 283a52b203..492e679c4e 100644 --- a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp +++ b/src/projects/hpcspades/common/pipeline/partask_mpi.hpp @@ -119,6 +119,13 @@ inline void barrier() { const size_t MPI_MAX_COUNT = 1 << 30; // Should be <= MAX_INT +inline size_t get_block_size(size_t count) { + //BLOCK_SIZE * NNODES sould be <= MAX_INT + //bug fix in openMPI: https://github.com/open-mpi/ompi/commit/fe07940cfd5507871ce2a747a6c88149cc8096af + size_t nodes = world_size(); + return std::min(count, MPI_MAX_COUNT/nodes); +} + inline void membroadcast(void *p, size_t count, int root = 0) { ASSERT_MAIN_THREAD; TIME_TRACE_SCOPE("partask::membroadcast"); @@ -128,7 +135,7 @@ inline void membroadcast(void *p, size_t count, int root = 0) { char *cp = reinterpret_cast(p); while (count) { - size_t block_size = std::min(count, MPI_MAX_COUNT); + size_t block_size = get_block_size(count); int ret = MPI_Bcast(cp, static_cast(block_size), MPI_BYTE, root, MPI_COMM_WORLD); VERIFY(ret == MPI_SUCCESS); cp += block_size; @@ -141,7 +148,7 @@ inline void memsend(const void *p, size_t count, int rank, int tag = 0) { TIME_TRACE_SCOPE("partask::memsend"); char *cp = reinterpret_cast(const_cast(p)); while (count) { - size_t block_size = std::min(count, MPI_MAX_COUNT); + size_t block_size = get_block_size(count); int ret = MPI_Send(cp, static_cast(block_size), MPI_BYTE, rank, tag, MPI_COMM_WORLD); VERIFY(ret == MPI_SUCCESS); cp += block_size; @@ -154,7 +161,7 @@ inline void memrecv(void *p, size_t count, int rank, int tag = MPI_ANY_TAG) { TIME_TRACE_SCOPE("partask::memrecv"); char *cp = reinterpret_cast(p); while (count) { - size_t block_size = std::min(count, MPI_MAX_COUNT); + size_t block_size = get_block_size(count); int ret = MPI_Recv(cp, static_cast(block_size), MPI_BYTE, rank, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); VERIFY(ret == MPI_SUCCESS); cp += block_size; @@ -282,7 +289,7 @@ void allreduce(T *recvbuf, size_t count, MPI_Op op) { using NoneVoidT = std::conditional_t::value, char, T>; NoneVoidT *crecvbuf = reinterpret_cast(recvbuf); while (count) { - size_t block_size = std::min(count, MPI_MAX_COUNT); + size_t block_size = get_block_size(count); int ret = MPI_Allreduce(MPI_IN_PLACE, crecvbuf, static_cast(block_size), mpi_datatype(), op, MPI_COMM_WORLD); VERIFY(ret == MPI_SUCCESS); crecvbuf += block_size; From ae78af75362161a32f9d07e39cbccfbdafaf7f2a Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 6 Apr 2022 12:48:08 +0300 Subject: [PATCH 092/102] sequence mapper notifier MPI in paired info counter --- src/common/paired_info/paired_info_utils.cpp | 24 ++++++++++++++++---- src/common/paired_info/paired_info_utils.hpp | 2 +- src/projects/hpcspades/pipeline.cpp | 6 ++--- src/projects/spades/pair_info_count.cpp | 2 +- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index b09af6388c..8dcbd03941 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -76,11 +76,25 @@ class EdgePairCounterFiller : public SequenceMapperListener { } } + void Serialize(std::ostream &os) const override { + io::binary::BinWrite(os, counter_); + } + + void Deserialize(std::istream &is) override { + io::binary::BinRead(is, counter_); + } + + void MergeFromStream(std::istream &is) override { + EdgePairCounterFiller remote(*this); + remote.Deserialize(is); + counter_.merge(remote.counter_); + } + std::vector buf_; EdgePairCounter counter_; }; -bool CollectLibInformation(const Graph &graph, +bool CollectLibInformation(const Graph &graph, const MapLibBase &process_libs, const SequenceMapperNotifier::SequenceMapperT &mapper, size_t &edgepairs, SequencingLib &reads, size_t edge_length_threshold) { @@ -88,15 +102,15 @@ bool CollectLibInformation(const Graph &graph, InsertSizeCounter hist_counter(graph, edge_length_threshold); EdgePairCounterFiller pcounter(omp_get_max_threads()); - SequenceMapperNotifier notifier; - notifier.Subscribe(&hist_counter); - notifier.Subscribe(&pcounter); + std::vector listeners; + listeners.push_back(&hist_counter); + listeners.push_back(&pcounter); auto &data = reads.data(); auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, /*insert_size*/0, /*include_merged*/true); + process_libs(listeners, mapper, paired_streams); - notifier.ProcessLibrary(paired_streams, mapper); //Check read length after lib processing since mate pairs a not used until this step VERIFY(reads.data().unmerged_read_length != 0); diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index ab0bb44ed5..022fac5a85 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -28,7 +28,7 @@ typedef std::function &, io::ReadStreamList &streams)> MapLibFuncT; -bool CollectLibInformation(const debruijn_graph::Graph &gp, +bool CollectLibInformation(const debruijn_graph::Graph &gp, const debruijn_graph::MapLibBase &process_libs, const debruijn_graph::SequenceMapper &mapper, size_t &edgepairs, SequencingLib &reads, size_t edge_length_threshold); diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index c9110ad0d6..74b15f831c 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -139,9 +139,9 @@ static void AddPreliminarySimplificationStages(spades::StageManager &SPAdes) { SPAdes.add("prelim_gapcloser"); if (cfg::get().use_intermediate_contigs) { - SPAdes.add(true); - SPAdes.add(true); - SPAdes.add(true); + SPAdes.add(true) + .add(true) + .add(true); if (cfg::get().hm) SPAdes.add(); diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 8bc6635e81..68ddc583a4 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -179,7 +179,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, size_t k = cfg::get().K; size_t edgepairs = 0; - if (!paired_info::CollectLibInformation(graph, *ChooseProperMapper(gp, lib), + if (!paired_info::CollectLibInformation(graph, map_lib_func, *ChooseProperMapper(gp, lib), edgepairs, lib, edge_length_threshold)) { cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0; WARN("Unable to estimate insert size for paired library #" << i); From 5dbc8fc70694b9c464c9272c55ea3228741fb520 Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Wed, 6 Apr 2022 13:35:29 +0300 Subject: [PATCH 093/102] PairedInfoCounter SeqMapNot MPI --- src/common/paired_info/paired_info_utils.cpp | 10 +++++----- src/common/paired_info/paired_info_utils.hpp | 2 +- src/projects/binspreader/paired_end.cpp | 3 ++- src/projects/spades/pair_info_count.cpp | 2 +- src/projects/spades_tools/gmapper.cpp | 1 + 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index 8dcbd03941..0729f3524e 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -140,7 +140,7 @@ bool CollectLibInformation(const Graph &graph, const MapLibBase &process_libs, return !data.insert_size_distribution.empty(); } -void FillPairedIndex(const Graph &graph, +void FillPairedIndex(const Graph &graph, const MapLibBase &process_lib, const SequenceMapperNotifier::SequenceMapperT &mapper, SequencingLib &reads, PairedIndex &index, @@ -148,7 +148,6 @@ void FillPairedIndex(const Graph &graph, unsigned round_thr, bool use_binary) { const auto &data = reads.data(); - SequenceMapperNotifier notifier; INFO("Left insert size quantile " << data.insert_size_left_quantile << ", right insert size quantile " << data.insert_size_right_quantile << ", filtering threshold " << filter_threshold << @@ -168,16 +167,17 @@ void FillPairedIndex(const Graph &graph, } LatePairedIndexFiller pif(graph, weight, round_thr, index); - notifier.Subscribe(&pif); + std::vector listeners; + listeners.push_back(&pif); if (use_binary) { auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, (size_t) data.mean_insert_size, /*include merged*/true); - notifier.ProcessLibrary(paired_streams, mapper); + process_lib(listeners, mapper, paired_streams); } else { auto paired_streams = paired_easy_readers(reads, /*followed by rc*/false, (size_t)data.mean_insert_size, /*use_orientation*/false); - notifier.ProcessLibrary(paired_streams, mapper); + process_lib(listeners, mapper, paired_streams); } } diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index 022fac5a85..0b659e8ff1 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -33,7 +33,7 @@ bool CollectLibInformation(const debruijn_graph::Graph &gp, const debruijn_graph size_t &edgepairs, SequencingLib &reads, size_t edge_length_threshold); -void FillPairedIndex(const debruijn_graph::Graph &gp, +void FillPairedIndex(const debruijn_graph::Graph &gp, const debruijn_graph::MapLibBase &process_libs, const debruijn_graph::SequenceMapper &mapper, SequencingLib &reads, PairedIndex &index, diff --git a/src/projects/binspreader/paired_end.cpp b/src/projects/binspreader/paired_end.cpp index 90e8730fd1..4c0372c268 100644 --- a/src/projects/binspreader/paired_end.cpp +++ b/src/projects/binspreader/paired_end.cpp @@ -13,6 +13,7 @@ #include "paired_info/paired_info_utils.hpp" #include "alignment/kmer_sequence_mapper.hpp" +#include "alignment/sequence_mapper_notifier.hpp" #include "io/binary/paired_index.hpp" #include "io/dataset_support/read_converter.hpp" @@ -40,7 +41,7 @@ void FillPairedEndLinks(LinkIndex &pe_links, if (!bin_load) { alignment::ShortKMerReadMapper mapper(graph, workdir); - paired_info::FillPairedIndex(graph, + paired_info::FillPairedIndex(graph, MapLibFunc(), mapper, lib, index, { }, 0, std::numeric_limits::max()); diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 68ddc583a4..84df721d66 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -224,7 +224,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * lib.data().insert_size_deviation * cfg::get().de.rounding_coeff, cfg::get().de.rounding_thr)); - paired_info::FillPairedIndex(graph, *ChooseProperMapper(gp, lib), + paired_info::FillPairedIndex(graph, map_lib_func,*ChooseProperMapper(gp, lib), lib, gp.get_mutable()[i], std::move(filter), filter_threshold, round_thr); } diff --git a/src/projects/spades_tools/gmapper.cpp b/src/projects/spades_tools/gmapper.cpp index 202d474a1a..0de962556a 100644 --- a/src/projects/spades_tools/gmapper.cpp +++ b/src/projects/spades_tools/gmapper.cpp @@ -244,6 +244,7 @@ int main(int argc, char* argv[]) { io::ReadConverter::ConvertToBinary(lib, pool.get()); paired_info::FillPairedIndex(graph, + MapLibFunc(), mapper, lib, index, { }, 0, std::numeric_limits::max()); From b51eb6c861e00b2487b1a5d3a458c17124460dac Mon Sep 17 00:00:00 2001 From: Olga Kunyavsksya Date: Mon, 11 Apr 2022 15:39:03 +0300 Subject: [PATCH 094/102] split streams on allthreads cnt --- src/common/paired_info/paired_info_utils.cpp | 9 +++++---- src/common/paired_info/paired_info_utils.hpp | 5 +++-- src/projects/spades/mismatch_correction.cpp | 2 +- src/projects/spades/pair_info_count.cpp | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/common/paired_info/paired_info_utils.cpp b/src/common/paired_info/paired_info_utils.cpp index 0729f3524e..d5af5e072d 100644 --- a/src/common/paired_info/paired_info_utils.cpp +++ b/src/common/paired_info/paired_info_utils.cpp @@ -97,7 +97,8 @@ class EdgePairCounterFiller : public SequenceMapperListener { bool CollectLibInformation(const Graph &graph, const MapLibBase &process_libs, const SequenceMapperNotifier::SequenceMapperT &mapper, size_t &edgepairs, SequencingLib &reads, - size_t edge_length_threshold) { + size_t edge_length_threshold, + size_t num_readers) { INFO("Estimating insert size (takes a while)"); InsertSizeCounter hist_counter(graph, edge_length_threshold); EdgePairCounterFiller pcounter(omp_get_max_threads()); @@ -108,7 +109,7 @@ bool CollectLibInformation(const Graph &graph, const MapLibBase &process_libs, auto &data = reads.data(); auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, /*insert_size*/0, - /*include_merged*/true); + /*include_merged*/true, num_readers); process_libs(listeners, mapper, paired_streams); //Check read length after lib processing since mate pairs a not used until this step @@ -145,7 +146,7 @@ void FillPairedIndex(const Graph &graph, const MapLibBase &process_lib, SequencingLib &reads, PairedIndex &index, std::unique_ptr filter, unsigned filter_threshold, - unsigned round_thr, bool use_binary) { + unsigned round_thr, bool use_binary, size_t num_readers) { const auto &data = reads.data(); INFO("Left insert size quantile " << data.insert_size_left_quantile << @@ -172,7 +173,7 @@ void FillPairedIndex(const Graph &graph, const MapLibBase &process_lib, if (use_binary) { auto paired_streams = paired_binary_readers(reads, /*followed by rc*/false, (size_t) data.mean_insert_size, - /*include merged*/true); + /*include merged*/true, num_readers); process_lib(listeners, mapper, paired_streams); } else { auto paired_streams = paired_easy_readers(reads, /*followed by rc*/false, diff --git a/src/common/paired_info/paired_info_utils.hpp b/src/common/paired_info/paired_info_utils.hpp index 0b659e8ff1..f0bbe05a76 100644 --- a/src/common/paired_info/paired_info_utils.hpp +++ b/src/common/paired_info/paired_info_utils.hpp @@ -31,14 +31,15 @@ typedef std::function &mapper, size_t &edgepairs, SequencingLib &reads, - size_t edge_length_threshold); + size_t edge_length_threshold, + size_t num_readers = 0); void FillPairedIndex(const debruijn_graph::Graph &gp, const debruijn_graph::MapLibBase &process_libs, const debruijn_graph::SequenceMapper &mapper, SequencingLib &reads, PairedIndex &index, std::unique_ptr filter, unsigned filter_threshold, - unsigned round_thr = 0, bool use_binary = true); + unsigned round_thr = 0, bool use_binary = true, size_t num_readers = 0); std::unique_ptr FillEdgePairFilter(const debruijn_graph::Graph &gp, const debruijn_graph::SequenceMapper &mapper, diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp index 27e0e0fde8..b5f3e177b7 100644 --- a/src/projects/spades/mismatch_correction.cpp +++ b/src/projects/spades/mismatch_correction.cpp @@ -430,7 +430,7 @@ namespace mismatches { continue; auto &reads = dataset.reads[i]; - auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true); + auto single_streams = single_binary_readers(reads, /*followed by rc */true, /*binary*/true, num_readers_); proccess_lib_func_(&statistics, *mapper, single_streams); } diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp index 84df721d66..d976b8b398 100644 --- a/src/projects/spades/pair_info_count.cpp +++ b/src/projects/spades/pair_info_count.cpp @@ -180,7 +180,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, size_t edgepairs = 0; if (!paired_info::CollectLibInformation(graph, map_lib_func, *ChooseProperMapper(gp, lib), - edgepairs, lib, edge_length_threshold)) { + edgepairs, lib, edge_length_threshold, num_readers)) { cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0; WARN("Unable to estimate insert size for paired library #" << i); if (rl > 0 && rl <= k) { @@ -226,7 +226,7 @@ void PairInfoCountBase::execute(graph_pack::GraphPack &gp, const char *, paired_info::FillPairedIndex(graph, map_lib_func,*ChooseProperMapper(gp, lib), lib, gp.get_mutable()[i], - std::move(filter), filter_threshold, round_thr); + std::move(filter), filter_threshold, round_thr, true, num_readers); } } From 2d31adf11dd599e4b549cc6018cae6915582ed21 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 11 Sep 2024 18:21:39 -0700 Subject: [PATCH 095/102] Move MPI detection down to project --- src/cmake/deps.cmake | 29 --------------------- src/cmake/includes.cmake | 4 --- src/projects/hpcspades/CMakeLists.txt | 37 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/cmake/deps.cmake b/src/cmake/deps.cmake index ad1f82778e..bb27036aeb 100644 --- a/src/cmake/deps.cmake +++ b/src/cmake/deps.cmake @@ -19,35 +19,6 @@ find_package(Readline QUIET) set(CURSES_NEED_NCURSES TRUE) find_package(Curses QUIET) -set(MPI_DETERMINE_LIBRARY_VERSION TRUE) -find_package(MPI) -if (MPI_FOUND) - # Determine MPI vendor and MPI runtime version - # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorName.c.in" - # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" - # IMMEDIATE @ONLY) - # try_run(MPI_VENDOR_NAME_RUN MPI_HAVE_VENDOR_NAME - # ${CMAKE_BINARY_DIR} - # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" - # RUN_OUTPUT_VARIABLE MPI_RUNTIME_NAME) - # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorVersion.c.in" - # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" - # IMMEDIATE @ONLY) - # try_run(MPI_VENDOR_VERSION_RUN MPI_HAVE_VENDOR_VERSION - # ${CMAKE_BINARY_DIR} - # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" - # RUN_OUTPUT_VARIABLE MPI_RUNTIME_VERSION) - message(STATUS "Detected MPI runtime: ${MPI_C_LIBRARY_VERSION_STRING}") - - if ("${MPI_C_LIBRARY_VERSION_STRING}" MATCHES "^Open MPI") - string(REGEX REPLACE "Open MPI v([0-9]+).*" "\\1" OPENMPI_MAJOR_VERSION "${MPI_C_LIBRARY_VERSION_STRING}") - message(STATUS "Open MPI runtime detected, major version: ${OPENMPI_MAJOR_VERSION}") - if (OPENMPI_MAJOR_VERSION STREQUAL 3) - message(FATAL_ERROR "Open MPI version ${OPENMPI_MAJOR_VERSION}.x is known to be buggy") - endif() - endif() -endif() - # Use included boost unless explicitly specified if (NOT SPADES_BOOST_ROOT) set(BOOST_ROOT "${EXT_DIR}/include") diff --git a/src/cmake/includes.cmake b/src/cmake/includes.cmake index 0ea44afd71..a43b6594c2 100644 --- a/src/cmake/includes.cmake +++ b/src/cmake/includes.cmake @@ -13,7 +13,3 @@ endif() if (SPADES_USE_JEMALLOC) include_directories("$/../include") endif() - -if (MPI_FOUND) - include_directories("${MPI_INCLUDE_PATH}") -endif() diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index 616384fad1..4f0dbbf4fc 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -6,6 +6,43 @@ project(hpcspades CXX) +set(MPI_DETERMINE_LIBRARY_VERSION TRUE) +find_package(MPI) +if (MPI_FOUND) + # Determine MPI vendor and MPI runtime version + # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorName.c.in" + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" + # IMMEDIATE @ONLY) + # try_run(MPI_VENDOR_NAME_RUN MPI_HAVE_VENDOR_NAME + # ${CMAKE_BINARY_DIR} + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorName.c" + # RUN_OUTPUT_VARIABLE MPI_RUNTIME_NAME) + # configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPIVendorVersion.c.in" + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" + # IMMEDIATE @ONLY) + # try_run(MPI_VENDOR_VERSION_RUN MPI_HAVE_VENDOR_VERSION + # ${CMAKE_BINARY_DIR} + # "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/MPIVendorVersion.c" + # RUN_OUTPUT_VARIABLE MPI_RUNTIME_VERSION) + message(STATUS "Detected MPI runtime: ${MPI_C_LIBRARY_VERSION_STRING}") + + if ("${MPI_C_LIBRARY_VERSION_STRING}" MATCHES "^Open MPI") + string(REGEX REPLACE "Open MPI v([0-9]+).*" "\\1" OPENMPI_MAJOR_VERSION "${MPI_C_LIBRARY_VERSION_STRING}") + message(STATUS "Open MPI runtime detected, major version: ${OPENMPI_MAJOR_VERSION}") + if (OPENMPI_MAJOR_VERSION STREQUAL 3) + message(FATAL_ERROR "Open MPI version ${OPENMPI_MAJOR_VERSION}.x is known to be buggy") + endif() + endif() +endif() + +if (NOT MPI_ENABLE AND (NOT MPI_FOUND OR MPI_DISABLE)) + message(FATAL_ERROR "hpcSPAdees requires MPI to be enabled") +endif() + +if (MPI_FOUND) + include_directories("${MPI_INCLUDE_PATH}") +endif() + add_subdirectory(common) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) From 155b203ca180b48a263c1088588b10aaa0e105a2 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 11 Sep 2024 18:43:48 -0700 Subject: [PATCH 096/102] Normalize include paths --- src/projects/hpcspades/CMakeLists.txt | 3 ++- src/projects/hpcspades/distance_estimation_mpi.cpp | 2 +- src/projects/hpcspades/distance_estimation_mpi.hpp | 2 +- src/projects/hpcspades/gap_closer_mpi.hpp | 6 +++--- src/projects/hpcspades/main_mpi.cpp | 4 ++-- src/projects/hpcspades/mismatch_correction_mpi.hpp | 6 +++--- src/projects/hpcspades/{common => mpi}/CMakeLists.txt | 0 .../hpcspades/{common => mpi}/alignment/CMakeLists.txt | 0 .../alignment/sequence_mapper_notifier_mpi.cpp | 0 .../alignment/sequence_mapper_notifier_mpi.hpp | 2 +- .../construction/debruijn_graph_constructor_mpi.hpp | 4 ++-- .../hpcspades/{common => mpi}/kmer_index/CMakeLists.txt | 0 .../extension_index/kmer_extension_index_builder_mpi.hpp | 6 +++--- .../kmer_index/kmer_mph/kmer_index_builder_mpi.hpp | 2 +- .../{common => mpi}/kmer_index/logger/mpi_log_writers.cpp | 0 .../{common => mpi}/kmer_index/logger/mpi_log_writers.hpp | 0 .../kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp | 2 +- .../hpcspades/{common => mpi}/paired_info/CMakeLists.txt | 0 .../{common => mpi}/paired_info/distance_estimation.cpp | 0 .../{common => mpi}/paired_info/distance_estimation.hpp | 2 +- .../paired_info/distance_estimation_utils.cpp | 0 .../paired_info/distance_estimation_utils.hpp | 0 .../hpcspades/{common => mpi}/pipeline/CMakeLists.txt | 0 .../hpcspades/{common => mpi}/pipeline/mpi_stage.cpp | 0 .../hpcspades/{common => mpi}/pipeline/mpi_stage.hpp | 0 .../hpcspades/{common => mpi}/pipeline/partask_mpi.hpp | 0 .../hpcspades/{common => mpi}/stages/CMakeLists.txt | 0 .../hpcspades/{common => mpi}/stages/construction_mpi.cpp | 8 ++++---- .../hpcspades/{common => mpi}/stages/construction_mpi.hpp | 2 +- .../hpcspades/{common => mpi}/stages/test_mpi.cpp | 4 ++-- .../hpcspades/{common => mpi}/utils/CMakeLists.txt | 0 .../{common => mpi}/utils/logger/mpi_log_writers.cpp | 0 .../{common => mpi}/utils/logger/mpi_log_writers.hpp | 0 src/projects/hpcspades/pair_info_count_mpi.hpp | 4 ++-- src/projects/hpcspades/pipeline.cpp | 4 ++-- 35 files changed, 32 insertions(+), 31 deletions(-) rename src/projects/hpcspades/{common => mpi}/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/alignment/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/alignment/sequence_mapper_notifier_mpi.cpp (100%) rename src/projects/hpcspades/{common => mpi}/alignment/sequence_mapper_notifier_mpi.hpp (98%) rename src/projects/hpcspades/{common => mpi}/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp (96%) rename src/projects/hpcspades/{common => mpi}/kmer_index/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp (98%) rename src/projects/hpcspades/{common => mpi}/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp (99%) rename src/projects/hpcspades/{common => mpi}/kmer_index/logger/mpi_log_writers.cpp (100%) rename src/projects/hpcspades/{common => mpi}/kmer_index/logger/mpi_log_writers.hpp (100%) rename src/projects/hpcspades/{common => mpi}/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp (98%) rename src/projects/hpcspades/{common => mpi}/paired_info/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/paired_info/distance_estimation.cpp (100%) rename src/projects/hpcspades/{common => mpi}/paired_info/distance_estimation.hpp (98%) rename src/projects/hpcspades/{common => mpi}/paired_info/distance_estimation_utils.cpp (100%) rename src/projects/hpcspades/{common => mpi}/paired_info/distance_estimation_utils.hpp (100%) rename src/projects/hpcspades/{common => mpi}/pipeline/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/pipeline/mpi_stage.cpp (100%) rename src/projects/hpcspades/{common => mpi}/pipeline/mpi_stage.hpp (100%) rename src/projects/hpcspades/{common => mpi}/pipeline/partask_mpi.hpp (100%) rename src/projects/hpcspades/{common => mpi}/stages/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/stages/construction_mpi.cpp (99%) rename src/projects/hpcspades/{common => mpi}/stages/construction_mpi.hpp (91%) rename src/projects/hpcspades/{common => mpi}/stages/test_mpi.cpp (95%) rename src/projects/hpcspades/{common => mpi}/utils/CMakeLists.txt (100%) rename src/projects/hpcspades/{common => mpi}/utils/logger/mpi_log_writers.cpp (100%) rename src/projects/hpcspades/{common => mpi}/utils/logger/mpi_log_writers.hpp (100%) diff --git a/src/projects/hpcspades/CMakeLists.txt b/src/projects/hpcspades/CMakeLists.txt index 4f0dbbf4fc..0b3885c4e4 100644 --- a/src/projects/hpcspades/CMakeLists.txt +++ b/src/projects/hpcspades/CMakeLists.txt @@ -43,7 +43,8 @@ if (MPI_FOUND) include_directories("${MPI_INCLUDE_PATH}") endif() -add_subdirectory(common) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_subdirectory(mpi) add_library(spades-stages-hpc STATIC distance_estimation_mpi.cpp) diff --git a/src/projects/hpcspades/distance_estimation_mpi.cpp b/src/projects/hpcspades/distance_estimation_mpi.cpp index 5cc3645a82..fff6e71e4f 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.cpp +++ b/src/projects/hpcspades/distance_estimation_mpi.cpp @@ -6,7 +6,7 @@ #include "distance_estimation_mpi.hpp" -#include "common/paired_info/distance_estimation_utils.hpp" +#include "mpi/paired_info/distance_estimation_utils.hpp" namespace debruijn_graph { void DistanceEstimationMPI::run(graph_pack::GraphPack &gp, const char* s) { diff --git a/src/projects/hpcspades/distance_estimation_mpi.hpp b/src/projects/hpcspades/distance_estimation_mpi.hpp index 64abe377d9..1f79e9560c 100644 --- a/src/projects/hpcspades/distance_estimation_mpi.hpp +++ b/src/projects/hpcspades/distance_estimation_mpi.hpp @@ -7,7 +7,7 @@ #pragma once -#include "common/pipeline/mpi_stage.hpp" +#include "mpi/pipeline/mpi_stage.hpp" #include "projects/spades/distance_estimation.hpp" namespace debruijn_graph { diff --git a/src/projects/hpcspades/gap_closer_mpi.hpp b/src/projects/hpcspades/gap_closer_mpi.hpp index 25c15e6de8..bdfb7663bc 100644 --- a/src/projects/hpcspades/gap_closer_mpi.hpp +++ b/src/projects/hpcspades/gap_closer_mpi.hpp @@ -8,10 +8,10 @@ #pragma once -#include "projects/spades/gap_closer.hpp" -#include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "common/pipeline/mpi_stage.hpp" +#include "mpi/alignment/sequence_mapper_notifier_mpi.hpp" +#include "mpi/pipeline/mpi_stage.hpp" #include "io/reads/io_helper.hpp" +#include "projects/spades/gap_closer.hpp" namespace debruijn_graph { class GapClosingMPI : public GapClosingBase, public spades_mpi::MPIAssemblyStage { diff --git a/src/projects/hpcspades/main_mpi.cpp b/src/projects/hpcspades/main_mpi.cpp index 29bca4b5fb..ac310c5a0e 100644 --- a/src/projects/hpcspades/main_mpi.cpp +++ b/src/projects/hpcspades/main_mpi.cpp @@ -7,9 +7,9 @@ //*************************************************************************** #include "configs/config_struct.hpp" -#include "common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" -#include "common/utils/logger/mpi_log_writers.hpp" +#include "mpi/utils/logger/mpi_log_writers.hpp" #include "utils/memory_limit.hpp" #include "utils/segfault_handler.hpp" #include "utils/perf/timetracer.hpp" diff --git a/src/projects/hpcspades/mismatch_correction_mpi.hpp b/src/projects/hpcspades/mismatch_correction_mpi.hpp index 75d0aee26a..9d90d54c47 100644 --- a/src/projects/hpcspades/mismatch_correction_mpi.hpp +++ b/src/projects/hpcspades/mismatch_correction_mpi.hpp @@ -6,10 +6,10 @@ #pragma once -#include "projects/spades/mismatch_correction.hpp" -#include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "common/pipeline/mpi_stage.hpp" +#include "mpi/alignment/sequence_mapper_notifier_mpi.hpp" +#include "mpi/pipeline/mpi_stage.hpp" #include "pipeline/graph_pack_helpers.h" +#include "projects/spades/mismatch_correction.hpp" namespace debruijn_graph { class MismatchCorrectionMPI : public spades_mpi::MPIAssemblyStage { diff --git a/src/projects/hpcspades/common/CMakeLists.txt b/src/projects/hpcspades/mpi/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/CMakeLists.txt rename to src/projects/hpcspades/mpi/CMakeLists.txt diff --git a/src/projects/hpcspades/common/alignment/CMakeLists.txt b/src/projects/hpcspades/mpi/alignment/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/alignment/CMakeLists.txt rename to src/projects/hpcspades/mpi/alignment/CMakeLists.txt diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp b/src/projects/hpcspades/mpi/alignment/sequence_mapper_notifier_mpi.cpp similarity index 100% rename from src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.cpp rename to src/projects/hpcspades/mpi/alignment/sequence_mapper_notifier_mpi.cpp diff --git a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp b/src/projects/hpcspades/mpi/alignment/sequence_mapper_notifier_mpi.hpp similarity index 98% rename from src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp rename to src/projects/hpcspades/mpi/alignment/sequence_mapper_notifier_mpi.hpp index a8a8eddc2d..1d0029d1eb 100644 --- a/src/projects/hpcspades/common/alignment/sequence_mapper_notifier_mpi.hpp +++ b/src/projects/hpcspades/mpi/alignment/sequence_mapper_notifier_mpi.hpp @@ -14,7 +14,7 @@ #include "assembly_graph/core/graph.hpp" #include "io/reads/read_stream_vector.hpp" #include "utils/perf/timetracer.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" #include #include diff --git a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp b/src/projects/hpcspades/mpi/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp similarity index 96% rename from src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp rename to src/projects/hpcspades/mpi/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp index f827bb33a7..fdbf16cf9d 100644 --- a/src/projects/hpcspades/common/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp +++ b/src/projects/hpcspades/mpi/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp @@ -5,9 +5,9 @@ //* See file LICENSE for details. //*************************************************************************** -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" #include "io/binary/graph.hpp" -#include "common/assembly_graph/construction/debruijn_graph_constructor.hpp" +#include "assembly_graph/construction/debruijn_graph_constructor.hpp" namespace debruijn_graph { template diff --git a/src/projects/hpcspades/common/kmer_index/CMakeLists.txt b/src/projects/hpcspades/mpi/kmer_index/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/kmer_index/CMakeLists.txt rename to src/projects/hpcspades/mpi/kmer_index/CMakeLists.txt diff --git a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp b/src/projects/hpcspades/mpi/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp similarity index 98% rename from src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp rename to src/projects/hpcspades/mpi/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp index 8e6a8ba540..8fbb25861a 100644 --- a/src/projects/hpcspades/common/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp +++ b/src/projects/hpcspades/mpi/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp @@ -10,13 +10,13 @@ #include "kmer_index/extension_index/kmer_extension_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" -#include "projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" +#include "mpi/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "kmer_index/kmer_counting.hpp" -#include "projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp" +#include "mpi/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp" #include "io/reads/multifile_reader.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" namespace kmers { class DeBruijnExtensionIndexBuilderMPI : public DeBruijnExtensionIndexBuilder { diff --git a/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp b/src/projects/hpcspades/mpi/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp similarity index 99% rename from src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp rename to src/projects/hpcspades/mpi/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp index b5c759af95..8055b1942f 100644 --- a/src/projects/hpcspades/common/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp +++ b/src/projects/hpcspades/mpi/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp @@ -8,7 +8,7 @@ #include "kmer_index/kmer_mph/kmer_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_buckets.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" namespace kmers { template diff --git a/src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.cpp b/src/projects/hpcspades/mpi/kmer_index/logger/mpi_log_writers.cpp similarity index 100% rename from src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.cpp rename to src/projects/hpcspades/mpi/kmer_index/logger/mpi_log_writers.cpp diff --git a/src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.hpp b/src/projects/hpcspades/mpi/kmer_index/logger/mpi_log_writers.hpp similarity index 100% rename from src/projects/hpcspades/common/kmer_index/logger/mpi_log_writers.hpp rename to src/projects/hpcspades/mpi/kmer_index/logger/mpi_log_writers.hpp diff --git a/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp b/src/projects/hpcspades/mpi/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp similarity index 98% rename from src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp rename to src/projects/hpcspades/mpi/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp index c96115f9b8..9a69da431c 100644 --- a/src/projects/hpcspades/common/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp +++ b/src/projects/hpcspades/mpi/kmer_index/ph_map/perfect_hash_map_builder_mpi.hpp @@ -9,7 +9,7 @@ #include "kmer_index/ph_map/kmer_maps.hpp" #include "kmer_index/ph_map/cqf_hash_map.hpp" -#include "..//kmer_mph/kmer_index_builder_mpi.hpp" +#include "mpi/kmer_index/kmer_mph/kmer_index_builder_mpi.hpp" #include "kmer_index/kmer_mph/kmer_index_builder.hpp" #include "kmer_index/kmer_mph/kmer_splitters.hpp" #include "common/utils/perf/timetracer.hpp" diff --git a/src/projects/hpcspades/common/paired_info/CMakeLists.txt b/src/projects/hpcspades/mpi/paired_info/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/paired_info/CMakeLists.txt rename to src/projects/hpcspades/mpi/paired_info/CMakeLists.txt diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.cpp b/src/projects/hpcspades/mpi/paired_info/distance_estimation.cpp similarity index 100% rename from src/projects/hpcspades/common/paired_info/distance_estimation.cpp rename to src/projects/hpcspades/mpi/paired_info/distance_estimation.cpp diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp b/src/projects/hpcspades/mpi/paired_info/distance_estimation.hpp similarity index 98% rename from src/projects/hpcspades/common/paired_info/distance_estimation.hpp rename to src/projects/hpcspades/mpi/paired_info/distance_estimation.hpp index 95635f4405..e6a0894bef 100644 --- a/src/projects/hpcspades/common/paired_info/distance_estimation.hpp +++ b/src/projects/hpcspades/mpi/paired_info/distance_estimation.hpp @@ -8,7 +8,7 @@ #define MPI_DISTANCE_ESTIMATION_HPP_ #include "common/paired_info/distance_estimation.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/partask_mpi.hpp" namespace omnigraph { namespace de { diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp b/src/projects/hpcspades/mpi/paired_info/distance_estimation_utils.cpp similarity index 100% rename from src/projects/hpcspades/common/paired_info/distance_estimation_utils.cpp rename to src/projects/hpcspades/mpi/paired_info/distance_estimation_utils.cpp diff --git a/src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp b/src/projects/hpcspades/mpi/paired_info/distance_estimation_utils.hpp similarity index 100% rename from src/projects/hpcspades/common/paired_info/distance_estimation_utils.hpp rename to src/projects/hpcspades/mpi/paired_info/distance_estimation_utils.hpp diff --git a/src/projects/hpcspades/common/pipeline/CMakeLists.txt b/src/projects/hpcspades/mpi/pipeline/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/pipeline/CMakeLists.txt rename to src/projects/hpcspades/mpi/pipeline/CMakeLists.txt diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.cpp b/src/projects/hpcspades/mpi/pipeline/mpi_stage.cpp similarity index 100% rename from src/projects/hpcspades/common/pipeline/mpi_stage.cpp rename to src/projects/hpcspades/mpi/pipeline/mpi_stage.cpp diff --git a/src/projects/hpcspades/common/pipeline/mpi_stage.hpp b/src/projects/hpcspades/mpi/pipeline/mpi_stage.hpp similarity index 100% rename from src/projects/hpcspades/common/pipeline/mpi_stage.hpp rename to src/projects/hpcspades/mpi/pipeline/mpi_stage.hpp diff --git a/src/projects/hpcspades/common/pipeline/partask_mpi.hpp b/src/projects/hpcspades/mpi/pipeline/partask_mpi.hpp similarity index 100% rename from src/projects/hpcspades/common/pipeline/partask_mpi.hpp rename to src/projects/hpcspades/mpi/pipeline/partask_mpi.hpp diff --git a/src/projects/hpcspades/common/stages/CMakeLists.txt b/src/projects/hpcspades/mpi/stages/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/stages/CMakeLists.txt rename to src/projects/hpcspades/mpi/stages/CMakeLists.txt diff --git a/src/projects/hpcspades/common/stages/construction_mpi.cpp b/src/projects/hpcspades/mpi/stages/construction_mpi.cpp similarity index 99% rename from src/projects/hpcspades/common/stages/construction_mpi.cpp rename to src/projects/hpcspades/mpi/stages/construction_mpi.cpp index b91462a832..c4c5cb5783 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.cpp +++ b/src/projects/hpcspades/mpi/stages/construction_mpi.cpp @@ -10,18 +10,18 @@ #include "alignment/edge_index.hpp" #include "assembly_graph/construction/early_simplification.hpp" -#include "../assembly_graph/construction/debruijn_graph_constructor_mpi.hpp" +#include "mpi/assembly_graph/construction/debruijn_graph_constructor_mpi.hpp" #include "io/dataset_support/dataset_readers.hpp" #include "io/dataset_support/read_converter.hpp" #include "io/reads/coverage_filtering_read_wrapper.hpp" #include "io/reads/multifile_reader.hpp" #include "kmer_index/ph_map/coverage_hash_map_builder.hpp" -#include "../kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp" +#include "mpi/kmer_index/extension_index/kmer_extension_index_builder_mpi.hpp" #include "modules/graph_construction.hpp" #include "pipeline/genomic_info.hpp" #include "pipeline/graph_pack.hpp" -#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/mpi_stage.hpp" +#include "mpi/pipeline/partask_mpi.hpp" #include "utils/filesystem/temporary.hpp" namespace debruijn_graph { diff --git a/src/projects/hpcspades/common/stages/construction_mpi.hpp b/src/projects/hpcspades/mpi/stages/construction_mpi.hpp similarity index 91% rename from src/projects/hpcspades/common/stages/construction_mpi.hpp rename to src/projects/hpcspades/mpi/stages/construction_mpi.hpp index 970c558b14..c2a2ba2345 100644 --- a/src/projects/hpcspades/common/stages/construction_mpi.hpp +++ b/src/projects/hpcspades/mpi/stages/construction_mpi.hpp @@ -6,7 +6,7 @@ #pragma once -#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" +#include "mpi/pipeline/mpi_stage.hpp" namespace debruijn_graph { struct ConstructionStorage; diff --git a/src/projects/hpcspades/common/stages/test_mpi.cpp b/src/projects/hpcspades/mpi/stages/test_mpi.cpp similarity index 95% rename from src/projects/hpcspades/common/stages/test_mpi.cpp rename to src/projects/hpcspades/mpi/stages/test_mpi.cpp index 39024f4e88..f521f7c6c9 100644 --- a/src/projects/hpcspades/common/stages/test_mpi.cpp +++ b/src/projects/hpcspades/mpi/stages/test_mpi.cpp @@ -4,8 +4,8 @@ //* See file LICENSE for details. //*************************************************************************** -#include "projects/hpcspades/common/pipeline/mpi_stage.hpp" -#include "projects/hpcspades/common/pipeline/partask_mpi.hpp" +#include "mpi/pipeline/mpi_stage.hpp" +#include "mpi/pipeline/partask_mpi.hpp" #include #include diff --git a/src/projects/hpcspades/common/utils/CMakeLists.txt b/src/projects/hpcspades/mpi/utils/CMakeLists.txt similarity index 100% rename from src/projects/hpcspades/common/utils/CMakeLists.txt rename to src/projects/hpcspades/mpi/utils/CMakeLists.txt diff --git a/src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp b/src/projects/hpcspades/mpi/utils/logger/mpi_log_writers.cpp similarity index 100% rename from src/projects/hpcspades/common/utils/logger/mpi_log_writers.cpp rename to src/projects/hpcspades/mpi/utils/logger/mpi_log_writers.cpp diff --git a/src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp b/src/projects/hpcspades/mpi/utils/logger/mpi_log_writers.hpp similarity index 100% rename from src/projects/hpcspades/common/utils/logger/mpi_log_writers.hpp rename to src/projects/hpcspades/mpi/utils/logger/mpi_log_writers.hpp diff --git a/src/projects/hpcspades/pair_info_count_mpi.hpp b/src/projects/hpcspades/pair_info_count_mpi.hpp index 6ca31ebc2f..143ecc4ff5 100644 --- a/src/projects/hpcspades/pair_info_count_mpi.hpp +++ b/src/projects/hpcspades/pair_info_count_mpi.hpp @@ -8,8 +8,8 @@ #pragma once #include "projects/spades/pair_info_count.hpp" -#include "common/alignment/sequence_mapper_notifier_mpi.hpp" -#include "common/pipeline/mpi_stage.hpp" +#include "mpi/alignment/sequence_mapper_notifier_mpi.hpp" +#include "mpi/pipeline/mpi_stage.hpp" namespace debruijn_graph { class PairInfoCountMPI : public PairInfoCountBase, public spades_mpi::MPIAssemblyStage { diff --git a/src/projects/hpcspades/pipeline.cpp b/src/projects/hpcspades/pipeline.cpp index 74b15f831c..8acf95ef8c 100644 --- a/src/projects/hpcspades/pipeline.cpp +++ b/src/projects/hpcspades/pipeline.cpp @@ -23,12 +23,12 @@ #include "library/library.hpp" #include "pipeline/graph_pack.hpp" #include "pipeline/stage.hpp" -#include "common/pipeline/mpi_stage.hpp" +#include "mpi/pipeline/mpi_stage.hpp" #include "alignment/kmer_mapper.hpp" #include "stages/genomic_info_filler.hpp" #include "stages/read_conversion.hpp" -#include "common/stages/construction_mpi.hpp" +#include "mpi/stages/construction_mpi.hpp" #include "stages/simplification.hpp" #include "stages/ss_edge_split.hpp" #include "configs/config_struct.hpp" From 220ddade9a258a7d895aeb658f0e132cbdaccd9c Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 18 Sep 2024 15:02:48 -0700 Subject: [PATCH 097/102] Add spades.py bits --- src/projects/spades/pipeline/spades.py | 59 ++++- .../spades_pipeline/commands_parser.py | 43 +++- .../executors/executor_local.py | 16 +- .../spades_pipeline/executors/executor_mpi.py | 67 ++++++ .../executors/executor_save_mpi_sh.py | 34 +++ .../executors/executor_save_yaml.py | 8 +- .../spades_pipeline/executors/executors.py | 215 +++++++++++++++++- .../spades_pipeline/options_parser.py | 62 +++++ .../spades_pipeline/options_storage.py | 2 + .../stages/spades_iteration_stage.py | 5 +- 10 files changed, 492 insertions(+), 19 deletions(-) create mode 100644 src/projects/spades/pipeline/spades_pipeline/executors/executor_mpi.py create mode 100644 src/projects/spades/pipeline/spades_pipeline/executors/executor_save_mpi_sh.py diff --git a/src/projects/spades/pipeline/spades.py b/src/projects/spades/pipeline/spades.py index 258c0a4902..8a5cc1feab 100755 --- a/src/projects/spades/pipeline/spades.py +++ b/src/projects/spades/pipeline/spades.py @@ -36,7 +36,6 @@ import options_parser from stages.pipeline import Pipeline -import executor_local import executor_save_yaml def print_used_values(cfg, log): @@ -561,6 +560,20 @@ def build_pipeline(pipeline, cfg, output_files, tmp_configs_dir, dataset_data, l terminating_stage.add_to_pipeline(pipeline, cfg, output_files, tmp_configs_dir, dataset_data, log, bin_home, ext_python_modules_home, python_modules_home) +def get_executor(log): + import importlib + module_name = "executor_" + options_storage.args.grid_engine + executor_module = importlib.import_module(module_name) + return executor_module.Executor(log) + + +def get_sh_dump_executor(log): + if options_storage.args.grid_engine == "local": + return executor_save_yaml.Executor(log) + else: + import executor_save_mpi_sh + return executor_save_mpi_sh.Executor(log) + def check_dir_is_empty(dir_name): if dir_name is not None and \ @@ -577,7 +590,7 @@ def init_parser(args): output_dir = options_parser.get_output_dir_from_args() if output_dir is None: support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).") - + command_line, options, script, err_msg = get_options_from_params( os.path.join(output_dir, "params.txt"), args[0]) @@ -597,6 +610,8 @@ def main(args): options_parser.usage(spades_version) sys.exit(0) + jobs = [] + executor = None pipeline = Pipeline() log = create_logger() @@ -624,17 +639,45 @@ def main(args): pipeline.generate_configs(cfg, spades_home, tmp_configs_dir) commands = pipeline.get_commands(cfg) - executor = executor_save_yaml.Executor(log) + executor = get_sh_dump_executor(log) executor.execute(commands) - if not options_storage.args.only_generate_config: - executor = executor_local.Executor(log) - executor.execute(commands) + executor = get_executor(log) + if options_storage.args.grid_engine != "local": + executor.dump_commands(commands, os.path.join(options_storage.args.output_dir, "run_spades_on_cluster.sh")) + + if options_storage.args.only_generate_config: + jobs = None + else: + jobs = executor.execute(commands) + + if jobs is not None and len(jobs): + last_job = jobs[-1] + if options_storage.args.grid_wait: + log.info("Waiting for the last job: " + last_job) + executor.join(last_job) + else: + log.info("Last job name: " + last_job) + + is_result = (options_storage.args.grid_engine != "save_yaml" and ( + not options_storage.args.only_generate_config)) and \ + (options_storage.args.grid_wait or options_storage.args.grid_engine == "local") + if is_result: + # TODO make it executor method executor.is_fake() print_info_about_output_files(cfg, log, output_files) if not support.log_warnings(log): - log.info("\n======= SPAdes pipeline finished.") - + if is_result: + log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS + else: + log.info("\n======= SPAdes pipeline submitted.") + + except KeyboardInterrupt: + log.info("Ctrl + C pressed, killing jobs...") + if executor is not None: + for job in jobs: + log.info("Killing " + job) + executor.kill(job) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: diff --git a/src/projects/spades/pipeline/spades_pipeline/commands_parser.py b/src/projects/spades/pipeline/spades_pipeline/commands_parser.py index 8b58207136..2d1610820f 100644 --- a/src/projects/spades/pipeline/spades_pipeline/commands_parser.py +++ b/src/projects/spades/pipeline/spades_pipeline/commands_parser.py @@ -16,11 +16,16 @@ class Command(object): def __init__(self, STAGE, path, args, short_name, config_dir="", + mpi_support=False, job_uuid="", del_after=None, output_files=None): self.STAGE = STAGE self.path = path self.args = args self.short_name = short_name + self.mpi_support = mpi_support + self.job_uuid = self.generate_job_uuid() + if job_uuid != "": + self.job_uuid = job_uuid self.config_dir = config_dir self.del_after = del_after if self.del_after is None: @@ -30,11 +35,33 @@ def __init__(self, STAGE, path, args, short_name, config_dir="", self.output_files = [] def to_list(self): - return [self.path] + self.args + return [self.path.format(spades_core="spades-core")] + self.args + + def to_sh_list(self): + if self.path == sys.executable: + return ["$PYTHON"] + self.args + return [self.path.format(spades_core="spades-core")] + self.args + + def to_mpi_list(self): + return [self.path.format(spades_core="spades-hpc")] + self.args + + def to_mpi_sh_list(self): + if self.path == sys.executable: + return ["$PYTHON"] + self.args + return [self.path.format(spades_core="spades-hpc")] + self.args def __str__(self): return ' '.join(self.to_list()) + def sh_str(self): + return ' '.join(self.to_sh_list()) + + def mpi_str(self): + return ' '.join(self.to_mpi_list()) + + def mpi_sh_str(self): + return ' '.join(self.to_mpi_sh_list()) + def run(self, log): support.sys_call(self.to_list(), log) @@ -43,10 +70,17 @@ def to_dict(self): "path": self.path, "args": self.args, "short_name": self.short_name, + "mpi_support": self.mpi_support, + "job_uuid": self.job_uuid, "config_dir": self.config_dir, "output_files": self.output_files, "del_after": self.del_after} + def generate_job_uuid(self): + return self.STAGE.replace(' ', '_') + "_" + \ + ''.join([random.choice(string.ascii_uppercase + string.digits) for k in range(32)]) + + def write_commands_to_sh(commands, output_file): with open(output_file, 'w') as fw: @@ -55,6 +89,13 @@ def write_commands_to_sh(commands, output_file): fw.write(command.__str__() + "\n") +def write_commands_to_mpi_sh(commands, output_file): + with open(output_file, 'w') as fw: + fw.write("set -e\n") + for command in commands: + fw.write(command.mpi_str() + "\n") + + def write_commands_to_yaml(commands, output_file): import pyyaml3 as yaml diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_local.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_local.py index 6a38563c09..2828081a40 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executor_local.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_local.py @@ -20,10 +20,9 @@ def __init__(self, log): super(Executor, self).__init__(log) def execute(self, commands): - for num in range(len(commands)): - command = commands[num] + for num, command in enumerate(commands): + stage_checkpoint_path = options_storage.get_stage_filename(num, command.short_name) if options_storage.args.continue_mode: - stage_checkpoint_path = options_storage. get_stage_filename(num, command.short_name) if os.path.isfile(stage_checkpoint_path) and \ ("_start" not in command.short_name) and \ ("_finish" not in command.short_name): @@ -51,7 +50,8 @@ def execute(self, commands): "pipeline (--stop-after was set to '%s'). " "You can continue later with --continue or " "--restart-from options\n" % options_storage.args.stop_after) - break + return None + return None def rm_files(self, command): if options_storage.args.no_clear_after: @@ -76,4 +76,10 @@ def touch_file(self, command, num): path = options_storage.get_stage_filename(num, command.short_name) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) - open(path, 'a').close() \ No newline at end of file + open(path, 'a').close() + + def join(self, job_name): + assert (job_name is None) + + def kill(self, job_name): + assert (job_name is None) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_mpi.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_mpi.py new file mode 100644 index 0000000000..e624f4163d --- /dev/null +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_mpi.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import os +import shutil +import support +import executors +import commands_parser +import options_storage +import executor_local + + +class Executor(executor_local.Executor): + def execute(self, commands): + for num, command in enumerate(commands): + stage_checkpoint_path = options_storage.get_stage_filename(num, command.short_name) + + if options_storage.args.continue_mode: + if os.path.isfile(stage_checkpoint_path) and \ + ("_start" not in command.short_name) and \ + ("_finish" not in command.short_name): + self.log.info("===== Skipping %s (already processed)" % command.STAGE) + continue + + if "_finish" not in command.short_name: + self.log.info("\n===== %s started. \n" % command.STAGE) + + # `true' command does nothing, it corresponds to an arbitrary stage + # used for cleanup, restart-from, and other such stuff. We skip its + # actual running for the sake of log purity and beauty + if command.__str__() != "true": + if (command.mpi_support): + # cmd = "mpiexec -np 4 xterm -e gdb -ex run --args " + command.__str__() + valgrind = "valgrind" if options_storage.args.grid_valgrind else "" + cmd = "mpiexec --bind-to none -np {NODES} {VALGRIND} ".format(NODES=options_storage.args.grid_nnodes, VALGRIND=valgrind) + command.mpi_str() + self.log.info("\n== Running: %s\n" % cmd) + support.sys_call(cmd, self.log) + else: + self.log.info("\n== Running: %s\n" % command.__str__()) + command.run(self.log) + + + self.rm_files(command) + self.check_output(command) + + if "_start" not in command.short_name: + self.log.info("\n===== %s finished. \n" % command.STAGE) + + self.touch_file(command, num) + + if options_storage.args.stop_after == command.short_name or \ + ("_finish" in command.short_name and + options_storage.args.stop_after == command.short_name.split('_')[0]): + self.log.info("\n======= Skipping the rest of SPAdes " + "pipeline (--stop-after was set to '%s'). " + "You can continue later with --continue or " + "--restart-from options\n" % options_storage.args.stop_after) + return None + return None + + def dump_commands(self, commands, outputfile): + commands_parser.write_commands_to_mpi_sh(commands, outputfile) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_mpi_sh.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_mpi_sh.py new file mode 100644 index 0000000000..97e6b93268 --- /dev/null +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_mpi_sh.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ + +import os +import executors +import commands_parser +import options_storage + + +class Executor(executors.ExecutorBase): + def __init__(self, log): + super(Executor, self).__init__(log) + + def execute(self, commands): + super(Executor, self).execute(commands) + commands_parser.write_commands_to_mpi_sh(commands, os.path.join(options_storage.args.output_dir, "run_spades.sh")) + commands_parser.write_commands_to_yaml(commands, + os.path.join(options_storage.args.output_dir, + "run_spades.yaml")) + return None + + def dump_commands(self, commands, outputfile): + commands_parser.write_commands_to_mpi_sh(commands, outputfile) + + def join(self, job_name): + assert (job_name is None) + + def kill(self, job_name): + assert (job_name is None) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_yaml.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_yaml.py index 2b11048da8..97d3a8e263 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_yaml.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_save_yaml.py @@ -23,9 +23,13 @@ def execute(self, commands): commands_parser.write_commands_to_yaml(commands, os.path.join(options_storage.args.output_dir, "run_spades.yaml")) + return None def dump_commands(self, commands, outputfile): commands_parser.write_commands_to_sh(commands, outputfile) - def touch_file(self, command): - pass + def join(self, job_name): + assert (job_name is None) + + def kill(self, job_name): + assert (job_name is None) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executors.py b/src/projects/spades/pipeline/spades_pipeline/executors/executors.py index 9f46fa8325..c6e3d1131c 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executors.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executors.py @@ -8,6 +8,7 @@ ############################################################################ import os +import sys from abc import ABCMeta, abstractmethod import options_storage @@ -30,5 +31,217 @@ def dump_commands(self, commands, outputfile): pass @abstractmethod - def touch_file(self, command): + def join(self, job_name): pass + + @abstractmethod + def kill(self, job_name): + pass + + +class ExecutorCluster(ExecutorBase): + grid_engine = None + grid_engine_submit_command = None + grid_engine_name_option = None + grid_engine_output_option = None + grid_engine_err_output_option = None + grid_engine_thread_option = None + grid_engine_dependency_option = None + grid_engine_memory_option = None + grid_engine_queue = None + grid_engine_minimum_node_mem = None + grid_engine_mpi_runtime = None + grid_engine_mpi_runtime_args = None + grid_engine_wait_command = None + grid_engine_kill_command = None + + def join(self, job_name): + support.sys_call(self.grid_engine_wait_command.format(JOB_NAME=job_name), log=self.log) + + def kill(self, job_name): + # support.sys_call(self.grid_engine_kill_command.format(JOB_NAME=job_name), log=self.log) + os.system(self.grid_engine_kill_command.format(JOB_NAME=job_name)) + + def run_cluster_command(self, cmd, uuid): + support.sys_call(cmd, log=self.log) + return uuid + + def execute(self, commands): + jobs = [] + def prev_id(): + if not jobs: + return "" + else: + return jobs[-1] + + for num, command in enumerate(commands): + stage_checkpoint_path = options_storage.get_stage_filename(num, command.short_name) + + if options_storage.args.continue_mode: + if os.path.isfile(stage_checkpoint_path) and \ + ("_start" not in command.short_name) and \ + ("_finish" not in command.short_name): + self.log.info("===== Skipping %s (already processed)" % command.STAGE) + continue + + if "_finish" not in command.short_name: + self.log.info("\n===== %s started. \n" % command.STAGE) + + # `true' command does nothing, it corresponds to an arbitrary stage + # used for cleanup, restart-from, and other such stuff We skip its + # actual running for the sake of log purity and beauty + if command.__str__() != "true": + self.log.info("\n==Submitting: %s\n" % command.__str__()) + if commands[num].mpi_support: + cmd = self.get_MPI_command(command, prev_id()) + else: + cmd = self.get_not_MPI_command(command, prev_id()) + jid = self.run_cluster_command(cmd, command.job_uuid) + if "_start" not in command.short_name: + self.log.info("\n===== %s submitted. Job ID: %s \n" % (command.STAGE, jid)) + jobs.append(jid) + + touch_command = commands_parser.Command(command.STAGE + "_touch", + "touch", + [stage_checkpoint_path], + "touch", + job_uuid=command.job_uuid + "_touch") + + touch_jid = self.run_cluster_command(self.get_not_MPI_command(touch_command, prev_id()), touch_command.job_uuid) + jobs.append(touch_jid) + + # FIXME implement + # self.rm_files(command) + # self.check_output(command) + + if options_storage.args.stop_after == command.short_name or \ + ("_finish" in command.short_name and + options_storage.args.stop_after == command.short_name.split('_')[0]): + self.log.info("\n======= Skipping the rest of SPAdes " + "pipeline (--stop-after was set to '%s'). " + "You can continue later with --continue or " + "--restart-from options\n" % options_storage.args.stop_after) + break + return jobs + + def dump_commands(self, commands, outputfile): + with open(outputfile, 'w') as fw: + fw.write(self.get_MPI_sh_preambula() + "\n") + + prev_id = "" + for i in range(len(commands)): + if (commands[i].mpi_support): + cmd = self.get_MPI_sh_command(commands[i], prev_id) + else: + cmd = self.get_not_MPI_sh_command(commands[i], prev_id) + fw.write(cmd + "\n") + prev_id = commands[i].job_uuid + self.log.info("Commands were saved to " + outputfile) + + def get_MPI_sh_preambula(self): + preambula = "" + log_file = options_storage.args.output_dir + "/spades.log" + preambula += "LOG_OUT=\"" + self.grid_engine_output_option.format(OUT=log_file) + "\"\n" + preambula += "ERR_OUT=\"" + self.grid_engine_err_output_option.format(ERR=log_file) + "\"\n" + memory_in_kb = int(options_storage.args.memory * 1024 * 1024) + preambula += "QUEUE=\"" + self.grid_engine_queue.format(QUEUE=options_storage.args.grid_queue) + "\"\n" + preambula += "CLUSTER_ARGS=\"$QUEUE " + \ + self.grid_engine_memory_option.format(MEMORY=memory_in_kb, TOTAL_MEMORY=memory_in_kb * options_storage.args.grid_nnodes) + " " + \ + self.grid_engine_thread_option.format(NNODES=options_storage.args.grid_nnodes, + NCPUS=options_storage.args.threads, + NPROCESSORS=options_storage.args.grid_nnodes * options_storage.args.threads) + " " + \ + self.grid_engine_minimum_node_mem.format(MEMORY=memory_in_kb) + "\"\n" + preambula += "MPIRUN_ARGS=\"" + self.grid_engine_mpi_runtime_args.format( + NNODES=options_storage.args.grid_nnodes, + NCPUS=options_storage.args.threads) + "\"\n" + preambula += "PYTHON=\"" + sys.executable + "\"\n" + + return preambula + + def get_MPI_sh_command(self, command, prev_job_name=""): + cmd = self.grid_engine_submit_command + " " + cmd += self.grid_engine_name_option.format(JOB_NAME=command.job_uuid) + " " + cmd += "$LOG_OUT " + cmd += "$ERR_OUT " + if prev_job_name != "": + cmd += self.grid_engine_dependency_option.format(WAIT_TAG=prev_job_name) + " " + cmd += "$CLUSTER_ARGS " + cmd += self.grid_engine_mpi_runtime + " $MPIRUN_ARGS " + cmd1 = cmd + cmd = "# === STAGE " + command.STAGE + "(MPI) === \n" + cmd += "CMD=\"" + command.mpi_sh_str() + "\"\n\n" + cmd += cmd1 + cmd += "$CMD\n\n" + return cmd + + def get_not_MPI_sh_command(self, command, prev_job_name=""): + cmd = "#=== STAGE " + command.STAGE + " (not MPI) ===\n" + cmd += "CMD=\"" + command.sh_str() + "\"\n\n" + + cmd += self.grid_engine_submit_command + " " + cmd += self.grid_engine_name_option.format(JOB_NAME=command.job_uuid) + " " + cmd += "$LOG_OUT " + cmd += "$ERR_OUT " + if prev_job_name != "": + cmd += self.grid_engine_dependency_option.format(WAIT_TAG=prev_job_name) + " " + cmd += "$QUEUE " + cmd += "$CMD\n\n" + return cmd + + + def get_MPI_command(self, command, prev_job_name=""): + cmd = self.grid_engine_submit_command + " " + cmd += self.grid_engine_name_option.format(JOB_NAME=command.job_uuid) + " " + log_file = options_storage.args.output_dir + "/spades.log" + cmd += self.grid_engine_output_option.format(OUT=log_file) + " " + cmd += self.grid_engine_err_output_option.format(ERR=log_file) + " " + if prev_job_name != "": + cmd += self.grid_engine_dependency_option.format(WAIT_TAG=prev_job_name) + " " + cmd += self.grid_engine_queue.format(QUEUE=options_storage.args.grid_queue) + " " + memory_in_kb = int(options_storage.args.memory * 1024 * 1024) + cmd += self.grid_engine_memory_option.format(MEMORY=memory_in_kb, TOTAL_MEMORY=memory_in_kb * options_storage.args.grid_nnodes) + " " + cmd += self.grid_engine_thread_option.format(NNODES=options_storage.args.grid_nnodes, + NCPUS=options_storage.args.threads, + NPROCESSORS=options_storage.args.grid_nnodes * options_storage.args.threads) + " " + cmd += self.grid_engine_minimum_node_mem.format(MEMORY=memory_in_kb) + " " + + cmd += self.grid_engine_mpi_runtime + " " + self.grid_engine_mpi_runtime_args.format( + NNODES=options_storage.args.grid_nnodes, + NCPUS=options_storage.args.threads) + " " + + if options_storage.args.grid_profile: + name = command.STAGE + "_" + command.short_name + "_" + command.job_uuid + profile = options_storage.args.output_dir + "/" + name + ".prof" + profile_line = " -x CPUPROFILE={PROFILE} ompi_profile_helper.sh ".format(PROFILE=profile) + else: + profile_line = "" + + if options_storage.args.grid_valgrind: + valgrind_line = " valgrind --track-origins=yes " + else: + valgrind_line = "" + + if options_storage.args.grid_coredump: + coredump_line = "ulimit -c unlimited && " + else: + coredump_line = "" + cmd = coredump_line + " " + cmd + cmd += profile_line + " " + cmd += valgrind_line + " " + cmd += command.mpi_str() + return cmd + + + def get_not_MPI_command(self, command, prev_job_name=""): + cmd = self.grid_engine_submit_command + " " + cmd += self.grid_engine_name_option.format(JOB_NAME=command.job_uuid) + " " + log_file = options_storage.args.output_dir + "/spades.log" + cmd += self.grid_engine_output_option.format(OUT=log_file) + " " + cmd += self.grid_engine_err_output_option.format(ERR=log_file) + " " + if prev_job_name != "": + cmd += self.grid_engine_dependency_option.format(WAIT_TAG=prev_job_name) + " " + + cmd += self.grid_engine_queue.format(QUEUE=options_storage.args.grid_queue) + " " + + cmd += command.__str__() + return cmd diff --git a/src/projects/spades/pipeline/spades_pipeline/options_parser.py b/src/projects/spades/pipeline/spades_pipeline/options_parser.py index c617826bbb..23a9e017bf 100644 --- a/src/projects/spades/pipeline/spades_pipeline/options_parser.py +++ b/src/projects/spades/pipeline/spades_pipeline/options_parser.py @@ -799,6 +799,66 @@ def add_hidden_args(pgroup_hidden): if show_help_hidden else argparse.SUPPRESS, action="help") +def add_cluster_args(pgroup_cluster): + pgroup_cluster.add_argument("--grid-engine", + metavar="", + dest="grid_engine", + default="local", + help="run under grid control\n" + "('lsf', 'slurm', 'local', 'mpi', save_yaml') " + "[default: 'local']", + action="store") + pgroup_cluster.add_argument("--grid-queue", + metavar="", + dest="grid_queue", + default="mpi-rh74", + help="submits the jobs to one of the specified queues", + action="store") + pgroup_cluster.add_argument("--grid-nnodes", + metavar="", + dest="grid_nnodes", + type=int, + default=2, + help="specifies the number of processors", + action="store") + pgroup_cluster.add_argument("--grid-wait", + dest="grid_wait", + help="wait for job finish", + action="store_true") + pgroup_cluster.add_argument("--grid-qos", + dest="grid_qos", + default="lr_normal", + help="quality of service for the jobs (for SLURM grid engine)", + metavar="", + action="store") + pgroup_cluster.add_argument("--grid-partition", + dest="grid_partition", + default="lr3", + help="partition for the resource allocation (for SLURM grid enging)", + metavar="", + action="store") + pgroup_cluster.add_argument("--grid-account", + dest="grid_account", + default="ac_scsguest", + help="charge resources used by this run to specified account (for SLURM grid engine)", + metavar="", + action="store") + + show_help_hidden = ("--help-hidden" in sys.argv) + + pgroup_cluster.add_argument("--grid-profile", + dest="grid_profile", + action="store_true", + help="enable mpi task profiling (for SLURM grid engine, for developers only)" if show_help_hidden else argparse.SUPPRESS) + pgroup_cluster.add_argument("--grid-valgrind", + dest="grid_valgrind", + action="store_true", + help="run mpi tasks with valgrind (for SLURM grid engine, for developers only" if show_help_hidden else argparse.SUPPRESS) + pgroup_cluster.add_argument("--grid-coredump", + dest="grid_coredump", + action="store_true", + help="enable core dumps for mpi tasks (for SLURM cluster, for developers only" if show_help_hidden else argparse.SUPPRESS) + def create_parser(): parser = argparse.ArgumentParser(prog="spades.py", formatter_class=SpadesHelpFormatter, @@ -809,12 +869,14 @@ def create_parser(): pgroup_input_data = parser.add_argument_group("Input data") pgroup_pipeline = parser.add_argument_group("Pipeline options") pgroup_advanced = parser.add_argument_group("Advanced options") + pgroup_cluster = parser.add_argument_group('Cluster execution options') pgroup_hidden = parser.add_argument_group("Hidden options") add_basic_args(pgroup_basic) add_input_data_args(pgroup_input_data) add_pipeline_args(pgroup_pipeline) add_advanced_args(pgroup_advanced) + add_cluster_args(pgroup_cluster) add_hidden_args(pgroup_hidden) return parser diff --git a/src/projects/spades/pipeline/spades_pipeline/options_storage.py b/src/projects/spades/pipeline/spades_pipeline/options_storage.py index 693af58f86..bdb825526c 100644 --- a/src/projects/spades/pipeline/spades_pipeline/options_storage.py +++ b/src/projects/spades/pipeline/spades_pipeline/options_storage.py @@ -95,6 +95,8 @@ def get_stage_filename(stage_num, stage_short_name): stage_file_name = "stage_%d_%s" % (stage_num, stage_short_name) stage_checkpoint_path = os.path.join(args.output_dir, pipeline_state_dir, stage_file_name) + if not os.path.exists(os.path.dirname(stage_checkpoint_path)): + os.makedirs(os.path.dirname(stage_checkpoint_path)) return stage_checkpoint_path diff --git a/src/projects/spades/pipeline/spades_pipeline/stages/spades_iteration_stage.py b/src/projects/spades/pipeline/spades_pipeline/stages/spades_iteration_stage.py index 2b56dc654a..78ba619498 100644 --- a/src/projects/spades/pipeline/spades_pipeline/stages/spades_iteration_stage.py +++ b/src/projects/spades/pipeline/spades_pipeline/stages/spades_iteration_stage.py @@ -177,8 +177,9 @@ def get_command(self, cfg): command = [commands_parser.Command( STAGE="K%d" % self.K, - path=os.path.join(self.bin_home, "spades-core"), + path=os.path.join(self.bin_home, "{spades_core}"), args=args, config_dir=os.path.relpath(data_dir, options_storage.args.output_dir), - short_name=self.short_name)] + short_name=self.short_name, + mpi_support=True)] return command From f9ed06234ec1465a81e6b09c3c78d4e7c5ab1ed3 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 18 Sep 2024 15:13:11 -0700 Subject: [PATCH 098/102] Add hpcSPAdes to list of known projects --- src/cmake/proj.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmake/proj.cmake b/src/cmake/proj.cmake index 55fd7f9b1b..63b87ab42b 100644 --- a/src/cmake/proj.cmake +++ b/src/cmake/proj.cmake @@ -8,7 +8,7 @@ # Side-by-side subprojects layout: automatically set the # SPADES_EXTERNAL_${project}_SOURCE_DIR using SPADES_ALL_PROJECTS -set(SPADES_ALL_PROJECTS "spades;hammer;ionhammer;corrector;spaligner;spades_tools;binspreader;pathracer") +set(SPADES_ALL_PROJECTS "spades;hammer;ionhammer;corrector;spaligner;spades_tools;binspreader;pathracer;hpcspades") set(SPADES_EXTRA_PROJECTS "mts;online_vis;cds_subgraphs") set(SPADES_KNOWN_PROJECTS "${SPADES_ALL_PROJECTS};${SPADES_EXTRA_PROJECTS}") set(SPADES_ENABLE_PROJECTS "" CACHE STRING From fcaca1143304c4dfdeaca66b9ed9b826cc607e1b Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 20 Sep 2024 18:17:48 -0700 Subject: [PATCH 099/102] Add SLURM executor --- .../executors/executor_slurm.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py new file mode 100644 index 0000000000..fa80813ef9 --- /dev/null +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +############################################################################ +# Copyright (c) 2023-2024 SPAdes team +# All Rights Reserved +# See file LICENSE for details. +############################################################################ +import sys + +import executors +from abc import ABCMeta, abstractmethod +import options_storage +import support + + +class Executor(executors.ExecutorCluster): + grid_engine = "SLURM" + grid_engine_submit_command = "sbatch" + grid_engine_slurm_args = "--hint=compute_bound --mem-bind=verbose,none --exclusive --cpus-per-task {NCPUS} --open-mode=append --kill-on-invalid-dep=yes --mem {MEMORY_MB}M --partition={PARTITION} --account={ACCOUNT} --qos={QOS}" + grid_engine_output_option = "-o {OUT}" + grid_engine_err_output_option = "-e {ERR}" + grid_engine_job_name = "--job-name {JOB_NAME}" + grid_engine_set_command = "--wrap \"{COMMAND}\"" + grid_engine_dependency_option = "--dependency afterok:{WAIT_TAG}" + grid_engine_nodes = "--nodes={NNODES} --ntasks={NNODES}" + grid_engine_kill_command = "scancel {JOB_NAME}" + grid_engine_mpirun_args = "--use-hwthread-cpus --display-devel-allocation --display-devel-map --report-bindings --map-by ppr:1:NODE:PE={NCPUS} --rank-by NODE -x OMP_NUM_THREADS={NCPUS}" + + def join(self, job_name): + log_file = options_storage.args.output_dir + "/spades.log" + cmd = self.grid_engine_submit_command.format(COMMAND="true", JOB_NAME="wait", OUT=log_file, ERR=log_file, NCPUS=1) + cmd += " " + self.grid_engine_dependency_option.format(WAIT_TAG=job_name) + cmd += " " + self.grid_engine_credentials.format(PARTITION=options_storage.args.grid_partition, + ACCOUNT=options_storage.args.grid_account, + QOS=options_storage.args.grid_qos, + QUEUE=options_storage.args.grid_queue) + cmd += " --wait" + support.sys_call(cmd, log=self.log) + + def get_MPI_sh_preambula(self): + memory_mb = int(options_storage.args.memory * 1024) + preambula = "SLURM_ARGS=\"" + self.grid_engine_slurm_args.format(NCPUS=options_storage.args.threads, + MEMORY_MB=memory_mb, + PARTITION=options_storage.args.grid_partition, + ACCOUNT=options_storage.args.grid_account, + QOS=options_storage.args.grid_qos, + QUEUE=options_storage.args.grid_queue) + "\"\n" + preambula += "MPIRUN_ARGS=\"" + self.grid_engine_mpirun_args.format(NCPUS=options_storage.args.threads) + "\"\n" + log_file = options_storage.args.output_dir + "/spades.log" + preambula += "LOG_OUT=\"" + self.grid_engine_output_option.format(OUT=log_file) + "\"\n" + preambula += "ERR_OUT=\"" + self.grid_engine_err_output_option.format(ERR=log_file) + "\"\n" + preambula += "PYTHON=\"" + sys.executable + "\"\n" + return preambula + + def get_sh_command(self, command, prev_id, mpi): + cmd_str = "#=== STAGE " + command.STAGE + ("(MPI) ===\n" if mpi else "(not MPI) ===\n") + cmd_str += "CMD=\"" + command.mpi_sh_str() + "\"\n" + cmd_str += "SID1=$(" + self.grid_engine_submit_command + " $SLURM_ARGS " + \ + self.grid_engine_job_name.format(JOB_NAME=command.job_uuid) + " $LOG_OUT $ERR_OUT " + if mpi: + cmd_str += self.grid_engine_set_command.format(COMMAND="mpirun $MPIRUN_ARGS $CMD") + else: + cmd_str += self.grid_engine_set_command.format(COMMAND="$CMD") + + if prev_id != "": + cmd_str += " " + self.grid_engine_dependency_option.format(WAIT_TAG="$SID1") + + if mpi: + cmd_str += " " + self.grid_engine_nodes.format(NNODES=options_storage.args.grid_nnodes) + cmd_str += ")\n" + cmd_str += "SID1=\"${SID1##* }\"\n" + return cmd_str + + def get_command(self, command, prev_id, mpi): + log_file = options_storage.args.output_dir + "/spades.log" + if mpi: + if options_storage.args.grid_profile: + name = command.STAGE + "_" + command.short_name + "_" + command.job_uuid + profile = options_storage.args.output_dir + "/" + name + ".prof" + profile_line = "-x CPUPROFILE={PROFILE} ompi_profile_helper.sh".format(PROFILE=profile) + else: + profile_line = "" + + if options_storage.args.grid_valgrind: + # valgrind_line = "valgrind --track-origins=yes --suppressions=/global/software/sl-7.x86_64/modules/gcc/7.4.0/openmpi/4.0.1-gcc/share/openmpi/openmpi-valgrind.supp" + valgrind_line = "-x valgrind --track-origins=yes" + else: + valgrind_line = "" + + if options_storage.args.grid_coredump: + coredump_line = "ulimit -c unlimited;" + else: + coredump_line = "" + command_line = "{COREDUMP} mpirun ".format(COREDUMP=coredump_line) + self.grid_engine_mpirun_args.format(NCPUS=options_storage.args.threads) + \ + " {VALGRIND} {PROFILE}".format(PROFILE=profile_line, VALGRIND=valgrind_line) + " " + command.mpi_str() + else: + command_line = command.mpi_str() + cmd = self.grid_engine_submit_command + " " + memory_mb = int(options_storage.args.memory * 1024) + cmd += self.grid_engine_slurm_args.format(NCPUS=options_storage.args.threads, + MEMORY_MB=memory_mb, + PARTITION=options_storage.args.grid_partition, + ACCOUNT=options_storage.args.grid_account, + QOS=options_storage.args.grid_qos, + QUEUE=options_storage.args.grid_queue) + " " + cmd += self.grid_engine_job_name.format(JOB_NAME=command.job_uuid) + " " + cmd += self.grid_engine_err_output_option.format(ERR=log_file) + " " + cmd += self.grid_engine_output_option.format(OUT=log_file) + " " + cmd += self.grid_engine_set_command.format(COMMAND=command_line) + " " + + if prev_id != "": + cmd += " " + self.grid_engine_dependency_option.format(WAIT_TAG=prev_id) + + if mpi: + cmd += " " + self.grid_engine_nodes.format(NNODES=options_storage.args.grid_nnodes) + return cmd + + def get_MPI_command(self, command, prev_id=""): + return self.get_command(command, prev_id=prev_id, mpi=True) + + def get_not_MPI_command(self, command, prev_id=""): + return self.get_command(command, prev_id=prev_id, mpi=False) + + def get_MPI_sh_command(self, command, prev_id=""): + return self.get_sh_command(command, prev_id=prev_id, mpi=True) + + def get_not_MPI_sh_command(self, command, prev_id=""): + return self.get_sh_command(command, prev_id=prev_id, mpi=False) + + def run_cluster_command(self, cmd, uuid): + import re, os + self.log.info("Submit cluster job: " + cmd) + output = os.popen(cmd).read() + jobid_search = re.search(r"^Submitted batch job (\d+)$", output) + assert jobid_search + return jobid_search.group(1) From fcf0c5c81060df2f5ecfbfd9f3815f56a797d982 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 20 Sep 2024 18:40:19 -0700 Subject: [PATCH 100/102] Better job names --- .../spades/pipeline/spades_pipeline/commands_parser.py | 3 ++- .../pipeline/spades_pipeline/executors/executor_slurm.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/projects/spades/pipeline/spades_pipeline/commands_parser.py b/src/projects/spades/pipeline/spades_pipeline/commands_parser.py index 2d1610820f..d6281b07ba 100644 --- a/src/projects/spades/pipeline/spades_pipeline/commands_parser.py +++ b/src/projects/spades/pipeline/spades_pipeline/commands_parser.py @@ -77,7 +77,8 @@ def to_dict(self): "del_after": self.del_after} def generate_job_uuid(self): - return self.STAGE.replace(' ', '_') + "_" + \ + return ('hpcSPAdes_' if self.mpi_support else 'SPAdes_') + \ + self.STAGE.replace(' ', '_') + "_" + \ ''.join([random.choice(string.ascii_uppercase + string.digits) for k in range(32)]) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py index fa80813ef9..6702ec1c16 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py @@ -53,7 +53,7 @@ def get_MPI_sh_preambula(self): return preambula def get_sh_command(self, command, prev_id, mpi): - cmd_str = "#=== STAGE " + command.STAGE + ("(MPI) ===\n" if mpi else "(not MPI) ===\n") + cmd_str = "#=== STAGE " + command.STAGE + (" (MPI) ===\n" if mpi else " (not MPI) ===\n") cmd_str += "CMD=\"" + command.mpi_sh_str() + "\"\n" cmd_str += "SID1=$(" + self.grid_engine_submit_command + " $SLURM_ARGS " + \ self.grid_engine_job_name.format(JOB_NAME=command.job_uuid) + " $LOG_OUT $ERR_OUT " From 9573ebf47d06749da32137a249d62b9053e8edca Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 20 Sep 2024 21:08:23 -0700 Subject: [PATCH 101/102] Fix some defaults --- .../executors/executor_slurm.py | 17 +++++-------- .../spades_pipeline/options_parser.py | 24 +++++++------------ 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py index 6702ec1c16..9add6a369f 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py @@ -16,7 +16,7 @@ class Executor(executors.ExecutorCluster): grid_engine = "SLURM" grid_engine_submit_command = "sbatch" - grid_engine_slurm_args = "--hint=compute_bound --mem-bind=verbose,none --exclusive --cpus-per-task {NCPUS} --open-mode=append --kill-on-invalid-dep=yes --mem {MEMORY_MB}M --partition={PARTITION} --account={ACCOUNT} --qos={QOS}" + grid_engine_slurm_args = "--hint=compute_bound --mem-bind=verbose,none --cpus-per-task {NCPUS} --open-mode=append --kill-on-invalid-dep=yes --mem {MEMORY_MB}M --time {TIME} {EXTRA}" grid_engine_output_option = "-o {OUT}" grid_engine_err_output_option = "-e {ERR}" grid_engine_job_name = "--job-name {JOB_NAME}" @@ -30,10 +30,7 @@ def join(self, job_name): log_file = options_storage.args.output_dir + "/spades.log" cmd = self.grid_engine_submit_command.format(COMMAND="true", JOB_NAME="wait", OUT=log_file, ERR=log_file, NCPUS=1) cmd += " " + self.grid_engine_dependency_option.format(WAIT_TAG=job_name) - cmd += " " + self.grid_engine_credentials.format(PARTITION=options_storage.args.grid_partition, - ACCOUNT=options_storage.args.grid_account, - QOS=options_storage.args.grid_qos, - QUEUE=options_storage.args.grid_queue) + cmd += " " + self.grid_engine_credentials.format(QUEUE=options_storage.args.grid_queue) cmd += " --wait" support.sys_call(cmd, log=self.log) @@ -41,9 +38,8 @@ def get_MPI_sh_preambula(self): memory_mb = int(options_storage.args.memory * 1024) preambula = "SLURM_ARGS=\"" + self.grid_engine_slurm_args.format(NCPUS=options_storage.args.threads, MEMORY_MB=memory_mb, - PARTITION=options_storage.args.grid_partition, - ACCOUNT=options_storage.args.grid_account, - QOS=options_storage.args.grid_qos, + TIME=options_storage.args.grid_time, + EXTRA=options_storage.args.grid_extra, QUEUE=options_storage.args.grid_queue) + "\"\n" preambula += "MPIRUN_ARGS=\"" + self.grid_engine_mpirun_args.format(NCPUS=options_storage.args.threads) + "\"\n" log_file = options_storage.args.output_dir + "/spades.log" @@ -99,9 +95,8 @@ def get_command(self, command, prev_id, mpi): memory_mb = int(options_storage.args.memory * 1024) cmd += self.grid_engine_slurm_args.format(NCPUS=options_storage.args.threads, MEMORY_MB=memory_mb, - PARTITION=options_storage.args.grid_partition, - ACCOUNT=options_storage.args.grid_account, - QOS=options_storage.args.grid_qos, + TIME=options_storage.args.grid_time, + EXTRA=options.storage.args.grid_extra, QUEUE=options_storage.args.grid_queue) + " " cmd += self.grid_engine_job_name.format(JOB_NAME=command.job_uuid) + " " cmd += self.grid_engine_err_output_option.format(ERR=log_file) + " " diff --git a/src/projects/spades/pipeline/spades_pipeline/options_parser.py b/src/projects/spades/pipeline/spades_pipeline/options_parser.py index 23a9e017bf..7df4ee46eb 100644 --- a/src/projects/spades/pipeline/spades_pipeline/options_parser.py +++ b/src/projects/spades/pipeline/spades_pipeline/options_parser.py @@ -811,7 +811,7 @@ def add_cluster_args(pgroup_cluster): pgroup_cluster.add_argument("--grid-queue", metavar="", dest="grid_queue", - default="mpi-rh74", + default="standard", help="submits the jobs to one of the specified queues", action="store") pgroup_cluster.add_argument("--grid-nnodes", @@ -825,22 +825,16 @@ def add_cluster_args(pgroup_cluster): dest="grid_wait", help="wait for job finish", action="store_true") - pgroup_cluster.add_argument("--grid-qos", - dest="grid_qos", - default="lr_normal", - help="quality of service for the jobs (for SLURM grid engine)", + pgroup_cluster.add_argument("--grid-extra", + dest="grid_extra", + default="", + help="any extra commands", metavar="", action="store") - pgroup_cluster.add_argument("--grid-partition", - dest="grid_partition", - default="lr3", - help="partition for the resource allocation (for SLURM grid enging)", - metavar="", - action="store") - pgroup_cluster.add_argument("--grid-account", - dest="grid_account", - default="ac_scsguest", - help="charge resources used by this run to specified account (for SLURM grid engine)", + pgroup_cluster.add_argument("--grid-time", + dest="grid_time", + default="1:00:00", + help="time limit", metavar="", action="store") From a7d7ecfcd1d518c1089135dbe900e8c2a295ff3c Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Fri, 20 Sep 2024 23:19:07 -0700 Subject: [PATCH 102/102] Run stuff via srun by default --- .../pipeline/spades_pipeline/executors/executor_slurm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py index 9add6a369f..049a6cb4fc 100644 --- a/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py +++ b/src/projects/spades/pipeline/spades_pipeline/executors/executor_slurm.py @@ -24,7 +24,7 @@ class Executor(executors.ExecutorCluster): grid_engine_dependency_option = "--dependency afterok:{WAIT_TAG}" grid_engine_nodes = "--nodes={NNODES} --ntasks={NNODES}" grid_engine_kill_command = "scancel {JOB_NAME}" - grid_engine_mpirun_args = "--use-hwthread-cpus --display-devel-allocation --display-devel-map --report-bindings --map-by ppr:1:NODE:PE={NCPUS} --rank-by NODE -x OMP_NUM_THREADS={NCPUS}" + grid_engine_srun_args = "--cpus-per-task {NCPUS}" def join(self, job_name): log_file = options_storage.args.output_dir + "/spades.log" @@ -41,7 +41,7 @@ def get_MPI_sh_preambula(self): TIME=options_storage.args.grid_time, EXTRA=options_storage.args.grid_extra, QUEUE=options_storage.args.grid_queue) + "\"\n" - preambula += "MPIRUN_ARGS=\"" + self.grid_engine_mpirun_args.format(NCPUS=options_storage.args.threads) + "\"\n" + preambula += "SRUN_ARGS=\"" + self.grid_engine_srun_args.format(NCPUS=options_storage.args.threads) + "\"\n" log_file = options_storage.args.output_dir + "/spades.log" preambula += "LOG_OUT=\"" + self.grid_engine_output_option.format(OUT=log_file) + "\"\n" preambula += "ERR_OUT=\"" + self.grid_engine_err_output_option.format(ERR=log_file) + "\"\n" @@ -54,7 +54,7 @@ def get_sh_command(self, command, prev_id, mpi): cmd_str += "SID1=$(" + self.grid_engine_submit_command + " $SLURM_ARGS " + \ self.grid_engine_job_name.format(JOB_NAME=command.job_uuid) + " $LOG_OUT $ERR_OUT " if mpi: - cmd_str += self.grid_engine_set_command.format(COMMAND="mpirun $MPIRUN_ARGS $CMD") + cmd_str += self.grid_engine_set_command.format(COMMAND="srun $SRUN_ARGS $CMD") else: cmd_str += self.grid_engine_set_command.format(COMMAND="$CMD") @@ -87,7 +87,7 @@ def get_command(self, command, prev_id, mpi): coredump_line = "ulimit -c unlimited;" else: coredump_line = "" - command_line = "{COREDUMP} mpirun ".format(COREDUMP=coredump_line) + self.grid_engine_mpirun_args.format(NCPUS=options_storage.args.threads) + \ + command_line = "{COREDUMP} srun ".format(COREDUMP=coredump_line) + self.grid_engine_srun_args.format(NCPUS=options_storage.args.threads) + \ " {VALGRIND} {PROFILE}".format(PROFILE=profile_line, VALGRIND=valgrind_line) + " " + command.mpi_str() else: command_line = command.mpi_str()