From 8d7f4248f500a8ff2e3f961d9bd3a257b413d3de Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Thu, 23 May 2024 21:50:02 -0700 Subject: [PATCH] add gpn back --- src/tamm/execution_context.cpp | 6 +++++- src/tamm/execution_context.hpp | 6 ++++++ src/tamm/rmm_memory_pool.hpp | 27 +++++++++++++-------------- tests/tamm/Test_CCSD.cpp | 5 ++++- tests/tamm/Test_Mult_Ops.cpp | 5 ++++- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/tamm/execution_context.cpp b/src/tamm/execution_context.cpp index 465606d52..0335eebc4 100644 --- a/src/tamm/execution_context.cpp +++ b/src/tamm/execution_context.cpp @@ -6,6 +6,7 @@ #include "labeled_tensor.hpp" #include "memory_manager.hpp" #include "proc_group.hpp" +#include "rmm_memory_pool.hpp" #include "runtime_engine.hpp" namespace tamm { @@ -37,7 +38,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k #else ranks_pn_ = GA_Cluster_nprocs(GA_Cluster_proc_nodeid(pg.rank().value())); #endif - nnodes_ = pg.size().value() / ranks_pn_; + nnodes_ = pg.size().value() / ranks_pn_; + gpus_pn_ = ranks_pn_ / ranks_per_gpu_pool(); #if defined(__APPLE__) { @@ -64,6 +66,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k minfo_.gpu_name = getDeviceName() + ", " + getRuntimeVersion(); gpuMemGetInfo(&free_, &minfo_.gpu_mem_per_device); minfo_.gpu_mem_per_device /= (1024 * 1024 * 1024.0); // GiB + minfo_.gpu_mem_per_node = minfo_.gpu_mem_per_device * gpus_pn_; + minfo_.total_gpu_mem = minfo_.gpu_mem_per_device * nnodes_ * gpus_pn_; } #endif } diff --git a/src/tamm/execution_context.hpp b/src/tamm/execution_context.hpp index 42067f2ab..f09d49e71 100644 --- a/src/tamm/execution_context.hpp +++ b/src/tamm/execution_context.hpp @@ -366,9 +366,12 @@ class ExecutionContext { int nnodes() const { return nnodes_; } int ppn() const { return ranks_pn_; } + int gpn() const { return gpus_pn_; } struct meminfo { size_t gpu_mem_per_device; // single gpu mem per rank (GiB) + size_t gpu_mem_per_node; // total gpu mem per node (GiB) + size_t total_gpu_mem; // total gpu mem across all nodes (GiB) size_t cpu_mem_per_node; // cpu mem on single node (GiB) size_t total_cpu_mem; // total cpu mem across all nodes (GiB) std::string cpu_name; // cpu name @@ -387,6 +390,8 @@ class ExecutionContext { if(has_gpu_) { std::cout << "[" << minfo_.gpu_name << "] : " << std::endl; std::cout << " GPU memory per device (GiB): " << minfo_.gpu_mem_per_device << std::endl; + std::cout << " GPU memory per node (GiB): " << minfo_.gpu_mem_per_node << std::endl; + std::cout << " Total GPU memory (GiB): " << minfo_.total_gpu_mem << std::endl; } std::cout << "}" << std::endl; } @@ -456,6 +461,7 @@ class ExecutionContext { std::shared_ptr re_; int nnodes_; int ranks_pn_; + int gpus_pn_{0}; bool has_gpu_{false}; ExecutionHW exhw_{ExecutionHW::CPU}; meminfo minfo_; diff --git a/src/tamm/rmm_memory_pool.hpp b/src/tamm/rmm_memory_pool.hpp index 6882d0187..5b1f79df4 100644 --- a/src/tamm/rmm_memory_pool.hpp +++ b/src/tamm/rmm_memory_pool.hpp @@ -50,17 +50,6 @@ static const uint32_t tamm_cpu_pool = [] { return usingcpupool; }(); -// TAMM_RANKS_PER_GPU_POOL -static uint32_t tamm_rpg = [] { - uint32_t usingrpg = 1; -// This env is only applicable to DPCPP backend -#ifdef USE_DPCPP - if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) { - usingrpg = std::atoi(tammrpg); - } -#endif // USE_DPCPP - return usingrpg; -}(); } // namespace detail class RMMMemoryManager { @@ -80,8 +69,12 @@ class RMMMemoryManager { private: RMMMemoryManager() { initialize(); } + // TAMM_RANKS_PER_GPU_POOL + uint32_t tamm_rpg; public: + uint32_t get_rpg() { return tamm_rpg; } + #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) /// Returns a RMM device pool handle device_pool_mr& getDeviceMemoryPool() { return *(deviceMR.get()); } @@ -117,6 +110,7 @@ class RMMMemoryManager { void initialize() { if(this->invalid_state) { + tamm_rpg = 1; // Number of user-MPI ranks is needed for efficient CPU-pool size int ranks_pn_ = 0; #if defined(USE_UPCXX) @@ -128,10 +122,13 @@ class RMMMemoryManager { long max_host_bytes{0}; #if defined(USE_DPCPP) + if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) { + tamm_rpg = std::atoi(tammrpg); + } // if binding more than 1 rank per GPU ensure that // TAMM_RANKS_PER_GPU_POOL is set appropriately EXPECTS_STR( - (detail::tamm_rpg >= 1), + (tamm_rpg >= 1), "[TAMM ERROR]: TAMM_RANKS_PER_GPU_POOL env variable needs to be set to atleast 1!"); #endif @@ -161,7 +158,7 @@ class RMMMemoryManager { if(ranks_pn_ > ngpus_per_node) { EXPECTS_STR((ranks_pn_ % ngpus_per_node == 0), "[TAMM ERROR]: num_ranks_per_node is not a multiple of num_gpus_per_node!"); - detail::tamm_rpg = ranks_pn_ / ngpus_per_node; + tamm_rpg = ranks_pn_ / ngpus_per_node; } #endif // USE_CUDA, USE_HIP @@ -242,7 +239,7 @@ class RMMMemoryManager { size_t free{}, total{}; gpuMemGetInfo(&free, &total); size_t max_device_bytes{0}; - max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / detail::tamm_rpg; + max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / tamm_rpg; deviceMR = std::make_unique(new rmm::mr::gpu_memory_resource, max_device_bytes); @@ -266,6 +263,8 @@ class RMMMemoryManager { RMMMemoryManager& operator=(RMMMemoryManager&&) = delete; }; +static inline uint32_t ranks_per_gpu_pool() { return RMMMemoryManager::getInstance().get_rpg(); } + // The reset pool & reinitialize only is being used for the (T) segement of cannonical static inline void reset_rmm_pool() { RMMMemoryManager::getInstance().reset(); } diff --git a/tests/tamm/Test_CCSD.cpp b/tests/tamm/Test_CCSD.cpp index 8c04dc9c2..83f645328 100644 --- a/tests/tamm/Test_CCSD.cpp +++ b/tests/tamm/Test_CCSD.cpp @@ -306,7 +306,10 @@ int main(int argc, char* argv[]) { auto cur_local_time = localtime(¤t_time_t); std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl; std::cout << "nnodes: " << ec.nnodes() << ", "; - std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl; + std::cout << "nproc_per_node: " << ec.ppn() << ", "; + std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", "; + std::cout << "ngpus_per_node: " << ec.gpn() << ", "; + std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl; ec.print_mem_info(); std::cout << std::endl; std::cout << "basis functions: " << nbf << ", occ_alpha: " << n_occ_alpha diff --git a/tests/tamm/Test_Mult_Ops.cpp b/tests/tamm/Test_Mult_Ops.cpp index 285118346..e5dc777a4 100644 --- a/tests/tamm/Test_Mult_Ops.cpp +++ b/tests/tamm/Test_Mult_Ops.cpp @@ -450,7 +450,10 @@ int main(int argc, char* argv[]) { std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl; std::cout << "nnodes: " << ec.nnodes() << ", "; - std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl; + std::cout << "nproc_per_node: " << ec.ppn() << ", "; + std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", "; + std::cout << "ngpus_per_node: " << ec.gpn() << ", "; + std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl; std::cout << "dim, tile sizes = " << is_size << ", " << tile_size << std::endl; ec.print_mem_info(); std::cout << std::endl << std::endl;