From 8d7f4248f500a8ff2e3f961d9bd3a257b413d3de Mon Sep 17 00:00:00 2001
From: Ajay Panyala <ajay.panyala@gmail.com>
Date: Thu, 23 May 2024 21:50:02 -0700
Subject: [PATCH] add gpn back

---
 src/tamm/execution_context.cpp |  6 +++++-
 src/tamm/execution_context.hpp |  6 ++++++
 src/tamm/rmm_memory_pool.hpp   | 27 +++++++++++++--------------
 tests/tamm/Test_CCSD.cpp       |  5 ++++-
 tests/tamm/Test_Mult_Ops.cpp   |  5 ++++-
 5 files changed, 32 insertions(+), 17 deletions(-)
diff --git a/src/tamm/execution_context.cpp b/src/tamm/execution_context.cpp
index 465606d52..0335eebc4 100644
--- a/src/tamm/execution_context.cpp
+++ b/src/tamm/execution_context.cpp
@@ -6,6 +6,7 @@
 #include "labeled_tensor.hpp"
 #include "memory_manager.hpp"
 #include "proc_group.hpp"
+#include "rmm_memory_pool.hpp"
 #include "runtime_engine.hpp"
 
 namespace tamm {
@@ -37,7 +38,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k
 #else
   ranks_pn_ = GA_Cluster_nprocs(GA_Cluster_proc_nodeid(pg.rank().value()));
 #endif
-  nnodes_ = pg.size().value() / ranks_pn_;
+  nnodes_  = pg.size().value() / ranks_pn_;
+  gpus_pn_ = ranks_pn_ / ranks_per_gpu_pool();
 
 #if defined(__APPLE__)
   {
@@ -64,6 +66,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k
     minfo_.gpu_name = getDeviceName() + ", " + getRuntimeVersion();
     gpuMemGetInfo(&free_, &minfo_.gpu_mem_per_device);
     minfo_.gpu_mem_per_device /= (1024 * 1024 * 1024.0); // GiB
+    minfo_.gpu_mem_per_node = minfo_.gpu_mem_per_device * gpus_pn_;
+    minfo_.total_gpu_mem    = minfo_.gpu_mem_per_device * nnodes_ * gpus_pn_;
   }
 #endif
 }
diff --git a/src/tamm/execution_context.hpp b/src/tamm/execution_context.hpp
index 42067f2ab..f09d49e71 100644
--- a/src/tamm/execution_context.hpp
+++ b/src/tamm/execution_context.hpp
@@ -366,9 +366,12 @@ class ExecutionContext {
 
   int nnodes() const { return nnodes_; }
   int ppn() const { return ranks_pn_; }
+  int gpn() const { return gpus_pn_; }
 
   struct meminfo {
     size_t      gpu_mem_per_device; // single gpu mem per rank (GiB)
+    size_t      gpu_mem_per_node;   // total gpu mem per node (GiB)
+    size_t      total_gpu_mem;      // total gpu mem across all nodes (GiB)
     size_t      cpu_mem_per_node;   // cpu mem on single node (GiB)
     size_t      total_cpu_mem;      // total cpu mem across all nodes (GiB)
     std::string cpu_name;           // cpu name
@@ -387,6 +390,8 @@ class ExecutionContext {
     if(has_gpu_) {
       std::cout << "[" << minfo_.gpu_name << "] : " << std::endl;
       std::cout << "  GPU memory per device (GiB): " << minfo_.gpu_mem_per_device << std::endl;
+      std::cout << "  GPU memory per node (GiB): " << minfo_.gpu_mem_per_node << std::endl;
+      std::cout << "  Total GPU memory (GiB): " << minfo_.total_gpu_mem << std::endl;
     }
     std::cout << "}" << std::endl;
   }
@@ -456,6 +461,7 @@ class ExecutionContext {
   std::shared_ptr<RuntimeEngine> re_;
   int                            nnodes_;
   int                            ranks_pn_;
+  int                            gpus_pn_{0};
   bool                           has_gpu_{false};
   ExecutionHW                    exhw_{ExecutionHW::CPU};
   meminfo                        minfo_;
diff --git a/src/tamm/rmm_memory_pool.hpp b/src/tamm/rmm_memory_pool.hpp
index 6882d0187..5b1f79df4 100644
--- a/src/tamm/rmm_memory_pool.hpp
+++ b/src/tamm/rmm_memory_pool.hpp
@@ -50,17 +50,6 @@ static const uint32_t tamm_cpu_pool = [] {
   return usingcpupool;
 }();
 
-// TAMM_RANKS_PER_GPU_POOL
-static uint32_t tamm_rpg = [] {
-  uint32_t usingrpg = 1;
-// This env is only applicable to DPCPP backend
-#ifdef USE_DPCPP
-  if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) {
-    usingrpg = std::atoi(tammrpg);
-  }
-#endif // USE_DPCPP
-  return usingrpg;
-}();
 } // namespace detail
 
 class RMMMemoryManager {
@@ -80,8 +69,12 @@ class RMMMemoryManager {
 
 private:
   RMMMemoryManager() { initialize(); }
+  // TAMM_RANKS_PER_GPU_POOL
+  uint32_t tamm_rpg;
 
 public:
+  uint32_t get_rpg() { return tamm_rpg; }
+
 #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
   /// Returns a RMM device pool handle
   device_pool_mr& getDeviceMemoryPool() { return *(deviceMR.get()); }
@@ -117,6 +110,7 @@ class RMMMemoryManager {
 
   void initialize() {
     if(this->invalid_state) {
+      tamm_rpg = 1;
       // Number of user-MPI ranks is needed for efficient CPU-pool size
       int ranks_pn_ = 0;
 #if defined(USE_UPCXX)
@@ -128,10 +122,13 @@ class RMMMemoryManager {
       long max_host_bytes{0};
 
 #if defined(USE_DPCPP)
+      if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) {
+        tamm_rpg = std::atoi(tammrpg);
+      }
       // if binding more than 1 rank per GPU ensure that
       // TAMM_RANKS_PER_GPU_POOL is set appropriately
       EXPECTS_STR(
-        (detail::tamm_rpg >= 1),
+        (tamm_rpg >= 1),
         "[TAMM ERROR]: TAMM_RANKS_PER_GPU_POOL env variable needs to be set to atleast 1!");
 #endif
 
@@ -161,7 +158,7 @@ class RMMMemoryManager {
       if(ranks_pn_ > ngpus_per_node) {
         EXPECTS_STR((ranks_pn_ % ngpus_per_node == 0),
                     "[TAMM ERROR]: num_ranks_per_node is not a multiple of num_gpus_per_node!");
-        detail::tamm_rpg = ranks_pn_ / ngpus_per_node;
+        tamm_rpg = ranks_pn_ / ngpus_per_node;
       }
 #endif // USE_CUDA, USE_HIP
 
@@ -242,7 +239,7 @@ class RMMMemoryManager {
       size_t free{}, total{};
       gpuMemGetInfo(&free, &total);
       size_t max_device_bytes{0};
-      max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / detail::tamm_rpg;
+      max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / tamm_rpg;
 
       deviceMR =
         std::make_unique<device_pool_mr>(new rmm::mr::gpu_memory_resource, max_device_bytes);
@@ -266,6 +263,8 @@ class RMMMemoryManager {
   RMMMemoryManager& operator=(RMMMemoryManager&&)      = delete;
 };
 
+static inline uint32_t ranks_per_gpu_pool() { return RMMMemoryManager::getInstance().get_rpg(); }
+
 // The reset pool & reinitialize only is being used for the (T) segement of cannonical
 static inline void reset_rmm_pool() { RMMMemoryManager::getInstance().reset(); }
 
diff --git a/tests/tamm/Test_CCSD.cpp b/tests/tamm/Test_CCSD.cpp
index 8c04dc9c2..83f645328 100644
--- a/tests/tamm/Test_CCSD.cpp
+++ b/tests/tamm/Test_CCSD.cpp
@@ -306,7 +306,10 @@ int main(int argc, char* argv[]) {
     auto cur_local_time = localtime(&current_time_t);
     std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl;
     std::cout << "nnodes: " << ec.nnodes() << ", ";
-    std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl;
+    std::cout << "nproc_per_node: " << ec.ppn() << ", ";
+    std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", ";
+    std::cout << "ngpus_per_node: " << ec.gpn() << ", ";
+    std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl;
     ec.print_mem_info();
     std::cout << std::endl;
     std::cout << "basis functions: " << nbf << ", occ_alpha: " << n_occ_alpha
diff --git a/tests/tamm/Test_Mult_Ops.cpp b/tests/tamm/Test_Mult_Ops.cpp
index 285118346..e5dc777a4 100644
--- a/tests/tamm/Test_Mult_Ops.cpp
+++ b/tests/tamm/Test_Mult_Ops.cpp
@@ -450,7 +450,10 @@ int main(int argc, char* argv[]) {
     std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl;
 
     std::cout << "nnodes: " << ec.nnodes() << ", ";
-    std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl;
+    std::cout << "nproc_per_node: " << ec.ppn() << ", ";
+    std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", ";
+    std::cout << "ngpus_per_node: " << ec.gpn() << ", ";
+    std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl;
     std::cout << "dim, tile sizes = " << is_size << ", " << tile_size << std::endl;
     ec.print_mem_info();
     std::cout << std::endl << std::endl;