Skip to content

Commit

Permalink
add gpn back
Browse files Browse the repository at this point in the history
  • Loading branch information
ajaypanyala committed May 24, 2024
1 parent de9bcc2 commit 8d7f424
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 17 deletions.
6 changes: 5 additions & 1 deletion src/tamm/execution_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "labeled_tensor.hpp"
#include "memory_manager.hpp"
#include "proc_group.hpp"
#include "rmm_memory_pool.hpp"
#include "runtime_engine.hpp"

namespace tamm {
Expand Down Expand Up @@ -37,7 +38,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k
#else
ranks_pn_ = GA_Cluster_nprocs(GA_Cluster_proc_nodeid(pg.rank().value()));
#endif
nnodes_ = pg.size().value() / ranks_pn_;
nnodes_ = pg.size().value() / ranks_pn_;
gpus_pn_ = ranks_pn_ / ranks_per_gpu_pool();

#if defined(__APPLE__)
{
Expand All @@ -64,6 +66,8 @@ ExecutionContext::ExecutionContext(ProcGroup pg, DistributionKind default_dist_k
minfo_.gpu_name = getDeviceName() + ", " + getRuntimeVersion();
gpuMemGetInfo(&free_, &minfo_.gpu_mem_per_device);
minfo_.gpu_mem_per_device /= (1024 * 1024 * 1024.0); // GiB
minfo_.gpu_mem_per_node = minfo_.gpu_mem_per_device * gpus_pn_;
minfo_.total_gpu_mem = minfo_.gpu_mem_per_device * nnodes_ * gpus_pn_;
}
#endif
}
Expand Down
6 changes: 6 additions & 0 deletions src/tamm/execution_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,12 @@ class ExecutionContext {

int nnodes() const { return nnodes_; }
int ppn() const { return ranks_pn_; }
int gpn() const { return gpus_pn_; }

struct meminfo {
size_t gpu_mem_per_device; // single gpu mem per rank (GiB)
size_t gpu_mem_per_node; // total gpu mem per node (GiB)
size_t total_gpu_mem; // total gpu mem across all nodes (GiB)
size_t cpu_mem_per_node; // cpu mem on single node (GiB)
size_t total_cpu_mem; // total cpu mem across all nodes (GiB)
std::string cpu_name; // cpu name
Expand All @@ -387,6 +390,8 @@ class ExecutionContext {
if(has_gpu_) {
std::cout << "[" << minfo_.gpu_name << "] : " << std::endl;
std::cout << " GPU memory per device (GiB): " << minfo_.gpu_mem_per_device << std::endl;
std::cout << " GPU memory per node (GiB): " << minfo_.gpu_mem_per_node << std::endl;
std::cout << " Total GPU memory (GiB): " << minfo_.total_gpu_mem << std::endl;
}
std::cout << "}" << std::endl;
}
Expand Down Expand Up @@ -456,6 +461,7 @@ class ExecutionContext {
std::shared_ptr<RuntimeEngine> re_;
int nnodes_;
int ranks_pn_;
int gpus_pn_{0};
bool has_gpu_{false};
ExecutionHW exhw_{ExecutionHW::CPU};
meminfo minfo_;
Expand Down
27 changes: 13 additions & 14 deletions src/tamm/rmm_memory_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,6 @@ static const uint32_t tamm_cpu_pool = [] {
return usingcpupool;
}();

// TAMM_RANKS_PER_GPU_POOL
static uint32_t tamm_rpg = [] {
uint32_t usingrpg = 1;
// This env is only applicable to DPCPP backend
#ifdef USE_DPCPP
if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) {
usingrpg = std::atoi(tammrpg);
}
#endif // USE_DPCPP
return usingrpg;
}();
} // namespace detail

class RMMMemoryManager {
Expand All @@ -80,8 +69,12 @@ class RMMMemoryManager {

private:
RMMMemoryManager() { initialize(); }
// TAMM_RANKS_PER_GPU_POOL
uint32_t tamm_rpg;

public:
uint32_t get_rpg() { return tamm_rpg; }

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
/// Returns a RMM device pool handle
device_pool_mr& getDeviceMemoryPool() { return *(deviceMR.get()); }
Expand Down Expand Up @@ -117,6 +110,7 @@ class RMMMemoryManager {

void initialize() {
if(this->invalid_state) {
tamm_rpg = 1;
// Number of user-MPI ranks is needed for efficient CPU-pool size
int ranks_pn_ = 0;
#if defined(USE_UPCXX)
Expand All @@ -128,10 +122,13 @@ class RMMMemoryManager {
long max_host_bytes{0};

#if defined(USE_DPCPP)
if(const char* tammrpg = std::getenv("TAMM_RANKS_PER_GPU_POOL")) {
tamm_rpg = std::atoi(tammrpg);
}
// if binding more than 1 rank per GPU ensure that
// TAMM_RANKS_PER_GPU_POOL is set appropriately
EXPECTS_STR(
(detail::tamm_rpg >= 1),
(tamm_rpg >= 1),
"[TAMM ERROR]: TAMM_RANKS_PER_GPU_POOL env variable needs to be set to atleast 1!");
#endif

Expand Down Expand Up @@ -161,7 +158,7 @@ class RMMMemoryManager {
if(ranks_pn_ > ngpus_per_node) {
EXPECTS_STR((ranks_pn_ % ngpus_per_node == 0),
"[TAMM ERROR]: num_ranks_per_node is not a multiple of num_gpus_per_node!");
detail::tamm_rpg = ranks_pn_ / ngpus_per_node;
tamm_rpg = ranks_pn_ / ngpus_per_node;
}
#endif // USE_CUDA, USE_HIP

Expand Down Expand Up @@ -242,7 +239,7 @@ class RMMMemoryManager {
size_t free{}, total{};
gpuMemGetInfo(&free, &total);
size_t max_device_bytes{0};
max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / detail::tamm_rpg;
max_device_bytes = ((detail::tamm_gpu_pool / 100.0) * free) / tamm_rpg;

deviceMR =
std::make_unique<device_pool_mr>(new rmm::mr::gpu_memory_resource, max_device_bytes);
Expand All @@ -266,6 +263,8 @@ class RMMMemoryManager {
RMMMemoryManager& operator=(RMMMemoryManager&&) = delete;
};

static inline uint32_t ranks_per_gpu_pool() { return RMMMemoryManager::getInstance().get_rpg(); }

// The reset pool & reinitialize only is being used for the (T) segement of cannonical
static inline void reset_rmm_pool() { RMMMemoryManager::getInstance().reset(); }

Expand Down
5 changes: 4 additions & 1 deletion tests/tamm/Test_CCSD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,10 @@ int main(int argc, char* argv[]) {
auto cur_local_time = localtime(&current_time_t);
std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl;
std::cout << "nnodes: " << ec.nnodes() << ", ";
std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl;
std::cout << "nproc_per_node: " << ec.ppn() << ", ";
std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", ";
std::cout << "ngpus_per_node: " << ec.gpn() << ", ";
std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl;
ec.print_mem_info();
std::cout << std::endl;
std::cout << "basis functions: " << nbf << ", occ_alpha: " << n_occ_alpha
Expand Down
5 changes: 4 additions & 1 deletion tests/tamm/Test_Mult_Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,10 @@ int main(int argc, char* argv[]) {
std::cout << std::endl << "date: " << std::put_time(cur_local_time, "%c") << std::endl;

std::cout << "nnodes: " << ec.nnodes() << ", ";
std::cout << "nproc: " << ec.nnodes() * ec.ppn() << std::endl;
std::cout << "nproc_per_node: " << ec.ppn() << ", ";
std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", ";
std::cout << "ngpus_per_node: " << ec.gpn() << ", ";
std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << std::endl;
std::cout << "dim, tile sizes = " << is_size << ", " << tile_size << std::endl;
ec.print_mem_info();
std::cout << std::endl << std::endl;
Expand Down

0 comments on commit 8d7f424

Please sign in to comment.