Hardware dev count (#131)

* fix hardware device count API * fix a few headers
NWChemEx · Aug 14, 2024 · 4c5a9f7 · 4c5a9f7
1 parent c494f65
commit 4c5a9f7
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 25 deletions.
diff --git a/src/tamm/gpu_streams.hpp b/src/tamm/gpu_streams.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "tamm/errors.hpp"
+#include <array>
+#include <memory>
 #include <optional>
 #include <sstream>
 #include <utility>
@@ -115,6 +117,38 @@ static inline void getDeviceCount(int* id) {
 #endif
 }
 
+// The following API is to get the hardware count of
+// GPUs/GCDs/Xe-stacks/tiles on a given node. Unlike the
+// above API, this method is not affected by the masking
+// env variables like CUDA/ROCR_VISIBLE_DEVICES or ZE_AFFINITY_MASK
+static inline void getHardwareGPUCount(int* gpus_per_node) {
+  std::array<char, 128> buffer;
+  std::string           result, m_call;
+
+#if defined(USE_CUDA)
+  m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
+#elif defined(USE_HIP)
+  m_call = "rocm-smi --alldevices | grep \"AMD INSTINCT\" | wc -l";
+#elif defined(USE_DPCPP)
+  sycl::platform pltf = sycl_get_device(0)->get_platform();
+  if(pltf.get_backend() == sycl::backend::ext_oneapi_level_zero ||
+     pltf.get_backend() == sycl::backend::opencl) {
+    m_call = "cat /sys/class/drm/card*/gt/gt*/id | wc -l";
+  }
+  else if(pltf.get_backend() == sycl::backend::ext_oneapi_cuda) {
+    m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
+  }
+  else if(pltf.get_backend() == sycl::backend::ext_oneapi_hip) {
+    m_call = "rocm-smi --alldevices | grep \"AMD INSTINCT\" | wc -l";
+  }
+#endif
+
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(m_call.c_str(), "r"), pclose);
+  if(!pipe) { throw std::runtime_error("popen() failed!"); }
+  while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
+  *gpus_per_node = stoi(result);
+}
+
 static inline std::string getDeviceName() {
 #if defined(USE_CUDA)
   cudaDeviceProp prop;

diff --git a/src/tamm/rmm_memory_pool.hpp b/src/tamm/rmm_memory_pool.hpp
@@ -120,31 +120,7 @@ class RMMMemoryManager {
     world_rank_ = GA_Nodeid();
 #endif // USE_UPCXX
 
-    if(world_rank_ == 0) {
-      std::array<char, 128> buffer;
-      std::string           result;
-
-#if defined(USE_CUDA)
-      const std::string m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
-#elif defined(USE_HIP)
-      const std::string m_call = "rocm-smi --showmemvendor | wc -l";
-#elif defined(USE_DPCPP)
-      const std::string m_call = "ONEAPI_DEVICE_SELECTOR=level_zero:gpu sycl-ls | wc -l";
-#endif
-
-      std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(m_call.c_str(), "r"), pclose);
-      if(!pipe) { throw std::runtime_error("popen() failed!"); }
-      while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
-
-#if defined(USE_CUDA)
-      ngpus_per_node = stoi(result);
-#elif defined(USE_HIP)
-      // - 6 is to remove the empty lines from the output
-      ngpus_per_node = stoi(result) - 6;
-#elif defined(USE_DPCPP)
-      ngpus_per_node           = stoi(result);
-#endif
-    }
+    if(world_rank_ == 0) { tamm::getHardwareGPUCount(&ngpus_per_node); }
 
 #if defined(USE_UPCXX)
     upcxx::broadcast(&tamm_rpg, 0).wait();