Skip to content

Commit

Permalink
Hardware dev count (#131)
Browse files Browse the repository at this point in the history
* fix hardware device count API

* fix a few headers
  • Loading branch information
abagusetty authored Aug 14, 2024
1 parent c494f65 commit 4c5a9f7
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 25 deletions.
34 changes: 34 additions & 0 deletions src/tamm/gpu_streams.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

#include "tamm/errors.hpp"
#include <array>
#include <memory>
#include <optional>
#include <sstream>
#include <utility>
Expand Down Expand Up @@ -115,6 +117,38 @@ static inline void getDeviceCount(int* id) {
#endif
}

// The following API is to get the hardware count of
// GPUs/GCDs/Xe-stacks/tiles on a given node. Unlike the
// above API, this method is not affected by the masking
// env variables like CUDA/ROCR_VISIBLE_DEVICES or ZE_AFFINITY_MASK
static inline void getHardwareGPUCount(int* gpus_per_node) {
std::array<char, 128> buffer;
std::string result, m_call;

#if defined(USE_CUDA)
m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
#elif defined(USE_HIP)
m_call = "rocm-smi --alldevices | grep \"AMD INSTINCT\" | wc -l";
#elif defined(USE_DPCPP)
sycl::platform pltf = sycl_get_device(0)->get_platform();
if(pltf.get_backend() == sycl::backend::ext_oneapi_level_zero ||
pltf.get_backend() == sycl::backend::opencl) {
m_call = "cat /sys/class/drm/card*/gt/gt*/id | wc -l";
}
else if(pltf.get_backend() == sycl::backend::ext_oneapi_cuda) {
m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
}
else if(pltf.get_backend() == sycl::backend::ext_oneapi_hip) {
m_call = "rocm-smi --alldevices | grep \"AMD INSTINCT\" | wc -l";
}
#endif

std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(m_call.c_str(), "r"), pclose);
if(!pipe) { throw std::runtime_error("popen() failed!"); }
while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
*gpus_per_node = stoi(result);
}

static inline std::string getDeviceName() {
#if defined(USE_CUDA)
cudaDeviceProp prop;
Expand Down
26 changes: 1 addition & 25 deletions src/tamm/rmm_memory_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,31 +120,7 @@ class RMMMemoryManager {
world_rank_ = GA_Nodeid();
#endif // USE_UPCXX

if(world_rank_ == 0) {
std::array<char, 128> buffer;
std::string result;

#if defined(USE_CUDA)
const std::string m_call = "nvidia-smi --query-gpu=name --format=csv,noheader | wc -l";
#elif defined(USE_HIP)
const std::string m_call = "rocm-smi --showmemvendor | wc -l";
#elif defined(USE_DPCPP)
const std::string m_call = "ONEAPI_DEVICE_SELECTOR=level_zero:gpu sycl-ls | wc -l";
#endif

std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(m_call.c_str(), "r"), pclose);
if(!pipe) { throw std::runtime_error("popen() failed!"); }
while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }

#if defined(USE_CUDA)
ngpus_per_node = stoi(result);
#elif defined(USE_HIP)
// - 6 is to remove the empty lines from the output
ngpus_per_node = stoi(result) - 6;
#elif defined(USE_DPCPP)
ngpus_per_node = stoi(result);
#endif
}
if(world_rank_ == 0) { tamm::getHardwareGPUCount(&ngpus_per_node); }

#if defined(USE_UPCXX)
upcxx::broadcast(&tamm_rpg, 0).wait();
Expand Down

0 comments on commit 4c5a9f7

Please sign in to comment.