gpu ga

NWChemEx · Aug 22, 2023 · 5b23876 · 5b23876
1 parent 76fa8f6
commit 5b23876
Show file tree

Hide file tree

Showing 5 changed files with 641 additions and 538 deletions.
diff --git a/src/tamm/CMakeLists.txt b/src/tamm/CMakeLists.txt
@@ -86,6 +86,7 @@ set(TAMM_INCLUDES
     label_translator.hpp
     opmin.hpp
     "${tamm_gpu_INCS}"
+    cpu_memory_pool.hpp
     # spin_tensor.hpp
    )
 

diff --git a/src/tamm/cpu_memory_pool.hpp b/src/tamm/cpu_memory_pool.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <cstddef>
+#include <new>
+#include <unordered_map>
+#include <vector>
+#if __APPLE__
+#include <sys/sysctl.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+
+namespace tamm {
+
+class CPUPooledStorageManager {
+protected:
+  // used memory
+  size_t used_memory_ = 0;
+  // percentage of reserved memory
+  int reserve_;
+  // memory pool
+  std::unordered_map<size_t, std::vector<void*>> memory_pool_;
+
+private:
+  CPUPooledStorageManager() { reserve_ = 90; }
+  ~CPUPooledStorageManager() { ReleaseAll(); }
+
+public:
+  void* allocate(size_t sizeInBytes) {
+    // don't allocate anything if the user requested zero bytes
+    if(0 == sizeInBytes) { return nullptr; }
+    auto&& reuse_it = memory_pool_.find(sizeInBytes);
+    if(reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
+      size_t free{}, total{};
+
+      struct sysinfo cpumeminfo_;
+      sysinfo(&cpumeminfo_);
+      total = cpumeminfo_.totalram * cpumeminfo_.mem_unit;
+      free  = cpumeminfo_.freeram * cpumeminfo_.mem_unit;
+
+      if(free <= total * reserve_ / 100 || sizeInBytes > free - total * reserve_ / 100) {
+        ReleaseAll();
+      }
+
+      void* ret = ::operator new(sizeInBytes);
+
+      used_memory_ += sizeInBytes;
+      return ret;
+    }
+    else {
+      auto&& reuse_pool = reuse_it->second;
+      auto   ret        = reuse_pool.back();
+      reuse_pool.pop_back();
+      return ret;
+    }
+  }
+  void deallocate(void* ptr, size_t sizeInBytes) {
+    auto&& reuse_pool = memory_pool_[sizeInBytes];
+    reuse_pool.push_back(ptr);
+  }
+
+  // void cpuMemset(void*& ptr, size_t sizeInBytes, bool blocking = false) {}
+
+  void ReleaseAll() {
+    for(auto&& i: memory_pool_) {
+      for(auto&& j: i.second) {
+        ::operator delete(j);
+        used_memory_ -= i.first;
+      }
+    }
+    memory_pool_.clear();
+  }
+
+  /// Returns the instance of device manager singleton.
+  inline static CPUPooledStorageManager& getInstance() {
+    static CPUPooledStorageManager d_m{};
+    return d_m;
+  }
+
+  CPUPooledStorageManager(const CPUPooledStorageManager&)            = delete;
+  CPUPooledStorageManager& operator=(const CPUPooledStorageManager&) = delete;
+  CPUPooledStorageManager(CPUPooledStorageManager&&)                 = delete;
+  CPUPooledStorageManager& operator=(CPUPooledStorageManager&&)      = delete;
+
+}; // class CPUPooledStorageManager
+
+} // namespace tamm
diff --git a/src/tamm/gpu_streams.hpp b/src/tamm/gpu_streams.hpp
@@ -2,6 +2,8 @@
 
 #include "tamm/errors.hpp"
 #include <map>
+#include <sstream>
+#include <vector>
 
 #if defined(USE_CUDA)
 #include <cublas_v2.h>
@@ -13,6 +15,7 @@
 #include <rocblas.h>
 #elif defined(USE_DPCPP)
 #include "sycl_device.hpp"
+#include <oneapi/mkl/blas.hpp>
 #endif
 
 namespace tamm {
@@ -26,27 +29,30 @@ using gpuMemcpyKind   = hipMemcpyKind;
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 
-#define HIP_CHECK(err)                                                                      \
-  do {                                                                                      \
-    hipError_t err_ = (err);                                                                \
-    if(err_ != hipSuccess) {                                                                \
-      std::printf("HIP Exception code: %s at %s : %d\n", hipGetErrorString(err_), __FILE__, \
-                  __LINE__);                                                                \
-      throw std::runtime_error("hip runtime error");                                        \
-    }                                                                                       \
+#define HIP_CHECK(FUNC)                                                                           \
+  do {                                                                                            \
+    hipError_t err_ = (FUNC);                                                                     \
+    if(err_ != hipSuccess) {                                                                      \
+      std::ostringstream msg;                                                                     \
+      msg << "HIP Error: " << hipGetErrorString(err_) << ", at " << __FILE__ << " : " << __LINE__ \
+          << std::endl;                                                                           \
+      throw std::runtime_error(msg.str());                                                        \
+    }                                                                                             \
   } while(0)
 
-#define ROCBLAS_CHECK(err)                                                                   \
-  do {                                                                                       \
-    rocblas_status err_ = (err);                                                             \
-    if(err_ != rocblas_status_success) {                                                     \
-      std::printf("rocblas Exception code: %s at %s : %d\n", rocblas_status_to_string(err_), \
-                  __FILE__, __LINE__);                                                       \
-      throw std::runtime_error("rocblas runtime error");                                     \
-    }                                                                                        \
+#define ROCBLAS_CHECK(FUNC)                                                                      \
+  do {                                                                                           \
+    rocblas_status err_ = (FUNC);                                                                \
+    if(err_ != rocblas_status_success) {                                                         \
+      std::ostringstream msg;                                                                    \
+      msg << "ROCBLAS Error: " << rocblas_status_to_string(err_) << ", at " << __FILE__ << " : " \
+          << __LINE__ << std::endl;                                                              \
+      throw std::runtime_error(msg.str());                                                       \
+    }                                                                                            \
   } while(0)
+#endif // USE_HIP
 
-#elif defined(USE_CUDA)
+#if defined(USE_CUDA)
 using gpuStream_t     = cudaStream_t;
 using gpuEvent_t      = cudaEvent_t;
 using gpuBlasHandle_t = cublasHandle_t;
@@ -55,27 +61,30 @@ using gpuMemcpyKind   = cudaMemcpyKind;
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 
-#define CUDA_CHECK(err)                                                                            \
-  do {                                                                                             \
-    cudaError_t err_ = (err);                                                                      \
-    if(err_ != cudaSuccess) {                                                                      \
-      std::printf("CUDA Exception code: %s at %s : %d\n", /*cudaGetErrorString*/ (err_), __FILE__, \
-                  __LINE__);                                                                       \
-      throw std::runtime_error("cuda runtime error");                                              \
-    }                                                                                              \
+#define CUDA_CHECK(FUNC)                                                                \
+  do {                                                                                  \
+    cudaError_t err_ = (FUNC);                                                          \
+    if(err_ != cudaSuccess) {                                                           \
+      std::ostringstream msg;                                                           \
+      msg << "CUDA Error: " << cudaGetErrorString(err_) << ", at " << __FILE__ << " : " \
+          << __LINE__ << std::endl;                                                     \
+      throw std::runtime_error(msg.str());                                              \
+    }                                                                                   \
   } while(0)
 
-#define CUBLAS_CHECK(err)                                                                     \
-  do {                                                                                        \
-    cublasStatus_t err_ = (err);                                                              \
-    if(err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
-      std::printf("cublas Exception code: %s at %s : %d\n", /*cublasGetStatusString*/ (err_), \
-                  __FILE__, __LINE__);                                                        \
-      throw std::runtime_error("cublas runtime error");                                       \
-    }                                                                                         \
+#define CUBLAS_CHECK(FUNC)                                                                        \
+  do {                                                                                            \
+    cublasStatus_t err_ = (FUNC);                                                                 \
+    if(err_ != CUBLAS_STATUS_SUCCESS) {                                                           \
+      std::ostringstream msg;                                                                     \
+      msg << "CUBLAS Error: " << /*cublasGetStatusString*/ (err_) << ", at " << __FILE__ << " : " \
+          << __LINE__ << std::endl;                                                               \
+      throw std::runtime_error(msg.str());                                                        \
+    }                                                                                             \
   } while(0)
+#endif // USE_CUDA
 
-#elif defined(USE_DPCPP)
+#if defined(USE_DPCPP)
 using gpuStream_t   = sycl::queue;
 using gpuEvent_t    = sycl::event;
 using gpuMemcpyKind = int;
@@ -93,7 +102,19 @@ auto sycl_asynchandler = [](sycl::exception_list exceptions) {
     }
   }
 };
-#endif
+
+#define ONEMKLBLAS_CHECK(FUNC)                                                         \
+  do {                                                                                 \
+    try {                                                                              \
+      (FUNC)                                                                           \
+    } catch(oneapi::mkl::exception const& ex) {                                        \
+      std::ostringstream msg;                                                          \
+      msg << "oneMKL Error: " << ex.what() << ", at " << __FILE__ << " : " << __LINE__ \
+          << std::endl;                                                                \
+      throw std::runtime_error(msg.str());                                             \
+    }                                                                                  \
+  } while(0)
+#endif // USE_DPCPP
 
 static inline void getDeviceCount(int* id) {
 #if defined(USE_CUDA)
@@ -125,6 +146,16 @@ static inline void gpuSetDevice(int active_device) {
 #endif
 }
 
+static inline void gpuGetDevice(int* active_device) {
+#ifdef USE_CUDA
+  CUDA_CHECK(cudaGetDevice(active_device));
+#elif defined(USE_HIP)
+  HIP_CHECK(hipGetDevice(active_device));
+#elif defined(USE_DPCPP)
+  syclGetDevice(active_device);
+#endif
+}
+
 template<typename T>
 static void gpuMemcpyAsync(T* dst, const T* src, size_t count, gpuMemcpyKind kind,
                            gpuStream_t& stream) {
@@ -138,6 +169,26 @@ static void gpuMemcpyAsync(T* dst, const T* src, size_t count, gpuMemcpyKind kin
 #endif
 }
 
+static inline void gpuMemsetAsync(void*& ptr, size_t sizeInBytes, gpuStream_t stream) {
+#if defined(USE_DPCPP)
+  stream.memset(ptr, 0, sizeInBytes);
+#elif defined(USE_HIP)
+  hipMemsetAsync(ptr, 0, sizeInBytes, stream);
+#elif defined(USE_CUDA)
+  cudaMemsetAsync(ptr, 0, sizeInBytes, stream);
+#endif
+}
+
+static inline void gpuStreamSynchronize(gpuStream_t stream) {
+#if defined(USE_DPCPP)
+  stream.wait();
+#elif defined(USE_HIP)
+  hipStreamSynchronize(stream);
+#elif defined(USE_CUDA)
+  cudaStreamSynchronize(stream);
+#endif
+}
+
 class GPUStreamPool {
 protected:
   bool _initialized{false};
@@ -241,4 +292,6 @@ class GPUStreamPool {
   GPUStreamPool& operator=(GPUStreamPool&&)      = delete;
 };
 
+// This API needs to be defined after the class GPUStreamPool since the classs
+// is only declared and defined before this method
 } // namespace tamm