Skip to content

Commit

Permalink
gpu ga
Browse files Browse the repository at this point in the history
  • Loading branch information
ajaypanyala committed Aug 22, 2023
1 parent 76fa8f6 commit 5b23876
Show file tree
Hide file tree
Showing 5 changed files with 641 additions and 538 deletions.
1 change: 1 addition & 0 deletions src/tamm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ set(TAMM_INCLUDES
label_translator.hpp
opmin.hpp
"${tamm_gpu_INCS}"
cpu_memory_pool.hpp
# spin_tensor.hpp
)

Expand Down
87 changes: 87 additions & 0 deletions src/tamm/cpu_memory_pool.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#pragma once

#include <cstddef>
#include <new>
#include <unordered_map>
#include <vector>
#if __APPLE__
#include <sys/sysctl.h>
#else
#include <sys/sysinfo.h>
#endif

namespace tamm {

class CPUPooledStorageManager {
protected:
// used memory
size_t used_memory_ = 0;
// percentage of reserved memory
int reserve_;
// memory pool
std::unordered_map<size_t, std::vector<void*>> memory_pool_;

private:
CPUPooledStorageManager() { reserve_ = 90; }
~CPUPooledStorageManager() { ReleaseAll(); }

public:
void* allocate(size_t sizeInBytes) {
// don't allocate anything if the user requested zero bytes
if(0 == sizeInBytes) { return nullptr; }
auto&& reuse_it = memory_pool_.find(sizeInBytes);
if(reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
size_t free{}, total{};

struct sysinfo cpumeminfo_;
sysinfo(&cpumeminfo_);
total = cpumeminfo_.totalram * cpumeminfo_.mem_unit;
free = cpumeminfo_.freeram * cpumeminfo_.mem_unit;

if(free <= total * reserve_ / 100 || sizeInBytes > free - total * reserve_ / 100) {
ReleaseAll();
}

void* ret = ::operator new(sizeInBytes);

used_memory_ += sizeInBytes;
return ret;
}
else {
auto&& reuse_pool = reuse_it->second;
auto ret = reuse_pool.back();
reuse_pool.pop_back();
return ret;
}
}
void deallocate(void* ptr, size_t sizeInBytes) {
auto&& reuse_pool = memory_pool_[sizeInBytes];
reuse_pool.push_back(ptr);
}

// void cpuMemset(void*& ptr, size_t sizeInBytes, bool blocking = false) {}

void ReleaseAll() {
for(auto&& i: memory_pool_) {
for(auto&& j: i.second) {
::operator delete(j);
used_memory_ -= i.first;
}
}
memory_pool_.clear();
}

/// Returns the instance of device manager singleton.
inline static CPUPooledStorageManager& getInstance() {
static CPUPooledStorageManager d_m{};
return d_m;
}

CPUPooledStorageManager(const CPUPooledStorageManager&) = delete;
CPUPooledStorageManager& operator=(const CPUPooledStorageManager&) = delete;
CPUPooledStorageManager(CPUPooledStorageManager&&) = delete;
CPUPooledStorageManager& operator=(CPUPooledStorageManager&&) = delete;

}; // class CPUPooledStorageManager

} // namespace tamm
123 changes: 88 additions & 35 deletions src/tamm/gpu_streams.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "tamm/errors.hpp"
#include <map>
#include <sstream>
#include <vector>

#if defined(USE_CUDA)
#include <cublas_v2.h>
Expand All @@ -13,6 +15,7 @@
#include <rocblas.h>
#elif defined(USE_DPCPP)
#include "sycl_device.hpp"
#include <oneapi/mkl/blas.hpp>
#endif

namespace tamm {
Expand All @@ -26,27 +29,30 @@ using gpuMemcpyKind = hipMemcpyKind;
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice

#define HIP_CHECK(err) \
do { \
hipError_t err_ = (err); \
if(err_ != hipSuccess) { \
std::printf("HIP Exception code: %s at %s : %d\n", hipGetErrorString(err_), __FILE__, \
__LINE__); \
throw std::runtime_error("hip runtime error"); \
} \
#define HIP_CHECK(FUNC) \
do { \
hipError_t err_ = (FUNC); \
if(err_ != hipSuccess) { \
std::ostringstream msg; \
msg << "HIP Error: " << hipGetErrorString(err_) << ", at " << __FILE__ << " : " << __LINE__ \
<< std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)

#define ROCBLAS_CHECK(err) \
do { \
rocblas_status err_ = (err); \
if(err_ != rocblas_status_success) { \
std::printf("rocblas Exception code: %s at %s : %d\n", rocblas_status_to_string(err_), \
__FILE__, __LINE__); \
throw std::runtime_error("rocblas runtime error"); \
} \
#define ROCBLAS_CHECK(FUNC) \
do { \
rocblas_status err_ = (FUNC); \
if(err_ != rocblas_status_success) { \
std::ostringstream msg; \
msg << "ROCBLAS Error: " << rocblas_status_to_string(err_) << ", at " << __FILE__ << " : " \
<< __LINE__ << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)
#endif // USE_HIP

#elif defined(USE_CUDA)
#if defined(USE_CUDA)
using gpuStream_t = cudaStream_t;
using gpuEvent_t = cudaEvent_t;
using gpuBlasHandle_t = cublasHandle_t;
Expand All @@ -55,27 +61,30 @@ using gpuMemcpyKind = cudaMemcpyKind;
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice

#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if(err_ != cudaSuccess) { \
std::printf("CUDA Exception code: %s at %s : %d\n", /*cudaGetErrorString*/ (err_), __FILE__, \
__LINE__); \
throw std::runtime_error("cuda runtime error"); \
} \
#define CUDA_CHECK(FUNC) \
do { \
cudaError_t err_ = (FUNC); \
if(err_ != cudaSuccess) { \
std::ostringstream msg; \
msg << "CUDA Error: " << cudaGetErrorString(err_) << ", at " << __FILE__ << " : " \
<< __LINE__ << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)

#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if(err_ != CUBLAS_STATUS_SUCCESS) { \
std::printf("cublas Exception code: %s at %s : %d\n", /*cublasGetStatusString*/ (err_), \
__FILE__, __LINE__); \
throw std::runtime_error("cublas runtime error"); \
} \
#define CUBLAS_CHECK(FUNC) \
do { \
cublasStatus_t err_ = (FUNC); \
if(err_ != CUBLAS_STATUS_SUCCESS) { \
std::ostringstream msg; \
msg << "CUBLAS Error: " << /*cublasGetStatusString*/ (err_) << ", at " << __FILE__ << " : " \
<< __LINE__ << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)
#endif // USE_CUDA

#elif defined(USE_DPCPP)
#if defined(USE_DPCPP)
using gpuStream_t = sycl::queue;
using gpuEvent_t = sycl::event;
using gpuMemcpyKind = int;
Expand All @@ -93,7 +102,19 @@ auto sycl_asynchandler = [](sycl::exception_list exceptions) {
}
}
};
#endif

#define ONEMKLBLAS_CHECK(FUNC) \
do { \
try { \
(FUNC) \
} catch(oneapi::mkl::exception const& ex) { \
std::ostringstream msg; \
msg << "oneMKL Error: " << ex.what() << ", at " << __FILE__ << " : " << __LINE__ \
<< std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)
#endif // USE_DPCPP

static inline void getDeviceCount(int* id) {
#if defined(USE_CUDA)
Expand Down Expand Up @@ -125,6 +146,16 @@ static inline void gpuSetDevice(int active_device) {
#endif
}

static inline void gpuGetDevice(int* active_device) {
#ifdef USE_CUDA
CUDA_CHECK(cudaGetDevice(active_device));
#elif defined(USE_HIP)
HIP_CHECK(hipGetDevice(active_device));
#elif defined(USE_DPCPP)
syclGetDevice(active_device);
#endif
}

template<typename T>
static void gpuMemcpyAsync(T* dst, const T* src, size_t count, gpuMemcpyKind kind,
gpuStream_t& stream) {
Expand All @@ -138,6 +169,26 @@ static void gpuMemcpyAsync(T* dst, const T* src, size_t count, gpuMemcpyKind kin
#endif
}

static inline void gpuMemsetAsync(void*& ptr, size_t sizeInBytes, gpuStream_t stream) {
#if defined(USE_DPCPP)
stream.memset(ptr, 0, sizeInBytes);
#elif defined(USE_HIP)
hipMemsetAsync(ptr, 0, sizeInBytes, stream);
#elif defined(USE_CUDA)
cudaMemsetAsync(ptr, 0, sizeInBytes, stream);
#endif
}

static inline void gpuStreamSynchronize(gpuStream_t stream) {
#if defined(USE_DPCPP)
stream.wait();
#elif defined(USE_HIP)
hipStreamSynchronize(stream);
#elif defined(USE_CUDA)
cudaStreamSynchronize(stream);
#endif
}

class GPUStreamPool {
protected:
bool _initialized{false};
Expand Down Expand Up @@ -241,4 +292,6 @@ class GPUStreamPool {
GPUStreamPool& operator=(GPUStreamPool&&) = delete;
};

// This API needs to be defined after the class GPUStreamPool since the classs
// is only declared and defined before this method
} // namespace tamm
Loading

0 comments on commit 5b23876

Please sign in to comment.