diff --git a/docs/user_guide/tensor_operations.rst b/docs/user_guide/tensor_operations.rst index e2616a8b0..9e64deec1 100644 --- a/docs/user_guide/tensor_operations.rst +++ b/docs/user_guide/tensor_operations.rst @@ -360,8 +360,6 @@ Examples using labels: .. code:: cpp - // This seems overly complex for multiplication with all the dependancies - // Constructed TiledIndexSpaces // TiledIndexSpace AO{AO_is, tile_size}; // TiledIndexSpace MO{MO_is, tile_sizes}; diff --git a/src/tamm/execution_context.hpp b/src/tamm/execution_context.hpp index 131c92069..4cf0a0446 100644 --- a/src/tamm/execution_context.hpp +++ b/src/tamm/execution_context.hpp @@ -358,7 +358,7 @@ class ExecutionContext { std::string get_profile_header() { std::string pheader = "ID;Level;OP;total_op_time_min;total_op_time_max;total_op_time_avg;"; pheader += "get_time_min;get_time_max;get_time_avg;"; - pheader += "gemm_time_min;gemm_time_max;gemm_time_avg;"; + pheader += "block_compute_time_min;block_compute_time_max;block_compute_time_avg;"; pheader += "copy_time_min;copy_time_max;copy_time_avg;"; pheader += "acc_time_min;acc_time_max;acc_time_avg"; return pheader; diff --git a/src/tamm/multop.hpp b/src/tamm/multop.hpp index 0ad5f97d6..c47a53297 100644 --- a/src/tamm/multop.hpp +++ b/src/tamm/multop.hpp @@ -470,6 +470,7 @@ class MultOp: public Op { add_bufs.push_back(ab); { + TimerGuard tg_bc{&oprof.multOpBCTime}; TensorElType1* cbuf_dev_ptr{nullptr}; TensorElType1* cbuf_tmp_dev_ptr{nullptr}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) @@ -485,16 +486,12 @@ class MultOp: public Op { csize * sizeof(TensorElType1), thandle); } #endif - { - TimerGuard tg_dgemm{&oprof.multOpDgemmTime}; - kernels::block_multiply( + kernels::block_multiply( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - th_a, th_b, + th_a, th_b, #endif - thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, - cscale, ab->cbuf_, cdims_sz, lhs_int_labels_, hw, true, cbuf_dev_ptr, - cbuf_tmp_dev_ptr); - } + thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, + cscale, ab->cbuf_, cdims_sz, lhs_int_labels_, hw, true, cbuf_dev_ptr, cbuf_tmp_dev_ptr); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) if(hw == ExecutionHW::GPU) { @@ -781,6 +778,7 @@ class MultOp: public Op { // A*B { + TimerGuard tg_bc{&oprof.multOpBCTime}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) TensorElType2* abuf_dev{nullptr}; TensorElType3* bbuf_dev{nullptr}; @@ -792,15 +790,12 @@ class MultOp: public Op { } #endif - { - TimerGuard tg_dgemm{&oprof.multOpDgemmTime}; - kernels::block_multiply( + kernels::block_multiply( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - abuf_dev, bbuf_dev, + abuf_dev, bbuf_dev, #endif - thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, - cscale, cbuf, cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); - } + thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, + cscale, cbuf, cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) if(hw == ExecutionHW::GPU) { @@ -820,6 +815,7 @@ class MultOp: public Op { #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) // copy to host if(hw == ExecutionHW::GPU) { + TimerGuard tg_bc{&oprof.multOpBCTime}; TensorElType1* cbuf_tmp{nullptr}; cbuf_tmp = static_cast(memHostPool.allocate(csize * sizeof(TensorElType1))); diff --git a/src/tamm/op_profiler.hpp b/src/tamm/op_profiler.hpp index b8e74394e..764252cc3 100644 --- a/src/tamm/op_profiler.hpp +++ b/src/tamm/op_profiler.hpp @@ -16,15 +16,15 @@ class OpProfiler { double tgetTime = 0; double taddTime = 0; double twaitTime = 0; - double tgemmTime = 0; + double tBCTime = 0; double tcopyTime = 0; double tbarrierTime = 0; - double multOpGetTime = 0; - double multOpAddTime = 0; - double multOpWaitTime = 0; - double multOpCopyTime = 0; - double multOpDgemmTime = 0; + double multOpGetTime = 0; + double multOpAddTime = 0; + double multOpWaitTime = 0; + double multOpCopyTime = 0; + double multOpBCTime = 0; inline static OpProfiler& instance() { static OpProfiler op_prof; diff --git a/src/tamm/scheduler.hpp b/src/tamm/scheduler.hpp index 578ba00e0..4a3e34811 100644 --- a/src/tamm/scheduler.hpp +++ b/src/tamm/scheduler.hpp @@ -186,11 +186,11 @@ class Scheduler { oprof.taddTime += oprof.multOpAddTime; oprof.tgetTime += oprof.multOpGetTime; oprof.twaitTime += oprof.multOpWaitTime; - oprof.tgemmTime += oprof.multOpDgemmTime; + oprof.tBCTime += oprof.multOpBCTime; oprof.tcopyTime += oprof.multOpCopyTime; oprof.multOpGetTime = 0; oprof.multOpWaitTime = 0; - oprof.multOpDgemmTime = 0; + oprof.multOpBCTime = 0; oprof.multOpAddTime = 0; oprof.multOpCopyTime = 0; } @@ -213,10 +213,10 @@ class Scheduler { auto t1 = misc_end; // double nranks = 1.0 * ec_.pg().size().value(); - oprof.multOpGetTime = 0; - oprof.multOpDgemmTime = 0; - oprof.multOpAddTime = 0; - oprof.multOpCopyTime = 0; + oprof.multOpGetTime = 0; + oprof.multOpBCTime = 0; + oprof.multOpAddTime = 0; + oprof.multOpCopyTime = 0; std::vector load_imbalance_times; std::vector op_times; @@ -238,10 +238,10 @@ class Scheduler { // - t2)).count()); // level_times.push_back(std::chrono::duration_cast>((t3 - // t1)).count()); multop_get_times.push_back(oprof.multOpGetTime); - // multop_dgemm_times.push_back(oprof.multOpDgemmTime); + // multop_dgemm_times.push_back(oprof.multOpBCTime); // multop_add_times.push_back(oprof.multOpAddTime); // oprof.multOpGetTime = 0; - // oprof.multOpDgemmTime = 0; + // oprof.multOpBCTime = 0; // oprof.multOpAddTime = 0; // t1 = t3; } @@ -254,13 +254,13 @@ class Scheduler { op_times.push_back( std::chrono::duration_cast>((t3 - t2)).count()); multop_get_times.push_back(oprof.multOpGetTime); - multop_dgemm_times.push_back(oprof.multOpDgemmTime); + multop_dgemm_times.push_back(oprof.multOpBCTime); multop_add_times.push_back(oprof.multOpAddTime); multop_copy_times.push_back(oprof.multOpCopyTime); - oprof.multOpGetTime = 0; - oprof.multOpDgemmTime = 0; - oprof.multOpAddTime = 0; - oprof.multOpCopyTime = 0; + oprof.multOpGetTime = 0; + oprof.multOpBCTime = 0; + oprof.multOpAddTime = 0; + oprof.multOpCopyTime = 0; } auto t2 = std::chrono::high_resolution_clock::now(); ec().pg().barrier(); @@ -270,10 +270,10 @@ class Scheduler { // - t2)).count()); // level_times.push_back(std::chrono::duration_cast>((t3 - // t1)).count()); multop_get_times.push_back(oprof.multOpGetTime); - // multop_dgemm_times.push_back(oprof.multOpDgemmTime); + // multop_dgemm_times.push_back(oprof.multOpBCTime); // multop_add_times.push_back(oprof.multOpAddTime); // oprof.multOpGetTime = 0; - // oprof.multOpDgemmTime = 0; + // oprof.multOpBCTime = 0; // oprof.multOpAddTime = 0; start_idx_ = ops_.size(); ec().set_ac(IndexedAC(nullptr, 0)); diff --git a/tests/tamm/Test_CCSD.cpp b/tests/tamm/Test_CCSD.cpp index 63d86b2dc..f2d1438af 100644 --- a/tests/tamm/Test_CCSD.cpp +++ b/tests/tamm/Test_CCSD.cpp @@ -418,6 +418,7 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace // A*B { + TimerGuard tg_bc{&oprof.multOpBCTime}; #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) TensorElType2* abuf_dev{nullptr}; TensorElType3* bbuf_dev{nullptr}; @@ -431,16 +432,13 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace gpuMemcpyAsync(bbuf_dev, bbuf, bsize, gpuMemcpyHostToDevice, thandle); } #endif - { - TimerGuard tg_dgemm{&oprof.multOpDgemmTime}; - kernels::block_multiply( + + kernels::block_multiply( #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) - abuf_dev, bbuf_dev, + abuf_dev, bbuf_dev, #endif - thandle, 1.0, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, - cscale, cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, - cbuf_tmp_dev_ptr); - } + thandle, 1.0, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, cscale, + cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr); #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) if(hw == ExecutionHW::GPU) { @@ -459,6 +457,7 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace #if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)) // copy to host if(hw == ExecutionHW::GPU) { + TimerGuard tg_bc{&oprof.multOpBCTime}; TensorElType1* cbuf_tmp{nullptr}; cbuf_tmp = static_cast(memHostPool.allocate(csize * sizeof(TensorElType1))); std::memset(cbuf_tmp, 0, csize * sizeof(TensorElType1)); @@ -605,7 +604,7 @@ int main(int argc, char* argv[]) { Scheduler sch{ec}; - bool profile = false; + bool profile = true; if(ec.print()) { std::cout << tamm_git_info() << std::endl;