Skip to content

Commit

Permalink
profiling update
Browse files Browse the repository at this point in the history
  • Loading branch information
ajaypanyala committed Nov 9, 2024
1 parent eb93889 commit f9ef0f8
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 48 deletions.
2 changes: 0 additions & 2 deletions docs/user_guide/tensor_operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,6 @@ Examples using labels:

.. code:: cpp
// This seems overly complex for multiplication with all the dependancies
// Constructed TiledIndexSpaces
// TiledIndexSpace AO{AO_is, tile_size};
// TiledIndexSpace MO{MO_is, tile_sizes};
Expand Down
2 changes: 1 addition & 1 deletion src/tamm/execution_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ class ExecutionContext {
std::string get_profile_header() {
std::string pheader = "ID;Level;OP;total_op_time_min;total_op_time_max;total_op_time_avg;";
pheader += "get_time_min;get_time_max;get_time_avg;";
pheader += "gemm_time_min;gemm_time_max;gemm_time_avg;";
pheader += "block_compute_time_min;block_compute_time_max;block_compute_time_avg;";
pheader += "copy_time_min;copy_time_max;copy_time_avg;";
pheader += "acc_time_min;acc_time_max;acc_time_avg";
return pheader;
Expand Down
26 changes: 11 additions & 15 deletions src/tamm/multop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ class MultOp: public Op {
add_bufs.push_back(ab);

{
TimerGuard tg_bc{&oprof.multOpBCTime};
TensorElType1* cbuf_dev_ptr{nullptr};
TensorElType1* cbuf_tmp_dev_ptr{nullptr};
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
Expand All @@ -485,16 +486,12 @@ class MultOp: public Op {
csize * sizeof(TensorElType1), thandle);
}
#endif
{
TimerGuard tg_dgemm{&oprof.multOpDgemmTime};
kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(
kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
th_a, th_b,
th_a, th_b,
#endif
thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_,
cscale, ab->cbuf_, cdims_sz, lhs_int_labels_, hw, true, cbuf_dev_ptr,
cbuf_tmp_dev_ptr);
}
thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_,
cscale, ab->cbuf_, cdims_sz, lhs_int_labels_, hw, true, cbuf_dev_ptr, cbuf_tmp_dev_ptr);

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
if(hw == ExecutionHW::GPU) {
Expand Down Expand Up @@ -781,6 +778,7 @@ class MultOp: public Op {

// A*B
{
TimerGuard tg_bc{&oprof.multOpBCTime};
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
TensorElType2* abuf_dev{nullptr};
TensorElType3* bbuf_dev{nullptr};
Expand All @@ -792,15 +790,12 @@ class MultOp: public Op {
}
#endif

{
TimerGuard tg_dgemm{&oprof.multOpDgemmTime};
kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(
kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
abuf_dev, bbuf_dev,
abuf_dev, bbuf_dev,
#endif
thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_,
cscale, cbuf, cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr);
}
thandle, alpha_, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_,
cscale, cbuf, cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr);

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
if(hw == ExecutionHW::GPU) {
Expand All @@ -820,6 +815,7 @@ class MultOp: public Op {
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
// copy to host
if(hw == ExecutionHW::GPU) {
TimerGuard tg_bc{&oprof.multOpBCTime};
TensorElType1* cbuf_tmp{nullptr};
cbuf_tmp =
static_cast<TensorElType1*>(memHostPool.allocate(csize * sizeof(TensorElType1)));
Expand Down
12 changes: 6 additions & 6 deletions src/tamm/op_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ class OpProfiler {
double tgetTime = 0;
double taddTime = 0;
double twaitTime = 0;
double tgemmTime = 0;
double tBCTime = 0;
double tcopyTime = 0;
double tbarrierTime = 0;

double multOpGetTime = 0;
double multOpAddTime = 0;
double multOpWaitTime = 0;
double multOpCopyTime = 0;
double multOpDgemmTime = 0;
double multOpGetTime = 0;
double multOpAddTime = 0;
double multOpWaitTime = 0;
double multOpCopyTime = 0;
double multOpBCTime = 0;

inline static OpProfiler& instance() {
static OpProfiler op_prof;
Expand Down
30 changes: 15 additions & 15 deletions src/tamm/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ class Scheduler {
oprof.taddTime += oprof.multOpAddTime;
oprof.tgetTime += oprof.multOpGetTime;
oprof.twaitTime += oprof.multOpWaitTime;
oprof.tgemmTime += oprof.multOpDgemmTime;
oprof.tBCTime += oprof.multOpBCTime;
oprof.tcopyTime += oprof.multOpCopyTime;
oprof.multOpGetTime = 0;
oprof.multOpWaitTime = 0;
oprof.multOpDgemmTime = 0;
oprof.multOpBCTime = 0;
oprof.multOpAddTime = 0;
oprof.multOpCopyTime = 0;
}
Expand All @@ -213,10 +213,10 @@ class Scheduler {
auto t1 = misc_end;

// double nranks = 1.0 * ec_.pg().size().value();
oprof.multOpGetTime = 0;
oprof.multOpDgemmTime = 0;
oprof.multOpAddTime = 0;
oprof.multOpCopyTime = 0;
oprof.multOpGetTime = 0;
oprof.multOpBCTime = 0;
oprof.multOpAddTime = 0;
oprof.multOpCopyTime = 0;

std::vector<double> load_imbalance_times;
std::vector<double> op_times;
Expand All @@ -238,10 +238,10 @@ class Scheduler {
// - t2)).count());
// level_times.push_back(std::chrono::duration_cast<std::chrono::duration<double>>((t3 -
// t1)).count()); multop_get_times.push_back(oprof.multOpGetTime);
// multop_dgemm_times.push_back(oprof.multOpDgemmTime);
// multop_dgemm_times.push_back(oprof.multOpBCTime);
// multop_add_times.push_back(oprof.multOpAddTime);
// oprof.multOpGetTime = 0;
// oprof.multOpDgemmTime = 0;
// oprof.multOpBCTime = 0;
// oprof.multOpAddTime = 0;
// t1 = t3;
}
Expand All @@ -254,13 +254,13 @@ class Scheduler {
op_times.push_back(
std::chrono::duration_cast<std::chrono::duration<double>>((t3 - t2)).count());
multop_get_times.push_back(oprof.multOpGetTime);
multop_dgemm_times.push_back(oprof.multOpDgemmTime);
multop_dgemm_times.push_back(oprof.multOpBCTime);
multop_add_times.push_back(oprof.multOpAddTime);
multop_copy_times.push_back(oprof.multOpCopyTime);
oprof.multOpGetTime = 0;
oprof.multOpDgemmTime = 0;
oprof.multOpAddTime = 0;
oprof.multOpCopyTime = 0;
oprof.multOpGetTime = 0;
oprof.multOpBCTime = 0;
oprof.multOpAddTime = 0;
oprof.multOpCopyTime = 0;
}
auto t2 = std::chrono::high_resolution_clock::now();
ec().pg().barrier();
Expand All @@ -270,10 +270,10 @@ class Scheduler {
// - t2)).count());
// level_times.push_back(std::chrono::duration_cast<std::chrono::duration<double>>((t3 -
// t1)).count()); multop_get_times.push_back(oprof.multOpGetTime);
// multop_dgemm_times.push_back(oprof.multOpDgemmTime);
// multop_dgemm_times.push_back(oprof.multOpBCTime);
// multop_add_times.push_back(oprof.multOpAddTime);
// oprof.multOpGetTime = 0;
// oprof.multOpDgemmTime = 0;
// oprof.multOpBCTime = 0;
// oprof.multOpAddTime = 0;
start_idx_ = ops_.size();
ec().set_ac(IndexedAC(nullptr, 0));
Expand Down
17 changes: 8 additions & 9 deletions tests/tamm/Test_CCSD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace

// A*B
{
TimerGuard tg_bc{&oprof.multOpBCTime};
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
TensorElType2* abuf_dev{nullptr};
TensorElType3* bbuf_dev{nullptr};
Expand All @@ -431,16 +432,13 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace
gpuMemcpyAsync<TensorElType3>(bbuf_dev, bbuf, bsize, gpuMemcpyHostToDevice, thandle);
}
#endif
{
TimerGuard tg_dgemm{&oprof.multOpDgemmTime};
kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(

kernels::block_multiply<T, TensorElType1, TensorElType2, TensorElType3>(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
abuf_dev, bbuf_dev,
abuf_dev, bbuf_dev,
#endif
thandle, 1.0, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_,
cscale, cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr,
cbuf_tmp_dev_ptr);
}
thandle, 1.0, abuf, adims_sz, rhs1_int_labels_, bbuf, bdims_sz, rhs2_int_labels_, cscale,
cbuf.data(), cdims_sz, lhs_int_labels_, hw, false, cbuf_dev_ptr, cbuf_tmp_dev_ptr);

#if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP))
if(hw == ExecutionHW::GPU) {
Expand All @@ -459,6 +457,7 @@ void ccsd_t2_cs(Scheduler& sch, const TiledIndexSpace& MO, const TiledIndexSpace
#if(defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP))
// copy to host
if(hw == ExecutionHW::GPU) {
TimerGuard tg_bc{&oprof.multOpBCTime};
TensorElType1* cbuf_tmp{nullptr};
cbuf_tmp = static_cast<TensorElType1*>(memHostPool.allocate(csize * sizeof(TensorElType1)));
std::memset(cbuf_tmp, 0, csize * sizeof(TensorElType1));
Expand Down Expand Up @@ -605,7 +604,7 @@ int main(int argc, char* argv[]) {

Scheduler sch{ec};

bool profile = false;
bool profile = true;

if(ec.print()) {
std::cout << tamm_git_info() << std::endl;
Expand Down

0 comments on commit f9ef0f8

Please sign in to comment.