Skip to content

Commit

Permalink
Fix profiler to report time spent on GPU kernels again instead of on …
Browse files Browse the repository at this point in the history
…'wait for parallel tasks'. (#8453)

Fix profiler to report time spent on GPU kernels again instead of on 'wait for parallel tasks'.
  • Loading branch information
mcourteaux authored Nov 4, 2024
1 parent 6ef825c commit 9ba1829
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 7 deletions.
12 changes: 11 additions & 1 deletion src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ class InjectProfiling : public IRMutator {
unconditionally_set_current_func(stack.back())});
}

Stmt suspend_thread_but_keep_task_id(const Stmt &s) {
return Block::make({decr_active_threads(profiler_instance),
s,
incr_active_threads(profiler_instance)});
}

private:
using IRMutator::visit;

Expand Down Expand Up @@ -499,7 +505,11 @@ class InjectProfiling : public IRMutator {
Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body);

if (update_active_threads) {
stmt = suspend_thread(stmt);
if (Internal::is_gpu(op->for_type)) {
stmt = suspend_thread_but_keep_task_id(stmt);
} else {
stmt = suspend_thread(stmt);
}
}

return stmt;
Expand Down
3 changes: 2 additions & 1 deletion src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,8 @@ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);
* without specifying any buffer to synchronize on.
* Calling this with a null device_interface is always illegal.
*/
extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface);
extern int halide_device_sync_global(void *user_context,
const struct halide_device_interface_t *device_interface);

/** Allocate device memory to back a halide_buffer_t. */
extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
Expand Down
5 changes: 3 additions & 2 deletions src/runtime/device_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,11 +237,12 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) {
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on.
*/
WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) {
WEAK int halide_device_sync_global(void *user_context,
const struct halide_device_interface_t *device_interface) {
if (device_interface == nullptr) {
return halide_error_code_no_device_interface;
}
// This function calls immediately the device_interface implementation to syncrhonize on
// This function calls immediately the device_interface implementation to synchronize on
// "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync.
return device_interface->impl->device_sync(user_context, nullptr);
}
Expand Down
7 changes: 4 additions & 3 deletions src/runtime/profiler_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,10 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
for (int i = 0; i < p->num_funcs; i++) {
halide_profiler_func_stats *fs = p->funcs + i;

// The first id is always a catch-all overhead
// slot. Only report overhead time if it's non-zero
if (i == 0 && fs->time == 0) {
// The first id is always a catch-all overhead slot (notably containing the asserts).
// The second id is always the "wait for parallel tasks" slot.
// Only report these time if it's non-zero
if ((i == 0 || i == 1) && fs->time == 0) {
continue;
}

Expand Down

0 comments on commit 9ba1829

Please sign in to comment.