Skip to content

Commit

Permalink
ocl: removed unused code
Browse files Browse the repository at this point in the history
* Removed support for sharing streams among threads, removed support for flush-bits.
* OPENCL_LIBSMM_VALIDATE_SMM: rely on libxsmm_matdiff_epsilon.
* Revised console output (acc_bench_smm).
* Code cleanup.
  • Loading branch information
hfp committed Oct 11, 2023
1 parent 81c6213 commit 1cbc5f3
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 136 deletions.
20 changes: 12 additions & 8 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ int main(int argc, char* argv[]) {
const char *snc = NULL, *sna = NULL, *snb = NULL;
FILE* file = NULL;
#if defined(USE_LIBXSMM) && defined(VALIDATE)
double maxerror = 0;
double maxdiff = 0;
#else
DBCSR_MARK_USED(check);
#endif
Expand Down Expand Up @@ -480,16 +480,20 @@ int main(int argc, char* argv[]) {
/* transfer result from device to host for validation */
CHECK(c_dbcsr_acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result, check);
CHECK(c_dbcsr_acc_stream_sync(stream), &result, check);
# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
# if defined(USE_LIBXSMM)
if (EXIT_SUCCESS == result) {
libxsmm_matdiff_info diff;
/* validate result buffers at once (including excess/padded space) */
result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ELEM_TYPE), mn, nc, gold_hst, cmat_hst, &mn, &mn);
if (EXIT_SUCCESS == result) {
const double relerror = 1.0 - diff.rsq;
PRINTF("rel.error: %g", relerror);
if (maxerror < relerror && NULL != file) maxerror = relerror;
if (0 < relerror) {
# if defined(USE_LIBXSMM) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
const double epsilon = libxsmm_matdiff_epsilon(&diff); /* 1.0 - diff.rsq */
# else
const double epsilon = diff.normf_rel;
# endif
PRINTF("diff.cur: %g", epsilon);
if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
if (0 < epsilon) {
if (LIBXSMM_NOTNAN(diff.v_tst)) {
PRINTF(" (%g != %g)\n", diff.v_ref, diff.v_tst);
}
Expand All @@ -500,7 +504,7 @@ int main(int argc, char* argv[]) {
else {
PRINTF("\n");
}
if (0 < check && check < relerror) result = EXIT_FAILURE;
if (0 < check && check < epsilon) result = EXIT_FAILURE;
}
}
# endif
Expand Down Expand Up @@ -546,7 +550,7 @@ int main(int argc, char* argv[]) {
#endif
CHECK(c_dbcsr_acc_finalize(), NULL, check);
#if defined(USE_LIBXSMM) && defined(VALIDATE)
if (1 < nok) printf("\nmax.error: %g\n", maxerror);
if (1 < nok) printf("\ndiff.max: %g\n", maxdiff);
#endif
if (EXIT_SUCCESS != result) {
if (NULL != file) fclose(file);
Expand Down
34 changes: 12 additions & 22 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,8 @@ int c_dbcsr_acc_init(void) {
const char *const env_devmatch = getenv("ACC_OPENCL_DEVMATCH"), *const env_devtype = getenv("ACC_OPENCL_DEVTYPE");
const char *const env_priority = getenv("ACC_OPENCL_PRIORITY"), *const env_xhints = getenv("ACC_OPENCL_XHINTS");
const char *const env_devcopy = getenv("ACC_OPENCL_DEVCOPY"), *const env_verbose = getenv("ACC_OPENCL_VERBOSE");
const char *const env_dump_acc = getenv("ACC_OPENCL_DUMP"), *const env_share = getenv("ACC_OPENCL_SHARE");
const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_async = getenv("ACC_OPENCL_ASYNC");
const char *const env_flush = getenv("ACC_OPENCL_FLUSH"), *const env_timer = getenv("ACC_OPENCL_TIMER");
const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP");
const char *const env_async = getenv("ACC_OPENCL_ASYNC"), *const env_timer = getenv("ACC_OPENCL_TIMER");
const char* const env_dump = (NULL != env_dump_acc ? env_dump_acc : getenv("IGC_ShaderDumpEnable"));
# if defined(ACC_OPENCL_NCCS) && (0 < ACC_OPENCL_NCCS)
const char *const env_zex = getenv("ZEX_NUMBER_OF_CCS"), *const env_nccs = getenv("ACC_OPENCL_NCCS");
Expand All @@ -233,17 +232,12 @@ int c_dbcsr_acc_init(void) {
c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
c_dbcsr_acc_opencl_config.devcopy = (NULL == env_devcopy ? /*default*/ 0 : atoi(env_devcopy));
c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? /*default*/ 5 : atoi(env_xhints));
c_dbcsr_acc_opencl_config.share = (NULL == env_share ? /*default*/ 0 : atoi(env_share));
c_dbcsr_acc_opencl_config.async = (NULL == env_async ? /*default*/ 3 : atoi(env_async));
c_dbcsr_acc_opencl_config.flush = (NULL == env_flush ? /*default*/ 0 : atoi(env_flush));
c_dbcsr_acc_opencl_config.dump = (NULL == env_dump ? /*default*/ 0 : atoi(env_dump));
if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(NULL /*device*/, env_devmatch, &c_dbcsr_acc_opencl_config.devmatch)) {
c_dbcsr_acc_opencl_config.devmatch = 1;
}
libxsmm_init();
/* sanitize ACC_OPENCL_SHARE */
if (1 == c_dbcsr_acc_opencl_config.share) c_dbcsr_acc_opencl_config.share = 2;
else if (0 > c_dbcsr_acc_opencl_config.share) c_dbcsr_acc_opencl_config.share = 0;
if (NULL != env_timer && (c_dbcsr_acc_opencl_timer_host == atoi(env_timer) ||
(env_timer == LIBXSMM_STRISTR(env_timer, "host") && 4 == strlen(env_timer)) ||
(env_timer == LIBXSMM_STRISTR(env_timer, "cpu") && 3 == strlen(env_timer))))
Expand Down Expand Up @@ -1044,22 +1038,18 @@ int c_dbcsr_acc_set_active_device(int device_id) {


int c_dbcsr_acc_opencl_device_synchronize(int thread_id) {
void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id;
int result = EXIT_SUCCESS;
if ((0 == (4 & c_dbcsr_acc_opencl_config.flush)) &&
(0 == c_dbcsr_acc_opencl_config.share || 0 == (thread_id % c_dbcsr_acc_opencl_config.share)))
{
void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id;
int i = 0;
assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void* const stream = streams[i];
if (NULL != stream) {
result = c_dbcsr_acc_stream_sync(stream);
if (EXIT_SUCCESS != result) break;
}
else break;
int i = 0;
assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void* const stream = streams[i];
if (NULL != stream) {
result = c_dbcsr_acc_stream_sync(stream);
if (EXIT_SUCCESS != result) break;
}
else break;
}
return result;
}
Expand Down
7 changes: 2 additions & 5 deletions src/acc/opencl/acc_opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
# define ACC_OPENCL_MAXSTRLEN 48
#endif
#if !defined(ACC_OPENCL_DEVICES_MAXCOUNT)
# define ACC_OPENCL_DEVICES_MAXCOUNT 256
# define ACC_OPENCL_DEVICES_MAXCOUNT 64
#endif
/** Counted on a per-thread basis! */
#if !defined(ACC_OPENCL_HANDLES_MAXCOUNT)
Expand Down Expand Up @@ -112,6 +112,7 @@
# define ACC_OPENCL_STREAM_PRIORITIES
# endif
#endif
/** Use DBCSR's profile for detailed timings */
#if !defined(ACC_OPENCL_PROFILE) && 0
# define ACC_OPENCL_PROFILE
#endif
Expand Down Expand Up @@ -262,12 +263,8 @@ typedef struct c_dbcsr_acc_opencl_config_t {
cl_int devcopy;
/** Execution-hints (command stream). */
cl_int xhints;
/** Share streams across threads. */
cl_int share;
/** Asynchronous memory ops. */
cl_int async;
/** Flush level. */
cl_int flush;
/** Dump level. */
cl_int dump;
} c_dbcsr_acc_opencl_config_t;
Expand Down
7 changes: 1 addition & 6 deletions src/acc/opencl/acc_opencl_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,7 @@ int c_dbcsr_acc_event_query(void* event, c_dbcsr_acc_bool_t* has_occurred) {
# endif
assert(NULL != event && NULL != has_occurred);
result = clGetEventInfo(*ACC_OPENCL_EVENT(event), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL);
if (CL_SUCCESS == result && 0 <= status) {
*has_occurred = (CL_COMPLETE == status ? 1 : 0);
if (0 == *has_occurred && 0 != (8 & c_dbcsr_acc_opencl_config.flush)) {
result = c_dbcsr_acc_opencl_device_synchronize(ACC_OPENCL_OMP_TID());
}
}
if (CL_SUCCESS == result && 0 <= status) *has_occurred = (CL_COMPLETE == status ? 1 : 0);
else { /* error state */
# if defined(ACC_OPENCL_EVENT_CREATE)
if (CL_SUCCESS == result) result = EXIT_FAILURE;
Expand Down
135 changes: 42 additions & 93 deletions src/acc/opencl/acc_opencl_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,86 +110,52 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
cl_device_id device = NULL;
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL);
if (CL_SUCCESS == result) {
const int s = c_dbcsr_acc_opencl_config.share;
if (0 == s || 0 == (tid % s)) {
if (0 != c_dbcsr_acc_opencl_config.device[tid].intel) {
const int xhints = ((1 == c_dbcsr_acc_opencl_config.xhints || 0 > c_dbcsr_acc_opencl_config.xhints)
? (0 != c_dbcsr_acc_opencl_config.device[tid].intel ? 1 : 0)
: (c_dbcsr_acc_opencl_config.xhints >> 1));
if (0 != (1 & xhints)) { /* attempt to enable command aggregation */
const ACC_OPENCL_STREAM_PROPERTIES_TYPE props[4] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */
};
const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, props, &result);
if (CL_SUCCESS == result) {
c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host; /* force host-timer */
clReleaseCommandQueue(q);
}
else result = CL_SUCCESS;
}
if (0 != (2 & xhints)) { /* attempt to enable queue families */
struct {
cl_command_queue_properties properties;
cl_bitfield capabilities;
cl_uint count;
char name[64 /*CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL*/];
} intel_qfprops[16];
size_t nbytes = 0, i;
if (CL_SUCCESS == clGetDeviceInfo(device, 0x418B /*CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL*/, sizeof(intel_qfprops),
intel_qfprops, &nbytes))
{
for (i = 0; (i * sizeof(*intel_qfprops)) < nbytes; ++i) {
if (0 /*CL_QUEUE_DEFAULT_CAPABILITIES_INTEL*/ == intel_qfprops[i].capabilities && 1 < intel_qfprops[i].count) {
const int j = (0 /*terminator*/ == properties[2] ? 2 : 4);
properties[j + 0] = 0x418C; /* CL_QUEUE_FAMILY_INTEL */
properties[j + 1] = (int)i;
properties[j + 2] = 0x418D; /* CL_QUEUE_INDEX_INTEL */
properties[j + 3] = (i + offset) % intel_qfprops[i].count;
properties[j + 4] = 0; /* terminator */
break;
}
}
}
if (0 != c_dbcsr_acc_opencl_config.device[tid].intel) {
const int xhints = ((1 == c_dbcsr_acc_opencl_config.xhints || 0 > c_dbcsr_acc_opencl_config.xhints)
? (0 != c_dbcsr_acc_opencl_config.device[tid].intel ? 1 : 0)
: (c_dbcsr_acc_opencl_config.xhints >> 1));
if (0 != (1 & xhints)) { /* attempt to enable command aggregation */
const ACC_OPENCL_STREAM_PROPERTIES_TYPE props[4] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */
};
const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, props, &result);
if (CL_SUCCESS == result) {
c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host; /* force host-timer */
clReleaseCommandQueue(q);
}
else result = CL_SUCCESS;
}
if ((c_dbcsr_acc_opencl_timer_device == c_dbcsr_acc_opencl_config.timer) &&
(3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
{
properties[1] = CL_QUEUE_PROFILING_ENABLE;
}
queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, properties, &result);
}
else { /* attempt to share existing stream */
const int maxn = c_dbcsr_acc_opencl_config.nthreads;
cl_command_queue stream = NULL;
assert(0 < tid);
for (i = 0; i < maxn; ++i) {
int j = i + tid - 1, t = (j < maxn ? j : (j - maxn));
if (0 < t && t != tid) { /* avoid cloning master's and own streams */
streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * t;
for (j = 0; j < ACC_OPENCL_STREAMS_MAXCOUNT; ++j) {
if (NULL != streams[j]) {
stream = *ACC_OPENCL_STREAM(streams[j]);
i = maxn; /* break outer loop */
if (0 != (2 & xhints)) { /* attempt to enable queue families */
struct {
cl_command_queue_properties properties;
cl_bitfield capabilities;
cl_uint count;
char name[64 /*CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL*/];
} intel_qfprops[16];
size_t nbytes = 0, i;
if (CL_SUCCESS == clGetDeviceInfo(device, 0x418B /*CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL*/, sizeof(intel_qfprops),
intel_qfprops, &nbytes))
{
for (i = 0; (i * sizeof(*intel_qfprops)) < nbytes; ++i) {
if (0 /*CL_QUEUE_DEFAULT_CAPABILITIES_INTEL*/ == intel_qfprops[i].capabilities && 1 < intel_qfprops[i].count) {
const int j = (0 /*terminator*/ == properties[2] ? 2 : 4);
properties[j + 0] = 0x418C; /* CL_QUEUE_FAMILY_INTEL */
properties[j + 1] = (int)i;
properties[j + 2] = 0x418D; /* CL_QUEUE_INDEX_INTEL */
properties[j + 3] = (i + offset) % intel_qfprops[i].count;
properties[j + 4] = 0; /* terminator */
break;
}
}
}
}
if (NULL != stream) { /* clone existing stream */
result = clRetainCommandQueue(stream);
}
else {
for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { /* adopt master's stream created last */
if (NULL != c_dbcsr_acc_opencl_config.streams[i]) {
stream = *ACC_OPENCL_STREAM(c_dbcsr_acc_opencl_config.streams[i]);
result = clRetainCommandQueue(stream);
}
else break;
}
}
if (EXIT_SUCCESS == result) queue = stream;
}
if ((c_dbcsr_acc_opencl_timer_device == c_dbcsr_acc_opencl_config.timer) &&
(3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
{
properties[1] = CL_QUEUE_PROFILING_ENABLE;
}
queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, device, properties, &result);
}
}
else {
Expand Down Expand Up @@ -267,14 +233,8 @@ int c_dbcsr_acc_stream_destroy(void* stream) {
memmove(streams + i, streams + j, sizeof(void*) * (ACC_OPENCL_STREAMS_MAXCOUNT - j));
}
streams[ACC_OPENCL_STREAMS_MAXCOUNT - j] = NULL;
/* consider breaking outer loop */
if (0 == c_dbcsr_acc_opencl_config.share) {
tid = c_dbcsr_acc_opencl_config.nthreads;
result = result_release; /* promote */
}
else if (EXIT_SUCCESS != result_release) {
tid = c_dbcsr_acc_opencl_config.nthreads;
}
tid = c_dbcsr_acc_opencl_config.nthreads; /* leave outer loop */
result = result_release; /* promote */
break;
}
else if (NULL == streams[i]) { /* compact streams */
Expand Down Expand Up @@ -335,7 +295,7 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) {
int c_dbcsr_acc_stream_sync(void* stream) {
int result = EXIT_SUCCESS;
# if defined(ACC_OPENCL_STREAM_PRIORITIES)
const int* const priority = (0 == (1 & c_dbcsr_acc_opencl_config.flush) ? NULL : c_dbcsr_acc_opencl_stream_priority(stream));
const int* const priority = NULL;
# endif
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
int routine_handle;
Expand All @@ -344,18 +304,7 @@ int c_dbcsr_acc_stream_sync(void* stream) {
c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle);
# endif
assert(NULL != stream);
# if defined(ACC_OPENCL_STREAM_PRIORITIES)
if (NULL != priority && CL_QUEUE_PRIORITY_HIGH_KHR <= *priority && CL_QUEUE_PRIORITY_MED_KHR > *priority) {
if (0 != (2 & c_dbcsr_acc_opencl_config.flush)) {
result = clFlush(*ACC_OPENCL_STREAM(stream));
}
}
else
# endif
{
const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
result = clFinish(queue);
}
result = clFinish(*ACC_OPENCL_STREAM(stream));
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
c_dbcsr_timestop(&routine_handle);
# endif
Expand Down
10 changes: 8 additions & 2 deletions src/acc/opencl/smm/opencl_libsmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1672,7 +1672,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
&blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize +
5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
scratch = libxsmm_aligned_malloc(scratch_size, LIBXSMM_ALIGNMENT);
scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
if (NULL != desc && NULL != scratch) {
pinp = (int*)scratch;
ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT);
Expand Down Expand Up @@ -1783,10 +1783,16 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
/* some result may be validated multiple times in case of duplicated c-indexes */
for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
const size_t ic = (size_t)(params[i + 2] - 1) * typesize;
double epsilon = 0;
libxsmm_matdiff_info diff;
libxsmm_matdiff(
&diff, (libxsmm_datatype)precision, m_max, n_max, gold + ic, test + ic, &m_max /*ldref*/, &m_max /*ldtst*/);
if (tolerance < diff.normf_rel) {
# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
epsilon = libxsmm_matdiff_epsilon(&diff);
# else
epsilon = diff.normf_rel;
# endif
if (tolerance < epsilon) {
if (0 == c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
Expand Down

0 comments on commit 1cbc5f3

Please sign in to comment.