Skip to content

Commit

Permalink
ocl: revised device-split, additional tuning param, and other improve…
Browse files Browse the repository at this point in the history
…ments

* Split into maximum number of sub-devices if ACC_OPENCL_DEVSPLIT=1
  - If 1<ACC_OPENCL_DEVSPLIT always split according to CL_DEVICE_PARTITION_EQUALLY
  - Prioritize CL_DEVICE_AFFINITY_DOMAIN_NUMA if ACC_OPENCL_DEVSPLIT=0|1
* Experimental support for XF. Remove XF=0 when storing JSON.
  - Handle extension flag (XF) with currently only one state-bit.
* Other improvements/changes
  - Check finally tuned kernel even when handling signal (recursion).
  - Adjusted built-in default (OPENCL_LIBSMM_SMM_AL).
  • Loading branch information
hfp committed Oct 5, 2023
1 parent a8fd5be commit 685304a
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 54 deletions.
23 changes: 15 additions & 8 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,12 +331,14 @@ int c_dbcsr_acc_init(void) {
cl_device_partition_property properties[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, CL_DEVICE_AFFINITY_DOMAIN_NUMA, /*terminator*/ 0};
cl_uint nunits = 0;
if (1 < devsplit &&
if (0 != devsplit &&
CL_SUCCESS == clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &nunits, NULL) &&
0 < nunits)
1 < nunits)
{
properties[0] = CL_DEVICE_PARTITION_EQUALLY;
properties[1] = (nunits + devsplit - 1) / devsplit;
if (1 < devsplit) {
properties[0] = CL_DEVICE_PARTITION_EQUALLY;
properties[1] = (nunits + devsplit - 1) / devsplit;
}
}
if ((NULL != env_devsplit && '0' == *env_devsplit) ||
(c_dbcsr_acc_opencl_config.ndevices + 1) == ACC_OPENCL_DEVICES_MAXCOUNT ||
Expand All @@ -347,7 +349,12 @@ int c_dbcsr_acc_init(void) {
++c_dbcsr_acc_opencl_config.ndevices;
}
# if defined(CL_VERSION_1_2)
else if (1 < n) { /* create subdevices */
else if (1 < n || 1 < nunits) { /* create subdevices */
if (1 < nunits) {
properties[0] = CL_DEVICE_PARTITION_EQUALLY;
properties[1] = 1;
n = nunits;
}
if (ACC_OPENCL_DEVICES_MAXCOUNT < (c_dbcsr_acc_opencl_config.ndevices + n)) {
n = (cl_uint)ACC_OPENCL_DEVICES_MAXCOUNT - c_dbcsr_acc_opencl_config.ndevices;
}
Expand Down Expand Up @@ -437,7 +444,7 @@ int c_dbcsr_acc_init(void) {
}
}
if (EXIT_SUCCESS == result && 0 < c_dbcsr_acc_opencl_config.ndevices) {
/* preselect any default device or prune to homogeneous set of GPUs */
/* preselect any default device or prune to homogeneous set of devices */
if (NULL == env_device || '\0' == *env_device) {
char tmp[ACC_OPENCL_BUFFERSIZE] = "";
ndevices = (cl_uint)c_dbcsr_acc_opencl_config.ndevices;
Expand All @@ -453,9 +460,9 @@ int c_dbcsr_acc_init(void) {
device_id = (int)i;
break;
}
else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype && CL_DEVICE_TYPE_GPU == itype && device_id <= (int)i) {
else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype /*&& CL_DEVICE_TYPE_GPU == itype*/ && device_id <= (int)i) {
result = clGetDeviceInfo(c_dbcsr_acc_opencl_config.devices[i], CL_DEVICE_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL);
if (CL_SUCCESS == result /* prune for homogeneous set of GPUs */
if (CL_SUCCESS == result /* prune for homogeneous set of devices */
&& ('\0' == *tmp || 0 == strncmp(buffer, tmp, ACC_OPENCL_BUFFERSIZE)))
{
c_dbcsr_acc_opencl_config.ndevices = i + 1;
Expand Down
45 changes: 27 additions & 18 deletions src/acc/opencl/smm/opencl_libsmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -807,11 +807,10 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
cl_device_id active_device;
result = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL);
if (CL_SUCCESS == result) {
const char* const param_format = "-DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s";
const char *const env_cl = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"), *const env_bm = getenv("OPENCL_LIBSMM_TRANS_BM");
const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant");
const char *const env_options = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"), *tname = "";
const char* const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE");
const char* const env_bm = getenv("OPENCL_LIBSMM_TRANS_BM");
const char* const param_format = "-DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s";
const char *const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE"), *tname = "";
# if defined(OPENCL_LIBSMM_TRANS_INPLACE)
const int inplace = ((m == n) && (NULL == env_inplace ? 1 : ('0' != *env_inplace)));
# else
Expand All @@ -837,7 +836,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
default: assert('\0' == *tname);
}
new_config.wgsize = LIBXSMM_MIN((size_t)((m == bm || 0 == (m % bm)) ? bm : m), wgsize_max);
nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s", NULL == env_options ? "" : env_options);
nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s", NULL == env_cl ? "" : env_cl);
if (0 <= /*<*/ nchar && (int)sizeof(buffer) > nchar) {
nchar = LIBXSMM_SNPRINTF(
build_params, sizeof(build_params), param_format, cmem, inplace, fname, m, n, (int)new_config.wgsize, tname);
Expand Down Expand Up @@ -1255,6 +1254,14 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
const char *const env_tb = getenv("OPENCL_LIBSMM_SMM_TB"), *const env_tc = getenv("OPENCL_LIBSMM_SMM_TC");
const char *const env_ap = getenv("OPENCL_LIBSMM_SMM_AP"), *const env_aa = getenv("OPENCL_LIBSMM_SMM_AA");
const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC");
const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
const char* const intel_xf = "-cl-intel-256-GRF-per-thread";
const int cl_nonv = (0 != devinfo->intel || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(
active_device, "nvidia", 0 /*use_platform_name*/));
const int cl_noamd =
0 != devinfo->intel || !cl_nonv ||
(EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) &&
EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/));
const int blockm = ((NULL == env_bm || '\0' == *env_bm) ? 0 : atoi(env_bm));
const int blockn = ((NULL == env_bn || '\0' == *env_bn) ? 0 : atoi(env_bn));
const int blockk = ((NULL == env_bk || '\0' == *env_bk) ? 0 : atoi(env_bk));
Expand Down Expand Up @@ -1292,9 +1299,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->nz) : /*default*/ 0)
: atoi(env_nz),
0, 1);
new_config.al = LIBXSMM_CLMP((NULL == env_al || '\0' == *env_al)
? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0)
: atoi(env_al),
new_config.al = LIBXSMM_CLMP(
(NULL == env_al || '\0' == *env_al)
? (cl_noamd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1)
: atoi(env_al),
0, 1);
new_config.tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb)
? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->tb) : /*default*/ 0)
Expand Down Expand Up @@ -1323,7 +1331,13 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
? (0 == kernel_idx ? (NULL == config ? /*default*/ default_ac : config->ac) : /*default*/ default_ac)
: atoi(env_ac),
0, 2);
new_config.flags = (NULL == config ? /*default*/ 0 : config->flags);
if (NULL == env_xf || '\0' == *env_xf) {
if (0 == devinfo->intel || NULL == env_cl || NULL == strstr(env_cl, intel_xf)) {
new_config.flags = (NULL == config ? /*default*/ 0 : config->flags);
}
else new_config.flags = 1;
}
else new_config.flags = atoi(env_xf);
if (0 >= new_config.s) new_config.s = stack_size;
if (0 == kernel_idx || 1 >= new_config.bs) new_config.bs = bs;
nbm = (m_max + new_config.bm - 1) / new_config.bm;
Expand Down Expand Up @@ -1398,7 +1412,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
}
if (new_config.wgsize[kernel_idx] <= wgsize_max) { /* SMMs can be potentially handled by device */
const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant");
const char* const env_options = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
const char* const env_barrier = getenv("OPENCL_LIBSMM_SMM_BARRIER");
const char* const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS");
const char* const env_nrepeat = getenv("SMM_NREPEAT");
Expand All @@ -1414,8 +1427,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
if (NULL == env_atomics || '0' != *env_atomics) {
/* atomics_force: attempt to force atomics without confirmation */
const int atomics_force = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics));
const int cl_nonv = (EXIT_SUCCESS !=
c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia", 0 /*use_platform_name*/));
if (NULL == env_atomics || '\0' == *env_atomics || 0 != atomics_force) {
cl_bitfield fp_atomics;
assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype);
Expand Down Expand Up @@ -1456,10 +1467,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
}
else if (cl_nonv) {
int gfx90 = 0;
if ((EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) ||
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/)) &&
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE,
NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1))
if (!cl_noamd && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE,
NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1))
{
const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx");
if (NULL != gfxname && 90 <= atoi(gfxname + 3)) gfx90 = 1;
Expand Down Expand Up @@ -1534,8 +1543,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
# else
const char* const cl_debug = "";
# endif
nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero",
NULL == env_options ? "" : env_options, cl_debug);
nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "-cl-fast-relaxed-math -cl-denorms-are-zero %s %s %s",
NULL == env_cl ? "" : env_cl, (0 == new_config.flags || 0 == devinfo->intel) ? "" : intel_xf, cl_debug);
if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE;
}
else result = EXIT_FAILURE;
Expand Down
Loading

0 comments on commit 685304a

Please sign in to comment.