From 86905e74b5a141c03c24fd29b77ee052d1e8d66f Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Fri, 29 Sep 2023 10:26:12 +0200 Subject: [PATCH] ocl: GCN atomics for Mi-GPUs and other improvements * GCN: Rely on builtin atomics (Mi-GPUs); TODO: tuned params and check if supported. * Allow relying on platform name instead of device name (c_dbcsr_acc_opencl_device_vendor). * Introduced optional kernel-flags (opencl_libsmm_smm_t); needs some followup work later. * Improved kernel by relying on work_group_broadcast in general. * improved documentation for CUDA/OpenCL stand-alone drivers * Modernized Shell script (acc_opencl.sh). * Improved some runtime error messages. * Some more debug/developer settings. --- src/acc/README.md | 20 +++++++-- src/acc/opencl/acc_opencl.c | 29 +++++++++---- src/acc/opencl/acc_opencl.h | 18 ++++---- src/acc/opencl/acc_opencl.sh | 12 +++--- src/acc/opencl/acc_opencl_stream.c | 20 ++++----- src/acc/opencl/smm/kernels/multiply.cl | 11 +++-- src/acc/opencl/smm/opencl_libsmm.c | 57 +++++++++++++++++++------- src/acc/opencl/smm/opencl_libsmm.h | 2 +- 8 files changed, 114 insertions(+), 55 deletions(-) diff --git a/src/acc/README.md b/src/acc/README.md index c595f09b70a..d753d0cf6c5 100644 --- a/src/acc/README.md +++ b/src/acc/README.md @@ -8,13 +8,25 @@ The code for both the CUDA and the HIP backend is unified, and can be found in t ## Drivers -There are two stand-alone sample codes or drivers exercising the ACC-interface. The driver code (only depending on above mentioned interfaces) can be built locally and in a rather self-contained fashion, i.e., no DBCSR library is needed (except runtime libraries such as CUDA, HIP, OpenCL). For OpenCL, the LIBXSMM library is mandatory. +There are two stand-alone sample codes or drivers exercising the ACC-interface. The driver code (only depending on above mentioned interfaces) can be built locally and in a rather self-contained fashion, i.e., no DBCSR library is needed (except runtime libraries such as CUDA, HIP, OpenCL). For OpenCL, the LIBXSMM library is mandatory and preferred as baseline and for validation in any case. To build LIBXSMM, a folder `libxsmm` in parallel to DBCSR's root directory (`dbcsr`) is expected to be present and prebuilt. -To build the driver code, a folder `libxsmm` in parallel to DBCSR's root directory (`dbcsr`) is expected to be present and prebuilt (`make GNU=1` in LIBXSMM's root directory). To build the driver code, change into the respective backend folder (`cuda` or `opencl`), and invoke `make` (`DBG=0|1|2` is supported among other optional key-value pairs). +```bash +git clone -b main https://github.com/libxsmm/libxsmm.git +cd libxsmm +make GNU=1 -j +``` + +To build the driver code (`opencl` in below example), change into the respective backend folder (`cuda` or `opencl`), and invoke `make` (`DBG=0|1|2` is supported among other optional key-value pairs). + +```bash +git clone https://github.com/cp2k/dbcsr.git +cd dbcsr/src/acc/opencl +make +``` -**NOTE**: To activate a certain device, the drivers consider an environment variable called `DEVICE`. For example, `DEVICE=1 ./acc_bench_trans` activates the second device (at least two devices must be discovered). +**NOTE**: To activate a certain device, the drivers consider an environment variable called `DEVICE`. For example, `DEVICE=1 ./acc_bench_trans` activates the second device (at least two devices must be discovered). This environment variable is implemented by the driver code and meant to work across backends, i.e., the OpenCL backend also supports `ACC_OPENCL_DEVICE=1` (see Developer Guide for the OpenCL backend). -The drivers support a few command line options (_nrepeat_, _stack_size_, _m_, _n_, ...). Command line arguments are positional but allow `0` as placeholder to access the default value (`acc_bench_smm 0 0 5 13 5` performs the default number of repetitions with the default stacksize when running the 5x13x5-kernel). For example, running the tranpose benchmark may look like: +The drivers support command line options (_nrepeat_, _stack_size_, _m_, _n_, ...). Command line arguments are positional but allow `0` as placeholder to refer to the default value (`acc_bench_smm 0 0 5 13 5` performs the default number of repetitions with the default stacksize when running the 5x13x5-kernel). For example, running the tranpose benchmark may look like: ```bash $ OMP_PROC_BIND=TRUE ./acc_bench_trans 5 30000 23 23 diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index e0217897055..33b020c41e2 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -728,12 +728,23 @@ int c_dbcsr_acc_opencl_device_id(cl_device_id device, int* device_id, int* globa } -int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[]) { +int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[], int use_platform_name) { char buffer[ACC_OPENCL_BUFFERSIZE]; int result = EXIT_SUCCESS; assert(NULL != device && NULL != vendor); - ACC_OPENCL_CHECK( - clGetDeviceInfo(device, CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve device vendor", result); + if (0 == use_platform_name) { + ACC_OPENCL_CHECK( + clGetDeviceInfo(device, CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve device vendor", result); + } + else { + cl_platform_id platform_id; + ACC_OPENCL_CHECK( + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform_id, NULL), "retrieve platform id", result); + if (EXIT_SUCCESS == result) { + ACC_OPENCL_CHECK( + clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve platform name", result); + } + } if (EXIT_SUCCESS == result) { result = (NULL != LIBXSMM_STRISTR(buffer, vendor) ? EXIT_SUCCESS : EXIT_FAILURE); } @@ -744,7 +755,7 @@ int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[]) { int c_dbcsr_acc_opencl_device_uid(cl_device_id device, const char devname[], unsigned int* uid) { int result; if (NULL != uid) { - if (NULL != device && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(device, "intel")) { + if (NULL != device && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(device, "intel", 0 /*use_platform_name*/)) { result = clGetDeviceInfo(device, 0x4251 /*CL_DEVICE_ID_INTEL*/, sizeof(unsigned int), uid, NULL); } else result = EXIT_FAILURE; @@ -931,7 +942,9 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) { } } } - else if (CL_INVALID_DEVICE == result && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) { + else if (CL_INVALID_DEVICE == result && + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)) + { fprintf(stderr, "WARN ACC/OpenCL: if MPI-ranks target the same device in exclusive mode,\n" " SMI must be used to enable sharing the device.\n"); } @@ -996,7 +1009,7 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) { c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1; } c_dbcsr_acc_opencl_config.device[thread_id].intel = - (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel") ? CL_TRUE : CL_FALSE); + (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/) ? CL_TRUE : CL_FALSE); } } } @@ -1259,7 +1272,9 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha const int cl_std_len = (int)strlen(cl_std); nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), ACC_OPENCL_CPPBIN " -P -C -nostdinc -D__OPENCL_VERSION__=%u %s %s %s %s >%s.cl", 100 * level_major + 10 * level_minor, - EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia") ? "" : "-D__NV_CL_C_VERSION", + EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/) + ? "" + : "-D__NV_CL_C_VERSION", NULL != build_params ? build_params : "", buffer_name, sed_pattern, kernel_name); if (0 < nchar && (int)sizeof(buffer) > nchar && (0 == cl_std_len || (3 == write(file_tmp, "/*\n", 3) && cl_std_len == write(file_tmp, cl_std, cl_std_len) && diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 533b5964dbd..f9a4708a807 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -135,14 +135,18 @@ # define ACC_OPENCL_OMP_TID() (/*main*/ 0) #endif -#if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER -# define ACC_OPENCL_EXPECT(EXPR) LIBXSMM_EXPECT(EXPR) -#else -# define ACC_OPENCL_EXPECT(EXPR) \ - if (0 == (EXPR)) assert(0); +#if 1 +# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER +# define ACC_OPENCL_EXPECT(EXPR) LIBXSMM_EXPECT(EXPR) +# else +# define ACC_OPENCL_EXPECT(EXPR) \ + if (0 == (EXPR)) assert(0); +# endif +#else /* elide */ +# define ACC_OPENCL_EXPECT(EXPR) (void)(EXPR) #endif -#if !defined(NDEBUG) +#if !defined(NDEBUG) && 1 # define ACC_OPENCL_CHECK(EXPR, MSG, RESULT) \ do { \ if (EXIT_SUCCESS == (RESULT)) { \ @@ -301,7 +305,7 @@ int c_dbcsr_acc_opencl_device(int thread_id, cl_device_id* device); /** Get device-ID for given device, and optionally global device-ID. */ int c_dbcsr_acc_opencl_device_id(cl_device_id device, int* device_id, int* global_id); /** Confirm the vendor of the given device. */ -int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[]); +int c_dbcsr_acc_opencl_device_vendor(cl_device_id device, const char vendor[], int use_platform_name); /** Capture or calculate UID based on the device-name. */ int c_dbcsr_acc_opencl_device_uid(cl_device_id device, const char devname[], unsigned int* uid); /** Based on the device-ID, return the device's UID (capture or calculate), device name, and platform name. */ diff --git a/src/acc/opencl/acc_opencl.sh b/src/acc/opencl/acc_opencl.sh index fc74cbd1d62..1eac6a2a14b 100755 --- a/src/acc/opencl/acc_opencl.sh +++ b/src/acc/opencl/acc_opencl.sh @@ -80,7 +80,7 @@ then if [ "${CLFILE##*.}" = "cl" ]; then if [ -e "${CLFILE}" ]; then BNAME=$(${BASENAME} "${CLFILE}" .cl) - UNAME=$(echo "${BNAME}" | ${TR} '[:lower:]' '[:upper:]') + UNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${BNAME}") SNAME=OPENCL_LIBSMM_STRING_${UNAME} VNAME=opencl_libsmm_source_${BNAME} MNAME=OPENCL_LIBSMM_SOURCE_${UNAME} @@ -167,8 +167,8 @@ then SNAME=OPENCL_LIBSMM_STRING_PARAMS_SMM VNAME=opencl_libsmm_params_smm DNAME=opencl_libsmm_devices - MNAME=$(echo "${VNAME}" | ${TR} '[:lower:]' '[:upper:]') - NNAME=$(echo "${DNAME}" | ${TR} '[:lower:]' '[:upper:]') + MNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${VNAME}") + NNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${DNAME}") if [ "${DEVICES}" ]; then echo >>"${OFILE}" echo "#define ${MNAME} ${VNAME}" >>"${OFILE}" @@ -176,19 +176,19 @@ then CSVLINES=$(for CSVFILE in "${CSVFILES[@]}"; do ${SED} "1d;/^[[:space:]]*$/d;s/[\r]*$/\\\n\" \\\/" "${CSVFILE}"; done) IFS=$'\n' for LINE in ${CSVLINES}; do - I=0; IDEVICE=$(echo "${LINE}" | ${SED} "${DEVPAT}") + I=0; IDEVICE=$(${SED} "${DEVPAT}" <<<"${LINE}") for DEVICE in ${DEVICES}; do if [ "${DEVICE}" = "${IDEVICE}" ]; then break; fi I=$((I+1)); done - echo "${LINE}" | ${SED} "s/[^${DELIM}]*//;s/^/ \"${I}/" >>"${OFILE}" + ${SED} "s/[^${DELIM}]*//;s/^/ \"${I}/" <<<"${LINE}" >>"${OFILE}" done echo " \"\"" >>"${OFILE}" echo "static const char ${VNAME}[] = ${SNAME};" >>"${OFILE}" echo >>"${OFILE}" echo "#define ${NNAME} ${DNAME}" >>"${OFILE}" echo "static const char *const ${DNAME}[] = {" >>"${OFILE}" - I=0; S=","; NDEVICES=$(echo "${DEVICES}" | ${WC} -l) + I=0; S=","; NDEVICES=$(${WC} -l <<<"${DEVICES}") for DEVICE in ${DEVICES}; do I=$((I+1)); if [ "0" != "$((NDEVICES==I))" ]; then S=""; fi echo " \"${DEVICE}\"${S}" >>"${OFILE}" diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index a5f6c7f6920..3cb83bcf6b6 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -1,10 +1,10 @@ /*------------------------------------------------------------------------------------------------*/ -/* Copyright (C) by the DBCSR developers group - All rights reserved */ -/* This file is part of the DBCSR library. */ +/* Copyright (C) by the DBCSR developers group - All rights reserved */ +/* This file is part of the DBCSR library. */ /* */ -/* For information on the license, see the LICENSE file. */ -/* For further information please visit https://dbcsr.cp2k.org */ -/* SPDX-License-Identifier: GPL-2.0+ */ +/* For information on the license, see the LICENSE file. */ +/* For further information please visit https://dbcsr.cp2k.org */ +/* SPDX-License-Identifier: GPL-2.0+ */ /*------------------------------------------------------------------------------------------------*/ #if defined(__OPENCL) # include "acc_opencl.h" @@ -19,7 +19,6 @@ clCreateCommandQueue(CTX, DEV, (cl_command_queue_properties)(NULL != (PROPS) ? ((PROPS)[1]) : 0), RESULT) # endif - # if defined(__cplusplus) extern "C" { # endif @@ -27,14 +26,12 @@ extern "C" { int c_dbcsr_acc_opencl_stream_counter_base; int c_dbcsr_acc_opencl_stream_counter; - c_dbcsr_acc_opencl_info_stream_t* c_dbcsr_acc_opencl_info_stream(void* stream) { assert(NULL == stream || sizeof(c_dbcsr_acc_opencl_info_stream_t) <= (uintptr_t)stream); return ( NULL != stream ? ((c_dbcsr_acc_opencl_info_stream_t*)((uintptr_t)stream - sizeof(c_dbcsr_acc_opencl_info_stream_t))) : NULL); } - const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) { const int* result; # if !defined(ACC_OPENCL_STREAM_PRIORITIES) @@ -50,7 +47,6 @@ const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) { return result; } - int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { ACC_OPENCL_STREAM_PROPERTIES_TYPE properties[8] = { CL_QUEUE_PROPERTIES, 0 /*placeholder*/, 0 /* terminator */ @@ -245,7 +241,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { ACC_OPENCL_RETURN_CAUSE(result, name); } - int c_dbcsr_acc_stream_destroy(void* stream) { int result = EXIT_SUCCESS; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -297,7 +292,6 @@ int c_dbcsr_acc_stream_destroy(void* stream) { ACC_OPENCL_RETURN(result); } - int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { int result = ((NULL != least || NULL != greatest) ? EXIT_SUCCESS : EXIT_FAILURE); int priohi = -1, priolo = -1; @@ -321,7 +315,8 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { ACC_OPENCL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ACC_OPENCL_BUFFERSIZE, buffer, NULL), "retrieve platform extensions", result); if (EXIT_SUCCESS == result) { - if (NULL != strstr(buffer, "cl_khr_priority_hints") || EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia")) + if (NULL != strstr(buffer, "cl_khr_priority_hints") || + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)) { priohi = CL_QUEUE_PRIORITY_HIGH_KHR; priolo = CL_QUEUE_PRIORITY_LOW_KHR; @@ -337,7 +332,6 @@ int c_dbcsr_acc_stream_priority_range(int* least, int* greatest) { ACC_OPENCL_RETURN(result); } - int c_dbcsr_acc_stream_sync(void* stream) { int result = EXIT_SUCCESS; # if defined(ACC_OPENCL_STREAM_PRIORITIES) diff --git a/src/acc/opencl/smm/kernels/multiply.cl b/src/acc/opencl/smm/kernels/multiply.cl index 1119bf50de7..f9421de0b6d 100644 --- a/src/acc/opencl/smm/kernels/multiply.cl +++ b/src/acc/opencl/smm/kernels/multiply.cl @@ -545,14 +545,19 @@ FN(global T* restrict cdata, GLOBAL const T* restrict adata, GLOBAL const T* res # if defined(BARRIER) && (MAX(1, SGS) < SWG) && defined(SLM_A) BARRIER(CLK_LOCAL_MEM_FENCE); # endif -# if (WRK == SM) && (SGS >= SM) && !defined(SLM_A) && !defined(REG_A) +# if (WRK == SM) && (SM <= SGS || SM <= SWG) && !defined(SLM_A) && !defined(REG_A) const T a = AMK(idx, k); # endif UNROLL_FORCE(SM) for (short m = 0; m < SM; ++m) { -# if (WRK == SM) && (SGS >= SM) && !defined(SLM_A) && !defined(REG_A) +# if (200 /*CL_VERSION_2_0*/ <= __OPENCL_VERSION__) && !defined(SLM_A) && !defined(REG_A) && (WRK == SM) && \ + (SM <= SGS || SM <= SWG) /* size of subgroup or size of workgroup is sufficient */ +# if (SM <= SGS) CNM(idx, m) = MAD(sub_group_broadcast(a, m), b, CNM(idx, m)); -# else +# else + CNM(idx, m) = MAD(work_group_broadcast(a, m), b, CNM(idx, m)); +# endif +# else /* fallback */ CNM(idx, m) = MAD(AMK(m, k), b, CNM(idx, m)); # endif } diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index 4e0c5fcb3e6..b7f81900286 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -204,6 +204,7 @@ int opencl_libsmm_write_smm_params(FILE* stream, int only_key, const opencl_libs result += fprintf(stream, "%i%c%i%c%i%c%i%c %i%c%i%c %i%c%i%c%i%c %i%c%i%c %i%c%i%c%i%c%i", config->bs, d, config->bm, d, config->bn, d, config->bk, d, config->ws, d, config->wg, d, config->lu, d, config->nz, d, config->al, d, config->tb, d, config->tc, d, config->ap, d, config->aa, d, config->ab, d, config->ac); + if (0 != config->flags) result += fprintf(stream, "%c %i", d, config->flags); } } else { @@ -228,9 +229,8 @@ int opencl_libsmm_read_smm_params( char* parambuf, opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value, opencl_libsmm_perfest_t* perfest, char* device) { const char* const end = parambuf + strlen(parambuf); /* before strtok */ char* s = strtok(parambuf, ACC_OPENCL_DELIMS); - int result = EXIT_SUCCESS, i = 0, ivalue, consumed = 0, c = 0; const int opt_consumed = (NULL != perfest ? 2 : 0) + (NULL != device ? 1 : 0); - const int max_consumed = opt_consumed + 19; + int result = EXIT_SUCCESS, i = 0, ivalue, consumed = 0, c = 0, max_consumed = opt_consumed + 19; double gflops; assert(NULL != key && NULL != value); for (; NULL != s; @@ -368,6 +368,13 @@ int opencl_libsmm_read_smm_params( ++consumed; } break; + case 22: + if (1 == sscanf(s, "%i", &ivalue)) { + value->flags = ivalue; + ++max_consumed; + ++consumed; + } + break; default: s = NULL; /* break */ } } @@ -469,7 +476,7 @@ int libsmm_acc_init(void) { } else { if (0 != c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "WARN LIBSMM: failed to load tuned parameters!\n"); + fprintf(stderr, "WARN LIBSMM: failed to load tuned parameters from CSV-file!\n"); } break; /* invalid entry */ } @@ -557,7 +564,7 @@ int libsmm_acc_init(void) { } else { if (0 != c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "WARN LIBSMM: failed to load tuned parameters!\n"); + fprintf(stderr, "WARN LIBSMM: failed to load embedded parameters!\n"); } break; } @@ -1316,6 +1323,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ? (0 == kernel_idx ? (NULL == config ? /*default*/ default_ac : config->ac) : /*default*/ default_ac) : atoi(env_ac), 0, 2); + new_config.flags = (NULL == config ? /*default*/ 0 : config->flags); if (0 >= new_config.s) new_config.s = stack_size; if (0 == kernel_idx || 1 >= new_config.bs) new_config.bs = bs; nbm = (m_max + new_config.bm - 1) / new_config.bm; @@ -1406,7 +1414,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, if (NULL == env_atomics || '0' != *env_atomics) { /* atomics_force: attempt to force atomics without confirmation */ const int atomics_force = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics)); - const int cl_nonv = (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia")); + const int cl_nonv = (EXIT_SUCCESS != + c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia", 0 /*use_platform_name*/)); if (NULL == env_atomics || '\0' == *env_atomics || 0 != atomics_force) { cl_bitfield fp_atomics; assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype); @@ -1446,18 +1455,25 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } } else if (cl_nonv) { - if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && - (0 == (m_max & 1) || (0 == devinfo->intel /*&& cl_nonv*/)) /* TODO */ - && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) - { - assert(dbcsr_type_real_4 == datatype); - atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\""; + if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/)) { + if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && + (0 == (m_max & 1) || (0 == devinfo->intel /*&& cl_nonv*/)) /* TODO */ + && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) + { + assert(dbcsr_type_real_4 == datatype); + atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\""; + } + else { + extensions[1] = NULL; + } + atomic_exp = "atomic_add_global_cmpxchg(A,B)"; + atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); } else { - extensions[1] = NULL; + atomic_exp = (dbcsr_type_real_8 == datatype + ? "__builtin_amdgcn_global_atomic_fadd_f64(A,B,__ATOMIC_RELAXED)" + : "__builtin_amdgcn_global_atomic_fadd_f32(A,B,__ATOMIC_RELAXED)"); } - atomic_exp = "atomic_add_global_cmpxchg(A,B)"; - atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); } else { assert(NULL != atomic_ops && '\0' == *atomic_ops); @@ -1562,6 +1578,19 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } } } +# if defined(NDEBUG) + else if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "WARNING: SMM-kernel "); + opencl_libsmm_write_smm_params( + stderr, 0 /*only_key*/, &key, NULL /*config*/, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); + fprintf(stderr, "="); + opencl_libsmm_write_smm_params( + stderr, 0 /*only_key*/, &key, &new_config, NULL /*delim*/, NULL /*begin*/, NULL /*close*/); + fprintf(stderr, " failed to compile!\n"); + LIBXSMM_STDIO_RELEASE(); + } +# endif } } /* insufficient device capabilities */ diff --git a/src/acc/opencl/smm/opencl_libsmm.h b/src/acc/opencl/smm/opencl_libsmm.h index 0dfbea0c1b3..36e11e0f947 100644 --- a/src/acc/opencl/smm/opencl_libsmm.h +++ b/src/acc/opencl/smm/opencl_libsmm.h @@ -65,7 +65,7 @@ typedef struct opencl_libsmm_smm_t { size_t wgsize[2]; double gflops; /* (pseudo-)parameters (either pretuned or determined) */ - int s, bs, bm, bn, bk, ws, wg, lu, nz, al, tb, tc, ap, aa, ab, ac; + int s, bs, bm, bn, bk, ws, wg, lu, nz, al, tb, tc, ap, aa, ab, ac, flags; } opencl_libsmm_smm_t; /** Type to collect statistics about tuned SMM-kernels */