From 685304a35a4c8932a961b0a3dd1aba98c4f6936d Mon Sep 17 00:00:00 2001 From: Hans Pabst Date: Thu, 5 Oct 2023 10:16:42 +0200 Subject: [PATCH] ocl: revised device-split, additional tuning param, and other improvements * Split into maximum number of sub-devices if ACC_OPENCL_DEVSPLIT=1 - If 1 nchar) { nchar = LIBXSMM_SNPRINTF( build_params, sizeof(build_params), param_format, cmem, inplace, fname, m, n, (int)new_config.wgsize, tname); @@ -1255,6 +1254,14 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char *const env_tb = getenv("OPENCL_LIBSMM_SMM_TB"), *const env_tc = getenv("OPENCL_LIBSMM_SMM_TC"); const char *const env_ap = getenv("OPENCL_LIBSMM_SMM_AP"), *const env_aa = getenv("OPENCL_LIBSMM_SMM_AA"); const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC"); + const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS"); + const char* const intel_xf = "-cl-intel-256-GRF-per-thread"; + const int cl_nonv = (0 != devinfo->intel || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor( + active_device, "nvidia", 0 /*use_platform_name*/)); + const int cl_noamd = + 0 != devinfo->intel || !cl_nonv || + (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) && + EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/)); const int blockm = ((NULL == env_bm || '\0' == *env_bm) ? 0 : atoi(env_bm)); const int blockn = ((NULL == env_bn || '\0' == *env_bn) ? 0 : atoi(env_bn)); const int blockk = ((NULL == env_bk || '\0' == *env_bk) ? 0 : atoi(env_bk)); @@ -1292,9 +1299,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->nz) : /*default*/ 0) : atoi(env_nz), 0, 1); - new_config.al = LIBXSMM_CLMP((NULL == env_al || '\0' == *env_al) - ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) - : atoi(env_al), + new_config.al = LIBXSMM_CLMP( + (NULL == env_al || '\0' == *env_al) + ? (cl_noamd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1) + : atoi(env_al), 0, 1); new_config.tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb) ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->tb) : /*default*/ 0) @@ -1323,7 +1331,13 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ? (0 == kernel_idx ? (NULL == config ? /*default*/ default_ac : config->ac) : /*default*/ default_ac) : atoi(env_ac), 0, 2); - new_config.flags = (NULL == config ? /*default*/ 0 : config->flags); + if (NULL == env_xf || '\0' == *env_xf) { + if (0 == devinfo->intel || NULL == env_cl || NULL == strstr(env_cl, intel_xf)) { + new_config.flags = (NULL == config ? /*default*/ 0 : config->flags); + } + else new_config.flags = 1; + } + else new_config.flags = atoi(env_xf); if (0 >= new_config.s) new_config.s = stack_size; if (0 == kernel_idx || 1 >= new_config.bs) new_config.bs = bs; nbm = (m_max + new_config.bm - 1) / new_config.bm; @@ -1398,7 +1412,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } if (new_config.wgsize[kernel_idx] <= wgsize_max) { /* SMMs can be potentially handled by device */ const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant"); - const char* const env_options = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS"); const char* const env_barrier = getenv("OPENCL_LIBSMM_SMM_BARRIER"); const char* const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS"); const char* const env_nrepeat = getenv("SMM_NREPEAT"); @@ -1414,8 +1427,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, if (NULL == env_atomics || '0' != *env_atomics) { /* atomics_force: attempt to force atomics without confirmation */ const int atomics_force = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics)); - const int cl_nonv = (EXIT_SUCCESS != - c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia", 0 /*use_platform_name*/)); if (NULL == env_atomics || '\0' == *env_atomics || 0 != atomics_force) { cl_bitfield fp_atomics; assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype); @@ -1456,10 +1467,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } else if (cl_nonv) { int gfx90 = 0; - if ((EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) || - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/)) && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE, - NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1)) + if (!cl_noamd && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE, + NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1)) { const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx"); if (NULL != gfxname && 90 <= atoi(gfxname + 3)) gfx90 = 1; @@ -1534,8 +1543,8 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, # else const char* const cl_debug = ""; # endif - nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero", - NULL == env_options ? "" : env_options, cl_debug); + nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "-cl-fast-relaxed-math -cl-denorms-are-zero %s %s %s", + NULL == env_cl ? "" : env_cl, (0 == new_config.flags || 0 == devinfo->intel) ? "" : intel_xf, cl_debug); if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE; } else result = EXIT_FAILURE; diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py index bb9a95a819f..1d57dd93a7a 100755 --- a/src/acc/opencl/smm/tune_multiply.py +++ b/src/acc/opencl/smm/tune_multiply.py @@ -12,7 +12,8 @@ from opentuner import ConfigurationManipulator from opentuner import MeasurementInterface from opentuner import Result -from signal import signal, getsignal, SIGINT +from signal import signal, SIGINT, SIG_DFL +import copy import json import glob import sys @@ -70,6 +71,7 @@ def manipulator(self): self.bs = self.bm = self.bn = self.bk = self.ws = self.wg = self.lu = None self.nz = self.al = self.tb = self.tc = None self.ap = self.aa = self.ab = self.ac = None + self.xf = os.getenv("OPENCL_LIBSMM_SMM_XF") self.typename = self.typeid = None self.device = self.size = None self.gfbase = self.gflops = 0 @@ -78,14 +80,14 @@ def manipulator(self): self.exepath = os.path.join( os.path.dirname(sys.argv[0]), "..", "..", self.exename ) - run_result = ( # verbosity to capture device name and tuned parameters + self.run_result = ( # verbosity to capture device name and tuned parameters self.launch(["ACC_OPENCL_VERBOSE=2", "CHECK=0"], nrep=1) if (self.args.merge is None or 0 > self.args.merge) and (self.args.update is None or "" == self.args.update) else None ) - if run_result: - stdout = str(run_result["stdout"]) + if self.run_result: + stdout = str(self.run_result["stdout"]) if 0 >= self.args.size: size = re.search( "{}\\s+[0-9]+\\s+([0-9]+)".format(self.exepath), @@ -100,11 +102,11 @@ def manipulator(self): int(typename.group(1)) if typename and typename.group(1) else 0 ) devicepat = 'INFO ACC/OpenCL:\\s+ndevices=[0-9]+\\s+device[0-9]+="([^"]+)"' - device = re.search(devicepat, str(run_result["stderr"])) + device = re.search(devicepat, str(self.run_result["stderr"])) self.device = device.group(1) if device and device.group(1) else "" elif self.args.update is not None and "" != self.args.update: self.device = self.args.update - if run_result and 0 == run_result["returncode"]: + if self.run_result and 0 == self.run_result["returncode"]: seedpat = "INFO ACC/OpenCL:\\s+SMM-kernel\\s+{}={}\\s+gen=".format( "{t,m,n,k, bs,bm,bn,bk, ws,wg, lu,nz,al, tb,tc, ap,aa,ab,ac}", "{{{}, {}}}".format( # key and value @@ -116,12 +118,13 @@ def manipulator(self): "(-*[0-9]+),(-*[0-9]+)", # ws,wg "(-*[0-9]+),(-*[0-9]+),(-*[0-9]+)", # lu,nz,al "(-*[0-9]+),(-*[0-9]+)", # tb,tc - "(-*[0-9]+),(-*[0-9]+),(-*[0-9]+),(-*[0-9]+)", # ap,aa,ab,ac + "(-*[0-9]+),(-*[0-9]+),(-*[0-9]+),(-*[0-9]+)(, .+)*", # ap,aa,ab,ac[, ext] ), ), ) - seed = re.search(seedpat, str(run_result["stderr"])) - if 15 != (len(seed.groups()) if seed else 0): + seed = re.search(seedpat, str(self.run_result["stderr"])) + nprm = len(seed.groups()) if seed else 0 + if 15 > nprm: print("WARNING: missed to parse initial parameters!") # setup fixed and tunable parameters params, paramt = [], [] @@ -142,6 +145,11 @@ def manipulator(self): self.create_param("AA", params, paramt, seed, 13, 0, 3) self.create_param("AB", params, paramt, seed, 14, 0, 3) self.create_param("AC", params, paramt, seed, 15, 0, 2) + if self.xf is None and ( + 15 < nprm and seed.group(16) and 2 < len(seed.group(16)) + ): + self.xf = seed.group(16)[2:] + self.create_param("XF", params, paramt, self.xf, -1, 0, 1) if not paramt: sys.tracebacklimit = 0 raise RuntimeError( @@ -149,8 +157,7 @@ def manipulator(self): ) for param in params + paramt: manipulator.add_parameter(param) - # consider to update and/or merge JSONS (update first) - if ( + if ( # consider to update and/or merge JSONS (update first) (self.args.merge is not None and (0 <= self.args.merge or self.typeid)) or self.args.update is None or "" != self.args.update @@ -188,13 +195,19 @@ def manipulator(self): def create_param(self, name, params, paramt, match, match_id, value0, value1): """Append integer-parameter to either params or paramt list""" - if env_isfixed("OPENCL_LIBSMM_SMM_{}".format(name)): - value_fix = getattr(self.args, name.lower()) + value_fixed = env_isfixed("OPENCL_LIBSMM_SMM_{}".format(name)) + value_fix = getattr(self.args, name.lower(), None) if value_fixed else None + if value_fix is not None: params.append(IntegerParameter(name, value_fix, value_fix)) else: - value = ( - int(match.group(match_id)) if match and match.group(match_id) else None - ) + if 0 <= match_id: + value = ( + int(match.group(match_id)) + if match and match.group(match_id) + else None + ) + else: + value = int(match) if match is not None else 0 setattr(self, name.lower(), value) paramt.append(IntegerParameter(name, value0, value1)) @@ -232,6 +245,7 @@ def seed_configurations(self): "AA": self.aa if self.aa is not None else self.args.aa, "AB": self.ab if self.ab is not None else self.args.ab, "AC": self.ac if self.ac is not None else self.args.ac, + "XF": self.xf if self.xf is not None else 0, } ] @@ -258,20 +272,21 @@ def environment(self, config): "OPENCL_LIBSMM_SMM_AA={}".format(config["AA"]), "OPENCL_LIBSMM_SMM_AB={}".format(config["AB"]), "OPENCL_LIBSMM_SMM_AC={}".format(config["AC"]), + "OPENCL_LIBSMM_SMM_XF={}".format(config["XF"]), ] def run(self, desired_result, input, limit): """Run a configuration and return performance""" config = desired_result.configuration.data cfgenv = self.environment(config) - run_result = self.launch( + self.run_result = self.launch( cfgenv + ["CHECK={}".format(self.args.check)], verbose=self.args.verbose, ) - if 0 == run_result["returncode"]: + if 0 == self.run_result["returncode"]: performance = re.search( "device:\\s+([0-9]+[^ ]*) ms\\s+([0-9]+[^ ]*)", - str(run_result["stdout"]), + str(self.run_result["stdout"]), ) else: failed = " ".join(map(str, cfgenv)).replace("OPENCL_LIBSMM_SMM_", "") @@ -363,6 +378,7 @@ def merge_jsons(self, filenames): data["AA"] if "AA" in data else 1, data["AB"] if "AB" in data else 3, data["AC"] if "AC" in data else 0, + data["XF"] if "XF" in data else 0, filename, # last entry ) if key not in merged: @@ -457,7 +473,11 @@ def save_final_config(self, configuration, final=True): os.path.join(self.args.jsondir, ".{}.json".format(self.args.label)), "w", ) as file: - json.dump(config, file, sort_keys=True) + cfg = config + if "XF" in config and 0 == config["XF"]: + cfg = copy.deepcopy(config) + del cfg["XF"] + json.dump(cfg, file, sort_keys=True) file.write("\n") # append newline at EOF if final: if not filenames and glob.glob(self.args.csvfile): @@ -488,11 +508,15 @@ def save_final_config(self, configuration, final=True): filename, ) ) - # no validation in SIGINT (signal may be due to application) - if 0 == self.args.check and self.handle_sigint != getsignal(SIGINT): - run_result = self.launch(self.environment(config) + ["CHECK=1"]) - if 0 != run_result["returncode"]: - print("WARNING: tuned result seems to be incorrect!") + if ( # avoid recursion (self.handle_sigint != getsignal(SIGINT)) + self.run_result and 0 == self.run_result["returncode"] + ) and 0 == self.args.check: + signal(SIGINT, SIG_DFL) + self.run_result = self.launch( + self.environment(config) + ["CHECK=1"] + ) + if self.run_result and 0 != self.run_result["returncode"]: + print("WARNING: tuned result seems to be incorrect!") def handle_sigint(self, signum, frame): """Handle SIGINT or CTRL-C""" @@ -754,8 +778,6 @@ def handle_sigint(self, signum, frame): # OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning) if os.getenv("OPENCL_LIBSMM_SMM_WS") not in {"tune", "enabled", "on"}: os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws) - # if not os.getenv("OPENCL_LIBSMM_SMM_AL") in {"tune", "enabled", "on"}: - # os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al) # fix tunables according to level of tuning if 1 <= args.tlevel or 0 > args.tlevel: os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm) @@ -774,4 +796,7 @@ def handle_sigint(self, signum, frame): if 0 == args.mb: args.mb = 64 # additional/depending arguments - SmmTuner.main(args) + try: + SmmTuner.main(args) + except: # noqa: E722 + pass