From 4c5c5a36e165f61bf72401fa6083bba13cc9c897 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 2 Oct 2024 22:33:34 +0200 Subject: [PATCH 01/47] {2023.06}[foss/2023a] cuDNN 8.9.2.26 w/ CUDA 12.1.1 --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 2 + eb_hooks.py | 174 +++++++++++++----- 2 files changed, 128 insertions(+), 48 deletions(-) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml new file mode 100644 index 0000000000..d54780804b --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -0,0 +1,2 @@ +easyconfigs: + - cuDNN-8.9.2.26-CUDA-12.1.1.eb diff --git a/eb_hooks.py b/eb_hooks.py index 79bdeeee0d..ce99ed1dfe 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -756,64 +756,141 @@ def post_postproc_cuda(self, *args, **kwargs): if 'libcudart' not in allowlist: raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist) - # iterate over all files in the CUDA installation directory - for dir_path, _, files in os.walk(self.installdir): - for filename in files: - full_path = os.path.join(dir_path, filename) - # we only really care about real files, i.e. not symlinks - if not os.path.islink(full_path): - # check if the current file name stub is part of the allowlist - basename = filename.split('.')[0] - if basename in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - else: - self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", - basename, full_path) - # if it is not in the allowlist, delete the file and create a symlink to host_injections - - # the host_injections path is under a fixed repo/location for CUDA - host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path) - # CUDA itself doesn't care about compute capability so remove this duplication from - # under host_injections (symlink to a single CUDA installation for all compute - # capabilities) - accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET") - if accel_subdir: - host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '') - # make sure source and target of symlink are not the same - if full_path == host_inj_path: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for an EESSI installation?", - full_path, host_inj_path) - remove_file(full_path) - symlink(host_inj_path, full_path) + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") +def post_postproc_cudnn(self, *args, **kwargs): + """ + Remove files from cuDNN installation that we are not allowed to ship, + and replace them with a symlink to a corresponding installation under host_injections. + """ + + # We need to check if we are doing an EESSI-distributed installation + eessi_installation = bool(re.search(EESSI_INSTALLATION_REGEX, self.installdir)) + + if self.name == 'cuDNN' and eessi_installation: + print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...") + + allowlist = ['LICENSE'] + + # read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped + license_path = os.path.join(self.installdir, 'LICENSE') + search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:" + with open(license_path) as infile: + for line in infile: + if line.strip().startswith(search_string): + # remove search string, split into words, remove trailing + # dots '.' and only retain words starting with a dot '.' + distributable = line[len(search_string):] + for word in distributable.split(): + if word[0] == '.': + allowlist.append(word.rstrip('.')) + + allowlist = sorted(set(allowlist)) + self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) + + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) + else: + raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") + + +def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist): + """ + Replace files that cannot be distributed with symlinks into host_injections + """ + extension_based = { "CUDA": False, "cuDNN": True } + if not package in extension_based: + raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package) + + # iterate over all files in the package installation directory + for dir_path, _, files in os.walk(install_dir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file name stub is part of the allowlist + basename = filename.split('.')[0] + if extension_based[package]: + if '.' in filename: + extension = '.' + filename.split('.')[1] + if basename in allowlist: + log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + elif extension_based[package] and '.' in filename and extension in allowlist: + log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) + else: + if extension_based[package]: + print_name = filename + else: + print_name = basename + log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + print_name, full_path) + # the host_injections path is under a fixed repo/location for CUDA or cuDNN + host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path) + # CUDA and cuDNN itself don't care about compute capability so remove this duplication from + # under host_injections (symlink to a single CUDA or cuDNN installation for all compute + # capabilities) + accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET") + if accel_subdir: + host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for an EESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + + def inject_gpu_property(ec): """ - Add 'gpu' property, via modluafooter easyconfig parameter + Add 'gpu' property EESSIVERSION envvars and drop dependencies to + build dependencies, via modluafooter easyconfig parameter """ ec_dict = ec.asdict() - # Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property - if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]): - ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version") + # Check if CUDA, cuDNN, you-name-it is in the dependencies, if so + # - drop dependency to build dependency + # - add 'gpu' Lmod property + # - add envvar with package version + packages_list = ( "CUDA", "cuDNN" ) + packages_version = { } + add_gpu_property = '' + + for package in packages_list: + # Check if package is in the dependencies, if so drop dependency to build + # dependency and set variable for later adding the 'gpu' Lmod property + if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]): + add_gpu_property = 'add_property("arch","gpu")' + for dep in iter(ec_dict['dependencies']): + if package in dep[0]: + # make package a build dependency only (rpathing saves us from link errors) + ec.log.info("Dropping dependency on %s to build dependency" % package) + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + # take note of version for creating the modluafooter + packages_version[package] = dep[1] + if add_gpu_property: + ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version") key = 'modluafooter' - value = 'add_property("arch","gpu")' - cuda_version = 0 - for dep in iter(ec_dict['dependencies']): - # Make CUDA a build dependency only (rpathing saves us from link errors) - if 'CUDA' in dep[0]: - cuda_version = dep[1] - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) - if key in ec_dict: - if value not in ec_dict[key]: - ec[key] = '\n'.join([ec_dict[key], value]) + values = [add_gpu_property] + for package, version in packages_version.items(): + envvar = "EESSI%sVERSION" % package.upper() + values.append('setenv("%s","%s")' % (envvar, version)) + if not key in ec_dict: + ec[key] = '\n'.join(values) else: - ec[key] = value + new_value = ec_dict[key] + for value in values: + if not value in new_value: + new_value = '\n'.join([new_value, value]) + ec[key] = new_value + return ec @@ -873,4 +950,5 @@ def inject_gpu_property(ec): POST_POSTPROC_HOOKS = { 'CUDA': post_postproc_cuda, + 'cuDNN': post_postproc_cudnn, } From da7c1e4669f4b3947b059162f5e72de1d634d777 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 2 Oct 2024 23:27:02 +0200 Subject: [PATCH 02/47] use post sanity-check hook for cuDNN --- eb_hooks.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index ce99ed1dfe..25eefaf27a 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -763,7 +763,13 @@ def post_postproc_cuda(self, *args, **kwargs): raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") -def post_postproc_cudnn(self, *args, **kwargs): +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanitycheck hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + +def post_sanitycheck_cudnn(self, *args, **kwargs): """ Remove files from cuDNN installation that we are not allowed to ship, and replace them with a symlink to a corresponding installation under host_injections. @@ -950,5 +956,8 @@ def inject_gpu_property(ec): POST_POSTPROC_HOOKS = { 'CUDA': post_postproc_cuda, - 'cuDNN': post_postproc_cudnn, +} + +POST_SANITYCHECK_HOOKS = { + 'cuDNN': post_sanitycheck_cudnn, } From 454d2bb0f5076704cdfb7e161e9a187cf6241f3a Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 14:57:55 +0200 Subject: [PATCH 03/47] install cuDNN under host_injections before installing it under /cvmfs --- EESSI-install-software.sh | 9 +- install_scripts.sh | 5 +- .../eessi-2023.06-cuda-and-libraries.yml | 3 + .../nvidia/install_cuda_and_libraries.sh | 205 ++++++++++++++++++ 4 files changed, 218 insertions(+), 4 deletions(-) create mode 100644 scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml create mode 100755 scripts/gpu_support/nvidia/install_cuda_and_libraries.sh diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index f9dd971a0d..d54da4a404 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -234,7 +234,7 @@ pr_diff=$(ls [0-9]*.diff | head -1) # for now, this just reinstalls all scripts. Note the most elegant, but works ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} -# Install full CUDA SDK in host_injections +# Install full CUDA SDK and cu* libraries in host_injections # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments @@ -250,9 +250,12 @@ else fi if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ + -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \ + -t /tmp/temp \ + --accept-cuda-eula else - echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" + echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi # Install NVIDIA drivers in host_injections (if they exist) diff --git a/install_scripts.sh b/install_scripts.sh index 11c7fc2a9f..df9bda3ad3 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -122,7 +122,10 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@ # Copy files for the scripts/gpu_support/nvidia directory nvidia_files=( - install_cuda_host_injections.sh link_nvidia_host_libraries.sh + eessi-2023.06-cuda-and-libraries.yml + install_cuda_and_libraries.sh + install_cuda_host_injections.sh + link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml b/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml new file mode 100644 index 0000000000..e0e47bf2d8 --- /dev/null +++ b/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml @@ -0,0 +1,3 @@ +easyconfigs: + - CUDA-12.1.1.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh new file mode 100755 index 0000000000..2fea64d7a6 --- /dev/null +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -0,0 +1,205 @@ +#!/usr/bin/env bash + +# This script can be used to install CUDA and other libraries by NVIDIA under +# the `.../host_injections` directory. +# +# This provides the parts of the CUDA installation and other libriaries that +# cannot be redistributed as part of EESSI due to license limitations. While +# GPU-based software from EESSI will _run_ without these, installation of +# additional software that builds upon CUDA or other libraries requires that +# these installation are present under `host_injections`. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" + echo " -e, --easystack EASYSTACKFILE Path to easystack file that defines which" + echo " packages shall be installed" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the installation of CUDA" + echo " and/or other libraries (must have" + echo " several GB available; depends on the number of installations)" +} + +# Initialize variables +eula_accepted=0 +EASYSTACKFILE= +TEMP_DIR= + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; + -e|--easystack) + if [ -n "$2" ]; then + EASYSTACKFILE="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +if [[ -z "${EASYSTACKFILE}" ]]; then + fatal_error "Need the name/path to an easystack file. See command line options\n" +fi + +# Make sure EESSI is initialised +check_eessi_initialised + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# we need a directory we can use for temporary storage +if [[ -z "${TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) +else + mkdir -p ${TEMP_DIR} + tmpdir=$(mktemp -d --tmpdir=${TEMP_DIR} cuda_n_co.XXX) + if [[ ! -d "$tmpdir" ]] ; then + fatal_error "Could not create directory ${tmpdir}" + fi +fi +echo "Created temporary directory '${tmpdir}'" + +# workaround for EasyBuild not being found when loading "extend" module +module load EasyBuild/4.9.4 + +# load EESSI-extend/2023.06-easybuild module && verify that it is loaded +EESSI_EXTEND_MODULE="EESSI-extend/2023.06-easybuild" +module load ${EESSI_EXTEND_MODULE} +ret=$? +if [ "${ret}" -ne 0 ]; then + fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" +fi + +# do a 'eb --dry-run-short' with the EASYSTACKFILE and determine list of packages +# to be installed +echo ">> Determining if packages specified in ${EASYSTACKFILE} are missing under ${EESSI_SITE_INSTALL}" +eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out +eb --dry-run-short --rebuild --easystack ${EASYSTACKFILE} 2>&1 | tee ${eb_dry_run_short_out} +ret=$? + +# Check if CUDA shall be installed +cuda_install_needed=0 +cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | grep "module: CUDA/" +ret=$? +if [ "${ret}" -eq 0 ]; then + cuda_install_needed=1 +fi + +# Make sure the CUDA EULA is accepted if it shall be installed +if [ "${cuda_install_needed}" -eq 1 ] && [ "${eula_accepted}" -ne 1 ]; then + show_help + error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + +# determine the number of packages to be installed (assume 5 GB + num_packages * +# 3GB space needed) +number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | sed -e 's/^.*module: //' | uniq | wc -l) +echo "number of packages to be (re-)installed: '${number_of_packages}'" +base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) + +required_space_in_tmpdir=${base_storage_space} +# Let's see if we have sources and build locations defined if not, we use the temporary space +if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) +fi +if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) +fi + +# The install is pretty fat, you need lots of space for download/unpack/install +# (~3*${base_storage_space}*1000 Bytes), +# need to do a space check before we proceed +avail_space=$(df --output=avail "${EESSI_SITE_INSTALL}"/ | tail -n 1 | awk '{print $1}') +min_disk_storage=$((3 * ${base_storage_space})) +if (( avail_space < ${min_disk_storage} )); then + fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_INSTALL}, exiting now..." +fi +avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') +if (( avail_space < required_space_in_tmpdir )); then + error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n" + error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH" + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" +fi + +# Brief explanation of parameters: +# - prefix: using $tmpdir as default base directory for several EB settings +# - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the +# `MODULEPATH` yet. Even if it is, we still want to redo this installation +# since it will provide the symlinked targets for the parts of the CUDA +# and/or other installation in the `.../versions/...` prefix +# - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, +# we only care about providing the targets for the symlinks. +# - ${cuda_arg}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if +# this script was called with the argument --accept-cuda-eula. +# - hooks: We don't want hooks used in this install, we need vanilla +# installations of CUDA and/or other libraries +# - easystack: Path to easystack file that defines which packages shall be +# installed +cuda_arg= +if [[ ${eula_accepted} -eq 1 ]]; then + cuda_arg="--accept-eula-for=CUDA" +fi +touch "$tmpdir"/none.py +eb --prefix="$tmpdir" \ + --rebuild \ + --installpath-modules=${tmpdir} \ + "${cuda_arg}" \ + --hooks="$tmpdir"/none.py \ + --easystack ${EASYSTACKFILE} +ret=$? +if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "some installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." +else + echo_green "all installations at ${EESSI_SITE_INSTALL}/software/... succeeded!" +fi +# clean up tmpdir +rm -rf "${tmpdir}" From 6824d75253b3ced2b4ce00f5a6f1fe5548bcbe41 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 15:00:33 +0200 Subject: [PATCH 04/47] use post_postproc hook to convert some cuDNN files to symlinks --- eb_hooks.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 25eefaf27a..ce99ed1dfe 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -763,13 +763,7 @@ def post_postproc_cuda(self, *args, **kwargs): raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") -def post_sanitycheck_hook(self, *args, **kwargs): - """Main post-sanitycheck hook: trigger custom functions based on software name.""" - if self.name in POST_SANITYCHECK_HOOKS: - POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) - - -def post_sanitycheck_cudnn(self, *args, **kwargs): +def post_postproc_cudnn(self, *args, **kwargs): """ Remove files from cuDNN installation that we are not allowed to ship, and replace them with a symlink to a corresponding installation under host_injections. @@ -956,8 +950,5 @@ def inject_gpu_property(ec): POST_POSTPROC_HOOKS = { 'CUDA': post_postproc_cuda, -} - -POST_SANITYCHECK_HOOKS = { - 'cuDNN': post_sanitycheck_cudnn, + 'cuDNN': post_postproc_cudnn, } From 044e168572f110fc9420336ee868fe13df538bba Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 20:46:32 +0200 Subject: [PATCH 05/47] explain idea for extension_based and reformat its definition --- eb_hooks.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index ce99ed1dfe..2e4b41fe2a 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -804,7 +804,19 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all """ Replace files that cannot be distributed with symlinks into host_injections """ - extension_based = { "CUDA": False, "cuDNN": True } + # Different packages use different ways to specify which files or file + # 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file + # names. For cuDNN, the 'LICENSE' lists file endings/suffixes (e.g., '.so') + # that can be redistributed. + # The map 'extension_based' defines which of these two ways are employed. If + # full file names are used it maps a package name (key) to False (value). If + # endings/suffixes are used, it maps a package name to True. Later we can + # easily use this data structure to employ the correct method for + # postprocessing an installation. + extension_based = { + "CUDA": False, + "cuDNN": True, + } if not package in extension_based: raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package) From f983fed3d06514da5534952d5588786273e0339f Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 20:57:54 +0200 Subject: [PATCH 06/47] explain why we need to obtain the extension and improve cond expr --- eb_hooks.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 2e4b41fe2a..6b3acaba29 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -828,9 +828,13 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all if not os.path.islink(full_path): # check if the current file name stub is part of the allowlist basename = filename.split('.')[0] - if extension_based[package]: - if '.' in filename: - extension = '.' + filename.split('.')[1] + if extension_based[package] and '.' in filename: + # if the allowlist only contains extensions, we have to + # determine that from filename. we assume the extension is + # the second element when splitting the filename at dots + # (e.g., for 'libcudnn_adv_infer.so.8.9.2' the extension + # would be '.so') + extension = '.' + filename.split('.')[1] if basename in allowlist: log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) elif extension_based[package] and '.' in filename and extension in allowlist: From 6e95efdb246fc4cfa43b14675170304f2c3ff5ae Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:03:01 +0200 Subject: [PATCH 07/47] use local var for conditional expression + slightly reorder code --- eb_hooks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 6b3acaba29..144f5b1333 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -826,18 +826,19 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all full_path = os.path.join(dir_path, filename) # we only really care about real files, i.e. not symlinks if not os.path.islink(full_path): - # check if the current file name stub is part of the allowlist - basename = filename.split('.')[0] - if extension_based[package] and '.' in filename: + check_by_extension = extension_based[package] and '.' in filename + if check_by_extension: # if the allowlist only contains extensions, we have to # determine that from filename. we assume the extension is # the second element when splitting the filename at dots # (e.g., for 'libcudnn_adv_infer.so.8.9.2' the extension # would be '.so') extension = '.' + filename.split('.')[1] + # check if the current file name stub or its extension is part of the allowlist + basename = filename.split('.')[0] if basename in allowlist: log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - elif extension_based[package] and '.' in filename and extension in allowlist: + elif check_by_extension and extension in allowlist: log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) else: if extension_based[package]: From 21916fd4e12ff7f880483e6e0f2c107288c6f361 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:05:54 +0200 Subject: [PATCH 08/47] code golf --- eb_hooks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 144f5b1333..61c96b921e 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -841,10 +841,7 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all elif check_by_extension and extension in allowlist: log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) else: - if extension_based[package]: - print_name = filename - else: - print_name = basename + print_name = filename if extension_based[package] else basename log.debug("%s is not found in allowlist, so replacing it with symlink: %s", print_name, full_path) # the host_injections path is under a fixed repo/location for CUDA or cuDNN From bf2684685070a199766f72fccfe0825a8ace573e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:16:41 +0200 Subject: [PATCH 09/47] improve comment (also anticipating additional cu* libraries in the future) --- eb_hooks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 61c96b921e..952b518d6b 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -846,9 +846,9 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all print_name, full_path) # the host_injections path is under a fixed repo/location for CUDA or cuDNN host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path) - # CUDA and cuDNN itself don't care about compute capability so remove this duplication from - # under host_injections (symlink to a single CUDA or cuDNN installation for all compute - # capabilities) + # CUDA and cu* libraries themselves don't care about compute capability so remove this + # duplication from under host_injections (symlink to a single CUDA or cu* library + # installation for all compute capabilities) accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET") if accel_subdir: host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '') From e1ba74ffa959316521461e448fe8370ed307c4c9 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:21:20 +0200 Subject: [PATCH 10/47] improve parameter name --- eb_hooks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 952b518d6b..6a4fdfe54b 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -800,7 +800,7 @@ def post_postproc_cudnn(self, *args, **kwargs): raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") -def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist): +def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist): """ Replace files that cannot be distributed with symlinks into host_injections """ @@ -817,8 +817,8 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all "CUDA": False, "cuDNN": True, } - if not package in extension_based: - raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package) + if not pkg_name in extension_based: + raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", pkg_name) # iterate over all files in the package installation directory for dir_path, _, files in os.walk(install_dir): @@ -826,7 +826,7 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all full_path = os.path.join(dir_path, filename) # we only really care about real files, i.e. not symlinks if not os.path.islink(full_path): - check_by_extension = extension_based[package] and '.' in filename + check_by_extension = extension_based[pkg_name] and '.' in filename if check_by_extension: # if the allowlist only contains extensions, we have to # determine that from filename. we assume the extension is @@ -841,7 +841,7 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, package, all elif check_by_extension and extension in allowlist: log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) else: - print_name = filename if extension_based[package] else basename + print_name = filename if extension_based[pkg_name] else basename log.debug("%s is not found in allowlist, so replacing it with symlink: %s", print_name, full_path) # the host_injections path is under a fixed repo/location for CUDA or cuDNN From 1f1eada7ba6535c1eb37a1cac6c249222d2700c0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:30:42 +0200 Subject: [PATCH 11/47] explain use of rstrip --- eb_hooks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/eb_hooks.py b/eb_hooks.py index 6a4fdfe54b..0e9bbcfc90 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -786,8 +786,11 @@ def post_postproc_cudnn(self, *args, **kwargs): # remove search string, split into words, remove trailing # dots '.' and only retain words starting with a dot '.' distributable = line[len(search_string):] + # distributable looks like ' the runtime files .so and .dll.' + # note the '.' after '.dll' for word in distributable.split(): if word[0] == '.': + # rstrip is used to remove the '.' after '.dll' allowlist.append(word.rstrip('.')) allowlist = sorted(set(allowlist)) From fc00d0cf758e6f4b250cbbd3e33a2db53f046740 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:45:53 +0200 Subject: [PATCH 12/47] raise error if search string wasn't found --- eb_hooks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/eb_hooks.py b/eb_hooks.py index 0e9bbcfc90..0f3f1a593d 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -780,9 +780,11 @@ def post_postproc_cudnn(self, *args, **kwargs): # read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped license_path = os.path.join(self.installdir, 'LICENSE') search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:" + found_search_string = False with open(license_path) as infile: for line in infile: if line.strip().startswith(search_string): + found_search_string = True # remove search string, split into words, remove trailing # dots '.' and only retain words starting with a dot '.' distributable = line[len(search_string):] @@ -792,6 +794,11 @@ def post_postproc_cudnn(self, *args, **kwargs): if word[0] == '.': # rstrip is used to remove the '.' after '.dll' allowlist.append(word.rstrip('.')) + if not found_search_string: + # search string wasn't found in LICENSE file + raise EasyBuildError("search string '%s' was not found in license file '%s';" + "hence installation may be replaced by symlinks only", + search_string, license_path) allowlist = sorted(set(allowlist)) self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) From e968608a8d9478468315fc90dc373dcdf875a28e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 3 Oct 2024 21:52:17 +0200 Subject: [PATCH 13/47] improved docstring --- eb_hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 0f3f1a593d..85cb739391 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -873,8 +873,8 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al def inject_gpu_property(ec): """ - Add 'gpu' property EESSIVERSION envvars and drop dependencies to - build dependencies, via modluafooter easyconfig parameter + Add 'gpu' property and EESSIVERSION envvars via modluafooter + easyconfig parameter, and drop dependencies to build dependencies """ ec_dict = ec.asdict() # Check if CUDA, cuDNN, you-name-it is in the dependencies, if so From 97d5b67f0d0306f084039062d8ce17ea2127c3f4 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 20:24:53 +0200 Subject: [PATCH 14/47] use TMPDIR as base for temporary storage --- EESSI-install-software.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index d54da4a404..7821655c07 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -249,10 +249,12 @@ else export skip_cuda_install=True fi +temp_install_storage=${TMPDIR}/temp_install_storage +mkdir -p ${temp_install_storage} if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \ - -t /tmp/temp \ + -t ${temp_install_storage} \ --accept-cuda-eula else echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" From 8e7a0e80ab3cefa1f3c0c2b7fbce71f2206f6102 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 20:47:02 +0200 Subject: [PATCH 15/47] attempt to use a single easystack file for CUDA/cu* packages --- EESSI-install-software.sh | 7 ++++++- .../nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 1 + install_scripts.sh | 12 +++++++++++- .../nvidia/eessi-2023.06-cuda-and-libraries.yml | 3 --- 4 files changed, 18 insertions(+), 5 deletions(-) delete mode 100644 scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 7821655c07..4c85fdfc31 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -251,9 +251,14 @@ fi temp_install_storage=${TMPDIR}/temp_install_storage mkdir -p ${temp_install_storage} +# Note the eessi...CUDA.yml file(s) is(are) copied by 'install_scripts.sh' from +# the EESSI/software-layer easystacks/software.eessi.io/2023.06/accel/nvidia +# directory to /cvmfs to avoid keeping them in sync manually. If more than one +# such file is used (e.g., because different EasyBuild versions were used), the +# install script 'install_cuda_and_libraries.sh' has to be run multiple times. if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ - -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \ + -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml \ -t ${temp_install_storage} \ --accept-cuda-eula else diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml index d54780804b..873c19aa33 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -1,2 +1,3 @@ easyconfigs: + - CUDA-12.1.1.eb - cuDNN-8.9.2.26-CUDA-12.1.1.eb diff --git a/install_scripts.sh b/install_scripts.sh index df9bda3ad3..ad73e769dd 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -122,13 +122,23 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@ # Copy files for the scripts/gpu_support/nvidia directory nvidia_files=( - eessi-2023.06-cuda-and-libraries.yml install_cuda_and_libraries.sh install_cuda_host_injections.sh link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" +# special treatment for the easystack file(s) that lists CUDA and cu* libraries +# To be picked up by a build job they have to be stored under +# easystacks/software.eessi.io/2023.06/accel/nvidia/ on GitHub. +# To avoid keeping that file and the one that we distribute via CernVM-FS so +# users/sites can install the full CUDA SDK and cu* libraries under +# 'host_injections' we copy the above file to the right location under /cvmfs. +nvidia_host_injections_files=( + eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +) +copy_files_by_list ${TOPDIR}/easystacks/software.eessi.io/2023.06/accel/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_host_injections_files[@]}" + # Copy over EasyBuild hooks file used for installations hook_files=( eb_hooks.py diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml b/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml deleted file mode 100644 index e0e47bf2d8..0000000000 --- a/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml +++ /dev/null @@ -1,3 +0,0 @@ -easyconfigs: - - CUDA-12.1.1.eb - - cuDNN-8.9.2.26-CUDA-12.1.1.eb From 57f5a485796a859bada8751f70319319f4e27e75 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 21:19:40 +0200 Subject: [PATCH 16/47] various improvements for inject_gpu_property --- eb_hooks.py | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 85cb739391..3179ac170f 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -881,39 +881,44 @@ def inject_gpu_property(ec): # - drop dependency to build dependency # - add 'gpu' Lmod property # - add envvar with package version - packages_list = ( "CUDA", "cuDNN" ) - packages_version = { } + pkg_names = ( "CUDA", "cuDNN" ) + pkg_versions = { } add_gpu_property = '' - for package in packages_list: - # Check if package is in the dependencies, if so drop dependency to build + for pkg_name in pkg_names: + # Check if pkg_name is in the dependencies, if so drop dependency to build # dependency and set variable for later adding the 'gpu' Lmod property - if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]): + # to '.remove' dependencies from ec_dict['dependencies'] we make a copy, + # iterate over the copy and can then savely use '.remove' on the original + # ec_dict['dependencies']. + deps = ec_dict['dependencies'][:] + if (pkg_name in [dep[0] for dep in deps]): add_gpu_property = 'add_property("arch","gpu")' - for dep in iter(ec_dict['dependencies']): - if package in dep[0]: - # make package a build dependency only (rpathing saves us from link errors) - ec.log.info("Dropping dependency on %s to build dependency" % package) + for dep in deps: + if pkg_name == dep[0]: + # make pkg_name a build dependency only (rpathing saves us from link errors) + ec.log.info("Dropping dependency on %s to build dependency" % pkg_name) ec_dict['dependencies'].remove(dep) if dep not in ec_dict['builddependencies']: ec_dict['builddependencies'].append(dep) # take note of version for creating the modluafooter - packages_version[package] = dep[1] + pkg_versions[pkg_name] = dep[1] if add_gpu_property: ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version") - key = 'modluafooter' - values = [add_gpu_property] - for package, version in packages_version.items(): - envvar = "EESSI%sVERSION" % package.upper() - values.append('setenv("%s","%s")' % (envvar, version)) - if not key in ec_dict: - ec[key] = '\n'.join(values) + modluafooter = 'modluafooter' + extra_mod_footer_lines = [add_gpu_property] + for pkg_name, version in pkg_versions.items(): + envvar = "EESSI%sVERSION" % pkg_name.upper() + extra_mod_footer_lines.append('setenv("%s","%s")' % (envvar, version)) + # take into account that modluafooter may already be set + if modluafooter in ec_dict: + value = ec_dict[modluafooter] + for line in extra_mod_footer_lines: + if not line in value: + value = '\n'.join([value, line]) + ec[modluafooter] = value else: - new_value = ec_dict[key] - for value in values: - if not value in new_value: - new_value = '\n'.join([new_value, value]) - ec[key] = new_value + ec[modluafooter] = '\n'.join(extra_mod_footer_lines) return ec From 21ffc180068a967f28ad9cbd4b3d4e6f60b4407c Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 22:02:53 +0200 Subject: [PATCH 17/47] various improvements for install_cuda_and_libraries.sh --- .../nvidia/install_cuda_and_libraries.sh | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 2fea64d7a6..e13b9ad386 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -27,7 +27,7 @@ show_help() { echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" echo " CUDA, see the EULA at" echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -e, --easystack EASYSTACKFILE Path to easystack file that defines which" + echo " -e, --easystack EASYSTACK_FILE Path to easystack file that defines which" echo " packages shall be installed" echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" echo " storage during the installation of CUDA" @@ -37,7 +37,7 @@ show_help() { # Initialize variables eula_accepted=0 -EASYSTACKFILE= +EASYSTACK_FILE= TEMP_DIR= # Parse command-line options @@ -53,7 +53,7 @@ while [[ $# -gt 0 ]]; do ;; -e|--easystack) if [ -n "$2" ]; then - EASYSTACKFILE="$2" + EASYSTACK_FILE="$2" shift 2 else echo "Error: Argument required for $1" @@ -78,7 +78,7 @@ while [[ $# -gt 0 ]]; do esac done -if [[ -z "${EASYSTACKFILE}" ]]; then +if [[ -z "${EASYSTACK_FILE}" ]]; then fatal_error "Need the name/path to an easystack file. See command line options\n" fi @@ -102,21 +102,24 @@ fi echo "Created temporary directory '${tmpdir}'" # workaround for EasyBuild not being found when loading "extend" module -module load EasyBuild/4.9.4 +# module load EasyBuild/4.9.4 # load EESSI-extend/2023.06-easybuild module && verify that it is loaded -EESSI_EXTEND_MODULE="EESSI-extend/2023.06-easybuild" +EESSI_EXTEND_MODULE="EESSI-extend/${EESSI_VERSION}-easybuild" module load ${EESSI_EXTEND_MODULE} ret=$? if [ "${ret}" -ne 0 ]; then fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" fi -# do a 'eb --dry-run-short' with the EASYSTACKFILE and determine list of packages +# show EasyBuild configuration +eb --show-config + +# do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages # to be installed -echo ">> Determining if packages specified in ${EASYSTACKFILE} are missing under ${EESSI_SITE_INSTALL}" +echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_INSTALL}" eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out -eb --dry-run-short --rebuild --easystack ${EASYSTACKFILE} 2>&1 | tee ${eb_dry_run_short_out} +eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} ret=$? # Check if CUDA shall be installed @@ -136,7 +139,7 @@ fi # determine the number of packages to be installed (assume 5 GB + num_packages * # 3GB space needed) -number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | sed -e 's/^.*module: //' | uniq | wc -l) +number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | sed -e 's/^.*module: //' | sort -u | wc -l) echo "number of packages to be (re-)installed: '${number_of_packages}'" base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) @@ -176,28 +179,28 @@ fi # and/or other installation in the `.../versions/...` prefix # - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, # we only care about providing the targets for the symlinks. -# - ${cuda_arg}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if +# - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if # this script was called with the argument --accept-cuda-eula. # - hooks: We don't want hooks used in this install, we need vanilla # installations of CUDA and/or other libraries # - easystack: Path to easystack file that defines which packages shall be # installed -cuda_arg= +accept_eula_opt= if [[ ${eula_accepted} -eq 1 ]]; then - cuda_arg="--accept-eula-for=CUDA" + accept_eula_opt="--accept-eula-for=CUDA" fi touch "$tmpdir"/none.py eb --prefix="$tmpdir" \ --rebuild \ - --installpath-modules=${tmpdir} \ - "${cuda_arg}" \ + --installpath-modules=${tmpdir}/modules \ + "${accept_eula_opt}" \ --hooks="$tmpdir"/none.py \ - --easystack ${EASYSTACKFILE} + --easystack ${EASYSTACK_FILE} ret=$? if [ $ret -ne 0 ]; then eb_last_log=$(unset EB_VERBOSE; eb --last-log) cp -a ${eb_last_log} . - fatal_error "some installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." + fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." else echo_green "all installations at ${EESSI_SITE_INSTALL}/software/... succeeded!" fi From a3edc20e439f4916283da0421727efb470d729c0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 22:28:00 +0200 Subject: [PATCH 18/47] show available *CUDA* modules for easier debugging --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index e13b9ad386..7a8d1d74ba 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -101,8 +101,8 @@ else fi echo "Created temporary directory '${tmpdir}'" -# workaround for EasyBuild not being found when loading "extend" module -# module load EasyBuild/4.9.4 +echo "List available *CUDA* modules before loading EESSI-extend/${EESSI_VERSION}-easybuild" +module avail CUDA # load EESSI-extend/2023.06-easybuild module && verify that it is loaded EESSI_EXTEND_MODULE="EESSI-extend/${EESSI_VERSION}-easybuild" @@ -112,7 +112,11 @@ if [ "${ret}" -ne 0 ]; then fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" fi +echo "List available *CUDA* modules after loading EESSI-extend/${EESSI_VERSION}-easybuild" +module avail CUDA + # show EasyBuild configuration +echo "Show EasyBuild configuration" eb --show-config # do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages From 7f601dc60a1ec221eed71a874e1a30d48bfb3cce Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 22:50:01 +0200 Subject: [PATCH 19/47] print and adjust MODULEPATH --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 7a8d1d74ba..b36eeafd29 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -101,6 +101,7 @@ else fi echo "Created temporary directory '${tmpdir}'" +echo "MODULEPATH=${MODULEPATH}" echo "List available *CUDA* modules before loading EESSI-extend/${EESSI_VERSION}-easybuild" module avail CUDA @@ -112,9 +113,15 @@ if [ "${ret}" -ne 0 ]; then fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" fi +echo "MODULEPATH=${MODULEPATH}" echo "List available *CUDA* modules after loading EESSI-extend/${EESSI_VERSION}-easybuild" module avail CUDA +# use install_path/modules/all as MODULEPATH +SAVE_MODULEPATH=${MODULEPATH} +MODULEPATH=${EASYBUILD_INSTALLPATH}/modules/all +echo "set MODULEPATH=${MODULEPATH}" + # show EasyBuild configuration echo "Show EasyBuild configuration" eb --show-config From e3101b0c82d554823b33d9cab2abee074e330222 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 4 Oct 2024 23:26:47 +0200 Subject: [PATCH 20/47] implement option 3 to install module files in hidden directory --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index b36eeafd29..40bff34bb7 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -119,7 +119,7 @@ module avail CUDA # use install_path/modules/all as MODULEPATH SAVE_MODULEPATH=${MODULEPATH} -MODULEPATH=${EASYBUILD_INSTALLPATH}/modules/all +MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all echo "set MODULEPATH=${MODULEPATH}" # show EasyBuild configuration @@ -150,7 +150,7 @@ fi # determine the number of packages to be installed (assume 5 GB + num_packages * # 3GB space needed) -number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | sed -e 's/^.*module: //' | sort -u | wc -l) +number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | sed -e 's/^.*module: //' | sort -u | wc -l) echo "number of packages to be (re-)installed: '${number_of_packages}'" base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) @@ -203,7 +203,7 @@ fi touch "$tmpdir"/none.py eb --prefix="$tmpdir" \ --rebuild \ - --installpath-modules=${tmpdir}/modules \ + --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules \ "${accept_eula_opt}" \ --hooks="$tmpdir"/none.py \ --easystack ${EASYSTACK_FILE} From e9018fb784d41983235768212efd7df88765bfc2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 16:34:38 +0200 Subject: [PATCH 21/47] Move to gpu_support/nvidia subdir --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml new file mode 100644 index 0000000000..24be8ddbc3 --- /dev/null +++ b/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml @@ -0,0 +1,5 @@ +# This EasyStack provides a list of all the EasyConfigs that should be installed in host_injections +# because they cannot (fully) be shipped as part of EESSI due to license constraints +easyconfigs: + - CUDA-12.1.1.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb From 54753c33bb79605a5c13e68726e795de4ad6b23f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 16:40:51 +0200 Subject: [PATCH 22/47] Make comment more explicit that this is only about nvidia GPU support --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml index 24be8ddbc3..2a6f9bd6f9 100644 --- a/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml +++ b/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml @@ -1,5 +1,5 @@ -# This EasyStack provides a list of all the EasyConfigs that should be installed in host_injections -# because they cannot (fully) be shipped as part of EESSI due to license constraints +# This EasyStack provides a list of all the EasyConfigs that should be installed in host_injections +# for nvidia GPU support, because they cannot (fully) be shipped as part of EESSI due to license constraints easyconfigs: - CUDA-12.1.1.eb - cuDNN-8.9.2.26-CUDA-12.1.1.eb From 2b76a54c0b1c4f1256679d6d1825c5590939d90b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 17:12:41 +0200 Subject: [PATCH 23/47] Moved easystack file --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/gpu_support/nvidia/{ => easystacks}/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml (100%) diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml similarity index 100% rename from scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml rename to scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml From 086ba5a570e70426d4bd52119247e094f1f9cabc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 17:14:37 +0200 Subject: [PATCH 24/47] Change how EESSI_SITE_INSTALL is used --- .../nvidia/install_cuda_and_libraries.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 40bff34bb7..7cbf253256 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -85,9 +85,8 @@ fi # Make sure EESSI is initialised check_eessi_initialised -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) -export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections} +# Make sure that `EESSI-extend` will install in the site installation path EESSI_SITE_SOFTWARE_PATH +export EESSI_SITE_INSTALL=1 # we need a directory we can use for temporary storage if [[ -z "${TEMP_DIR}" ]]; then @@ -128,7 +127,7 @@ eb --show-config # do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages # to be installed -echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_INSTALL}" +echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_SOFTWARE_PATH}" eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} ret=$? @@ -168,10 +167,10 @@ fi # The install is pretty fat, you need lots of space for download/unpack/install # (~3*${base_storage_space}*1000 Bytes), # need to do a space check before we proceed -avail_space=$(df --output=avail "${EESSI_SITE_INSTALL}"/ | tail -n 1 | awk '{print $1}') +avail_space=$(df --output=avail "${EESSI_SITE_SOFTWARE_PATH}"/ | tail -n 1 | awk '{print $1}') min_disk_storage=$((3 * ${base_storage_space})) if (( avail_space < ${min_disk_storage} )); then - fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_INSTALL}, exiting now..." + fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_SOFTWARE_PATH}, exiting now..." fi avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then @@ -213,7 +212,7 @@ if [ $ret -ne 0 ]; then cp -a ${eb_last_log} . fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." else - echo_green "all installations at ${EESSI_SITE_INSTALL}/software/... succeeded!" + echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!" fi # clean up tmpdir rm -rf "${tmpdir}" From ecd30ca4bdb9982ea42d0294b84cc60ff2165118 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 17:50:35 +0200 Subject: [PATCH 25/47] First attempt at making this loop over EasyStack files, loading the correct easybuild version before loading eessi-extend --- .../nvidia/install_cuda_and_libraries.sh | 265 +++++++++--------- 1 file changed, 139 insertions(+), 126 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 7cbf253256..60f3bb30be 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -27,8 +27,9 @@ show_help() { echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" echo " CUDA, see the EULA at" echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -e, --easystack EASYSTACK_FILE Path to easystack file that defines which" - echo " packages shall be installed" + echo " --accept-cudnn-eula You _must_ accept the cuDNN EULA to install" + echo " cuDNN, see the EULA at" + echo " https://docs.nvidia.com/deeplearning/cudnn/latest/reference/eula.html" echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" echo " storage during the installation of CUDA" echo " and/or other libraries (must have" @@ -36,7 +37,7 @@ show_help() { } # Initialize variables -eula_accepted=0 +cuda_eula_accepted=0 EASYSTACK_FILE= TEMP_DIR= @@ -48,18 +49,12 @@ while [[ $# -gt 0 ]]; do exit 0 ;; --accept-cuda-eula) - eula_accepted=1 + cuda_eula_accepted=1 shift 1 ;; - -e|--easystack) - if [ -n "$2" ]; then - EASYSTACK_FILE="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi + --accept-cudnn-eula) + cudnn_eula_accepted=1 + shift 1 ;; -t|--temp-dir) if [ -n "$2" ]; then @@ -78,10 +73,6 @@ while [[ $# -gt 0 ]]; do esac done -if [[ -z "${EASYSTACK_FILE}" ]]; then - fatal_error "Need the name/path to an easystack file. See command line options\n" -fi - # Make sure EESSI is initialised check_eessi_initialised @@ -100,119 +91,141 @@ else fi echo "Created temporary directory '${tmpdir}'" -echo "MODULEPATH=${MODULEPATH}" -echo "List available *CUDA* modules before loading EESSI-extend/${EESSI_VERSION}-easybuild" -module avail CUDA +# use install_path/modules/all as MODULEPATH +SAVE_MODULEPATH=${MODULEPATH} -# load EESSI-extend/2023.06-easybuild module && verify that it is loaded -EESSI_EXTEND_MODULE="EESSI-extend/${EESSI_VERSION}-easybuild" -module load ${EESSI_EXTEND_MODULE} -ret=$? -if [ "${ret}" -ne 0 ]; then - fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" -fi +for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do + echo -e "Processing easystack file ${easystack_file}...\n\n" + + # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file + eb_version=$(echo ${EASYSTACK_FILE} | sed 's/.*eb-\([0-9.]*\).*/\1/g') + + # Load EasyBuild version for this easystack file _before_ loading EESSI-extend + module load EasyBuild/${eb_version} + module load EESSI-extend/${EESSI_VERSION}-easybuild + + # Install modules in hidden .modules dir to keep track of what was installed before + MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all + echo "set MODULEPATH=${MODULEPATH}" + + # show EasyBuild configuration + echo "Show EasyBuild configuration" + eb --show-config + + # do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages + # to be installed + echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_SOFTWARE_PATH}" + eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out + eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} + ret=$? + + # Check if CUDA shall be installed + cuda_install_needed=0 + cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: CUDA/" + ret=$? + if [ "${ret}" -eq 0 ]; then + cuda_install_needed=1 + fi -echo "MODULEPATH=${MODULEPATH}" -echo "List available *CUDA* modules after loading EESSI-extend/${EESSI_VERSION}-easybuild" -module avail CUDA + # Make sure the CUDA EULA is accepted if it shall be installed + if [ "${cuda_install_needed}" -eq 1 ] && [ "${cuda_eula_accepted}" -ne 1 ]; then + show_help + error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" + fi -# use install_path/modules/all as MODULEPATH -SAVE_MODULEPATH=${MODULEPATH} -MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all -echo "set MODULEPATH=${MODULEPATH}" - -# show EasyBuild configuration -echo "Show EasyBuild configuration" -eb --show-config - -# do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages -# to be installed -echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_SOFTWARE_PATH}" -eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out -eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} -ret=$? - -# Check if CUDA shall be installed -cuda_install_needed=0 -cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | grep "module: CUDA/" -ret=$? -if [ "${ret}" -eq 0 ]; then - cuda_install_needed=1 -fi + # Check if cdDNN shall be installed + cudnn_install_needed=0 + cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: cuDNN/" + ret=$? + if [ "${ret}" -eq 0 ]; then + cudnn_install_needed=1 + fi -# Make sure the CUDA EULA is accepted if it shall be installed -if [ "${cuda_install_needed}" -eq 1 ] && [ "${eula_accepted}" -ne 1 ]; then - show_help - error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" - fatal_error "${error}" -fi + # Make sure the cuDNN EULA is accepted if it shall be installed + if [ "${cudnn_install_needed}" -eq 1 ] && [ "${cudnn_eula_accepted}" -ne 1 ]; then + show_help + error="\ncuDNN shall be installed. However, the cuDNNDA EULA has not been accepted\nYou _must_ accept the cuDNN EULA via the appropriate command line option.\n" + fatal_error "${error}" + fi -# determine the number of packages to be installed (assume 5 GB + num_packages * -# 3GB space needed) -number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | sed -e 's/^.*module: //' | sort -u | wc -l) -echo "number of packages to be (re-)installed: '${number_of_packages}'" -base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) - -required_space_in_tmpdir=${base_storage_space} -# Let's see if we have sources and build locations defined if not, we use the temporary space -if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) -fi -if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) -fi + # determine the number of packages to be installed (assume 5 GB + num_packages * + # 3GB space needed). Both CUDA and cuDNN are about this size + number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | sed -e 's/^.*module: //' | sort -u | wc -l) + echo "number of packages to be (re-)installed: '${number_of_packages}'" + base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) + + required_space_in_tmpdir=${base_storage_space} + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install + # (~3*${base_storage_space}*1000 Bytes), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${EESSI_SITE_SOFTWARE_PATH}"/ | tail -n 1 | awk '{print $1}') + min_disk_storage=$((3 * ${base_storage_space})) + if (( avail_space < ${min_disk_storage} )); then + fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_SOFTWARE_PATH}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n" + error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH" + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi -# The install is pretty fat, you need lots of space for download/unpack/install -# (~3*${base_storage_space}*1000 Bytes), -# need to do a space check before we proceed -avail_space=$(df --output=avail "${EESSI_SITE_SOFTWARE_PATH}"/ | tail -n 1 | awk '{print $1}') -min_disk_storage=$((3 * ${base_storage_space})) -if (( avail_space < ${min_disk_storage} )); then - fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_SOFTWARE_PATH}, exiting now..." -fi -avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') -if (( avail_space < required_space_in_tmpdir )); then - error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n" - error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check." - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH" - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" -fi + # Brief explanation of parameters: + # - prefix: using $tmpdir as default base directory for several EB settings + # - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the CUDA + # and/or other installation in the `.../versions/...` prefix + # - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. + # - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if + # this script was called with the argument --accept-cuda-eula. + # - hooks: We don't want hooks used in this install, we need vanilla + # installations of CUDA and/or other libraries + # - easystack: Path to easystack file that defines which packages shall be + # installed + accept_eula_opt= + if [[ ${cuda_eula_accepted} -eq 1 ]]; then + accept_eula_opt="--accept-eula-for=CUDA" + fi + if [[ ${cudnn_eula_accepted} -eq 1 ]]; then + if [[ -z ${accept_eula_opt} ]]; then + accept_eula_opt="--accept-eula-for=cuDNN" + else + accept_eula_opt="${accept_eula_opt},cuDNN" + fi + fi + touch "$tmpdir"/none.py + eb --prefix="$tmpdir" \ + --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules \ + "${accept_cuda_eula_opt}" \ + --hooks="$tmpdir"/none.py \ + --easystack ${EASYSTACK_FILE} + ret=$? + if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." + else + echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!" + fi -# Brief explanation of parameters: -# - prefix: using $tmpdir as default base directory for several EB settings -# - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the -# `MODULEPATH` yet. Even if it is, we still want to redo this installation -# since it will provide the symlinked targets for the parts of the CUDA -# and/or other installation in the `.../versions/...` prefix -# - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, -# we only care about providing the targets for the symlinks. -# - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if -# this script was called with the argument --accept-cuda-eula. -# - hooks: We don't want hooks used in this install, we need vanilla -# installations of CUDA and/or other libraries -# - easystack: Path to easystack file that defines which packages shall be -# installed -accept_eula_opt= -if [[ ${eula_accepted} -eq 1 ]]; then - accept_eula_opt="--accept-eula-for=CUDA" -fi -touch "$tmpdir"/none.py -eb --prefix="$tmpdir" \ - --rebuild \ - --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules \ - "${accept_eula_opt}" \ - --hooks="$tmpdir"/none.py \ - --easystack ${EASYSTACK_FILE} -ret=$? -if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." -else - echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!" -fi -# clean up tmpdir -rm -rf "${tmpdir}" + # clean up tmpdir + rm -rf "${tmpdir}" + + # Restore MODULEPATH for next loop iteration + MODUELPATH=${SAVE_MODULEPATH} +done From 18cdaa252b827744a7629d22d3dc150234292aca Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 18:04:22 +0200 Subject: [PATCH 26/47] not sure why this is not working, see if this solves it --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 60f3bb30be..1092472fc2 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -121,7 +121,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do # Check if CUDA shall be installed cuda_install_needed=0 - cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: CUDA/" + cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: CUDA/" > /dev/null ret=$? if [ "${ret}" -eq 0 ]; then cuda_install_needed=1 @@ -136,7 +136,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do # Check if cdDNN shall be installed cudnn_install_needed=0 - cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: cuDNN/" + cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: cuDNN/" > /dev/null ret=$? if [ "${ret}" -eq 0 ]; then cudnn_install_needed=1 @@ -205,7 +205,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do if [[ -z ${accept_eula_opt} ]]; then accept_eula_opt="--accept-eula-for=cuDNN" else - accept_eula_opt="${accept_eula_opt},cuDNN" + accept_eula_opt="$accept_eula_opt,cuDNN" fi fi touch "$tmpdir"/none.py From 57164242647969dfcf4bcfdf7dbe50f197602064 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 18:15:00 +0200 Subject: [PATCH 27/47] This was the only way in which I got this to work. Otherwise, it doesn't seem like the option is correctly accepted by EasyBuild - something related to bash variable expansion --- .../nvidia/install_cuda_and_libraries.sh | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 1092472fc2..73a4a7f6a9 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -199,21 +199,25 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do # installed accept_eula_opt= if [[ ${cuda_eula_accepted} -eq 1 ]]; then - accept_eula_opt="--accept-eula-for=CUDA" + accept_eula_opt="CUDA" fi if [[ ${cudnn_eula_accepted} -eq 1 ]]; then if [[ -z ${accept_eula_opt} ]]; then - accept_eula_opt="--accept-eula-for=cuDNN" + accept_eula_opt="cuDNN" else - accept_eula_opt="$accept_eula_opt,cuDNN" + accept_eula_opt="${accept_eula_opt},cuDNN" fi fi touch "$tmpdir"/none.py - eb --prefix="$tmpdir" \ - --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules \ - "${accept_cuda_eula_opt}" \ - --hooks="$tmpdir"/none.py \ - --easystack ${EASYSTACK_FILE} + eb_args="--prefix=$tmpdir" + eb_args="$eb_args --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules" + eb_args="$eb_args --hooks="$tmpdir"/none.py" + eb_args="$eb_args --easystack ${EASYSTACK_FILE}" + if [[ ! -z ${accept_eula_opt} ]]; then + eb_args="$eb_args --accept-eula-for=$accept_eula_opt" + fi + echo "Running eb $eb_args" + eb $eb_args ret=$? if [ $ret -ne 0 ]; then eb_last_log=$(unset EB_VERBOSE; eb --last-log) From 9f3853ca08d76def4a8b818cb0212693565d9be6 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 21:17:04 +0200 Subject: [PATCH 28/47] Added include-easyblocks-from-commit --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml index 2a6f9bd6f9..4e3fffacca 100644 --- a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml +++ b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml @@ -2,4 +2,7 @@ # for nvidia GPU support, because they cannot (fully) be shipped as part of EESSI due to license constraints easyconfigs: - CUDA-12.1.1.eb - - cuDNN-8.9.2.26-CUDA-12.1.1.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb: + options: + # Needed for support for --accept-uela-for option + include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8 From b9017d446053807152cfb84681d10c896f9c6f81 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 21:25:43 +0200 Subject: [PATCH 29/47] Easystack no longer passed as option. Comment is outdated, since we now _do_ have a separate EasyStack file in scripts/gpu_support/nvidia/easystacks --- EESSI-install-software.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 4c85fdfc31..7c4c1036d1 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -251,16 +251,11 @@ fi temp_install_storage=${TMPDIR}/temp_install_storage mkdir -p ${temp_install_storage} -# Note the eessi...CUDA.yml file(s) is(are) copied by 'install_scripts.sh' from -# the EESSI/software-layer easystacks/software.eessi.io/2023.06/accel/nvidia -# directory to /cvmfs to avoid keeping them in sync manually. If more than one -# such file is used (e.g., because different EasyBuild versions were used), the -# install script 'install_cuda_and_libraries.sh' has to be run multiple times. if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ - -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml \ -t ${temp_install_storage} \ - --accept-cuda-eula + --accept-cuda-eula \ + --accept-cudnn-eula else echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi From eea2879a7d5afee4d3b5f6cbdabe7b925305c56f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 21:37:59 +0200 Subject: [PATCH 30/47] Make sure easystack file for host_injections is shipped --- install_scripts.sh | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/install_scripts.sh b/install_scripts.sh index ad73e769dd..b6b5ac92b0 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -128,16 +128,12 @@ nvidia_files=( ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" -# special treatment for the easystack file(s) that lists CUDA and cu* libraries -# To be picked up by a build job they have to be stored under -# easystacks/software.eessi.io/2023.06/accel/nvidia/ on GitHub. -# To avoid keeping that file and the one that we distribute via CernVM-FS so -# users/sites can install the full CUDA SDK and cu* libraries under -# 'host_injections' we copy the above file to the right location under /cvmfs. -nvidia_host_injections_files=( - eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +# Easystacks to be used to install software in host injections +host_injections_easystacks=( + eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml ) -copy_files_by_list ${TOPDIR}/easystacks/software.eessi.io/2023.06/accel/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_host_injections_files[@]}" +copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia/easystacks \ +${INSTALL_PREFIX}/scripts/gpu_support/nvidia/easystacks "${host_injections_easystacks[@]}" # Copy over EasyBuild hooks file used for installations hook_files=( From 33199d78a34f8ead4d834af3225608ba253ccf21 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 22:04:23 +0200 Subject: [PATCH 31/47] Remove rebuild, change comments that were out of date --- .../gpu_support/nvidia/install_cuda_and_libraries.sh | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 73a4a7f6a9..b3da2a13da 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -116,7 +116,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do # to be installed echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_SOFTWARE_PATH}" eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out - eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} + eb --dry-run-short --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out} ret=$? # Check if CUDA shall be installed @@ -185,12 +185,8 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do # Brief explanation of parameters: # - prefix: using $tmpdir as default base directory for several EB settings - # - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the CUDA - # and/or other installation in the `.../versions/...` prefix - # - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. + # - installpath-modules: We install the module in a hidden .modules, so that next time this script + # is run, it is not reinstalled. # - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if # this script was called with the argument --accept-cuda-eula. # - hooks: We don't want hooks used in this install, we need vanilla From 06cd2eae5f169d3f7eb1951acef2175b0d8d3719 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Oct 2024 22:07:25 +0200 Subject: [PATCH 32/47] Only loop over the easystacks with CUDA in the name --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index b3da2a13da..3d273f0fbd 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -94,7 +94,7 @@ echo "Created temporary directory '${tmpdir}'" # use install_path/modules/all as MODULEPATH SAVE_MODULEPATH=${MODULEPATH} -for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*.yml; do +for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do echo -e "Processing easystack file ${easystack_file}...\n\n" # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file From 0e7c9d8a4f5150c6734f35d0d5ebc13ba2990150 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 09:54:14 +0200 Subject: [PATCH 33/47] add a bit more debug output, use *SITE_SOFTWARE_PATH and minor tweaks --- EESSI-install-software.sh | 1 + scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index f20b1b6c64..235e29477e 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -249,6 +249,7 @@ fi # Allow skipping CUDA SDK install in e.g. CI environments # The install_cuda... script uses EasyBuild. So, we need to check if we have EB # or skip this step. +echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary" module_avail_out=$TMPDIR/ml.out module avail 2>&1 | grep EasyBuild &> ${module_avail_out} if [[ $? -eq 0 ]]; then diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 3d273f0fbd..5250348b40 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -38,6 +38,7 @@ show_help() { # Initialize variables cuda_eula_accepted=0 +cudnn_eula_accepted=0 EASYSTACK_FILE= TEMP_DIR= @@ -91,7 +92,7 @@ else fi echo "Created temporary directory '${tmpdir}'" -# use install_path/modules/all as MODULEPATH +# use EESSI_SITE_SOFTWARE_PATH/.modules/all as MODULEPATH SAVE_MODULEPATH=${MODULEPATH} for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do @@ -101,11 +102,12 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do eb_version=$(echo ${EASYSTACK_FILE} | sed 's/.*eb-\([0-9.]*\).*/\1/g') # Load EasyBuild version for this easystack file _before_ loading EESSI-extend + module avail EasyBuild module load EasyBuild/${eb_version} module load EESSI-extend/${EESSI_VERSION}-easybuild # Install modules in hidden .modules dir to keep track of what was installed before - MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all + MODULEPATH=${EESSI_SITE_SOFTWARE_PATH}/.modules/all echo "set MODULEPATH=${MODULEPATH}" # show EasyBuild configuration @@ -145,7 +147,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # Make sure the cuDNN EULA is accepted if it shall be installed if [ "${cudnn_install_needed}" -eq 1 ] && [ "${cudnn_eula_accepted}" -ne 1 ]; then show_help - error="\ncuDNN shall be installed. However, the cuDNNDA EULA has not been accepted\nYou _must_ accept the cuDNN EULA via the appropriate command line option.\n" + error="\ncuDNN shall be installed. However, the cuDNN EULA has not been accepted\nYou _must_ accept the cuDNN EULA via the appropriate command line option.\n" fatal_error "${error}" fi From d57f8d8caa7dd17dd3e8e82a174f586998db08b0 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 10:33:51 +0200 Subject: [PATCH 34/47] replace TAB with WHITESPACEs --- init/eessi_environment_variables | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index d2daf40ace..60d69cc198 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -153,10 +153,10 @@ if [ -d $EESSI_PREFIX ]; then fi # Fix wrong path for RHEL >=8 libcurl - # This is required here because we ship curl in our compat layer. If we only provided - # curl as a module file we could instead do this via a `modluafooter` in an EasyBuild - # hook (or via an Lmod hook) - rhel_libcurl_file="/etc/pki/tls/certs/ca-bundle.crt" + # This is required here because we ship curl in our compat layer. If we only provided + # curl as a module file we could instead do this via a `modluafooter` in an EasyBuild + # hook (or via an Lmod hook) + rhel_libcurl_file="/etc/pki/tls/certs/ca-bundle.crt" if [ -f $rhel_libcurl_file ]; then show_msg "Found libcurl CAs file at RHEL location, setting CURL_CA_BUNDLE" export CURL_CA_BUNDLE=$rhel_libcurl_file From 02d3e1eaf6c62b6811f4920c8a1eaf77afb4ae08 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 10:42:20 +0200 Subject: [PATCH 35/47] show more msgs when building and init full environment --- EESSI-install-software.sh | 2 +- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 235e29477e..31c3adeccb 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -173,7 +173,7 @@ fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables +EESSI_SILENT=0 EESSI_BASIC_ENV=0 source $TOPDIR/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!" diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 5250348b40..7d30c65def 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -79,6 +79,7 @@ check_eessi_initialised # Make sure that `EESSI-extend` will install in the site installation path EESSI_SITE_SOFTWARE_PATH export EESSI_SITE_INSTALL=1 +echo "EESSI_SITE_SOFTWARE_PATH=${EESSI_SITE_SOFTWARE_PATH}" # we need a directory we can use for temporary storage if [[ -z "${TEMP_DIR}" ]]; then From b28017ad61c7bcc049edc522506b93b05d36f73b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 10:59:48 +0200 Subject: [PATCH 36/47] use zero length env vars --- EESSI-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 31c3adeccb..84c96ad369 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -173,7 +173,7 @@ fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT=0 EESSI_BASIC_ENV=0 source $TOPDIR/init/eessi_environment_variables +EESSI_SILENT= EESSI_BASIC_ENV= source $TOPDIR/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!" From 20aacdca6acf7013bea66a283086c63e4cc9761b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 11:48:20 +0200 Subject: [PATCH 37/47] fix syntax issue --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml index 4e3fffacca..677627eed3 100644 --- a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml +++ b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml @@ -3,6 +3,6 @@ easyconfigs: - CUDA-12.1.1.eb - cuDNN-8.9.2.26-CUDA-12.1.1.eb: - options: - # Needed for support for --accept-uela-for option - include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8 + options: + # Needed for support for --accept-uela-for option + include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8 From 88bdb888586ff96a3a2335244327f7336adab2a5 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 12:13:25 +0200 Subject: [PATCH 38/47] tweak variable expansion in test --- init/eessi_environment_variables | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 60d69cc198..125f3e5a93 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -10,7 +10,7 @@ function error() { function show_msg { # only echo msg if EESSI_SILENT is unset msg=$1 - if [[ -z ${EESSI_SILENT+x} ]]; then + if [[ -z ${EESSI_SILENT:+x} ]]; then echo "$msg" fi } From 7b8ba8b8a1a6e3c99dce3b74ffa1f568140ccfa7 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 13:24:12 +0200 Subject: [PATCH 39/47] dont use hooks when installing into host_injections --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 7d30c65def..3590cf3a86 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -99,6 +99,10 @@ SAVE_MODULEPATH=${MODULEPATH} for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do echo -e "Processing easystack file ${easystack_file}...\n\n" + # We don't want hooks used in this install, we need vanilla installations + touch "${tmpdir}"/none.py + export EASYBUILD_HOOKS="${tmpdir}/none.py" + # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file eb_version=$(echo ${EASYSTACK_FILE} | sed 's/.*eb-\([0-9.]*\).*/\1/g') From a95d546371afa4892ca51841527a6cda4907402c Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 15:09:32 +0200 Subject: [PATCH 40/47] revert variable expansion and unset certain variables instead --- EESSI-install-software.sh | 4 +++- init/eessi_environment_variables | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 84c96ad369..467a484fc7 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -173,7 +173,9 @@ fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT= EESSI_BASIC_ENV= source $TOPDIR/init/eessi_environment_variables +unset EESSI_SILENT +unset EESSI_BASIC_ENV +source $TOPDIR/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!" diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 125f3e5a93..60d69cc198 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -10,7 +10,7 @@ function error() { function show_msg { # only echo msg if EESSI_SILENT is unset msg=$1 - if [[ -z ${EESSI_SILENT:+x} ]]; then + if [[ -z ${EESSI_SILENT+x} ]]; then echo "$msg" fi } From 45fa6b12ffa9a2d7b559f1c60ce6b71f94f4cc45 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 19:59:05 +0200 Subject: [PATCH 41/47] log if Lmod rc/SitePackage are being created --- EESSI-install-software.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 467a484fc7..f05e8ea7f9 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -161,11 +161,13 @@ _eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_ _lmod_cfg_dir=${_eessi_software_path}/.lmod _lmod_rc_file=${_lmod_cfg_dir}/lmodrc.lua if [ ! -f ${_lmod_rc_file} ]; then + echo "Lmod file '${_lmod_rc_file}' does not exist yet; creating it..." command -V python3 python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path} fi _lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua if [ ! -f ${_lmod_sitepackage_file} ]; then + echo "Lmod file '${_lmod_sitepackage_file}' does not exist yet; creating it..." command -V python3 python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path} fi From c3482b269b892503a83080420f0317a3f98c5525 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 21:11:05 +0200 Subject: [PATCH 42/47] show full path to Lmod RC/SitePackage when created --- EESSI-install-software.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index f05e8ea7f9..3ebc2a5f03 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -328,20 +328,20 @@ else done fi -echo ">> Creating/updating Lmod RC file..." export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod" lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua" lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?) if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then + echo ">> Creating/updating Lmod RC file (${lmod_rc_file})..." python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$lmod_rc_file created" "Failed to create $lmod_rc_file" fi -echo ">> Creating/updating Lmod SitePackage.lua ..." export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod" lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua" sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?) if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then + echo ">> Creating/updating Lmod SitePackage.lua (${lmod_sitepackage_file})..." python3 $TOPDIR/create_lmodsitepackage.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$lmod_sitepackage_file created" "Failed to create $lmod_sitepackage_file" fi From db90ca70d49da84a2283c1c805181835d055f00b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 15 Oct 2024 22:26:25 +0200 Subject: [PATCH 43/47] adjust path to lua files if building accelerator software --- EESSI-install-software.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 3ebc2a5f03..7c51727bef 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -330,6 +330,11 @@ fi export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod" lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua" +if [[ ! -z ${EESSI_ACCELERATOR_TARGET} ]]; then + # EESSI_ACCELERATOR_TARGET is set, so let's remove the accelerator path from $lmod_rc_file + lmod_rc_file=$(echo ${lmod_rc_file} | sed "s@/accel/${EESSI_ACCELERATOR_TARGET}@@") + echo "Path to lmodrc.lua changed to '${lmod_rc_file}'" +fi lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?) if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then echo ">> Creating/updating Lmod RC file (${lmod_rc_file})..." @@ -339,6 +344,11 @@ fi export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod" lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua" +if [[ ! -z ${EESSI_ACCELERATOR_TARGET} ]]; then + # EESSI_ACCELERATOR_TARGET is set, so let's remove the accelerator path from $lmod_sitepackage_file + lmod_sitepackage_file=$(echo ${lmod_sitepackage_file} | sed "s@/accel/${EESSI_ACCELERATOR_TARGET}@@") + echo "Path to SitePackage.lua changed to '${lmod_sitepackage_file}'" +fi sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?) if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then echo ">> Creating/updating Lmod SitePackage.lua (${lmod_sitepackage_file})..." From 6a0223c944d8dd3cbf8943fbc7af9bc41ff45bcf Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 16 Oct 2024 10:42:14 +0200 Subject: [PATCH 44/47] small typo fixed --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 3590cf3a86..0662dac761 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -234,5 +234,5 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do rm -rf "${tmpdir}" # Restore MODULEPATH for next loop iteration - MODUELPATH=${SAVE_MODULEPATH} + MODULEPATH=${SAVE_MODULEPATH} done From affe37ba5617909d0ad415cdcf3ce6f899150644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Wed, 16 Oct 2024 11:45:07 +0200 Subject: [PATCH 45/47] add comment to clarify setting of MODULEPATH Co-authored-by: ocaisa --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 0662dac761..f9d889c1a1 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -112,6 +112,8 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do module load EESSI-extend/${EESSI_VERSION}-easybuild # Install modules in hidden .modules dir to keep track of what was installed before + # (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild + # subshells, so loaded modules are not automatically unloaded) MODULEPATH=${EESSI_SITE_SOFTWARE_PATH}/.modules/all echo "set MODULEPATH=${MODULEPATH}" From d2d95e9a9b7a8e47cc574de8d4d41dfad338f6e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Wed, 16 Oct 2024 11:52:57 +0200 Subject: [PATCH 46/47] clarify need for option Co-authored-by: Kenneth Hoste --- .../eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml index 677627eed3..83e68077a2 100644 --- a/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml +++ b/scripts/gpu_support/nvidia/easystacks/eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml @@ -4,5 +4,6 @@ easyconfigs: - CUDA-12.1.1.eb - cuDNN-8.9.2.26-CUDA-12.1.1.eb: options: - # Needed for support for --accept-uela-for option + # needed to enforce acceptance of EULA in cuDNN easyblock, + # see https://github.com/easybuilders/easybuild-easyblocks/pull/3473 include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8 From 77f3bc9cdaf6ab836616a7724d904e830a2e2d5e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 16 Oct 2024 12:03:51 +0200 Subject: [PATCH 47/47] revert to silent sourcing, keep initialising full environment and clarify in comments --- EESSI-install-software.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 7c51727bef..c2900b9a30 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -173,11 +173,9 @@ if [ ! -f ${_lmod_sitepackage_file} ]; then fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) -# $EESSI_SILENT - don't print any messages -# $EESSI_BASIC_ENV - give a basic set of environment variables -unset EESSI_SILENT -unset EESSI_BASIC_ENV -source $TOPDIR/init/eessi_environment_variables +# $EESSI_SILENT - don't print any messages if set (use 'unset EESSI_SILENT' to let script show messages) +# $EESSI_BASIC_ENV - give a basic set of environment variables if set (use 'EESSI_BASIC_ENV=' to let script initialise a full environment) +EESSI_SILENT=1 EESSI_BASIC_ENV= source $TOPDIR/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!"