Skip to content

Commit

Permalink
Merge branch '2023.06-software.eessi.io' into feature/dev.eessi.io_TEST
Browse files Browse the repository at this point in the history
  • Loading branch information
boegel committed Oct 11, 2024
2 parents 247a23d + 80ce564 commit 845865a
Show file tree
Hide file tree
Showing 41 changed files with 514 additions and 69 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/test-software.eessi.io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ on:
workflow_dispatch:
permissions:
contents: read # to fetch code (actions/checkout)
env:
EESSI_ACCELERATOR_TARGETS: |
x86_64/amd/zen2:
- nvidia/cc80
x86_64/amd/zen3:
- nvidia/cc80
jobs:
check_missing:
runs-on: ubuntu-latest
Expand All @@ -21,6 +27,7 @@ jobs:
- aarch64/neoverse_v1
- x86_64/amd/zen2
- x86_64/amd/zen3
- x86_64/amd/zen4
- x86_64/intel/haswell
- x86_64/intel/skylake_avx512
- x86_64/generic
Expand Down Expand Up @@ -48,14 +55,43 @@ jobs:
export EESSI_PREFIX=/cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}
export EESSI_OS_TYPE=linux
env | grep ^EESSI | sort
# first check the CPU-only builds for this CPU target
echo "just run check_missing_installations.sh (should use easystacks/software.eessi.io/${{matrix.EESSI_VERSION}}/eessi-${{matrix.EESSI_VERSION}}-*.yml)"
for easystack_file in $(ls easystacks/software.eessi.io/${{matrix.EESSI_VERSION}}/eessi-${{matrix.EESSI_VERSION}}-eb-*.yml); do
if [ ${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}} = "x86_64/amd/zen4" ]; then
if grep -q 2022b <<<"${easystack_file}"; then
# skip the check of installed software on zen4 for foss/2022b builds
continue
elif grep -q CUDA <<<"${easystack_file}"; then
# skip the check of install CUDA software in the CPU path for zen4
continue
fi
fi
echo "check missing installations for ${easystack_file}..."
./check_missing_installations.sh ${easystack_file}
ec=$?
if [[ ${ec} -ne 0 ]]; then echo "missing installations found for ${easystack_file}!" >&2; exit ${ec}; fi
done
# now check the accelerator builds for this CPU target
accelerators=$(echo "${EESSI_ACCELERATOR_TARGETS}" | yq ".${EESSI_SOFTWARE_SUBDIR_OVERRIDE}[]")
if [ -z ${accelerators} ]; then
echo "no accelerator targets defined for ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}"
else
for accel in ${accelerators}; do
module use ${EESSI_SOFTWARE_PATH}/accel/${accel}/modules/all
echo "checking missing installations for accelerator ${accel} using modulepath: ${MODULEPATH}"
for easystack_file in $(ls easystacks/software.eessi.io/${{matrix.EESSI_VERSION}}/accel/$(dirname ${accel})/eessi-${{matrix.EESSI_VERSION}}-eb-*.yml); do
echo "check missing installations for ${easystack_file}..."
./check_missing_installations.sh ${easystack_file}
ec=$?
if [[ ${ec} -ne 0 ]]; then echo "missing installations found for ${easystack_file}!" >&2; exit ${ec}; fi
done
module unuse ${EESSI_SOFTWARE_PATH}/accel/${accel}/modules/all
done
fi
- name: Test check_missing_installations.sh with missing package (GCC/8.3.0)
run: |
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}
Expand Down
124 changes: 124 additions & 0 deletions .github/workflows/tests_archdetect_nvidia_gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions
name: Tests for accelerator detection (NVIDIA GPU)
on:
push:
pull_request:
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
fake_nvidia_smi_script:
- none # no nvidia-smi command
- no_devices # nvidia-smi command works, but no GPUs available
- 1xa100 # cc80, supported with (atleast) zen2 CPU
- 2xa100 # cc80, supported with (atleast) zen2 CPU
- 4xa100 # cc80, supported with (atleast) zen2 CPU
- cc01 # non-existing GPU
fail-fast: false
steps:
- name: checkout
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0

# we deliberately do not use the eessi/github-action-eessi action,
# because we want to control when the EESSI environment is initialized
- name: Mount EESSI CernVM-FS repository
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
with:
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
cvmfs_http_proxy: DIRECT
cvmfs_repositories: software.eessi.io

- name: test accelerator detection
run: |
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2'
# put fake nvidia-smi command in place (unless we don't want to)
if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then
tmpdir=$(mktemp -d)
ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi
export PATH=$tmpdir:$PATH
fi
# first run with debugging enabled, just to show the output
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?"
# verify output (or exit code if non-zero)
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?")
if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then
echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'"
# run full EESSI init script, which pick up on the accelerator (if available)
echo
. init/bash 2>&1 | tee init.out
echo "-----------------------------------------------------------------------------"
if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then
pattern="archdetect could not detect any accelerators"
echo ">>> checking for pattern '${pattern}' in init output..."
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1)
pattern="archdetect found supported accelerator"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then
pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)"
echo ">>> checking for pattern '${pattern}' in init output..."
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1)
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
else
echo ">>> checking for 'accel/nvidia/cc80' in init output..."
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1)
fi
echo ">>> checking last line of init output..."
tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1)
echo "All checks on init output PASSED"
else
echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2
exit 1
fi
- name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE
run: |
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2'
export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3'
export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80'
# first run with debugging enabled, just to show the output
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?"
# verify output (or exit code if non-zero)
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?")
echo
. init/bash 2>&1 | tee init.out
echo "-----------------------------------------------------------------------------"
echo ">>> checking for 'accel/nvidia/cc80' in init output..."
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1)
grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1)
echo "All checks on init output PASSED"
3 changes: 2 additions & 1 deletion .github/workflows/tests_eessi_module.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,15 @@ jobs:
module load EESSI/${{matrix.EESSI_VERSION}}
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${moduleoutfile}"
module unload EESSI/${{matrix.EESSI_VERSION}}
source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash
source ./init/bash
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${sourceoutfile}"
cat "${moduleoutfile}"
cat "${sourceoutfile}"
if (diff "${moduleoutfile}" "${sourceoutfile}" > /dev/null); then
echo "Test for checking env variables PASSED"
else
echo "Test for checking env variables FAILED" >&2
diff "${moduleoutfile}" "${sourceoutfile}"
exit 1
fi
4 changes: 2 additions & 2 deletions EESSI-extend-2023.06-easybuild.eb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ elseif (os.getenv("EESSI_SITE_INSTALL") ~= nil) then
if ((os.getenv("EESSI_PROJECT_INSTALL") ~= nil) or (os.getenv("EESSI_USER_INSTALL") ~= nil)) then
LmodError("You cannot use EESSI_SITE_INSTALL in combination with any other EESSI_*_INSTALL environment variables")
end
easybuild_installpath = string.gsub(os.getenv("EESSI_SOFTWARE_PATH"), 'versions', 'host_injections')
easybuild_installpath = os.getenv("EESSI_SITE_SOFTWARE_PATH")
else
-- Deal with user and project installs
project_install = os.getenv("EESSI_PROJECT_INSTALL")
Expand Down Expand Up @@ -166,7 +166,7 @@ elseif (project_modulepath ~= nil) then
end
-- Make sure EasyBuild itself is loaded
if not ( isloaded("EasyBuild") ) then
load("EasyBuild")
load(latest("EasyBuild"))
end
"""

Expand Down
37 changes: 35 additions & 2 deletions bot/check-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ else
[[ ${VERBOSE} -ne 0 ]] && echo " Slurm output file '"${job_out}"' NOT found"
fi


# ReFrame prints e.g.
#[----------] start processing checks
#[ RUN ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:rome+default
Expand Down Expand Up @@ -76,8 +75,42 @@ fi
if [[ ! -z ${grep_reframe_failed} ]]; then
grep_reframe_result=${grep_reframe_failed}
else
grep_reframe_result=${grep_reframe_success}
# Grep the entire output of ReFrame, so that we can report it in the foldable section of the test report
GP_success_full='(?s)\[----------\] start processing checks.*?\[==========\] Finished on [a-zA-Z0-9 ]*'
# Grab the full ReFrame report, than cut the irrelevant parts
# Note that the character limit for messages in github is around 65k, so cutting is important
grep_reframe_success_full=$( \
grep -v "^>> searching for " ${job_dir}/${job_out} | \
# Use -z
grep -Pzo "${GP_success_full}" | \
# Replace null character with newline, to undo the -z option
sed 's/\x00/\n/g' | \
# Remove the [ RUN ] lines from reframe, they are not very informative
grep -v -P '\[\s*RUN\s*]' | \
# Remove the line '[----------] all spawned checks have finished'
grep -v '\[-*\]' | \
# Remove the line '[==========] Finished on Mon Oct 7 21'
grep -v '\[=*\]' | \
# Remove blank line(s) from the report
grep -v '^$' | \
# Remove warnings about the local spawner not supporting memory requests
grep -v 'WARNING\: hooks\.req_memory_per_node does not support the scheduler you configured .local.*$' | \
# Strip color coding characters
sed 's/\x1B\[[0-9;]*m//g' | \
# Replace all newline characters with <br/>
sed ':a;N;$!ba;s/\n/<br\/>/g' | \
# Replace % with %%. Use \%\% to interpret both %% as (non-special) characters
sed 's/\%/\%\%/g' \
)
# TODO (optional): we could impose a character limit here, and truncate if too long
# (though we should do that before inserting the <br/> statements).
# If we do, we should probably re-append the final summary, e.g.
# [ PASSED ] Ran 10/10 test case(s) from 10 check(s) (0 failure(s), 0 skipped, 0 aborted)
# so that that is always displayed
# However, that's not implemented yet - let's see if this ever even becomes an issue
grep_reframe_result=${grep_reframe_success_full}
fi
echo "grep_reframe_result: ${grep_reframe_result}"

echo "[TEST]" > ${job_test_result_file}
if [[ ${SLURM_OUTPUT_FOUND} -eq 0 ]]; then
Expand Down
6 changes: 6 additions & 0 deletions create_lmodrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def error(msg):
error("Prefix directory %s does not exist!" % prefix)

lmodrc_path = os.path.join(prefix, DOT_LMOD, 'lmodrc.lua')
# Lmod itself doesn't care about the accelerator subdir so remove this duplication from
# the target path (if it exists)
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
if accel_subdir:
lmodrc_path = lmodrc_path.replace("/accel/%s" % accel_subdir, '')

lmodrc_txt = TEMPLATE_LMOD_RC % {
'dot_lmod': DOT_LMOD,
'prefix': prefix,
Expand Down
17 changes: 12 additions & 5 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

DOT_LMOD = '.lmod'

hook_txt ="""require("strict")
hook_txt = """require("strict")
local hook = require("Hook")
local open = io.open
Expand Down Expand Up @@ -36,7 +36,7 @@
-- eessi_prefix_host_injections is the prefix with site-extensions (i.e. additional modules)
-- to the official EESSI modules, e.g. /cvmfs/software.eessi.io/host_injections/2023.06
local eessi_prefix_host_injections = string.gsub(eessi_prefix, 'versions', 'host_injections')
-- Check if the full modulepath starts with the eessi_prefix_*
return string.find(t.fn, "^" .. eessi_prefix) ~= nil or string.find(t.fn, "^" .. eessi_prefix_host_injections) ~= nil
end
Expand Down Expand Up @@ -103,15 +103,15 @@
if isFile(archSitePackage) then
dofile(archSitePackage)
end
end
local function eessi_cuda_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
Expand Down Expand Up @@ -207,6 +207,7 @@
load_site_specific_hooks()
"""


def error(msg):
sys.stderr.write("ERROR: %s\n" % msg)
sys.exit(1)
Expand All @@ -221,12 +222,18 @@ def error(msg):
error("Prefix directory %s does not exist!" % prefix)

sitepackage_path = os.path.join(prefix, DOT_LMOD, 'SitePackage.lua')

# Lmod itself doesn't care about compute capability so remove this duplication from
# the install path (if it exists)
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
if accel_subdir:
sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '')
try:
os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True)
with open(sitepackage_path, 'w') as fp:
fp.write(hook_txt)
# Make sure that the created Lmod file has "read/write" for the user/group and "read" permissions for others
os.chmod(sitepackage_path, S_IREAD|S_IWRITE|S_IRGRP|S_IWGRP|S_IROTH)
os.chmod(sitepackage_path, S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP | S_IROTH)

except (IOError, OSError) as err:
error("Failed to create %s: %s" % (sitepackage_path, err))
Expand Down
16 changes: 10 additions & 6 deletions create_tarball.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,17 @@ for subdir in ${cpu_arch_subdir} ${cpu_arch_subdir}/accel/${accel_subdir}; do
done

# add a bit debug output
echo "wrote file list to ${files_list}"
[ -r ${files_list} ] && cat ${files_list}
echo "wrote module file list to ${module_files_list}"
[ -r ${module_files_list} ] && cat ${module_files_list}
if [ -r ${files_list} ]; then
echo "wrote file list to ${files_list}"
cat ${files_list}
fi
if [ -r ${module_files_list} ]; then
echo "wrote module file list to ${module_files_list}"
cat ${module_files_list}

# Copy the module files list to current workindg dir for later use in the test step
cp ${module_files_list} ${current_workdir}/module_files.list.txt
# Copy the module files list to current workindg dir for later use in the test step
cp ${module_files_list} ${current_workdir}/module_files.list.txt
fi

topdir=${cvmfs_repo}/versions/

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
easyconfigs:
- SciPy-bundle-2023.11-gfbf-2023b.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
easyconfigs:
- LAMMPS-2Aug2023_update2-foss-2023a-kokkos-CUDA-12.1.1.eb
- ESPResSo-4.2.2-foss-2023a-CUDA-12.1.1.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21440
from-commit: 5525968921d7b5eae54f7d16391201e17ffae13c
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
easyconfigs:
- CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb:
# use easyconfig that only install subset of CUDA samples,
# to circumvent problem with nvcc linking to glibc of host OS,
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189;
# and where additional samples are excluded because they fail to build on aarch64,
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451;
options:
from-pr: 19451
Loading

0 comments on commit 845865a

Please sign in to comment.