diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index dcef0ca4d..00739a882 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -90,3 +90,32 @@ jobs: --unsigned \ --update-index ci-buildcache \ $(spack find --format '/{hash}') + allocationmodifier: + runs-on: ubuntu-latest + steps: + - name: Checkout Benchpark + uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 + + - name: Add needed Python libs + run: | + pip install -r ./requirements.txt + + - name: Dry run amg2023/cuda on Sierra + run: | + ./bin/benchpark setup amg2023/cuda LLNL-Sierra-IBM-power9-V100-Infiniband workspace/ + . workspace/setup.sh + ramble \ + --workspace-dir workspace/amg2023/cuda/LLNL-Sierra-IBM-power9-V100-Infiniband/workspace \ + --disable-progress-bar \ + --disable-logger \ + workspace setup --dry-run + + - name: Dry run amg2023/cuda on Pascal + run: | + ./bin/benchpark setup amg2023/cuda LLNL-Pascal-Penguin-broadwell-P100-OmniPath workspace/ + . workspace/setup.sh + ramble \ + --workspace-dir workspace/amg2023/cuda/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/workspace \ + --disable-progress-bar \ + --disable-logger \ + workspace setup --dry-run diff --git a/bin/benchpark b/bin/benchpark index e2466159d..b2c0be292 100755 --- a/bin/benchpark +++ b/bin/benchpark @@ -411,6 +411,10 @@ def benchpark_setup_handler(args): ramble_spack_experiment_configs_dir, include_fn, ) + os.symlink( + source_dir / "experiments" / "universal-resources" / "execute_experiment.tpl", + ramble_configs_dir / "execute_experiment.tpl", + ) spack_location = experiments_root / "spack" ramble_location = experiments_root / "ramble" diff --git a/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml index 6c86bdd0e..da7333c04 100644 --- a/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -6,12 +6,15 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' - cpu_partition: '#SBATCH -p small' - gpu_partition: '#SBATCH -p small-g' + timeout: '120' + scheduler: "slurm" + # This describes the LUMI-G partition: https://docs.lumi-supercomputer.eu/hardware/lumig/ + sys_cores_per_node: "64" + sys_gpus_per_node: "8" + sys_mem_per_node: "512" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml b/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml index 682ab5315..5ce00dcbe 100644 --- a/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml +++ b/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml @@ -4,12 +4,17 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' default_cuda_version: '11.2.0' cuda_arch: '60' enable_mps: '/usr/tcetmp/bin/enable_mps' + timeout: '120' + scheduler: "slurm" + # This describes the XC50 compute nodes: https://www.cscs.ch/computers/piz-daint + sys_cores_per_node: "12" + sys_gpus_per_node: "1" + sys_mem_per_node: "64" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml b/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml index 4e258c3bb..acee05641 100644 --- a/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml +++ b/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml @@ -4,9 +4,13 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '00:30' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' + timeout: '30' + scheduler: "slurm" + sys_cores_per_node: "128" + # sys_gpus_per_node unset + # sys_mem_per_node unset + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml b/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml index e4674cde0..46ca2504b 100644 --- a/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml +++ b/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml @@ -4,9 +4,11 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' + timeout: "120" + scheduler: "slurm" + sys_cores_per_node: "96" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml b/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml index 21097fef2..fa6dccf02 100644 --- a/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml +++ b/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml @@ -7,9 +7,12 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL cuda_arch: '60' default_cuda_version: '11.8.0' - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks} -G {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' + timeout: "120" + scheduler: "slurm" + sys_cores_per_node: "36" + sys_gpus_per_node: "2" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml b/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml index b85115bbc..c4c802503 100644 --- a/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml +++ b/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml @@ -5,11 +5,15 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL - batch_time: '02:00' - mpi_command: '/usr/tcetmp/bin/lrun -n {n_ranks} -T {processes_per_node} {gtl_flag}' - batch_submit: 'bsub -q pdebug {execute_experiment}' - batch_nodes: '#BSUB -nnodes {n_nodes}' - batch_ranks: '' - batch_timeout: '#BSUB -W {batch_time}' default_cuda_version: '11.8.0' cuda_arch: '70' + timeout: "120" + scheduler: "lsf" + queue: "pdebug" + sys_cores_per_node: "44" + sys_gpus_per_node: "4" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml index 286f4d66f..4c2c0045c 100644 --- a/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -6,9 +6,12 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' - batch_time: '120m' - mpi_command: 'flux run -N {n_nodes} -n {n_ranks}' - batch_submit: 'flux batch {execute_experiment}' - batch_nodes: '# flux: -N {n_nodes}' - batch_ranks: '# flux: -n {n_ranks}' - batch_timeout: '# flux: -t {batch_time}' + timeout: "120" + scheduler: "flux" + sys_cores_per_node: "64" + sys_gpus_per_node: "4" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml b/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml index fb379dfe5..304db561c 100644 --- a/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml +++ b/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml @@ -4,15 +4,16 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '02:00' - mpi_command: 'mpiexec' - batch_submit: 'pjsub {execute_experiment}' - batch_nodes: '#PJM -L "node={n_nodes}"' - batch_ranks: '#PJM --mpi proc={n_ranks}' - batch_timeout: '#PJM -L "elapse={batch_time}:00" -x PJM_LLIO_GFSCACHE="/vol0002:/vol0003:/vol0004:/vol0005:/vol0006"' - default_comp: 'clang@17.0.2' - #default_comp: 'fj@4.10.0' - #default_comp: 'gcc@13.2.0' - fj_comp_version: '4.10.0' - sys_arch: 'arch=linux-rhel8-a64fx' - + default_fj_version: '4.10.0' + default_llvm_version: '17.0.2' + default_gnu_version: '13.2.0' + timeout: "120" + scheduler: "pjm" + sys_cores_per_node: "48" + sys_mem_per_node: "32" + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" + #sys_arch: 'arch=linux-rhel8-a64fx' \ No newline at end of file diff --git a/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml b/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml index fd0dbf964..d92d39c6e 100644 --- a/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml +++ b/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml @@ -4,9 +4,12 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks} --mpi=pmix --export=ALL,FI_EFA_USE_DEVICE_RDMA=1,FI_PROVIDER="efa",OMPI_MCA_mtl_base_verbose=100' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' + timeout: "120" + scheduler: "slurm" + sys_cores_per_node: "1" + # sys_gpus_per_node unset + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml index db27b33b9..59954bdbd 100644 --- a/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -6,9 +6,12 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' - batch_time: '02:00' - mpi_command: 'srun -N {n_nodes} -n {n_ranks}' - batch_submit: 'sbatch {execute_experiment}' - batch_nodes: '#SBATCH -N {n_nodes}' - batch_ranks: '#SBATCH -n {n_ranks}' - batch_timeout: '#SBATCH -t {batch_time}:00' + timeout: "120" + scheduler: "slurm" + sys_cores_per_node: "1" + # sys_gpus_per_node unset + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/configs/nosite-x86_64/variables.yaml b/configs/nosite-x86_64/variables.yaml index b9c418080..1e0a6cbe9 100644 --- a/configs/nosite-x86_64/variables.yaml +++ b/configs/nosite-x86_64/variables.yaml @@ -4,9 +4,11 @@ # SPDX-License-Identifier: Apache-2.0 variables: - batch_time: '' - mpi_command: 'mpirun -n {n_nodes} -c {n_ranks} --oversubscribe' - batch_submit: '{execute_experiment}' - batch_nodes: '' - batch_ranks: '' - batch_timeout: '' + scheduler: "mpi" + sys_cores_per_node: "1" + # sys_gpus_per_node unset + max_request: "1000" # n_ranks/n_nodes cannot exceed this + n_ranks: '1000001' # placeholder value + n_nodes: '1000001' # placeholder value + batch_submit: "placeholder" + mpi_command: "placeholder" diff --git a/experiments/amg2023/cuda/ramble.yaml b/experiments/amg2023/cuda/ramble.yaml index 0a5d7f76d..45c4d415b 100644 --- a/experiments/amg2023/cuda/ramble.yaml +++ b/experiments/amg2023/cuda/ramble.yaml @@ -15,12 +15,14 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: amg2023: workloads: problem1: variables: - n_ranks: '{processes_per_node} * {n_nodes}' p: 2 px: '{p}' py: '{p}' @@ -32,11 +34,10 @@ ramble: gtl: ['gtl', 'nogtl'] gtlflag: ['-M"-gpu"', ''] experiments: - amg2023_cuda_problem1_{gtl}_{n_nodes}_{px}_{py}_{pz}_{nx}_{ny}_{nz}: + amg2023_cuda_problem1_{gtl}_{px}_{py}_{pz}_{nx}_{ny}_{nz}: variables: env_name: amg2023 - processes_per_node: '4' - n_nodes: '2' + n_gpus: '8' zips: gtl_info: - gtl diff --git a/experiments/amg2023/openmp/execute_experiment.tpl b/experiments/amg2023/openmp/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/amg2023/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/amg2023/openmp/ramble.yaml b/experiments/amg2023/openmp/ramble.yaml index 4871a5409..b58284b81 100644 --- a/experiments/amg2023/openmp/ramble.yaml +++ b/experiments/amg2023/openmp/ramble.yaml @@ -15,15 +15,14 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifier: + - name: allocation + applications: amg2023: workloads: problem1: - env_vars: - set: - OMP_NUM_THREADS: '{omp_num_threads}' variables: - n_ranks: '{processes_per_node} * {n_nodes}' p: 2 px: '{p}' py: '{p}' @@ -32,10 +31,9 @@ ramble: nx: '{n}' ny: '{n}' nz: '{n}' - processes_per_node: ['8', '4'] + n_ranks_per_node: ['8', '4'] n_nodes: ['1', '2'] - threads_per_node_core: ['4', '6', '12'] - omp_num_threads: '{threads_per_node_core} * {n_nodes}' + n_threads_per_proc: ['4', '6', '12'] experiments: amg2023_omp_problem1_{n_nodes}_{omp_num_threads}_{px}_{py}_{pz}_{nx}_{ny}_{nz}: variables: @@ -43,7 +41,7 @@ ramble: matrices: - size_threads: - n - - threads_per_node_core + - n_threads_per_proc spack: concretized: true packages: diff --git a/experiments/amg2023/rocm/execute_experiment.tpl b/experiments/amg2023/rocm/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/amg2023/rocm/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/amg2023/rocm/ramble.yaml b/experiments/amg2023/rocm/ramble.yaml index 6e0d19045..c2ec20ebb 100644 --- a/experiments/amg2023/rocm/ramble.yaml +++ b/experiments/amg2023/rocm/ramble.yaml @@ -15,12 +15,14 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: amg2023: workloads: problem1: variables: - n_ranks: '{processes_per_node} * {n_nodes}' p: 2 px: '{p}' py: '{p}' @@ -30,12 +32,11 @@ ramble: ny: '{n}' nz: '{n}' experiments: - '{env_name}_problem1_{n_nodes}_{px}_{py}_{pz}_{nx}_{ny}_{nz}': + '{env_name}_problem1_{px}_{py}_{pz}_{nx}_{ny}_{nz}': variables: gtl: ["gtl", "no-gtl"] env_name: 'amg2023-gpu-{gtl}' - processes_per_node: ['8', '4'] - n_nodes: ['1', '2'] + n_gpus: '8' matrices: - size_gtl: - n diff --git a/experiments/gromacs/cuda/execute_experiment.tpl b/experiments/gromacs/cuda/execute_experiment.tpl deleted file mode 100755 index ab02968fe..000000000 --- a/experiments/gromacs/cuda/execute_experiment.tpl +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -## Copyright 2023 Lawrence Livermore National Security, LLC and other -## Benchpark Project Developers. See the top-level COPYRIGHT file for details. -## -## SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{experiment_setup} - -{command} diff --git a/experiments/gromacs/cuda/ramble.yaml b/experiments/gromacs/cuda/ramble.yaml index 0dc188d0f..5ffa1f5ec 100644 --- a/experiments/gromacs/cuda/ramble.yaml +++ b/experiments/gromacs/cuda/ramble.yaml @@ -10,6 +10,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: gromacs: workloads: @@ -18,13 +21,11 @@ ramble: set: OMP_PROC_BIND: close OMP_PLACES: cores - OMP_NUM_THREADS: '{omp_num_threads}' variables: experiment_setup: '' - n_ranks: '{processes_per_node} * {n_nodes}' - processes_per_node: '2' + n_ranks_per_node: '2' n_nodes: '4' - omp_num_threads: '10' + n_threads_per_proc: '10' target: 'gpu' experiments: gromacs_water_gmx50_adac_size{size}_dlb{dlb}_pin{pin}_target{target}_maxh{maxh}_nsteps{nsteps}_nstlist{nstlist}_npme{npme}: diff --git a/experiments/gromacs/openmp/execute_experiment.tpl b/experiments/gromacs/openmp/execute_experiment.tpl deleted file mode 100755 index ab02968fe..000000000 --- a/experiments/gromacs/openmp/execute_experiment.tpl +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -## Copyright 2023 Lawrence Livermore National Security, LLC and other -## Benchpark Project Developers. See the top-level COPYRIGHT file for details. -## -## SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{experiment_setup} - -{command} diff --git a/experiments/gromacs/openmp/ramble.yaml b/experiments/gromacs/openmp/ramble.yaml index 166bb9620..44d19aea9 100644 --- a/experiments/gromacs/openmp/ramble.yaml +++ b/experiments/gromacs/openmp/ramble.yaml @@ -10,6 +10,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: gromacs: workloads: @@ -18,13 +21,11 @@ ramble: set: OMP_PROC_BIND: close OMP_PLACES: cores - OMP_NUM_THREADS: '{omp_num_threads}' variables: experiment_setup: '' - n_ranks: '{processes_per_node} * {n_nodes}' - processes_per_node: '4' + n_ranks_per_node: '4' n_nodes: '2' - omp_num_threads: '16' + n_threads_per_proc: '16' target: 'cpu' experiments: gromacs_water_gmx50_adac_size{size}_dlb{dlb}_pin{pin}_target{target}_maxh{maxh}_nsteps{nsteps}_nstlist{nstlist}_npme{npme}: diff --git a/experiments/gromacs/rocm/execute_experiment.tpl b/experiments/gromacs/rocm/execute_experiment.tpl deleted file mode 100755 index ab02968fe..000000000 --- a/experiments/gromacs/rocm/execute_experiment.tpl +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -## Copyright 2023 Lawrence Livermore National Security, LLC and other -## Benchpark Project Developers. See the top-level COPYRIGHT file for details. -## -## SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{experiment_setup} - -{command} diff --git a/experiments/gromacs/rocm/ramble.yaml b/experiments/gromacs/rocm/ramble.yaml index c1aa8368e..5b2e3afda 100644 --- a/experiments/gromacs/rocm/ramble.yaml +++ b/experiments/gromacs/rocm/ramble.yaml @@ -10,6 +10,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: gromacs: workloads: @@ -18,13 +21,11 @@ ramble: set: OMP_PROC_BIND: close OMP_PLACES: cores - OMP_NUM_THREADS: '{omp_num_threads}' variables: experiment_setup: '' - n_ranks: '{processes_per_node} * {n_nodes}' - processes_per_node: '8' + n_ranks: '8' n_nodes: '1' - omp_num_threads: '8' + n_threads_per_proc: '8' target: 'gpu' experiments: gromacs_water_gmx50_adac_size{size}_dlb{dlb}_pin{pin}_target{target}_maxh{maxh}_nsteps{nsteps}_nstlist{nstlist}_npme{npme}: diff --git a/experiments/hpcc/mpi-only/execute_experiment.tpl b/experiments/hpcc/mpi-only/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/hpcc/mpi-only/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/hpcc/mpi-only/ramble.yaml b/experiments/hpcc/mpi-only/ramble.yaml index fc6ff6ed9..bbe92689e 100644 --- a/experiments/hpcc/mpi-only/ramble.yaml +++ b/experiments/hpcc/mpi-only/ramble.yaml @@ -14,6 +14,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: hpcc: workloads: diff --git a/experiments/hpcg/openmp/execute_experiment.tpl b/experiments/hpcg/openmp/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/hpcg/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/hpcg/openmp/ramble.yaml b/experiments/hpcg/openmp/ramble.yaml index 10a212ea0..df1d77336 100644 --- a/experiments/hpcg/openmp/ramble.yaml +++ b/experiments/hpcg/openmp/ramble.yaml @@ -14,29 +14,28 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: hpcg: workloads: standard: - env_vars: - set: - OMP_NUM_THREADS: '{n_threads}' variables: - n_ranks: '1' mx: '104' my: '104' mz: '104' matrix_size: '{mx} {my} {mz}' iterations: '60' - n_threads: ['8', '16'] - processes_per_node: '1' + n_threads_per_proc: ['8', '16'] + n_ranks_per_node: '1' n_nodes: '1' experiments: - hpcg_standard_{mx}_{my}_{mz}_{iterations}_{n_ranks}_{n_threads}: + hpcg_standard_{mx}_{my}_{mz}_{iterations}_{n_threads_per_proc}: variables: env_name: hpcg-omp matrix: - - n_threads + - n_threads_per_proc spack: concretized: true diff --git a/experiments/hpl/openmp/execute_experiment.tpl b/experiments/hpl/openmp/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/hpl/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/hpl/openmp/ramble.yaml b/experiments/hpl/openmp/ramble.yaml index 03083e4d9..ce8a5c2f0 100644 --- a/experiments/hpl/openmp/ramble.yaml +++ b/experiments/hpl/openmp/ramble.yaml @@ -14,15 +14,14 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: hpl: workloads: standard: - env_vars: - set: - OMP_NUM_THREADS: '{omp_num_threads}' variables: - n_ranks: '{processes_per_node} * {n_nodes}' N-Grids: 1 Ps: 2 Qs: 4 @@ -30,15 +29,15 @@ ramble: Ns: 10000 N-NBs: 1 NBs: 128 - processes_per_node: '8' + n_ranks_per_node: '8' n_nodes: '1' - omp_num_threads: ['2', '4', '8'] + n_threads_per_proc: ['2', '4', '8'] experiments: - hpl_omp_problem1_{n_nodes}_{n_ranks}_{omp_num_threads}_{Ps}_{Qs}_{Ns}_{NBs}: + hpl_omp_problem1_{n_threads_per_proc}_{Ps}_{Qs}_{Ns}_{NBs}: variables: env_name: hpl-omp matrix: - - omp_num_threads + - n_threads_per_proc spack: concretized: true packages: diff --git a/experiments/lammps/openmp/execute_experiment.tpl b/experiments/lammps/openmp/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/lammps/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/lammps/openmp/ramble.yaml b/experiments/lammps/openmp/ramble.yaml index 416d5f462..ef5357ee4 100644 --- a/experiments/lammps/openmp/ramble.yaml +++ b/experiments/lammps/openmp/ramble.yaml @@ -14,28 +14,27 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: lammps: workloads: hns-reaxff: - env_vars: - set: - OMP_NUM_THREADS: '{omp_num_threads}' variables: - n_ranks: '{processes_per_node} * {n_nodes}' size_name: ['medium'] size_x: [2] size_y: [2] size_z: [2] scaling_nodes: [1] n_nodes: '{scaling_nodes}' - omp_num_threads: '1' + n_threads_per_proc: '1' lammps_flags: '-v x {size_x} -v y {size_y} -v z {size_z}' experiments: scaling_{n_nodes}nodes_{size_name}: variables: env_name: lammps - processes_per_node: ['36'] + n_ranks_per_node: ['36'] zips: problems: - size_name diff --git a/experiments/md-test/mpi-only/execute_experiment.tpl b/experiments/md-test/mpi-only/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/md-test/mpi-only/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/md-test/mpi-only/ramble.yaml b/experiments/md-test/mpi-only/ramble.yaml index ef05eb7f6..2a2c41ad4 100644 --- a/experiments/md-test/mpi-only/ramble.yaml +++ b/experiments/md-test/mpi-only/ramble.yaml @@ -14,6 +14,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: md-test: workloads: diff --git a/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl b/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/osu-micro-benchmarks/mpi-only/ramble.yaml b/experiments/osu-micro-benchmarks/mpi-only/ramble.yaml index 8d3dead19..cb371d09f 100644 --- a/experiments/osu-micro-benchmarks/mpi-only/ramble.yaml +++ b/experiments/osu-micro-benchmarks/mpi-only/ramble.yaml @@ -14,19 +14,21 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: osu-micro-benchmarks: workloads: osu_latency: variables: - n_ranks: '{processes_per_node} * {n_nodes}' size_name: ['medium'] scaling_nodes: [2] n_nodes: '{scaling_nodes}' experiments: scaling_{n_nodes}nodes_{size_name}: variables: - processes_per_node: ['1'] + n_ranks_per_node: ['36'] spack: concretized: true packages: diff --git a/experiments/raja-perf/cuda/execute_experiment.tpl b/experiments/raja-perf/cuda/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/raja-perf/cuda/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/raja-perf/cuda/ramble.yaml b/experiments/raja-perf/cuda/ramble.yaml index 607b10fea..5178f13cb 100644 --- a/experiments/raja-perf/cuda/ramble.yaml +++ b/experiments/raja-perf/cuda/ramble.yaml @@ -13,16 +13,18 @@ ramble: spack_flags: install: '--add --keep-stage' concretize: '-U -f' + + modifiers: + - name: allocation + applications: raja-perf: workloads: suite: experiments: - suite_{n_nodes}_{n_ranks}: + suite_{n_gpus}: variables: - processes_per_node: ['1', '2', '4'] - n_ranks: '{processes_per_node} * {n_nodes}' - n_nodes: '1' + n_gpus: '1' spack: concretized: true packages: diff --git a/experiments/raja-perf/mpi-only/execute_experiment.tpl b/experiments/raja-perf/mpi-only/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/raja-perf/mpi-only/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/raja-perf/mpi-only/ramble.yaml b/experiments/raja-perf/mpi-only/ramble.yaml index 73e58f6d4..d48297781 100644 --- a/experiments/raja-perf/mpi-only/ramble.yaml +++ b/experiments/raja-perf/mpi-only/ramble.yaml @@ -14,6 +14,9 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: raja-perf: workloads: diff --git a/experiments/raja-perf/openmp/execute_experiment.tpl b/experiments/raja-perf/openmp/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/raja-perf/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/raja-perf/openmp/ramble.yaml b/experiments/raja-perf/openmp/ramble.yaml index 21a531a18..8f1a161b1 100644 --- a/experiments/raja-perf/openmp/ramble.yaml +++ b/experiments/raja-perf/openmp/ramble.yaml @@ -14,21 +14,21 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: raja-perf: workloads: suite: - env_vars: - set: - OMP_NUM_THREADS: '{n_threads}' variables: n_ranks: '1' experiments: - suite_{n_nodes}_{n_ranks}_{n_threads}: + suite_{n_nodes}_{n_ranks}_{n_threads_per_proc}: variables: env_name: raja-perf n_nodes: '1' - n_threads: ['8', '16'] + n_threads_per_proc: ['8', '16'] spack: concretized: true packages: diff --git a/experiments/raja-perf/rocm/execute_experiment.tpl b/experiments/raja-perf/rocm/execute_experiment.tpl deleted file mode 100644 index 89e73cf49..000000000 --- a/experiments/raja-perf/rocm/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/raja-perf/rocm/ramble.yaml b/experiments/raja-perf/rocm/ramble.yaml index ac43c0899..1d98b7eb6 100644 --- a/experiments/raja-perf/rocm/ramble.yaml +++ b/experiments/raja-perf/rocm/ramble.yaml @@ -14,16 +14,17 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: raja-perf: workloads: suite: experiments: - suite_{n_nodes}_{n_ranks}_{n_gpus_per_task}: + suite_{n_gpus}: variables: - n_ranks: '1' - n_nodes: '1' - n_gpus_per_task: '1' + n_gpus: '1' spack: concretized: true packages: diff --git a/experiments/saxpy/cuda/execute_experiment.tpl b/experiments/saxpy/cuda/execute_experiment.tpl deleted file mode 100755 index 8815e9c4f..000000000 --- a/experiments/saxpy/cuda/execute_experiment.tpl +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/saxpy/cuda/ramble.yaml b/experiments/saxpy/cuda/ramble.yaml index a819742b0..5412bf8aa 100644 --- a/experiments/saxpy/cuda/ramble.yaml +++ b/experiments/saxpy/cuda/ramble.yaml @@ -15,17 +15,17 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: saxpy: workloads: problem: - variables: - n_ranks: '{processes_per_node} * {n_nodes}' experiments: - saxpy_{n}_{n_nodes}_{processes_per_node}_{n_ranks}: + saxpy_{n}: variables: - processes_per_node: '4' - n_nodes: '1' + n_gpus: '1' n: ['128', '256', '512', '1024'] matrix: - n diff --git a/experiments/saxpy/openmp/execute_experiment.tpl b/experiments/saxpy/openmp/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/saxpy/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/saxpy/openmp/ramble.yaml b/experiments/saxpy/openmp/ramble.yaml index f4e601edf..20e765211 100644 --- a/experiments/saxpy/openmp/ramble.yaml +++ b/experiments/saxpy/openmp/ramble.yaml @@ -15,26 +15,24 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: saxpy: workloads: problem: - env_vars: - set: - OMP_NUM_THREADS: '{n_threads}' - variables: - n_ranks: '8' experiments: - saxpy_{n}_{n_nodes}_{n_ranks}_{n_threads}: + saxpy_{n}_{n_nodes}_{omp_num_threads}: variables: - processes_per_node: ['8', '4'] + n_ranks_per_node: ['8', '4'] n_nodes: ['1', '2'] - n_threads: ['2', '4'] + omp_num_threads: ['2', '4'] n: ['512', '1024'] matrices: - size_threads: - n - - n_threads + - omp_num_threads spack: concretized: true diff --git a/experiments/saxpy/rocm/execute_experiment.tpl b/experiments/saxpy/rocm/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/saxpy/rocm/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/saxpy/rocm/ramble.yaml b/experiments/saxpy/rocm/ramble.yaml index 80f443b21..1807dbe9f 100644 --- a/experiments/saxpy/rocm/ramble.yaml +++ b/experiments/saxpy/rocm/ramble.yaml @@ -15,17 +15,17 @@ ramble: install: '--add --keep-stage' concretize: '-U -f' + modifiers: + - name: allocation + applications: saxpy: workloads: problem: - variables: - n_ranks: '{processes_per_node} * {n_nodes}' experiments: - saxpy_{n}_{n_nodes}_{processes_per_node}_{n_ranks}: + saxpy_{n}: variables: - processes_per_node: ['8', '4'] - n_nodes: ['1', '2'] + n_gpus: '1' n: ['128', '256', '512', '1024'] matrix: - n diff --git a/experiments/streamc/openmp/execute_experiment.tpl b/experiments/streamc/openmp/execute_experiment.tpl deleted file mode 100755 index 89e73cf49..000000000 --- a/experiments/streamc/openmp/execute_experiment.tpl +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Copyright 2023 Lawrence Livermore National Security, LLC and other -# Benchpark Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: Apache-2.0 - -{batch_nodes} -{batch_ranks} -{batch_timeout} - -cd {experiment_run_dir} - -{command} diff --git a/experiments/streamc/openmp/ramble.yaml b/experiments/streamc/openmp/ramble.yaml index da4112483..8d813118c 100644 --- a/experiments/streamc/openmp/ramble.yaml +++ b/experiments/streamc/openmp/ramble.yaml @@ -18,24 +18,24 @@ ramble: n_times: ['20', '35'] array_size: ['80000000', '1280000000'] + modifiers: + - name: allocation + applications: streamc: workloads: streamc: - env_vars: - set: - OMP_NUM_THREADS: '{n_threads}' variables: n_ranks: '1' experiments: - stream_{array_size}_{n_times}_{n_threads}: + stream_{array_size}_{n_times}_{n_threads_per_proc}: variables: env_name: 'stream_{array_size}_{n_times}' - processes_per_node: '1' + n_ranks_per_node: '1' n_nodes: '1' - n_threads: ['8', '16', '32'] + n_threads_per_proc: ['8', '16', '32'] matrix: - - n_threads + - n_threads_per_proc spack: concretized: true diff --git a/experiments/amg2023/cuda/execute_experiment.tpl b/experiments/universal-resources/execute_experiment.tpl similarity index 88% rename from experiments/amg2023/cuda/execute_experiment.tpl rename to experiments/universal-resources/execute_experiment.tpl index 8815e9c4f..fc0acb57a 100755 --- a/experiments/amg2023/cuda/execute_experiment.tpl +++ b/experiments/universal-resources/execute_experiment.tpl @@ -4,8 +4,7 @@ # # SPDX-License-Identifier: Apache-2.0 -{batch_nodes} -{batch_timeout} +{allocation_directives} cd {experiment_run_dir} diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py new file mode 100644 index 000000000..a40092f05 --- /dev/null +++ b/modifiers/allocation/modifier.py @@ -0,0 +1,427 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC and other +# Benchpark Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: Apache-2.0 + +import math +from enum import Enum +from ramble.modkit import * + + +class AllocOpt(Enum): + # Experiment resource requests + N_RANKS = 1 + N_NODES = 2 + N_CORES_PER_RANK = 3 + N_THREADS_PER_PROC = 4 # number of OMP threads per rank + N_RANKS_PER_NODE = 5 + N_GPUS = 6 + N_CORES_PER_NODE = 7 + OMP_NUM_THREADS = 8 + + # Descriptions of resources available on systems + SYS_GPUS_PER_NODE = 100 + SYS_CORES_PER_NODE = 101 + SYS_MEM_PER_NODE = 102 + + # Scheduler identification and other high-level instructions + SCHEDULER = 200 + TIMEOUT = 201 # This is assumed to be in minutes + MAX_REQUEST = 202 + QUEUE = 203 + + @staticmethod + def as_type(enumval, input): + if enumval in [AllocOpt.SCHEDULER, AllocOpt.QUEUE]: + return str(input) + else: + return int(input) + + +class AllocAlias: + # Key options, if set, are used to set value options. Type inference + # occurs before that step, so type inference must be applied to aliases + # too. + match = { + AllocOpt.OMP_NUM_THREADS: AllocOpt.N_THREADS_PER_PROC, + } + + +SENTINEL_UNDEFINED_VALUE_STR = "placeholder" + + +class AttrDict(dict): + """Takes variables defined in AllocOpt, and collects them into a single + object where, for a given attribute v, and an AttrDict instance x, that + variable is accessible as "x.v" in Python. + + This is intended to be the most succinct form of access, and not require + dict access (i.e. `[]`) or string quotation, and also provides the + benefit that if you try to access a variable not defined in AllocOpt, + there will be an attribute error. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self["_attributes"] = set() + + def __getattr__(self, *args, **kwargs): + return self.__getitem__(*args, **kwargs) + + def __setattr__(self, *args, **kwargs): + self.__setitem__(*args, **kwargs) + + def __delattr__(self, *args, **kwargs): + self.__delitem__(*args, **kwargs) + + def __setitem__(self, key, value): + super().__setitem__(key, value) + if key != "_attributes": + self["_attributes"].add(key) + + def defined(self): + return list((k, self[k]) for k in self["_attributes"]) + + @staticmethod + def _nullify_placeholders(v): + # If we see a string variable set to "placeholder" we assume the + # user wants us to set it. + # For integers, values exceeding max_request are presumed to be + # placeholders. + max_request_int = v.max_request or 1000 + placeholder_checks = { + int: lambda x: x > max_request_int, + str: lambda x: x == SENTINEL_UNDEFINED_VALUE_STR, + } + for var, val in v.defined(): + if val is None: + continue + + for t, remove_fn in placeholder_checks.items(): + try: + read_as = t(val) + if remove_fn(read_as): + v[var] = None + except ValueError: + pass + + @staticmethod + def from_predefined_variables(expander): + var_defs = AttrDict._defined_allocation_options(expander) + v = AttrDict() + for alloc_opt in AllocOpt: + setattr(v, alloc_opt.name.lower(), var_defs.get(alloc_opt, None)) + + AttrDict._nullify_placeholders(v) + AttrDict._propagate_aliases(v) + return v + + @staticmethod + def _defined_allocation_options(expander): + """For each possible allocation option, check if it was defined as a + variable by the user. + + This includes placeholders (those values are not treated differently + for this step). + """ + defined = {} + for alloc_opt in AllocOpt: + # print(f"<---- Expanding {str(alloc_opt)}") + expansion_vref = f"{{{alloc_opt.name.lower()}}}" + var_def = expander.expand_var(expansion_vref) + # print(f" = {str(var_def)}") + if var_def == expansion_vref: + # If "{x}" expands to literal "{x}", that means it wasn't + # defined + continue + try: + val = AllocOpt.as_type(alloc_opt, var_def) + except ValueError: + continue + + if val is not None: + defined[alloc_opt] = val + + return defined + + @staticmethod + def _propagate_aliases(attr_dict): + # This assumes that placeholder nullification has already taken place + # (if it runs before, it may erroneously think that there is a + # duplicated/conflicting setting when the target is in fact just a + # placeholder value) + for alt_var, target in AllocAlias.match.items(): + src_name = alt_var.name.lower() + dst_name = target.name.lower() + src_val = getattr(attr_dict, src_name, None) + dst_val = getattr(attr_dict, dst_name, None) + + if src_val is not None: + if dst_val is not None and dst_val != src_val: + # Both the variable and its alias were set, and to + # different values. Note this modifier can be run + # multiple times so just looking for whether they + # are set would falsely trigger an error + raise RuntimeError(f"Configs set {src_name} and {dst_name}") + setattr(attr_dict, dst_name, src_val) + + +class TimeFormat: + @staticmethod + def hhmmss_tuple(minutes): + hours = int(minutes / 60) + minutes = minutes % 60 + seconds = 0 + return (hours, minutes, seconds) + + def as_hhmm(minutes): + return ":".join(str(x).zfill(2) for x in TimeFormat.hhmmss_tuple(minutes)[:2]) + + def as_hhmmss(minutes): + return ":".join(str(x).zfill(2) for x in TimeFormat.hhmmss_tuple(minutes)) + + +def divide_into(dividend, divisor): + """For x/y, return the quotient and remainder. + + Attempt to identify cases where a rounding error produces a nonzero + remainder. + """ + if divisor > dividend: + raise ValueError("Dividend must be larger than divisor") + for x in [dividend, divisor]: + if not isinstance(x, int): + raise ValueError("Both values must be integers") + multi_part = dividend / float(divisor) + + quotient = math.floor(multi_part) + # Python 3.7 has math.remainder + remainder = multi_part - quotient + rounding_err_threshold = 1 / float(dividend) + if remainder < rounding_err_threshold: + remainder = 0 + + return quotient, remainder + + +class Allocation(BasicModifier): + + name = "allocation" + + tags("infrastructure") + + # Currently there is only one mode. The only behavior supported right + # now is to attempt to request "enough" resources for a given + # request (e.g. to make sure we request enough nodes, assuming we + # know how many CPUs we want)" + mode("standard", description="Standard execution mode for allocation") + default_mode("standard") + + def inherit_from_application(self, app): + super().inherit_from_application(app) + + v = AttrDict.from_predefined_variables(app.expander) + + # Calculate unset values (e.g. determine n_nodes if not set) + self.determine_allocation(v) + + self.determine_scheduler_instructions(v) + + # Definitions + for var, val in v.defined(): + # print(f"<--- Define {str(var)} = {str(val)}") + app.define_variable(var, str(val)) + + if v.n_threads_per_proc: + self.env_var_modification( + "OMP_NUM_THREADS", + method="set", + modification="{n_threads_per_proc}", + mode="standard", + ) + + def determine_allocation(self, v): + if not v.n_ranks: + if v.n_ranks_per_node and v.n_nodes: + v.n_ranks = v.n_nodes * v.n_ranks_per_node + # TODO: elif n_gpus_per_node and n_nodes + elif v.n_gpus: + v.n_ranks = v.n_gpus + + if not v.n_nodes: + if not any((v.n_ranks, v.n_gpus)): + raise ValueError("Must specify one of: n_nodes, n_ranks, n_gpus") + cores_node_request = None + if v.n_ranks: + multi_cores_per_rank = v.n_cores_per_rank or v.n_threads_per_proc or 0 + cores_request_per_rank = max(multi_cores_per_rank, 1) + ranks_per_node = math.floor( + v.sys_cores_per_node / cores_request_per_rank + ) + if ranks_per_node == 0: + raise ValueError( + "Experiment requests more cores per rank than " + "are available on a node" + ) + cores_node_request = math.ceil(v.n_ranks / ranks_per_node) + gpus_node_request = None + if v.n_gpus: + if v.sys_gpus_per_node: + gpus_node_request = math.ceil(v.n_gpus / float(v.sys_gpus_per_node)) + else: + raise ValueError( + "Experiment requests GPUs, but sys_gpus_per_node " + "is not specified for the system" + ) + v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0) + + if not v.n_threads_per_proc: + v.n_threads_per_proc = 1 + + max_request = v.max_request or 1000 + # Final check, make sure the above arithmetic didn't result in an + # unreasonable allocation request. + for var, val in v.defined(): + try: + int(val) + except (ValueError, TypeError): + continue + if val > max_request: + raise ValueError(f"Request exceeds maximum: {var}/{val}/{max_request}") + + def slurm_instructions(self, v): + srun_opts = [] + sbatch_opts = [] # opts just for the sbatch script + + if v.n_ranks: + srun_opts.append(f"-n {v.n_ranks}") + if v.n_gpus: + srun_opts.append(f"--gpus {v.n_gpus}") + if v.n_nodes: + srun_opts.append(f"-N {v.n_nodes}") + + if v.timeout: + sbatch_opts.append(f"--time {v.timeout}") + + sbatch_directives = list(f"#SBATCH {x}" for x in (srun_opts + sbatch_opts)) + + v.mpi_command = f"srun {' '.join(srun_opts)}" + v.batch_submit = "sbatch {execute_experiment}" + v.allocation_directives = "\n".join(sbatch_directives) + + def gpus_as_gpus_per_rank(self, v): + """Some systems don't have a mechanism for directly requesting a + total number of GPUs: they just have an option that specifies how + many GPUs are required for each rank. + """ + # This error message can come up in multiple scenarios, so pre + # define it if it's needed (it might not be true except where the + # error is raised) + err_msg = ( + f"Cannot express GPUs ({v.n_gpus}) as an integer " + f"multiple of ranks ({v.n_ranks})" + ) + + if v.n_gpus >= v.n_ranks: + quotient, remainder = divide_into(v.n_gpus, v.n_ranks) + if remainder == 0: + return quotient + else: + raise ValueError(err_msg) + else: + raise ValueError(err_msg) + + def flux_instructions(self, v): + cmd_opts = [] + batch_opts = [] + + if v.n_ranks: + cmd_opts.append(f"-n {v.n_ranks}") + if v.n_nodes: + cmd_opts.append(f"-N {v.n_nodes}") + if v.n_gpus: + gpus_per_rank = self.gpus_as_gpus_per_rank(v) + cmd_opts.append(f"--gpus-per-task={gpus_per_rank}") + + if v.timeout: + batch_opts.append("-t {v.timeout}m") + + batch_directives = list(f"# flux: {x}" for x in (cmd_opts + batch_opts)) + + v.mpi_command = f"flux run {' '.join(cmd_opts)}" + v.batch_submit = "flux batch {execute_experiment}" + v.allocation_directives = "\n".join(batch_directives) + + def mpi_instructions(self, v): + v.mpi_command = f"mpirun -n {v.n_ranks} --oversubscribe" + v.batch_submit = "{execute_experiment}" + v.allocation_directives = "" + + def lsf_instructions(self, v): + """Note that this generates lrun invocations; lrun is an LLNL-specific + tool. jsrun is the generally-available scheduler for IBM Spectrum + machines (there is not currently a method for generating jsrun + invocations). + """ + cmd_opts = [] + batch_opts = [] + + if v.n_ranks: + cmd_opts.append(f"-n {v.n_ranks}") + if v.n_nodes: + batch_opts.append(f"-nnodes {v.n_nodes}") + if v.n_gpus: + gpus_per_rank = self.gpus_as_gpus_per_rank(v) + batch_opts.append(f"-g {gpus_per_rank}") + if v.n_ranks_per_node: + cmd_opts.append(f"-T {v.n_ranks_per_node}") + # TODO: this might have to be an option on the batch_submit vs. + # a batch directive + if v.queue: + batch_opts.append(f"-q {v.queue}") + if v.timeout: + batch_opts.append(f"-W {TimeFormat.as_hhmm(v.timeout)}") + + batch_directives = list(f"#BSUB {x}" for x in batch_opts) + + v.mpi_command = f"lrun {' '.join(cmd_opts)}" + v.batch_submit = "bsub {execute_experiment}" + v.allocation_directives = "\n".join(batch_directives) + + def pjm_instructions(self, v): + batch_opts = [] + + if v.n_ranks: + batch_opts.append(f"--mpi proc={v.n_ranks}") + if v.n_nodes: + batch_opts.append(f'-L "node={v.n_nodes}"') + if v.timeout: + batch_opts.append(f'-L "elapse={TimeFormat.as_hhmmss(v.timeout)}"') + batch_opts.append( + '-x PJM_LLIO_GFSCACHE="/vol0001:/vol0002:/vol0003:/vol0004:/vol0005:/vol0006"' + ) + + batch_directives = list(f"#PJM {x}" for x in batch_opts) + + v.mpi_command = "mpiexec" + v.batch_submit = "pjsub {execute_experiment}" + v.allocation_directives = "\n".join(batch_directives) + + def determine_scheduler_instructions(self, v): + handler = { + "slurm": self.slurm_instructions, + "flux": self.flux_instructions, + "mpi": self.mpi_instructions, + "lsf": self.lsf_instructions, + "pjm": self.pjm_instructions, + } + if v.scheduler not in handler: + raise ValueError( + f"scheduler ({v.scheduler}) must be one of : " + + " ".join(handler.keys()) + ) + + if not v.timeout: + v.timeout = 120 + + handler[v.scheduler](v)