From c9ee31d4ba27342a40c6ec34f3599933d140f700 Mon Sep 17 00:00:00 2001 From: domke <673751-domke@users.noreply.gitlab.com> Date: Sat, 1 Jun 2024 16:47:14 +0900 Subject: [PATCH] adding pre and post processing options which are unrelated to the individual benchmark the issue is that fujitsu-mpi does not redirect to stdout, and on fugaku a usual workaround via -of|-std flags is currently disabled by the ops team and therefore we can at best redirect to a merged file of stdout+strerr for each rank via -std-proc so, the general ramble approach of `mpi_command exec >> log_file` does not work; pre_cmd and post_cmd don't work either, because they are part of the benchmark or part of a modifier (which isnt the case for an mpi) currently, only way i see to add this is via these pre_process and post_process variables which might be useful for others in the future, too redirect mpi output on fugaku --- configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml | 2 ++ .../CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml | 2 ++ configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml | 2 ++ configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml | 2 ++ .../variables.yaml | 2 ++ configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml | 2 ++ .../LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml | 2 ++ configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml | 4 +++- configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml | 2 ++ configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml | 2 ++ configs/nosite-x86_64/variables.yaml | 2 ++ experiments/amg2023/cuda/execute_experiment.tpl | 2 ++ experiments/amg2023/openmp/execute_experiment.tpl | 2 ++ experiments/amg2023/rocm/execute_experiment.tpl | 2 ++ experiments/gromacs/cuda/execute_experiment.tpl | 2 ++ experiments/gromacs/openmp/execute_experiment.tpl | 2 ++ experiments/gromacs/rocm/execute_experiment.tpl | 2 ++ experiments/hpcc/mpi-only/execute_experiment.tpl | 2 ++ experiments/hpcg/openmp/execute_experiment.tpl | 2 ++ experiments/hpl/openmp/execute_experiment.tpl | 2 ++ experiments/kripke/cuda/execute_experiment.tpl | 2 ++ experiments/kripke/openmp/execute_experiment.tpl | 2 ++ experiments/kripke/rocm/execute_experiment.tpl | 2 ++ experiments/lammps/openmp/execute_experiment.tpl | 2 ++ experiments/md-test/mpi-only/execute_experiment.tpl | 2 ++ .../osu-micro-benchmarks/mpi-only/execute_experiment.tpl | 2 ++ experiments/qws/openmp/execute_experiment.tpl | 2 ++ experiments/raja-perf/cuda/execute_experiment.tpl | 2 ++ experiments/raja-perf/mpi-only/execute_experiment.tpl | 2 ++ experiments/raja-perf/openmp/execute_experiment.tpl | 2 ++ experiments/raja-perf/rocm/execute_experiment.tpl | 2 ++ experiments/saxpy/cuda/execute_experiment.tpl | 2 ++ experiments/saxpy/openmp/execute_experiment.tpl | 2 ++ experiments/saxpy/rocm/execute_experiment.tpl | 2 ++ experiments/streamc/openmp/execute_experiment.tpl | 2 ++ 35 files changed, 71 insertions(+), 1 deletion(-) diff --git a/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml index 6c86bdd0e..d440ff873 100644 --- a/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/CSC-LUMI-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -7,7 +7,9 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml b/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml index 682ab5315..e3a980f58 100644 --- a/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml +++ b/configs/CSCS-Daint-HPECray-haswell-P100-Infiniband/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml b/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml index 4e258c3bb..b31e42d20 100644 --- a/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml +++ b/configs/CSCS-Eiger-HPECray-zen2-Slingshot/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '00:30' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml b/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml index e4674cde0..245c6371d 100644 --- a/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml +++ b/configs/LLNL-Magma-Penguin-icelake-OmniPath/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml b/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml index 21097fef2..db9d2b0a6 100644 --- a/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml +++ b/configs/LLNL-Pascal-Penguin-broadwell-P100-OmniPath/variables.yaml @@ -8,7 +8,9 @@ variables: cuda_arch: '60' default_cuda_version: '11.8.0' batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks} -G {n_ranks}' diff --git a/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml b/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml index b85115bbc..99b966602 100644 --- a/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml +++ b/configs/LLNL-Sierra-IBM-power9-V100-Infiniband/variables.yaml @@ -6,7 +6,9 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL batch_time: '02:00' + pre_process: '' mpi_command: '/usr/tcetmp/bin/lrun -n {n_ranks} -T {processes_per_node} {gtl_flag}' + post_process: '' batch_submit: 'bsub -q pdebug {execute_experiment}' batch_nodes: '#BSUB -nnodes {n_nodes}' batch_ranks: '' diff --git a/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml index 286f4d66f..cff6a6708 100644 --- a/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/LLNL-Tioga-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -7,7 +7,9 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' batch_time: '120m' + pre_process: '' mpi_command: 'flux run -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'flux batch {execute_experiment}' batch_nodes: '# flux: -N {n_nodes}' batch_ranks: '# flux: -n {n_ranks}' diff --git a/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml b/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml index fb379dfe5..ba3b8c696 100644 --- a/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml +++ b/configs/RCCS-Fugaku-Fujitsu-A64FX-TofuD/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '02:00' - mpi_command: 'mpiexec' + pre_process: '' + mpi_command: 'mpiexec -std-proc fjmpioutdir/bmexe' + post_process: 'for F in $(ls -1v fjmpioutdir/bmexe.*); do cat $F >> {log_file}; done' batch_submit: 'pjsub {execute_experiment}' batch_nodes: '#PJM -L "node={n_nodes}"' batch_ranks: '#PJM --mpi proc={n_ranks}' diff --git a/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml b/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml index fd0dbf964..40fc097c6 100644 --- a/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml +++ b/configs/nosite-AWS_PCluster_Hpc7a-zen4-EFA/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks} --mpi=pmix --export=ALL,FI_EFA_USE_DEVICE_RDMA=1,FI_PROVIDER="efa",OMPI_MCA_mtl_base_verbose=100' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml b/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml index db27b33b9..3a16a975b 100644 --- a/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml +++ b/configs/nosite-HPECray-zen3-MI250X-Slingshot/variables.yaml @@ -7,7 +7,9 @@ variables: gtl_flag: '' # to be overwritten by tests that need GTL rocm_arch: 'gfx90a' batch_time: '02:00' + pre_process: '' mpi_command: 'srun -N {n_nodes} -n {n_ranks}' + post_process: '' batch_submit: 'sbatch {execute_experiment}' batch_nodes: '#SBATCH -N {n_nodes}' batch_ranks: '#SBATCH -n {n_ranks}' diff --git a/configs/nosite-x86_64/variables.yaml b/configs/nosite-x86_64/variables.yaml index b9c418080..60faccf85 100644 --- a/configs/nosite-x86_64/variables.yaml +++ b/configs/nosite-x86_64/variables.yaml @@ -5,7 +5,9 @@ variables: batch_time: '' + pre_process: '' mpi_command: 'mpirun -n {n_nodes} -c {n_ranks} --oversubscribe' + post_process: '' batch_submit: '{execute_experiment}' batch_nodes: '' batch_ranks: '' diff --git a/experiments/amg2023/cuda/execute_experiment.tpl b/experiments/amg2023/cuda/execute_experiment.tpl index 8815e9c4f..f4199887a 100755 --- a/experiments/amg2023/cuda/execute_experiment.tpl +++ b/experiments/amg2023/cuda/execute_experiment.tpl @@ -9,4 +9,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/amg2023/openmp/execute_experiment.tpl b/experiments/amg2023/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/amg2023/openmp/execute_experiment.tpl +++ b/experiments/amg2023/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/amg2023/rocm/execute_experiment.tpl b/experiments/amg2023/rocm/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/amg2023/rocm/execute_experiment.tpl +++ b/experiments/amg2023/rocm/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/gromacs/cuda/execute_experiment.tpl b/experiments/gromacs/cuda/execute_experiment.tpl index ab02968fe..efb30f11d 100755 --- a/experiments/gromacs/cuda/execute_experiment.tpl +++ b/experiments/gromacs/cuda/execute_experiment.tpl @@ -13,4 +13,6 @@ cd {experiment_run_dir} {experiment_setup} +{pre_process} {command} +{post_process} diff --git a/experiments/gromacs/openmp/execute_experiment.tpl b/experiments/gromacs/openmp/execute_experiment.tpl index ab02968fe..efb30f11d 100755 --- a/experiments/gromacs/openmp/execute_experiment.tpl +++ b/experiments/gromacs/openmp/execute_experiment.tpl @@ -13,4 +13,6 @@ cd {experiment_run_dir} {experiment_setup} +{pre_process} {command} +{post_process} diff --git a/experiments/gromacs/rocm/execute_experiment.tpl b/experiments/gromacs/rocm/execute_experiment.tpl index ab02968fe..efb30f11d 100755 --- a/experiments/gromacs/rocm/execute_experiment.tpl +++ b/experiments/gromacs/rocm/execute_experiment.tpl @@ -13,4 +13,6 @@ cd {experiment_run_dir} {experiment_setup} +{pre_process} {command} +{post_process} diff --git a/experiments/hpcc/mpi-only/execute_experiment.tpl b/experiments/hpcc/mpi-only/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/hpcc/mpi-only/execute_experiment.tpl +++ b/experiments/hpcc/mpi-only/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/hpcg/openmp/execute_experiment.tpl b/experiments/hpcg/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/hpcg/openmp/execute_experiment.tpl +++ b/experiments/hpcg/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/hpl/openmp/execute_experiment.tpl b/experiments/hpl/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/hpl/openmp/execute_experiment.tpl +++ b/experiments/hpl/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/kripke/cuda/execute_experiment.tpl b/experiments/kripke/cuda/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/kripke/cuda/execute_experiment.tpl +++ b/experiments/kripke/cuda/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/kripke/openmp/execute_experiment.tpl b/experiments/kripke/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/kripke/openmp/execute_experiment.tpl +++ b/experiments/kripke/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/kripke/rocm/execute_experiment.tpl b/experiments/kripke/rocm/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/kripke/rocm/execute_experiment.tpl +++ b/experiments/kripke/rocm/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/lammps/openmp/execute_experiment.tpl b/experiments/lammps/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/lammps/openmp/execute_experiment.tpl +++ b/experiments/lammps/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/md-test/mpi-only/execute_experiment.tpl b/experiments/md-test/mpi-only/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/md-test/mpi-only/execute_experiment.tpl +++ b/experiments/md-test/mpi-only/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl b/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl +++ b/experiments/osu-micro-benchmarks/mpi-only/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/qws/openmp/execute_experiment.tpl b/experiments/qws/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/qws/openmp/execute_experiment.tpl +++ b/experiments/qws/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/raja-perf/cuda/execute_experiment.tpl b/experiments/raja-perf/cuda/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/raja-perf/cuda/execute_experiment.tpl +++ b/experiments/raja-perf/cuda/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/raja-perf/mpi-only/execute_experiment.tpl b/experiments/raja-perf/mpi-only/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/raja-perf/mpi-only/execute_experiment.tpl +++ b/experiments/raja-perf/mpi-only/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/raja-perf/openmp/execute_experiment.tpl b/experiments/raja-perf/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/raja-perf/openmp/execute_experiment.tpl +++ b/experiments/raja-perf/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/raja-perf/rocm/execute_experiment.tpl b/experiments/raja-perf/rocm/execute_experiment.tpl index 89e73cf49..8c4d588f9 100644 --- a/experiments/raja-perf/rocm/execute_experiment.tpl +++ b/experiments/raja-perf/rocm/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/saxpy/cuda/execute_experiment.tpl b/experiments/saxpy/cuda/execute_experiment.tpl index 8815e9c4f..f4199887a 100755 --- a/experiments/saxpy/cuda/execute_experiment.tpl +++ b/experiments/saxpy/cuda/execute_experiment.tpl @@ -9,4 +9,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/saxpy/openmp/execute_experiment.tpl b/experiments/saxpy/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/saxpy/openmp/execute_experiment.tpl +++ b/experiments/saxpy/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/saxpy/rocm/execute_experiment.tpl b/experiments/saxpy/rocm/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/saxpy/rocm/execute_experiment.tpl +++ b/experiments/saxpy/rocm/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process} diff --git a/experiments/streamc/openmp/execute_experiment.tpl b/experiments/streamc/openmp/execute_experiment.tpl index 89e73cf49..8c4d588f9 100755 --- a/experiments/streamc/openmp/execute_experiment.tpl +++ b/experiments/streamc/openmp/execute_experiment.tpl @@ -10,4 +10,6 @@ cd {experiment_run_dir} +{pre_process} {command} +{post_process}