Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build script and Nexus support for CSCS Piz Daint #4068

Open
wants to merge 29 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4a6f036
Modified nexus/lib/physical_system.py to read poscar files with a dif…
zenandrea Jun 24, 2021
7d65bd6
Merge branch 'QMCPACK:develop' into develop
zenandrea Apr 5, 2022
8fc2c09
machine archer2
zenandrea Apr 5, 2022
d4b04f2
config for archer2
zenandrea Apr 5, 2022
1b550d0
corrected error in nu. cpus per node
zenandrea Apr 11, 2022
b5ada6c
config archer2 updates, right mpi libs
zenandrea Apr 22, 2022
7a20892
config for archer2
zenandrea Apr 26, 2022
77ff1ea
Merge branch 'QMCPACK:develop' into develop
zenandrea Apr 26, 2022
dcf26f5
Merge branch 'QMCPACK:develop' into develop
zenandrea May 2, 2022
0b72c9e
more efficient mpi moduli in Archer2
zenandrea May 3, 2022
87eca12
Merge branch 'QMCPACK:develop' into develop
zenandrea May 3, 2022
6843092
build script for archer2, some modificationsWq
zenandrea May 4, 2022
1c5037e
update config/build_archer2.sh
zenandrea May 4, 2022
2335578
Merge branch 'develop' into develop
ye-luo May 4, 2022
7a0fe99
undo change to "physical_system.py"
zenandrea May 5, 2022
3e5f07e
Merge branch 'develop' into develop
ye-luo May 5, 2022
25830a3
changes to machines and test_machines for Archer2
zenandrea May 5, 2022
8306409
Merge branch 'develop' of github.com:zenandrea/qmcpack into develop
zenandrea May 5, 2022
07f7945
fix error test on archer2 machine
zenandrea May 7, 2022
6cf80c0
Merge branch 'develop' into develop
ye-luo May 9, 2022
d4747fe
Merge branch 'QMCPACK:develop' into develop
zenandrea May 9, 2022
dbc9da7
Merge branch 'QMCPACK:develop' into develop
zenandrea Jun 16, 2022
26c3c67
Daint machine
zenandrea Jun 20, 2022
32a0ec7
config file Daint
zenandrea Jun 20, 2022
d9d6c13
Merge branch 'develop' into daint
prckent Jun 20, 2022
002036b
specifications Daint
zenandrea Jul 23, 2022
4cdd6f9
changes in machines.py
zenandrea Jul 26, 2022
0e00bec
Merge branch 'develop' into daint
ye-luo Jul 26, 2022
69e077f
Merge branch 'develop' into daint
prckent Jun 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions config/build_cscs_daint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# module purge
#echo "Purging current module set"

#BUILD_MODULES=config/load_cscs_daint_modules.sh
#echo "Sourcing file: $BUILD_MODULES to build QMCPACK"
#. $BUILD_MODULES

echo "Loading QMCPACK dependency modules for cscs piz-daint"
echo "https://user.cscs.ch/access/running/piz_daint/"
echo
module swap PrgEnv-cray PrgEnv-intel
module load daint-gpu
module load cudatoolkit
module load EasyBuild-custom/cscs
module load cray-hdf5-parallel
module load CMake
module load cray-python
module load Boost
# install libxml2 for CrayIntel
#eb libxml2-2.9.7-CrayIntel-20.08.eb -r
#module load libxml2/2.9.7-CrayIntel-20.08
module load libxml2
module unload cray-libsci
module unload cray-libsci_acc
# make sure there is a recent gcc compiler in the path
#module load gcc/8.3.0

module list

echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK"

declare -A builds=( \
["cpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=0" \
["complex_cpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=0" \
["legacy_gpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \
["complex_legacy_gpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \
)

mkdir bin

for build in "${!builds[@]}"
do
echo "building: $build with ${builds[$build]}"
rm bin/qmcpack_${build}
rm -rf build_${build}
mkdir build_${build}
cd build_${build}
cmake \
-DBUILD_LMYENGINE_INTERFACE=0 \
-DQMC_MPI=On -DQMC_OMP=On \
-DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
${builds[$build]} \
..
make -j 20
if [ $? -eq 0 ]; then
build_dir=$(pwd)
ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin/qmcpack_${build}
fi
cd ..
done
106 changes: 106 additions & 0 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3475,6 +3475,111 @@ def write_job_header(self,job):
#end class Archer2


## Added 16/06/2022 by A Zen
class Daint(Supercomputer):
# https://www.cscs.ch/computers/piz-daint/

name = 'daint'
requires_account = True
batch_capable = True
#executable_subfile = True
prefixed_output = True
outfile_extension = '.output'
errfile_extension = '.error'

def post_process_job(self,job):
if job.gpus is None:
job.gpus = 0 # gpus to use per node
elif job.gpus == 1 and job.processes_per_node is None:
job.threads = 12 # OpenMP thread(s)
job.processes_per_node = 1 # MPI rank(s)
job.hyperthreads = 1
elif job.gpus > 1:
self.warn('!!! ATTENTION !!!\n there is only 1 GPU/node in Daint. It is not possible to set gpus={}'.format(job.gpus))
if job.processes_per_node is None:
job.threads = 1 # OpenMP thread(s)
job.processes_per_node = 12 # MPI rank(s)
job.hyperthreads = 1
#end if
#end def post_process_job

def write_job_header(self,job):
if job.queue is None:
job.queue='normal'
#end if
### Slurm batch queues
# https://user.cscs.ch/access/running/piz_daint/
base_partition = None
max_partition = 2400
if job.queue == 'long': # Maximum 5 long jobs in total (one per user)
max_time = 7*24
max_partition = 4
elif job.queue == 'large': # Large scale work, by arrangement only
max_time = 12
max_partition = 4400
elif job.queue == 'low': # Up to 130% of project's quarterly allocation
max_time = 6
max_partition = 2400
elif job.queue == 'prepost': # High priority pre/post processing
max_time = 0.5 # 30 min
max_partition = 1
else:
max_time = 24
max_partition = 2400
#end if
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time:
self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours))
job.hours = max_time
job.minutes = 0
job.seconds = 0
#end if
if job.nodes > max_partition:
self.warn('!!! ATTENTION !!!\n the maximum nodes on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes))
job.nodes = max_partition
#end if

c='#!/bin/bash\n'
c+='#SBATCH --job-name '+str(job.name)+'\n'
c+='#SBATCH --account='+str(job.account)+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH --cpus-per-task={0}\n'.format(job.threads)
c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2))
c+='#SBATCH -o {0}\n'.format(job.outfile)
c+='#SBATCH -e {0}\n'.format(job.errfile)
c+='#SBATCH --partition={}\n'.format(job.queue)
c+='#SBATCH --constraint=gpu\n'
if job.hyperthreads is None or job.hyperthreads==1:
c+='#SBATCH --hint=nomultithread\n'
c+='#SBATCH --ntasks-per-core=1\n'
elif job.hyperthreads==2:
c+='#SBATCH --hint=multithread\n'
c+='#SBATCH --ntasks-per-core=2\n'
#end if job.hyperthreads
if job.email is not None:
c+='#SBATCH --mail-user {}\n'.format(job.email)
c+='#SBATCH --mail-type ALL\n'
#c+='#SBATCH --mail-type FAIL\n'
#end if
c+='\n'
#c+='module load daint-gpu\n'
#c+='\n'
c+='echo JobID : $SLURM_JOBID\n'
c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES\n'
c+='echo List of nodes assigned to the job: $SLURM_NODELIST\n'
c+='\n'
c+='export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK\n'
if job.gpus==1:
c+='export CRAY_CUDA_MPS=1\n'
c+='\n'
c+='ulimit -s unlimited\n'
c+='\n'
return c
#end def write_job_header
#end class Daint


class Tomcat3(Supercomputer):
name = 'tomcat3'
requires_account = False
Expand Down Expand Up @@ -3610,6 +3715,7 @@ def specialized_bundle_commands(self,job,launcher,serial):
Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel')
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')
Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Daint( 5704, 1, 12, 64, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Polaris( 560, 1, 32, 512, 8,'mpiexec', 'qsub', 'qstat', 'qdel')
Perlmutter( 3072, 2, 128, 512, 5000, 'srun', 'sbatch', 'squeue', 'scancel')

Expand Down
33 changes: 33 additions & 0 deletions nexus/tests/unit/test_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,12 @@ def job_commands_equal(c1,c2):
('cori' , 'n2_t2' ) : 'srun test.x',
('cori' , 'n2_t2_e' ) : 'srun test.x',
('cori' , 'n2_t2_p2' ) : 'srun test.x',
('daint' , 'n1' ) : 'srun -N 1 -n 12 test.x',
('daint' , 'n1_p1' ) : 'srun -N 1 -n 1 test.x',
('daint' , 'n2' ) : 'srun -N 2 -n 24 test.x',
('daint' , 'n2_t2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x',
('daint' , 'n2_t2_e' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x',
('daint' , 'n2_t2_p2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 4 test.x',
('eclipse' , 'n1' ) : 'srun test.x',
('eclipse' , 'n1_p1' ) : 'srun test.x',
('eclipse' , 'n2' ) : 'srun test.x',
Expand Down Expand Up @@ -1544,6 +1550,33 @@ def test_write_job():
export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
daint = '''#!/bin/bash
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=12
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --partition=normal
#SBATCH --constraint=gpu
#SBATCH --hint=nomultithread
#SBATCH --ntasks-per-core=1

cd $SLURM_SUBMIT_DIR

echo JobID : $SLURM_JOBID
echo Number of nodes requested: $SLURM_JOB_NUM_NODES
echo List of nodes assigned to the job: $SLURM_NODELIST


export ENV_VAR=1
export OMP_NUM_THREADS=1
export CRAY_CUDA_MPS=1
ulimit -s unlimited\n

srun -N 2 -n 64 test.x''',
eclipse = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
Expand Down