diff --git a/config/build_cscs_daint.sh b/config/build_cscs_daint.sh new file mode 100755 index 0000000000..0b0325f64e --- /dev/null +++ b/config/build_cscs_daint.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# module purge +#echo "Purging current module set" + +#BUILD_MODULES=config/load_cscs_daint_modules.sh +#echo "Sourcing file: $BUILD_MODULES to build QMCPACK" +#. $BUILD_MODULES + +echo "Loading QMCPACK dependency modules for cscs piz-daint" +echo "https://user.cscs.ch/access/running/piz_daint/" +echo +module swap PrgEnv-cray PrgEnv-intel +module load daint-gpu +module load cudatoolkit +module load EasyBuild-custom/cscs +module load cray-hdf5-parallel +module load CMake +module load cray-python +module load Boost +# install libxml2 for CrayIntel +#eb libxml2-2.9.7-CrayIntel-20.08.eb -r +#module load libxml2/2.9.7-CrayIntel-20.08 +module load libxml2 +module unload cray-libsci +module unload cray-libsci_acc +# make sure there is a recent gcc compiler in the path +#module load gcc/8.3.0 + +module list + +echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK" + +declare -A builds=( \ +["cpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=0" \ +["complex_cpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=0" \ +["legacy_gpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \ +["complex_legacy_gpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \ +) + +mkdir bin + +for build in "${!builds[@]}" +do + echo "building: $build with ${builds[$build]}" + rm bin/qmcpack_${build} + rm -rf build_${build} + mkdir build_${build} + cd build_${build} + cmake \ + -DBUILD_LMYENGINE_INTERFACE=0 \ + -DQMC_MPI=On -DQMC_OMP=On \ + -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \ + ${builds[$build]} \ + .. + make -j 20 + if [ $? -eq 0 ]; then + build_dir=$(pwd) + ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin/qmcpack_${build} + fi + cd .. +done diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index 5b4e2a16ee..8c5e6821a7 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3475,6 +3475,111 @@ def write_job_header(self,job): #end class Archer2 +## Added 16/06/2022 by A Zen +class Daint(Supercomputer): + # https://www.cscs.ch/computers/piz-daint/ + + name = 'daint' + requires_account = True + batch_capable = True + #executable_subfile = True + prefixed_output = True + outfile_extension = '.output' + errfile_extension = '.error' + + def post_process_job(self,job): + if job.gpus is None: + job.gpus = 0 # gpus to use per node + elif job.gpus == 1 and job.processes_per_node is None: + job.threads = 12 # OpenMP thread(s) + job.processes_per_node = 1 # MPI rank(s) + job.hyperthreads = 1 + elif job.gpus > 1: + self.warn('!!! ATTENTION !!!\n there is only 1 GPU/node in Daint. It is not possible to set gpus={}'.format(job.gpus)) + if job.processes_per_node is None: + job.threads = 1 # OpenMP thread(s) + job.processes_per_node = 12 # MPI rank(s) + job.hyperthreads = 1 + #end if + #end def post_process_job + + def write_job_header(self,job): + if job.queue is None: + job.queue='normal' + #end if + ### Slurm batch queues + # https://user.cscs.ch/access/running/piz_daint/ + base_partition = None + max_partition = 2400 + if job.queue == 'long': # Maximum 5 long jobs in total (one per user) + max_time = 7*24 + max_partition = 4 + elif job.queue == 'large': # Large scale work, by arrangement only + max_time = 12 + max_partition = 4400 + elif job.queue == 'low': # Up to 130% of project's quarterly allocation + max_time = 6 + max_partition = 2400 + elif job.queue == 'prepost': # High priority pre/post processing + max_time = 0.5 # 30 min + max_partition = 1 + else: + max_time = 24 + max_partition = 2400 + #end if + job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0 + if job.total_hours > max_time: + self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours)) + job.hours = max_time + job.minutes = 0 + job.seconds = 0 + #end if + if job.nodes > max_partition: + self.warn('!!! ATTENTION !!!\n the maximum nodes on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes)) + job.nodes = max_partition + #end if + + c='#!/bin/bash\n' + c+='#SBATCH --job-name '+str(job.name)+'\n' + c+='#SBATCH --account='+str(job.account)+'\n' + c+='#SBATCH -N '+str(job.nodes)+'\n' + c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node) + c+='#SBATCH --cpus-per-task={0}\n'.format(job.threads) + c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2)) + c+='#SBATCH -o {0}\n'.format(job.outfile) + c+='#SBATCH -e {0}\n'.format(job.errfile) + c+='#SBATCH --partition={}\n'.format(job.queue) + c+='#SBATCH --constraint=gpu\n' + if job.hyperthreads is None or job.hyperthreads==1: + c+='#SBATCH --hint=nomultithread\n' + c+='#SBATCH --ntasks-per-core=1\n' + elif job.hyperthreads==2: + c+='#SBATCH --hint=multithread\n' + c+='#SBATCH --ntasks-per-core=2\n' + #end if job.hyperthreads + if job.email is not None: + c+='#SBATCH --mail-user {}\n'.format(job.email) + c+='#SBATCH --mail-type ALL\n' + #c+='#SBATCH --mail-type FAIL\n' + #end if + c+='\n' + #c+='module load daint-gpu\n' + #c+='\n' + c+='echo JobID : $SLURM_JOBID\n' + c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES\n' + c+='echo List of nodes assigned to the job: $SLURM_NODELIST\n' + c+='\n' + c+='export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK\n' + if job.gpus==1: + c+='export CRAY_CUDA_MPS=1\n' + c+='\n' + c+='ulimit -s unlimited\n' + c+='\n' + return c + #end def write_job_header +#end class Daint + + class Tomcat3(Supercomputer): name = 'tomcat3' requires_account = False @@ -3610,6 +3715,7 @@ def specialized_bundle_commands(self,job,launcher,serial): Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel') SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel') Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel') +Daint( 5704, 1, 12, 64, 1000, 'srun', 'sbatch', 'squeue', 'scancel') Polaris( 560, 1, 32, 512, 8,'mpiexec', 'qsub', 'qstat', 'qdel') Perlmutter( 3072, 2, 128, 512, 5000, 'srun', 'sbatch', 'squeue', 'scancel') diff --git a/nexus/tests/unit/test_machines.py b/nexus/tests/unit/test_machines.py index 8d34c6b27e..58863e1518 100644 --- a/nexus/tests/unit/test_machines.py +++ b/nexus/tests/unit/test_machines.py @@ -1093,6 +1093,12 @@ def job_commands_equal(c1,c2): ('cori' , 'n2_t2' ) : 'srun test.x', ('cori' , 'n2_t2_e' ) : 'srun test.x', ('cori' , 'n2_t2_p2' ) : 'srun test.x', + ('daint' , 'n1' ) : 'srun -N 1 -n 12 test.x', + ('daint' , 'n1_p1' ) : 'srun -N 1 -n 1 test.x', + ('daint' , 'n2' ) : 'srun -N 2 -n 24 test.x', + ('daint' , 'n2_t2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x', + ('daint' , 'n2_t2_e' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x', + ('daint' , 'n2_t2_p2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 4 test.x', ('eclipse' , 'n1' ) : 'srun test.x', ('eclipse' , 'n1_p1' ) : 'srun test.x', ('eclipse' , 'n2' ) : 'srun test.x', @@ -1544,6 +1550,33 @@ def test_write_job(): export OMP_NUM_THREADS=1 export ENV_VAR=1 srun test.x''', + daint = '''#!/bin/bash +#SBATCH --job-name jobname +#SBATCH --account=ABC123 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH -t 06:30:00 +#SBATCH -o test.out +#SBATCH -e test.err +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --hint=nomultithread +#SBATCH --ntasks-per-core=1 + +cd $SLURM_SUBMIT_DIR + +echo JobID : $SLURM_JOBID +echo Number of nodes requested: $SLURM_JOB_NUM_NODES +echo List of nodes assigned to the job: $SLURM_NODELIST + + +export ENV_VAR=1 +export OMP_NUM_THREADS=1 +export CRAY_CUDA_MPS=1 +ulimit -s unlimited\n + +srun -N 2 -n 64 test.x''', eclipse = '''#!/bin/bash #SBATCH -p batch #SBATCH --job-name jobname