#!/bin/bash # new script for running testreport on stan1.awi.de # - split the testreport into 3 steps: # 1/ compiling on head node (tx7.awi.de), with -norun option # 2/ running on compute node (using PBS qsub), with -runonly option # 3/ evaluating result on head node with -runonly option # # Notes: # - step 2 leads to many error messages, because the OS on the compute # nodes does not have the appropriate shell tools, modifying the # runonly option to skip the evalution step would be nice but not # necessary; you'll just have to live with the error messages # - step 3 assumes that all experiments have been run successfully, i.e. # that the output files are up-to-date. # if not, testreport will try to run the sx ace-code on the tx7 frontend # which will fail inevitably and produce more errors, maybe we can # have a flag that skips everything but the evaluation step to avoid this # $Header: /home/ubuntu/mnt/e9_copy/MITgcm_contrib/test_scripts/sxace/mitgcmtestreport,v 1.5 2020/03/23 12:17:57 mlosch Exp $ # $Name: $ # for some reason the module command is not available in a bash script on # this computer so we have to create it here #module () { eval `/usr/bin/modulecmd bash $*` ; } # alternatively we can source this script that contains all relevant # definitions source /usr/share/Modules/init/bash #module use --append /sx8/user2/awisoft/modulefiles # load latest compilers: #module load sxf90/460 #module load sxc++/094 module load sxf90 module load sxc++ module load sxmpi module load sxnetcdf # # make sure that we have qsub and qstat #export PATH=${PATH}:/usr/bin/nqsII source /etc/profile.d/nec.sh # VENDOR=sxf90 RUNIT="runit_"$VENDOR HERE=`pwd` EXE='mpirun -np TR_NPROC ./mitgcmuv' NPROCS=2 MPI="-MPI $NPROCS" OUTFILE=$HOME/out_${VENDOR} MYOUTPUT=$HOME/testreport_${VENDOR} OUTFILE=out_${VENDOR} JOBNAME=test_ace JOBSCRIPT=job_${VENDOR} selectexperiment='-t exp2' selectexperiment='' # download code into this directory TDIR=/ace/user/mlosch/tmp_$VENDOR gcmDIR=MITgcm git_repo='MITgcm' git_code='MITgcm' OPTFILE=../tools/build_options/SUPER-UX_SX-ACE_sxf90_awi #OPTFILE=/home/ace/mlosch/MITgcm/tools/build_options/SUPER-UX_SX-ACE_sxf90_awi RUNTESTREPORT="./testreport $MPI -of=${OPTFILE} $selectexperiment -small_f" # # create batch script # cat << EOF > $HERE/$JOBSCRIPT #PBS -q ace-r # job queue #PBS -N $JOBNAME # give the job a name #PBS -l cpunum_job=$NPROCS # cpus per node #PBS -l elapstim_req=2:00:00 #PBS -l cputim_job=2:00:00 # time limit #PBS -l memsz_job=32gb # max accumulated memory, we need this much because of many netcdf files #PBS -j o # join i/o #PBS -S /bin/sh #PBS -o $OUTFILE # o Where to write output # cd \${PBS_O_WORKDIR} $RUNTESTREPORT -runonly -command "$EXE" >> $MYOUTPUT 2>&1 EOF # clean up old testreport output if [ -e $MYOUTPUT ]; then rm -rf $MYOUTPUT fi if [ -e $OUTFILE ]; then rm -r $OUTFILE fi # checkOut determines how much checking out is being done # checkOut = 3: new clone from GitHub and make a new copy # checkOut = 2: update (git pull) existing repo and make a new copy # checkOut = 1: skip update # checkOut = 0: use existing test code (if available otherwise switch to 1) checkOut=2 gitcmd=$HOME/git/git tmpFil=$TDIR/error.out if [ $checkOut -le 1 ] ; then if test -e $TDIR/${gcmDIR}/doc ; then echo $TDIR/${gcmDIR}/doc 'exist' else echo -n $TDIR/${gcmDIR} 'missing ; ' checkOut=2 echo "will make a new copy ( checkOut=$checkOut )" fi fi if [ $checkOut -ge 2 ] ; then #---- cleaning: cd $TDIR #---- Make a new clone or update existing one: if test -e ${gcmDIR}/.git/config ; then echo "${gcmDIR}/.git/config exist" else echo -n "${gcmDIR}/.git/config 'missing, " checkOut=3 echo "will get new clone ( checkOut=$checkOut )" fi if [ $checkOut -eq 3 ] ; then echo -n "Removing old clone: $TDIR/${gcmDIR} ..." test -e $TDIR/${gcmDIR} && rm -rf $TDIR/${gcmDIR} echo " done" echo -n "Make a new clone of $git_code from repo: $git_repo ..." ${gitcmd} clone https://github.com/$git_repo/${git_code}.git ${gcmDIR} 2> $tmpFil retVal=$? if test $retVal = 0 ; then echo ' --> done!' rm -f $tmpFil else echo " Error: 'git clone' returned: $retVal" cat $tmpFil rm -f $tmpFil exit 2 fi else echo "Updating current clone ( $git_code ) ..." ( cd ${gcmDIR}; ${gitcmd} checkout master ; ${gitcmd} pull ) retVal=$? if test $retVal = 0 ; then echo ' --> done!' else echo " Error: 'git pull' returned: $retVal" echo " Error: 'git pull' returned: $retVal" \ | mail -s "Git-error on Stan" Martin.Losch@awi.de exit 2 fi fi else cd $TDIR fi cd $TDIR/MITgcm/verification # make sure that we do not use the cross compiler for testreport unset CC # make sure that do use the cross compiler for testreport #export CC=sxcc $RUNTESTREPORT -j 8 -norun > $MYOUTPUT 2>&1 if [ "$?" != "0" ] then echo "something wrong with testreport" echo "keeping the working directory" #else # echo "check restarts" # echo ../tools/do_tst_2+2 -mpi -exe \"$HERE/$RUNIT\" -a NONE # ../tools/do_tst_2+2 -mpi -exe $HERE/$RUNIT -a NONE # everything OK: delete working directory # rm -rf $TDIR fi if [ ! -e $MYOUTPUT ] then touch $MYOUTPUT fi echo " " >> $MYOUTPUT echo "***********************************************************" >> $MYOUTPUT echo "Submitting this job script:" >> $MYOUTPUT echo "***********************************************************" >> $MYOUTPUT cat $HERE/$JOBSCRIPT >> $MYOUTPUT echo "***********************************************************" >> $MYOUTPUT echo "end of job script" >> $MYOUTPUT echo "***********************************************************" >> $MYOUTPUT echo " " >> $MYOUTPUT # now submit the job that actually runs all the experiments in one go echo "qsub $HERE/$JOBSCRIPT" qsub $HERE/$JOBSCRIPT # keep looking for the job in the job queues and wait until has disappeared jobruns=`qstat -n -u mlosch | grep "$JOBNAME"` while [ "${jobruns}"x != x ] do sleep 200 jobruns=`qstat -n -u mlosch | grep "$JOBNAME"` echo "waiting for job ${jobruns%% *} ($JOBNAME) to complete" currentexp=`grep Experiment $MYOUTPUT | tail -1` echo "currently running $currentexp" done # after running the experiments on the compute node run testreport # for a third time to evaluate results on the head node again echo " " >> $MYOUTPUT echo "now run testreport for a final time to evaluate results:" >> $MYOUTPUT echo "$RUNTESTREPORT -match 10 -runonly" >> $MYOUTPUT #$RUNTESTREPORT -match 10 -runonly >> $MYOUTPUT 2>&1 $RUNTESTREPORT -match 10 -runonly \ -a "jm_c@mitgcm.org" >> $MYOUTPUT 2>&1 # -a "jm_c@mitgcm.org, Martin.Losch@awi.de" >> $MYOUTPUT 2>&1 echo "end of mitgcmtestreport"