| 1 | #!/bin/bash -e | 
| 2 | # new script for running testreport on ollie.awi.de | 
| 3 | # - compile and run on compute nodes | 
| 4 | # - use ssh to call mpack command from the head node ollie0 | 
| 5 | #$Header: /u/gcmpack/MITgcm_contrib/test_scripts/ollie/mitgcmtestreport_cray,v 1.10 2020/06/10 13:58:05 mlosch Exp $ | 
| 6 | #$Name:  $ | 
| 7 |  | 
| 8 | # needed for cron-job | 
| 9 | #source /usr/Modules/current/init/bash | 
| 10 | source /etc/profile.d/cray_pe.sh | 
| 11 | # this seems to be enough to make the module cmd work | 
| 12 | source /etc/profile.d/modules.sh | 
| 13 | # | 
| 14 | module purge | 
| 15 | module load PrgEnv-cray | 
| 16 | module load netcdf | 
| 17 | # set the netcdf root directory here, because the definitions always | 
| 18 | # change with different "default" netcdf modules | 
| 19 | export NETCDF_ROOT=`nc-config --prefix` | 
| 20 | # not sure why I have to set these paths here again | 
| 21 | export MPI_ROOT=$(dirname $(dirname `which mpicc`)) | 
| 22 | # $(dirname `echo $LD_LIBRARY_PATH  | awk -F: '{print $1}'`) | 
| 23 | export MPI_INC_DIR=${MPI_ROOT}/include | 
| 24 |  | 
| 25 | # there is no slurm module anymore and this is the current recommendation to | 
| 26 | # have sbatch in your path (rather than running /etc/profile.d./slurm.sh) | 
| 27 | export PATH=${PATH}:/global/opt/slurm/default/bin | 
| 28 |  | 
| 29 | dNam='ollie' | 
| 30 | TST_DIR="/work/ollie/mlosch/test_$dNam" | 
| 31 | echo "start from TST_DIR='$TST_DIR' at: "`date` | 
| 32 |  | 
| 33 | umask 0022 | 
| 34 |  | 
| 35 | sfx='cray' | 
| 36 | RUNIT="runit_"$sfx | 
| 37 |  | 
| 38 | OPTFILE=../tools/build_options/linux_ia64_${sfx}_ollie | 
| 39 | options="-MPI 6" | 
| 40 | options="$options -odir ${dNam}-c" | 
| 41 | options="$options -j 6" | 
| 42 | #options="$options -t global_ocean.cs32x15" | 
| 43 |  | 
| 44 | #EXE='srun --mpi=pmi2 -n TR_NPROC ./mitgcmuv' | 
| 45 | #EXE='srun -n TR_NPROC ./mitgcmuv' | 
| 46 | EXE='srun -n TR_NPROC --cpu_bind=cores ./mitgcmuv' | 
| 47 |  | 
| 48 | if [ -e $TST_DIR ]; then | 
| 49 | echo "$TST_DIR exists" | 
| 50 | else | 
| 51 | mkdir $TST_DIR | 
| 52 | fi | 
| 53 | cd $TST_DIR | 
| 54 | HERE=$TST_DIR/output | 
| 55 | if [ -e $HERE ]; then | 
| 56 | echo "$HERE" | 
| 57 | else | 
| 58 | mkdir $HERE | 
| 59 | fi | 
| 60 | OUTFILE=$HERE/slurm_${sfx}.out | 
| 61 | MYOUTPUT=$HERE/out_$sfx | 
| 62 | if [ -e $MYOUTPUT ]; then | 
| 63 | rm -rf $MYOUTPUT | 
| 64 | fi | 
| 65 | if [ -e $OUTFILE ]; then | 
| 66 | rm -r $OUTFILE | 
| 67 | fi | 
| 68 | gcmDIR="MITgcm_${sfx}" | 
| 69 | git_repo='MITgcm' | 
| 70 | git_code='MITgcm' | 
| 71 |  | 
| 72 | # checkOut determines how much checking out is being done | 
| 73 | # checkOut = 3: new clone from GitHub and make a new copy | 
| 74 | # checkOut = 2: update (git pull) existing repo and make a new copy | 
| 75 | # checkOut = 1: skip update | 
| 76 | # checkOut = 0: use existing test code (if available otherwise switch to 1) | 
| 77 |  | 
| 78 | checkOut=2 | 
| 79 | tdir=${TST_DIR} | 
| 80 | today=`date +%Y%m%d` | 
| 81 | TODAY=`date +%d` | 
| 82 | #tmpFil="/tmp/"`basename $0`".$$" | 
| 83 | tmpFil=$TST_DIR/error.out | 
| 84 |  | 
| 85 | if [ $checkOut -le 1 ] ; then | 
| 86 | if test -e $TST_DIR/${gcmDIR}/doc ; then | 
| 87 | echo $TST_DIR/${gcmDIR}/doc 'exist' | 
| 88 | else | 
| 89 | echo -n "$TST_DIR/${gcmDIR} missing ; " | 
| 90 | checkOut=2 | 
| 91 | echo "will make a new copy ( checkOut=$checkOut )" | 
| 92 | fi | 
| 93 | fi | 
| 94 |  | 
| 95 | if [ $checkOut -ge 2 ] ; then | 
| 96 | #---- cleaning: | 
| 97 | cd $TST_DIR | 
| 98 |  | 
| 99 | #---- Make a new clone or update existing one: | 
| 100 | if test -e ${gcmDIR}/.git/config ; then | 
| 101 | echo "${gcmDIR}/.git/config exist" | 
| 102 | else | 
| 103 | echo -n "${gcmDIR}/.git/config missing ; " | 
| 104 | checkOut=3 | 
| 105 | echo "will get new clone ( checkOut=$checkOut )" | 
| 106 | fi | 
| 107 | if [ $checkOut -eq 3 ] ; then | 
| 108 | echo -n "Removing old clone: $TST_DIR/${gcmDIR} ..." | 
| 109 | test -e $TST_DIR/${gcmDIR}  &&  rm -rf $TST_DIR/${gcmDIR} | 
| 110 | echo "  done" | 
| 111 | echo -n "Make a new clone of $git_code from repo: $git_repo ..." | 
| 112 | git clone https://github.com/$git_repo/${git_code}.git ${gcmDIR} 2> $tmpFil | 
| 113 | retVal=$? | 
| 114 | if test $retVal = 0 ; then | 
| 115 | echo ' --> done!' | 
| 116 | rm -f $tmpFil | 
| 117 | else | 
| 118 | echo " Error: 'git clone' returned: $retVal" | 
| 119 | cat $tmpFil | 
| 120 | rm -f $tmpFil | 
| 121 | exit 2 | 
| 122 | fi | 
| 123 | else | 
| 124 | #    echo "clean tst_2+2 + testreport output" | 
| 125 | ( cd $gcmDIR/verification ; ../tools/do_tst_2+2 -clean ) | 
| 126 | ( cd $gcmDIR/verification ; ./testreport -clean ) | 
| 127 | echo "Updating current clone ( $git_code ) ..." | 
| 128 | ( cd ${gcmDIR}; git checkout master ; git pull ; git ls-files -d | xargs git checkout -- ) | 
| 129 | echo ' --> done!' | 
| 130 | fi | 
| 131 | else | 
| 132 | cd $TST_DIR | 
| 133 | fi | 
| 134 |  | 
| 135 | cd ${TST_DIR}/${gcmDIR}/verification | 
| 136 |  | 
| 137 | cwd=\`pwd\` | 
| 138 | SENDCMD="ssh ollie0 ${TST_DIR}/${gcmDIR}/tools/mpack-1.6/mpack" | 
| 139 | runtestreport="./testreport $options -of $OPTFILE -command \"${EXE}\" -send \"${SENDCMD}\" -sd ${cwd}" | 
| 140 | emailaddress="-a jm_c@mitgcm.org" | 
| 141 | testrestart="../tools/do_tst_2+2 -mpi -exe \"${EXE}\" -o ${dNam}-c -send \"${SENDCMD}\" -sd ${cwd}" | 
| 142 |  | 
| 143 | if [ ! -e $MYOUTPUT ] | 
| 144 | then | 
| 145 | touch $MYOUTPUT | 
| 146 | fi | 
| 147 |  | 
| 148 | # echo "running testreport like this:" | 
| 149 | # echo ${runtestreport} -norun | 
| 150 | # eval "${runtestreport} -norun >> $MYOUTPUT 2>&1" | 
| 151 |  | 
| 152 | # create batch script | 
| 153 | # | 
| 154 | JOBNAME=tst$sfx | 
| 155 | echo "creating batch script $HERE/$RUNIT" | 
| 156 | cat << EOF >| $HERE/$RUNIT | 
| 157 | #!/bin/bash | 
| 158 | #SBATCH --job-name=${JOBNAME} | 
| 159 | #SBATCH  -o ${OUTFILE} | 
| 160 | #SBATCH --time=12:00:00 | 
| 161 | #SBATCH --ntasks=6 | 
| 162 |  | 
| 163 | # still need this to be able to run a few experiments: | 
| 164 | # cfc_example, fizhi, tutorial_deep_convection | 
| 165 | ulimit -s unlimited | 
| 166 |  | 
| 167 | # binds OpenMP task to given cores | 
| 168 | export OMP_PROC_BIND=TRUE | 
| 169 |  | 
| 170 | # for debugging | 
| 171 | # export FLEXLM_DIAGNOSTICS=2 | 
| 172 | # export FNP_IP_ENV=1 | 
| 173 | # export LM_A_CONN_TIMEOUT=99 | 
| 174 |  | 
| 175 | cd \${SLURM_SUBMIT_DIR} | 
| 176 |  | 
| 177 | cwd=`pwd` | 
| 178 | echo "running testreport like this in \${cwd}:" | 
| 179 | echo "${runtestreport} -devel -match 10" | 
| 180 | ${runtestreport} -devel -match 10 ${emailaddress} >> $MYOUTPUT 2>&1 | 
| 181 |  | 
| 182 | echo "running restart test like this in \${cwd}:" | 
| 183 | echo "${testrestart}" | 
| 184 | ${testrestart} ${emailaddress} >> $MYOUTPUT 2>&1 | 
| 185 |  | 
| 186 | ../tools/do_tst_2+2 -clean | 
| 187 |  | 
| 188 | echo "running testreport like this in \${cwd}:" | 
| 189 | echo "./testreport -clean" | 
| 190 | ./testreport -clean | 
| 191 |  | 
| 192 | # Hack,hack,hack to avoid running dome: | 
| 193 | echo "running testreport like this in \${cwd}:" | 
| 194 | echo "${runtestreport} -skipdir dome -fast -match 10" | 
| 195 | ${runtestreport} -skipdir dome -fast -match 10 ${emailaddress} >> $MYOUTPUT 2>&1 | 
| 196 |  | 
| 197 | echo "running restart test like this in \${cwd}:" | 
| 198 | echo "${testrestart}" | 
| 199 | ${testrestart} ${emailaddress} >> $MYOUTPUT 2>&1 | 
| 200 |  | 
| 201 | EOF | 
| 202 |  | 
| 203 | chmod a+x $HERE/$RUNIT | 
| 204 |  | 
| 205 | echo " " >> $MYOUTPUT | 
| 206 | echo "***********************************************************" >> $MYOUTPUT | 
| 207 | echo "Submitting this job script:" >> $MYOUTPUT | 
| 208 | echo "***********************************************************" >> $MYOUTPUT | 
| 209 | cat $HERE/$RUNIT >> $MYOUTPUT | 
| 210 | echo "***********************************************************" >> $MYOUTPUT | 
| 211 | echo "end of job script" >> $MYOUTPUT | 
| 212 | echo "***********************************************************" >> $MYOUTPUT | 
| 213 | echo " " >> $MYOUTPUT | 
| 214 |  | 
| 215 | echo "sbatch $HERE/$RUNIT" | 
| 216 | sbatch $HERE/$RUNIT | 
| 217 |  | 
| 218 | # # keep looking for the job in the job queues and wait until it has disappeared | 
| 219 | # jobruns=somedummy | 
| 220 | # while [ "${jobruns}"x != x ] | 
| 221 | # do | 
| 222 | #   sleep 200 | 
| 223 | #   jobruns=`squeue --noheader -u mlosch | grep "$JOBNAME" | awk '{print $1}'` | 
| 224 | #   echo "waiting for job ${jobruns%% *} ($JOBNAME) to complete" | 
| 225 | #   currentexp=`grep Experiment $MYOUTPUT | tail -1` | 
| 226 | #   echo "currently running $currentexp" | 
| 227 | # done | 
| 228 |  | 
| 229 | # # workaround for mailing the stuff | 
| 230 | # echo "mail the stuff" | 
| 231 |  | 
| 232 | # MPACKCMD=../tools/mpack-1.6/mpack | 
| 233 | # fn=`ls -dtr tr_$dNam* | grep -v tar.gz | tail -1` | 
| 234 | # echo "fname ${fn}" | 
| 235 | # tar cf - $fn | gzip > "${fn}.tar.gz" | 
| 236 | # $MPACKCMD -s MITgcm-test -m 3555000 ${fn}.tar.gz jm_c@mitgcm.org | 
| 237 | # sleep 2 | 
| 238 | # rm -rf "${fn}.tar.gz" | 
| 239 |  | 
| 240 | echo "end of mitgcmtestreport" |