| 1 | 
#!/bin/bash | 
| 2 | 
# new script for running testreport on stan1.awi.de | 
| 3 | 
# - split the testreport into 3 steps: | 
| 4 | 
# 1/ compiling on head node (tx7.awi.de), with -norun option | 
| 5 | 
# 2/ running on compute node (using PBS qsub), with -runonly option | 
| 6 | 
# 3/ evaluating result on head node with -runonly option | 
| 7 | 
# | 
| 8 | 
# Notes: | 
| 9 | 
# - step 2 leads to many error messages, because the OS on the compute | 
| 10 | 
#   nodes does not have the appropriate shell tools, modifying the | 
| 11 | 
#   runonly option to skip the evalution step would be nice but not | 
| 12 | 
#   necessary; you'll just have to live with the error messages | 
| 13 | 
# - step 3 assumes that all experiments have been run successfully, i.e. | 
| 14 | 
#   that the output files are up-to-date. | 
| 15 | 
#   if not, testreport will try to run the sx ace-code on the tx7 frontend | 
| 16 | 
#   which will fail inevitably and produce more errors, maybe we can | 
| 17 | 
#   have a flag that skips everything but the evaluation step to avoid this | 
| 18 | 
# $Header: /u/gcmpack/MITgcm_contrib/test_scripts/sxace/mitgcmtestreport,v 1.4 2018/02/08 15:50:19 mlosch Exp $ | 
| 19 | 
# $Name:  $ | 
| 20 | 
 | 
| 21 | 
# for some reason the module command is not available in a bash script on | 
| 22 | 
# this computer so we have to create it here | 
| 23 | 
#module () { eval `/usr/bin/modulecmd bash $*` ; } | 
| 24 | 
# alternatively we can source this script that contains all relevant | 
| 25 | 
# definitions | 
| 26 | 
source /usr/share/Modules/init/bash | 
| 27 | 
#module use --append /sx8/user2/awisoft/modulefiles | 
| 28 | 
# load latest compilers: | 
| 29 | 
#module load sxf90/460 | 
| 30 | 
#module load sxc++/094 | 
| 31 | 
module load sxf90 | 
| 32 | 
module load sxc++ | 
| 33 | 
module load sxmpi | 
| 34 | 
module load sxnetcdf | 
| 35 | 
# | 
| 36 | 
# make sure that we have qsub and qstat | 
| 37 | 
#export PATH=${PATH}:/usr/bin/nqsII | 
| 38 | 
source /etc/profile.d/nec.sh | 
| 39 | 
# | 
| 40 | 
VENDOR=sxf90 | 
| 41 | 
RUNIT="runit_"$VENDOR | 
| 42 | 
HERE=`pwd` | 
| 43 | 
EXE='mpirun -np TR_NPROC ./mitgcmuv' | 
| 44 | 
NPROCS=2 | 
| 45 | 
MPI="-MPI $NPROCS" | 
| 46 | 
OUTFILE=$HOME/out_${VENDOR} | 
| 47 | 
MYOUTPUT=$HOME/testreport_${VENDOR} | 
| 48 | 
OUTFILE=out_${VENDOR} | 
| 49 | 
JOBNAME=test_ace | 
| 50 | 
JOBSCRIPT=job_${VENDOR} | 
| 51 | 
selectexperiment='-t exp2' | 
| 52 | 
selectexperiment='' | 
| 53 | 
# download code into this directory | 
| 54 | 
TDIR=/ace/user/mlosch/tmp_$VENDOR | 
| 55 | 
gcmDIR=MITgcm | 
| 56 | 
git_repo='MITgcm' | 
| 57 | 
git_code='MITgcm' | 
| 58 | 
 | 
| 59 | 
OPTFILE=../tools/build_options/SUPER-UX_SX-ACE_sxf90_awi | 
| 60 | 
#OPTFILE=/home/ace/mlosch/MITgcm/tools/build_options/SUPER-UX_SX-ACE_sxf90_awi | 
| 61 | 
 | 
| 62 | 
RUNTESTREPORT="./testreport $MPI -of=${OPTFILE} $selectexperiment -small_f" | 
| 63 | 
# | 
| 64 | 
# create batch script | 
| 65 | 
# | 
| 66 | 
cat << EOF > $HERE/$JOBSCRIPT | 
| 67 | 
#PBS -q ace-r                            # job queue | 
| 68 | 
#PBS -N $JOBNAME                         # give the job a name | 
| 69 | 
#PBS -l cpunum_job=$NPROCS               # cpus per node | 
| 70 | 
#PBS -l elapstim_req=2:00:00 | 
| 71 | 
#PBS -l cputim_job=2:00:00               # time limit | 
| 72 | 
#PBS -l memsz_job=32gb                   # max accumulated memory, we need this much because of many netcdf files | 
| 73 | 
#PBS -j o                                # join i/o | 
| 74 | 
#PBS -S /bin/sh | 
| 75 | 
#PBS -o $OUTFILE                         # o Where to write output | 
| 76 | 
# | 
| 77 | 
 | 
| 78 | 
cd \${PBS_O_WORKDIR} | 
| 79 | 
$RUNTESTREPORT -runonly -command "$EXE" >> $MYOUTPUT 2>&1 | 
| 80 | 
 | 
| 81 | 
EOF | 
| 82 | 
 | 
| 83 | 
# clean up old testreport output | 
| 84 | 
if [ -e $MYOUTPUT ]; then | 
| 85 | 
  rm -rf $MYOUTPUT | 
| 86 | 
fi | 
| 87 | 
if [ -e $OUTFILE ]; then | 
| 88 | 
  rm -r $OUTFILE | 
| 89 | 
fi | 
| 90 | 
 | 
| 91 | 
# checkOut determines how much checking out is being done | 
| 92 | 
# checkOut = 3: new clone from GitHub and make a new copy | 
| 93 | 
# checkOut = 2: update (git pull) existing repo and make a new copy | 
| 94 | 
# checkOut = 1: skip update | 
| 95 | 
# checkOut = 0: use existing test code (if available otherwise switch to 1) | 
| 96 | 
 | 
| 97 | 
checkOut=2 | 
| 98 | 
 | 
| 99 | 
gitcmd=$HOME/git/git | 
| 100 | 
tmpFil=$TDIR/error.out | 
| 101 | 
if [ $checkOut -le 1 ] ; then | 
| 102 | 
  if test -e $TDIR/${gcmDIR}/doc ; then | 
| 103 | 
    echo $TDIR/${gcmDIR}/doc 'exist' | 
| 104 | 
  else | 
| 105 | 
    echo -n $TDIR/${gcmDIR} 'missing ; ' | 
| 106 | 
    checkOut=2 | 
| 107 | 
    echo "will make a new copy ( checkOut=$checkOut )" | 
| 108 | 
  fi | 
| 109 | 
fi | 
| 110 | 
 | 
| 111 | 
if [ $checkOut -ge 2 ] ; then | 
| 112 | 
  #---- cleaning: | 
| 113 | 
  cd $TDIR | 
| 114 | 
 | 
| 115 | 
  #---- Make a new clone or update existing one: | 
| 116 | 
  if test -e ${gcmDIR}/.git/config ; then | 
| 117 | 
    echo "${gcmDIR}/.git/config exist" | 
| 118 | 
  else | 
| 119 | 
    echo -n "${gcmDIR}/.git/config 'missing, " | 
| 120 | 
    checkOut=3 | 
| 121 | 
    echo "will get new clone ( checkOut=$checkOut )" | 
| 122 | 
  fi | 
| 123 | 
  if [ $checkOut -eq 3 ] ; then | 
| 124 | 
    echo -n "Removing old clone: $TDIR/${gcmDIR} ..." | 
| 125 | 
    test -e $TDIR/${gcmDIR}  &&  rm -rf $TDIR/${gcmDIR} | 
| 126 | 
    echo "  done" | 
| 127 | 
    echo -n "Make a new clone of $git_code from repo: $git_repo ..." | 
| 128 | 
    ${gitcmd} clone https://github.com/$git_repo/${git_code}.git ${gcmDIR} 2> $tmpFil | 
| 129 | 
    retVal=$? | 
| 130 | 
    if test $retVal = 0 ; then | 
| 131 | 
       echo ' --> done!' | 
| 132 | 
       rm -f $tmpFil | 
| 133 | 
    else | 
| 134 | 
       echo " Error: 'git clone' returned: $retVal" | 
| 135 | 
       cat $tmpFil | 
| 136 | 
       rm -f $tmpFil | 
| 137 | 
       exit 2 | 
| 138 | 
    fi | 
| 139 | 
  else | 
| 140 | 
    echo "Updating current clone ( $git_code ) ..." | 
| 141 | 
    ( cd ${gcmDIR}; ${gitcmd} checkout master ; ${gitcmd} pull ) | 
| 142 | 
    retVal=$? | 
| 143 | 
    if test $retVal = 0 ; then | 
| 144 | 
	echo ' --> done!' | 
| 145 | 
    else | 
| 146 | 
	echo " Error: 'git pull' returned: $retVal" | 
| 147 | 
	echo " Error: 'git pull' returned: $retVal" \ | 
| 148 | 
            | mail -s "Git-error on Stan" Martin.Losch@awi.de | 
| 149 | 
	exit 2 | 
| 150 | 
    fi | 
| 151 | 
  fi | 
| 152 | 
else | 
| 153 | 
  cd $TDIR | 
| 154 | 
fi | 
| 155 | 
 | 
| 156 | 
cd $TDIR/MITgcm/verification | 
| 157 | 
 | 
| 158 | 
# make sure that we do not use the cross compiler for testreport | 
| 159 | 
unset CC | 
| 160 | 
# make sure that do use the cross compiler for testreport | 
| 161 | 
#export CC=sxcc | 
| 162 | 
 | 
| 163 | 
$RUNTESTREPORT  -j 8 -norun > $MYOUTPUT 2>&1 | 
| 164 | 
 | 
| 165 | 
if [ "$?" != "0" ] | 
| 166 | 
    then | 
| 167 | 
    echo "something wrong with testreport" | 
| 168 | 
    echo "keeping the working directory" | 
| 169 | 
#else | 
| 170 | 
#  echo "check restarts" | 
| 171 | 
#  echo ../tools/do_tst_2+2 -mpi -exe \"$HERE/$RUNIT\" -a NONE | 
| 172 | 
#  ../tools/do_tst_2+2 -mpi -exe $HERE/$RUNIT -a NONE | 
| 173 | 
# everything OK: delete working directory | 
| 174 | 
#  rm -rf $TDIR | 
| 175 | 
fi | 
| 176 | 
 | 
| 177 | 
if [ ! -e $MYOUTPUT ] | 
| 178 | 
    then | 
| 179 | 
    touch $MYOUTPUT | 
| 180 | 
fi | 
| 181 | 
 | 
| 182 | 
echo " " >> $MYOUTPUT | 
| 183 | 
echo "***********************************************************" >> $MYOUTPUT | 
| 184 | 
echo "Submitting this job script:" >> $MYOUTPUT | 
| 185 | 
echo "***********************************************************" >> $MYOUTPUT | 
| 186 | 
cat $HERE/$JOBSCRIPT >> $MYOUTPUT | 
| 187 | 
echo "***********************************************************" >> $MYOUTPUT | 
| 188 | 
echo "end of job script" >> $MYOUTPUT | 
| 189 | 
echo "***********************************************************" >> $MYOUTPUT | 
| 190 | 
echo " " >> $MYOUTPUT | 
| 191 | 
 | 
| 192 | 
# now submit the job that actually runs all the experiments in one go | 
| 193 | 
echo "qsub $HERE/$JOBSCRIPT" | 
| 194 | 
qsub $HERE/$JOBSCRIPT | 
| 195 | 
# keep looking for the job in the job queues and wait until has disappeared | 
| 196 | 
jobruns=`qstat -n -u mlosch | grep "$JOBNAME"` | 
| 197 | 
while [ "${jobruns}"x != x ] | 
| 198 | 
do | 
| 199 | 
  sleep 200 | 
| 200 | 
  jobruns=`qstat -n -u mlosch | grep "$JOBNAME"` | 
| 201 | 
  echo "waiting for job ${jobruns%% *} ($JOBNAME) to complete" | 
| 202 | 
  currentexp=`grep Experiment $MYOUTPUT | tail -1` | 
| 203 | 
  echo "currently running $currentexp" | 
| 204 | 
done | 
| 205 | 
 | 
| 206 | 
# after running the experiments on the compute node run testreport | 
| 207 | 
# for a third time to evaluate results on the head node again | 
| 208 | 
echo " " >> $MYOUTPUT | 
| 209 | 
echo "now run testreport for a final time to evaluate results:" >> $MYOUTPUT | 
| 210 | 
echo "$RUNTESTREPORT -match 10 -runonly" >> $MYOUTPUT | 
| 211 | 
#$RUNTESTREPORT -match 10 -runonly >> $MYOUTPUT 2>&1 | 
| 212 | 
$RUNTESTREPORT -match 10 -runonly \ | 
| 213 | 
    -a "jm_c@mitgcm.org" >> $MYOUTPUT 2>&1 | 
| 214 | 
#   -a "jm_c@mitgcm.org, Martin.Losch@awi.de" >> $MYOUTPUT 2>&1 | 
| 215 | 
 | 
| 216 | 
echo "end of mitgcmtestreport" |