#!/bin/bash #$ -N dyncore_cs96 #$ -q darwin #$ -pe mpich_mx 24 # #$ -l num_proc=4,mem_free=6G #$ -cwd #$ -j n #$ -o job.out.$JOB_ID #$ -e job.err.$JOB_ID # # -N job name # -q queue # -pe parallel environment and # cpus # -l num_proc=4 request nodes with 4 processors (doesn't do anything on beagle) # -l mem_free=6G request nodes with 6GB free memory (currently; see below) # -cwd go to submit directory # -j n don't combine stdout and stderr # -o file for stdout # -e file for stderr # # note: mem_free is not consumable, i.e. the above only checks # whether there is enough memory on the node before assigning # the job to it, it will not keep other jobs from being assigned # to it, or existing jobs from increasing their memory usage. # # Lots more SGE information can be found by reading # the SGE man pages - see # man qsub # man qconf # man sge_pe # However, these are very dense reading. # runDir='/scratch/jmc/dyncore/run' endRun=1 # in month # segmn=12 # length of 1 segment (in month) <- get it from file "data" #cd $runDir #--- select the executable: hdir=`grep the_run_name data | sed 's/.*exp //'| sed "s/', *$//"` tstc=`echo $hdir | sed 's/\..*//'` exe=`echo $tstc | sed 's/-.*$//g'` #echo "tstc='$tstc' , exe='$exe'" if test $exe = 3 -o $exe = 6 ; then executable=mitgcmuv_$exe ; else executable=mitgcmuv_1245 fi if test ! -x $executable ; then echo "no executable: '$executable'" exit else ls -l $executable fi #---------------------------------------------- rm -f run_here touch run_here echo '*********************************************************' THEDATE=`date` echo 'Start job '$THEDATE | tee -a run_here echo 'Job Number' $JOB_ID | tee -a run_here echo 'NHOSTS = '$NHOSTS | tee -a run_here echo 'NSLOTS = '$NSLOTS | tee -a run_here #echo 'Job Name' $JOB_NAME | tee -a run_here echo '======= PE_HOSTFILE =======' | tee -a run_here cat $PE_HOSTFILE | tee -a run_here echo on `hostname` run $executable 'in dir:' `pwd` | tee -a run_here deltaT=`egrep '^ *deltaT' data | sed 's/ *deltaT=//'| sed 's/\..*,$//'` nit1mn=`expr 86400 \* 30 / $deltaT` its=`egrep '^ *nIter0' data | sed 's/ *nIter0=//'| sed 's/,$//'` xx=`egrep -c '^ *nTimeSteps' data` if test $xx = 1 then nIters=`egrep '^ *nTimeSteps' data | sed 's/ *nTimeSteps=//'| sed 's/,$//'` else endTim=`egrep '^ *endTime' data | sed 's/ *endTime=//'| sed 's/\.,$//'` nIters=`expr $endTim / $deltaT` fi segmn=`expr $nIters / $nit1mn` nms=`expr $its / $nit1mn` nMs=`printf "%3.3i\n" $nms` nme=`expr $nms + $segmn` nMe=`printf "%3.3i\n" $nme` ite=`expr $nme \* $nit1mn` echo 'start at month=' $nMs '(it=' $its '), run until mn=' $nMe '(it=' $ite ')' | tee -a run_here if [ $nms -ge $endRun ]; then echo 'Run already finished : month' $nms exit 9 fi touch move_TTT_files echo mv job.out.$JOB_ID TTT.out.$nMe >> move_TTT_files echo mv job.err.$JOB_ID TTT.err.$nMe >> move_TTT_files #echo '===========================' #echo '======= $TMPDIR/machines ====' #cat $TMPDIR/machines machinefile=$TMPDIR/machines #export TMPDIR=/state/partition1 #export TMP=${TMPDIR} #export TEMP=${TMPDIR} source /etc/profile.d/modules.sh #module add intel #module add mpich-mx module add intel/9.1.051 module add mpich-mx/1.2.7..5/intel/9.1.051 # Run MPI program mpirun \ -np ${NSLOTS} -machinefile $machinefile \ --mx-kill 30 --mx-copy-env ./$executable out=$? echo 'end mpirun with status' $out iTe=`printf "%10.10i\n" $ite` if [ -f pickup.$iTe.data ]; then echo 'found pickup files' out=0 elif [ -f pickup.$iTe.001.001.data ]; then echo 'found pickup files' out=0 else echo 'no pickup file written => STOP here' out=1 fi if test $out = 0 ; then cp -p run_here std_outp.$nMe echo ' ' >> std_outp.$nMe cat STDOUT.0000 >> std_outp.$nMe mv -f STDOUT.00?? STDERR.00?? temp cp -p data temp #- prepare new submission : sed "s/^ *nIter0=$its/ nIter0=$ite/" data > TTT.tmp mv TTT.tmp data if [ $nme -ge $endRun ] ; then echo 'Run finished : month' $nme 'done !' else #- resubmit echo "ssh beagle cd ${SGE_CWD_PATH}; qsub run_job" ssh beagle "cd ${SGE_CWD_PATH}; qsub run_job" fi fi THEDATE=`date` echo '===========================' echo 'End job '$THEDATE ' with status' $out echo '*********************************************************' exit $out