1 |
jmc |
1.1 |
#!/bin/bash |
2 |
|
|
#$ -N dyncore_cs96 |
3 |
|
|
#$ -q darwin |
4 |
|
|
#$ -pe mpich_mx 24 |
5 |
|
|
# #$ -l num_proc=4,mem_free=6G |
6 |
|
|
#$ -cwd |
7 |
|
|
#$ -j n |
8 |
|
|
#$ -o job.out.$JOB_ID |
9 |
|
|
#$ -e job.err.$JOB_ID |
10 |
|
|
# |
11 |
|
|
# -N job name |
12 |
|
|
# -q queue |
13 |
|
|
# -pe parallel environment and # cpus |
14 |
|
|
# -l num_proc=4 request nodes with 4 processors (doesn't do anything on beagle) |
15 |
|
|
# -l mem_free=6G request nodes with 6GB free memory (currently; see below) |
16 |
|
|
# -cwd go to submit directory |
17 |
|
|
# -j n don't combine stdout and stderr |
18 |
|
|
# -o file for stdout |
19 |
|
|
# -e file for stderr |
20 |
|
|
# |
21 |
|
|
# note: mem_free is not consumable, i.e. the above only checks |
22 |
|
|
# whether there is enough memory on the node before assigning |
23 |
|
|
# the job to it, it will not keep other jobs from being assigned |
24 |
|
|
# to it, or existing jobs from increasing their memory usage. |
25 |
|
|
# |
26 |
|
|
# Lots more SGE information can be found by reading |
27 |
|
|
# the SGE man pages - see |
28 |
|
|
# man qsub |
29 |
|
|
# man qconf |
30 |
|
|
# man sge_pe |
31 |
|
|
# However, these are very dense reading. |
32 |
|
|
# |
33 |
|
|
|
34 |
|
|
runDir='/scratch/jmc/dyncore/run' |
35 |
|
|
endRun=1 # in month |
36 |
|
|
# segmn=12 # length of 1 segment (in month) <- get it from file "data" |
37 |
|
|
|
38 |
|
|
#cd $runDir |
39 |
|
|
|
40 |
|
|
#--- select the executable: |
41 |
|
|
hdir=`grep the_run_name data | sed 's/.*exp //'| sed "s/', *$//"` |
42 |
|
|
tstc=`echo $hdir | sed 's/\..*//'` |
43 |
|
|
exe=`echo $tstc | sed 's/-.*$//g'` |
44 |
|
|
#echo "tstc='$tstc' , exe='$exe'" |
45 |
|
|
if test $exe = 3 -o $exe = 6 ; then |
46 |
|
|
executable=mitgcmuv_$exe ; |
47 |
|
|
else |
48 |
|
|
executable=mitgcmuv_1245 |
49 |
|
|
fi |
50 |
|
|
if test ! -x $executable ; then |
51 |
|
|
echo "no executable: '$executable'" |
52 |
|
|
exit |
53 |
|
|
else |
54 |
|
|
ls -l $executable |
55 |
|
|
fi |
56 |
|
|
|
57 |
|
|
#---------------------------------------------- |
58 |
|
|
rm -f run_here |
59 |
|
|
touch run_here |
60 |
|
|
|
61 |
|
|
echo '*********************************************************' |
62 |
|
|
THEDATE=`date` |
63 |
|
|
echo 'Start job '$THEDATE | tee -a run_here |
64 |
|
|
echo 'Job Number' $JOB_ID | tee -a run_here |
65 |
|
|
echo 'NHOSTS = '$NHOSTS | tee -a run_here |
66 |
|
|
echo 'NSLOTS = '$NSLOTS | tee -a run_here |
67 |
|
|
#echo 'Job Name' $JOB_NAME | tee -a run_here |
68 |
|
|
echo '======= PE_HOSTFILE =======' | tee -a run_here |
69 |
|
|
cat $PE_HOSTFILE | tee -a run_here |
70 |
|
|
echo on `hostname` run $executable 'in dir:' `pwd` | tee -a run_here |
71 |
|
|
|
72 |
|
|
deltaT=`egrep '^ *deltaT' data | sed 's/ *deltaT=//'| sed 's/\..*,$//'` |
73 |
|
|
nit1mn=`expr 86400 \* 30 / $deltaT` |
74 |
|
|
its=`egrep '^ *nIter0' data | sed 's/ *nIter0=//'| sed 's/,$//'` |
75 |
|
|
xx=`egrep -c '^ *nTimeSteps' data` |
76 |
|
|
if test $xx = 1 |
77 |
|
|
then |
78 |
|
|
nIters=`egrep '^ *nTimeSteps' data | sed 's/ *nTimeSteps=//'| sed 's/,$//'` |
79 |
|
|
else |
80 |
|
|
endTim=`egrep '^ *endTime' data | sed 's/ *endTime=//'| sed 's/\.,$//'` |
81 |
|
|
nIters=`expr $endTim / $deltaT` |
82 |
|
|
fi |
83 |
|
|
segmn=`expr $nIters / $nit1mn` |
84 |
|
|
nms=`expr $its / $nit1mn` |
85 |
|
|
nMs=`printf "%3.3i\n" $nms` |
86 |
|
|
nme=`expr $nms + $segmn` |
87 |
|
|
nMe=`printf "%3.3i\n" $nme` |
88 |
|
|
ite=`expr $nme \* $nit1mn` |
89 |
|
|
|
90 |
|
|
echo 'start at month=' $nMs '(it=' $its '), run until mn=' $nMe '(it=' $ite ')' | tee -a run_here |
91 |
|
|
if [ $nms -ge $endRun ]; then |
92 |
|
|
echo 'Run already finished : month' $nms |
93 |
|
|
exit 9 |
94 |
|
|
fi |
95 |
|
|
|
96 |
|
|
touch move_TTT_files |
97 |
|
|
echo mv job.out.$JOB_ID TTT.out.$nMe >> move_TTT_files |
98 |
|
|
echo mv job.err.$JOB_ID TTT.err.$nMe >> move_TTT_files |
99 |
|
|
|
100 |
|
|
#echo '===========================' |
101 |
|
|
#echo '======= $TMPDIR/machines ====' |
102 |
|
|
#cat $TMPDIR/machines |
103 |
|
|
|
104 |
|
|
machinefile=$TMPDIR/machines |
105 |
|
|
|
106 |
|
|
#export TMPDIR=/state/partition1 |
107 |
|
|
#export TMP=${TMPDIR} |
108 |
|
|
#export TEMP=${TMPDIR} |
109 |
|
|
|
110 |
|
|
source /etc/profile.d/modules.sh |
111 |
|
|
#module add intel |
112 |
|
|
#module add mpich-mx |
113 |
|
|
module add intel/9.1.051 |
114 |
|
|
module add mpich-mx/1.2.7..5/intel/9.1.051 |
115 |
|
|
|
116 |
|
|
# Run MPI program |
117 |
|
|
mpirun \ |
118 |
|
|
-np ${NSLOTS} -machinefile $machinefile \ |
119 |
|
|
--mx-kill 30 --mx-copy-env ./$executable |
120 |
|
|
|
121 |
|
|
out=$? |
122 |
|
|
echo 'end mpirun with status' $out |
123 |
|
|
|
124 |
|
|
iTe=`printf "%10.10i\n" $ite` |
125 |
|
|
if [ -f pickup.$iTe.data ]; then |
126 |
|
|
echo 'found pickup files' |
127 |
|
|
out=0 |
128 |
|
|
elif [ -f pickup.$iTe.001.001.data ]; then |
129 |
|
|
echo 'found pickup files' |
130 |
|
|
out=0 |
131 |
|
|
else |
132 |
|
|
echo 'no pickup file written => STOP here' |
133 |
|
|
out=1 |
134 |
|
|
fi |
135 |
|
|
|
136 |
|
|
if test $out = 0 ; then |
137 |
|
|
cp -p run_here std_outp.$nMe |
138 |
|
|
echo ' ' >> std_outp.$nMe |
139 |
|
|
cat STDOUT.0000 >> std_outp.$nMe |
140 |
|
|
mv -f STDOUT.00?? STDERR.00?? temp |
141 |
|
|
cp -p data temp |
142 |
|
|
#- prepare new submission : |
143 |
|
|
sed "s/^ *nIter0=$its/ nIter0=$ite/" data > TTT.tmp |
144 |
|
|
mv TTT.tmp data |
145 |
|
|
if [ $nme -ge $endRun ] ; then |
146 |
|
|
echo 'Run finished : month' $nme 'done !' |
147 |
|
|
else |
148 |
|
|
#- resubmit |
149 |
|
|
echo "ssh beagle cd ${SGE_CWD_PATH}; qsub run_job" |
150 |
|
|
ssh beagle "cd ${SGE_CWD_PATH}; qsub run_job" |
151 |
|
|
fi |
152 |
|
|
fi |
153 |
|
|
|
154 |
|
|
THEDATE=`date` |
155 |
|
|
echo '===========================' |
156 |
|
|
echo 'End job '$THEDATE ' with status' $out |
157 |
|
|
echo '*********************************************************' |
158 |
|
|
exit $out |