/[MITgcm]/MITgcm/tools/example_scripts/ACESgrid/test_submit_aces
ViewVC logotype

Contents of /MITgcm/tools/example_scripts/ACESgrid/test_submit_aces

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.18 - (show annotations) (download)
Sun Aug 10 22:27:04 2014 UTC (9 years, 10 months ago) by jmc
Branch: MAIN
CVS Tags: checkpoint66a, checkpoint65z, checkpoint65x, checkpoint65y, checkpoint65r, checkpoint65s, checkpoint65p, checkpoint65q, checkpoint65v, checkpoint65w, checkpoint65t, checkpoint65u, checkpoint65j, checkpoint65k, checkpoint65h, checkpoint65i, checkpoint65n, checkpoint65o, checkpoint65l, checkpoint65m, checkpoint65b, checkpoint65c, checkpoint65f, checkpoint65g, checkpoint65d, checkpoint65e
Changes since 1.17: +4 -4 lines
add an OpenAD test using pgi compiler and MPI (openmpi); for now, run
it only during the WE.

1 #! /usr/bin/env bash
2
3 # $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/test_submit_aces,v 1.17 2014/05/07 16:16:04 jmc Exp $
4 # $Name: $
5
6 # submit list of jobs or get list of submitted jobs
7 sub_list_jobs()
8 {
9 # sub_list_jobs
10 # input : JOB_LIST
11 # output: NB_SUB_JOBS (+ status of jobs: M_{sfx}=submitted or skipped )
12
13 NB_SUB_JOBS=0
14 for i in $JOB_LIST
15 do
16 case $i in
17 'mth') sfx='path_'${i} ;;
18 'tuv') sfx='op64_'${i} ;;
19 'mp2') sfx='gnu_'${i} ;;
20 'oad') sfx='pgi_'${i} ;;
21 'iad') sfx='ifc_adm' ;;
22 *) sfx=${i}'_mpi' ;;
23 esac
24 if test -f $SUB_DIR/test_aces_$sfx ; then
25 JOB="tst_"$i
26 job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l`
27 if [ $action -eq 2 ] ; then
28 #-- to get outp back:
29 if test "x_$job_exist" = x_0 ; then
30 echo "did not find any job: $JOB" | tee -a $LOG_FIL
31 eval M_$i='skipped'
32 else
33 echo -n "found a job: $JOB" | tee -a $LOG_FIL
34 $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
35 eval M_$i='submitted'
36 NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
37 fi
38 else
39 #-- to submit job
40 if test "x_$job_exist" = x_0 ; then
41 #-- update code if not done within submitted script
42 doUp=`grep -c '^ *checkOut=0' $SUB_DIR/test_aces_$sfx`
43 if test ! -e $TST_DIR/MITgcm_$i ; then doUp=0; fi
44 if [ $doUp -ge 1 ] ; then
45 echo "" >> $LOG_FIL
46 echo " update $TST_DIR/MITgcm_$i :" | tee -a $LOG_FIL
47 ( cd $TST_DIR/MITgcm_$i
48 cvs -q -d :pserver:cvsanon@mitgcm.org:/u/gcmpack update -P -d
49 ) >> $LOG_FIL 2>&1
50 fi
51 echo -n " $JOB : " | tee -a $LOG_FIL
52 $QSUB $SUB_DIR/test_aces_$sfx | tee -a $LOG_FIL
53 eval M_$i='submitted'
54 NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
55 else
56 echo $JOB | tee -a $LOG_FIL
57 $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
58 echo 'job already exist => skip this test' | tee -a $LOG_FIL
59 eval M_$i='skipped'
60 fi
61 fi
62 else
63 echo 'no file:' test_aces_$sfx 'to submit' | tee -a $LOG_FIL
64 eval M_$i='skipped'
65 fi
66 done
67 echo " info-sub-list: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
68 }
69
70 # retrieve output when job is finished
71 get_outp_back()
72 {
73 # get_outp_back number_of_jobs
74 # input : JOB_LIST (+ status of jobs: M_{sfx}=submitted )
75 # output: REJECTED (= list of fast-to-fail jobs)
76 # (+ change status of jobs to: M_{sfx}=done )
77
78 nbJobs=$1
79 REJECTED=
80 minutes=0 ; freq=10 ; waitLimit=1800 ; #-- all in minutes
81 # "long" queue is 24.h = 24*60 = 1440mn ; 36.h = 2160mn ; 30.h = 1800mn
82
83 echo "Check every $freq mn for $nbJobs test(s) to finish" | tee -a $LOG_FIL
84 echo "- start at :" `date` | tee -a $LOG_FIL
85 fsec=`expr $freq \* 60`
86 while test $nbJobs != 0 ; do
87
88 sleep $fsec
89 minutes=$(( $minutes + $freq ))
90
91 for i in $JOB_LIST ; do
92
93 eval comm=\$M_$i
94 if test $comm = 'submitted' ; then
95 JOB="tst_"$i
96 $QSTAT -a > $TMP_FIL
97 RETVAL=$?
98 ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l`
99 rm -f $TMP_FIL
100 if test "x$RETVAL" != x0 ; then
101 echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL
102 continue
103 fi
104
105 if test "x_$ready_to_send" = x_0 ; then
106 run_dir=${TST_DIR}"/MITgcm_"$i"/verification"
107 #- results output:
108 tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^tr_$outPfix" | head -1`
109 if test "x$tdir" != x ; then
110 #- check this is the right output
111 chk=`echo $tdir | grep -c $today`
112 if test $chk = '0' ; then
113 curday=`date +%Y%m%d`
114 chk=`echo $tdir | grep -c $curday`
115 fi
116 if test $chk = '0' ; then
117 echo "tdir='$tdir'" | tee -a $LOG_FIL
118 echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL
119 if [ $minutes -eq $freq ] ; then
120 #- add to rejected list if it fails in less than "freq" minutes
121 REJECTED="$REJECTED $i"
122 fi
123 else
124 rm -f "/tmp/tr_$outPfix-"$i".tar.gz"
125 ( cd $run_dir ; tar -czf "/tmp/tr_$outPfix-"$i".tar.gz" ./$tdir )
126 if test "x$HAVE_MPACK" = xt ; then
127 $MPACK -s MITgcm-test -m 3555000 "/tmp/tr_$outPfix-"$i".tar.gz" jmc@mitgcm.org
128 echo "Email sent for $i at: " `date` | tee -a $LOG_FIL
129 else
130 echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL
131 fi
132 fi
133 else
134 echo " no output found for $i" | tee -a $LOG_FIL
135 if [ $minutes -eq $freq ] ; then
136 #- add to rejected list if it fails in less than "freq" minutes
137 REJECTED="$REJECTED $i"
138 fi
139 fi
140 #- restart output:
141 tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^rs_$outPfix" | head -1`
142 if test "x$tdir" != x ; then
143 #- check this is the right output
144 chk=`echo $tdir | grep -c $today`
145 if test $chk = '0' ; then
146 curday=`date +%Y%m%d`
147 chk=`echo $tdir | grep -c $curday`
148 fi
149 if test $chk = '0' ; then
150 echo "tdir='$tdir'" | tee -a $LOG_FIL
151 echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL
152 else
153 rm -f "/tmp/rs_$outPfix-"$i".tar.gz"
154 ( cd $run_dir ; tar -czf "/tmp/rs_$outPfix-"$i".tar.gz" ./$tdir )
155 if test "x$HAVE_MPACK" = xt ; then
156 $MPACK -s MITgcm-test -m 3555000 "/tmp/rs_$outPfix-"$i".tar.gz" jmc@mitgcm.org
157 echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL
158 else
159 echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL
160 fi
161 fi
162 else
163 echo " no restart output for $i" | tee -a $LOG_FIL
164 fi
165 #- record successful sending
166 eval M_$i=done
167 nbJobs=`expr $nbJobs - 1`
168 chmod 644 output/tst_$i.std*
169 fi
170 fi
171 done
172
173 if test $minutes -gt $waitLimit ; then
174 hrs=$(( $minutes / 60 ));
175 echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL
176 echo ' ' $nbJobs '/' $NB_SUB_JOBS 'tests not yet finished' | tee -a $LOG_FIL
177 exit 1
178 fi
179
180 done
181
182 echo "Retrieving $NB_SUB_JOBS tests finish :" `date` | tee -a $LOG_FIL
183 echo " info-get-outp: REJECTED='$REJECTED'" >> $LOG_FIL
184
185 }
186
187 #---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
188 #-- Sequential part of the script starts here:
189 #---------------------------------------------
190
191 # action =1 : submit test jobs ; =2 : get jobs output ; =3 : do both
192 case $1 in
193 '-subOnly') action=1 ; shift ;;
194 '-getOnly') action=2 ; shift ;;
195 '-sub+get') action=3 ; shift ;;
196 '-double' ) action=4 ; shift ;;
197 *) action=1 ;;
198 esac
199 #echo "action= $action , Arg= $# "
200
201 today=`date +%Y%m%d`
202 dInWeek=`date +%a`
203
204 if test $# = 0
205 then
206 TEST_LIST='gnu mp2 adm pgi path mth op64 tuv'
207 if test "x$dInWeek" = xSat ; then TEST_LIST="$TEST_LIST iad ifc oad" ; fi
208 if test "x$dInWeek" = xSun ; then TEST_LIST="$TEST_LIST iad ifc oad" ; fi
209 else
210 TEST_LIST=$*
211 fi
212
213 QSUB="/usr/bin/qsub"
214 QSTAT="/usr/bin/qstat"
215 logPfix='tst_submit'
216 outPfix='acesgrid'
217 HERE="$HOME/test_$outPfix"
218 #TST_DIR="/data/jm_c/test_$outPfix"
219 TST_DIR="/scratch/jm_c/test_$outPfix"
220 # QSUB=qsub
221 # QSTAT=qstat
222 # logPfix='tst_all'
223 # outPfix='aces'
224 # HERE="$HOME/test_$outPfix"
225 # TST_DIR=$HERE
226
227 MPACK="MITgcm_tools/mpack-1.6/mpack"
228 SUB_DIR="MITgcm_tools/example_scripts/ACESgrid"
229 TMP_FIL="$HERE/output/TTT.$$"
230 LOG_FIL="$HERE/output/$logPfix."`date +%m%d`".log"
231
232 #-- clean up old log files and start a new one:
233 cd $HERE/output
234
235 rm -f $logPfix.*.log_bak
236 if test -f $LOG_FIL ; then mv -f $LOG_FIL ${LOG_FIL}_bak ; fi
237 echo -n '-- Starting: ' | tee -a $LOG_FIL
238 date | tee -a $LOG_FIL
239
240 n=$(( `ls $logPfix.*.log | wc -l` - 10 ))
241 if test $n -gt 0 ; then
242 echo ' remove old log files:' | tee -a $LOG_FIL
243 ls -lt $logPfix.*.log | tail -"$n" | tee -a $LOG_FIL
244 ls -t $logPfix.*.log | tail -"$n" | xargs rm -f
245 fi
246
247 . /etc/profile.d/modules.sh
248 module list >> $LOG_FIL 2>&1
249
250 #-- now really do something:
251 cd $HERE
252
253 JOB_LIST=$TEST_LIST
254 sub_list_jobs
255 #echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
256
257 if test $action = 1 ; then
258 NB_JOBS2GET=0
259 else
260 #date_str=`date +%Y%m%d`"_0"
261
262 echo "" | tee -a $LOG_FIL
263 #- build the mpack utility (from build_mpack in testreport):
264 MPACKDIR=`dirname $MPACK`
265 ( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1
266 RETVAL=$?
267 if test "x$RETVAL" != x0 ; then
268 echo "Error building the mpack tools at: $MPACK_DIR" | tee -a $LOG_FIL
269 if test -x $MPACK ; then
270 HAVE_MPACK=t
271 echo " use (old ?) executable:" | tee -a $LOG_FIL
272 ls -l $MPACK | tee -a $LOG_FIL
273 else
274 HAVE_MPACK=f
275 fi
276 else
277 if test -x $MPACK ; then
278 rm -f TTT.build_mpack.$$
279 HAVE_MPACK=t
280 echo "Building mpack: OK" | tee -a $LOG_FIL
281 else
282 echo " $MPACK not executable" | tee -a $LOG_FIL
283 HAVE_MPACK=f
284 fi
285 fi
286 echo "" >> $LOG_FIL
287
288 NB_JOBS2GET=$NB_SUB_JOBS
289 fi
290
291 #- when it's done, retrieve output and send e-mail
292 get_outp_back $NB_JOBS2GET
293 #echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL
294
295 if test $action = 4 -a "x$REJECTED" != x ; then
296
297 echo "" >> $LOG_FIL
298 echo "Try 2nd round for fast-failed jobs: '$REJECTED'" | tee -a $LOG_FIL
299 JOB_LIST=$REJECTED
300 sub_list_jobs
301 #echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
302
303 echo "" >> $LOG_FIL
304 get_outp_back $NB_SUB_JOBS
305 #echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL
306
307 fi
308
309 #------------------------
310 exit 0

  ViewVC Help
Powered by ViewVC 1.1.22