/[MITgcm]/MITgcm/tools/example_scripts/ACESgrid/aces_test_all
ViewVC logotype

Annotation of /MITgcm/tools/example_scripts/ACESgrid/aces_test_all

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.22 - (hide annotations) (download)
Sun May 8 11:27:20 2011 UTC (12 years, 11 months ago) by jmc
Branch: MAIN
CVS Tags: checkpoint63a, checkpoint63, checkpoint62z, checkpoint62y, checkpoint62x
Changes since 1.21: +200 -145 lines
a hack to re-submit tests which fails too rapidly

1 jmc 1.1 #! /usr/bin/env bash
2    
3 jmc 1.22 # $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/aces_test_all,v 1.21 2010/08/03 04:02:13 jmc Exp $
4 jmc 1.1 # $Name: $
5    
6 jmc 1.22 # submit list of jobs or get list of submitted jobs
7     sub_list_jobs()
8     {
9     # sub_list_jobs
10     # input : JOB_LIST
11     # output: NB_SUB_JOBS (+ status of jobs: M_{sfx}=submitted or skipped )
12    
13     NB_SUB_JOBS=0
14     for i in $JOB_LIST
15     do
16     case $i in
17     'mth') sfx='ifc_'${i} ;;
18     'tuv') sfx='op64_'${i} ;;
19     'mp2') sfx=${i}'_mth' ;;
20     'g77') sfx=${i}'_adm' ;;
21     *) sfx=${i}'_mpi' ;;
22     esac
23     if test -f $SUB_DIR/aces_test_$sfx ; then
24     JOB="tst_"$i
25     job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l`
26     if [ $action -eq 2 ] ; then
27     #-- to get outp back:
28     if test "x_$job_exist" = x_0 ; then
29     echo "did not find any job: $JOB" | tee -a $LOG_FIL
30     eval M_$i='skipped'
31     else
32     echo -n "found a job: $JOB" | tee -a $LOG_FIL
33     $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
34     eval M_$i='submitted'
35     NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
36     fi
37     else
38     #-- to submit job
39     if test "x_$job_exist" = x_0 ; then
40     echo -n " $JOB : " | tee -a $LOG_FIL
41     $QSUB $SUB_DIR/aces_test_$sfx | tee -a $LOG_FIL
42     eval M_$i='submitted'
43     NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
44     else
45     echo $JOB | tee -a $LOG_FIL
46     $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
47     echo 'job already exist => skip this test' | tee -a $LOG_FIL
48     eval M_$i='skipped'
49     fi
50     fi
51     else
52     echo 'no file:' aces_test_$sfx 'to submit' | tee -a $LOG_FIL
53     eval M_$i='skipped'
54     fi
55     done
56     echo " info-sub-list: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
57     }
58    
59     # retrieve output when job is finished
60     get_outp_back()
61     {
62     # get_outp_back number_of_jobs
63     # input : JOB_LIST (+ status of jobs: M_{sfx}=submitted )
64     # output: REJECTED (= list of fast-to-fail jobs)
65     # (+ change status of jobs to: M_{sfx}=done )
66    
67     nbJobs=$1
68     REJECTED=
69     minutes=0 ; freq=10
70     fsec=`expr $freq \* 60`
71     echo "Check every $freq mn for $nbJobs test(s) to finish" | tee -a $LOG_FIL
72     echo "- start at :" `date` | tee -a $LOG_FIL
73     while test $nbJobs != 0 ; do
74    
75     sleep $fsec
76     minutes=$(( $minutes + $freq ))
77    
78     for i in $JOB_LIST ; do
79    
80     eval comm=\$M_$i
81     if test $comm = 'submitted' ; then
82     JOB="tst_"$i
83     $QSTAT -a > $TMP_FIL
84     RETVAL=$?
85     ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l`
86     rm -f $TMP_FIL
87     if test "x$RETVAL" != x0 ; then
88     echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL
89     continue
90     fi
91    
92     if test "x_$ready_to_send" = x_0 ; then
93     run_dir=${TST_DIR}"/MITgcm_"$i"/verification"
94     #- results output:
95     tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^tr_aces' | head -1`
96     if test "x$tdir" != x ; then
97     #- check this is the right output
98     chk=`echo $tdir | grep -c $today`
99     if test $chk = '0' ; then
100     curday=`date +%Y%m%d`
101     chk=`echo $tdir | grep -c $curday`
102     fi
103     if test $chk = '0' ; then
104     echo "tdir='$tdir'" | tee -a $LOG_FIL
105     echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL
106     if [ $minutes -eq $freq ] ; then
107     #- add to rejected list if it fails in less than "freq" minutes
108     REJECTED="$REJECTED $i"
109     fi
110     else
111     rm -f "/tmp/tr_aces-"$i".tar.gz"
112     ( cd $run_dir ; tar -czf "/tmp/tr_aces-"$i".tar.gz" ./$tdir )
113     if test "x$HAVE_MPACK" = xt ; then
114     $MPACK -s MITgcm-test -m 3555000 "/tmp/tr_aces-"$i".tar.gz" jmc@mitgcm.org
115     echo "Email sent for $i at: " `date` | tee -a $LOG_FIL
116     else
117     echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL
118     fi
119     fi
120     else
121     echo " no output found for $i" | tee -a $LOG_FIL
122     if [ $minutes -eq $freq ] ; then
123     #- add to rejected list if it fails in less than "freq" minutes
124     REJECTED="$REJECTED $i"
125     fi
126     fi
127     #- restart output:
128     tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^rs_aces' | head -1`
129     if test "x$tdir" != x ; then
130     #- check this is the right output
131     chk=`echo $tdir | grep -c $today`
132     if test $chk = '0' ; then
133     curday=`date +%Y%m%d`
134     chk=`echo $tdir | grep -c $curday`
135     fi
136     if test $chk = '0' ; then
137     echo "tdir='$tdir'" | tee -a $LOG_FIL
138     echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL
139     else
140     rm -f "/tmp/rs_aces-"$i".tar.gz"
141     ( cd $run_dir ; tar -czf "/tmp/rs_aces-"$i".tar.gz" ./$tdir )
142     if test "x$HAVE_MPACK" = xt ; then
143     $MPACK -s MITgcm-test -m 3555000 "/tmp/rs_aces-"$i".tar.gz" jmc@mitgcm.org
144     echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL
145     else
146     echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL
147     fi
148     fi
149     else
150     echo " no restart output for $i" | tee -a $LOG_FIL
151     fi
152     #- record successful sending
153     eval M_$i=done
154     nbJobs=`expr $nbJobs - 1`
155     chmod 644 output/tst_$i.std*
156     fi
157     fi
158     done
159    
160     # "long" queue is 24hrs = 24*60min = 1440min
161     if test $minutes -gt 2160 ; then
162     hrs=$(( $minutes / 60 ));
163     echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL
164     echo ' ' $nbJobs '/' $NB_SUB_JOBS 'tests not yet finished' | tee -a $LOG_FIL
165     exit 1
166     fi
167    
168     done
169    
170     echo "Retrieving $NB_SUB_JOBS tests finish :" `date` | tee -a $LOG_FIL
171     echo " info-get-outp: REJECTED='$REJECTED'" >> $LOG_FIL
172    
173     }
174    
175     #---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
176     #-- Sequential part of the script starts here:
177     #---------------------------------------------
178    
179     # action =1 : submit test jobs ; =2 : get jobs output ; =3 : do both
180 jmc 1.9 case $1 in
181 jmc 1.22 '-subOnly') action=1 ; shift ;;
182     '-getOnly') action=2 ; shift ;;
183 jmc 1.9 '-sub+get') action=3 ; shift ;;
184 jmc 1.22 '-double' ) action=4 ; shift ;;
185     *) action=4 ;;
186 jmc 1.9 esac
187     #echo "action= $action , Arg= $# "
188    
189 jmc 1.16 today=`date +%Y%m%d`
190 jmc 1.9
191 jmc 1.1 if test $# = 0
192     then
193 jmc 1.22 TEST_LIST='g77 gnu ifc pgi adm mp2 mth tuv'
194 jmc 1.1 else
195 jmc 1.22 TEST_LIST=$*
196 jmc 1.1 fi
197    
198     # QSUB="/usr/local/bin/qsub"
199     # QSTAT="/usr/local/bin/qstat"
200     QSUB=qsub
201     QSTAT=qstat
202 jmc 1.14 TST_DIR="/home/jmc/test_ACES"
203 jmc 1.4 MPACK="MITgcm_tools/mpack-1.6/mpack"
204 jmc 1.14 SUB_DIR="MITgcm_tools/example_scripts/ACESgrid"
205    
206 jmc 1.16 TMP_FIL="$TST_DIR/output/TTT.$$"
207     LOG_FIL="$TST_DIR/output/tst_all."`date +%m%d`".log"
208 jmc 1.17
209 jmc 1.18 #-- clean up old log files and start a new one:
210     cd $TST_DIR/output
211    
212     rm -f tst_all.*.log_bak
213     if test -f $LOG_FIL ; then mv -f $LOG_FIL ${LOG_FIL}_bak ; fi
214 jmc 1.17 echo -n '-- Starting: ' | tee -a $LOG_FIL
215     date | tee -a $LOG_FIL
216 jmc 1.16
217     n=$(( `ls tst_all.*.log | wc -l` - 10 ))
218     if test $n -gt 0 ; then
219     echo ' remove old log files:' | tee -a $LOG_FIL
220     ls -lt tst_all.*.log | tail -"$n" | tee -a $LOG_FIL
221     ls -t tst_all.*.log | tail -"$n" | xargs rm -f
222     fi
223    
224     . /etc/profile.d/modules.sh
225 jmc 1.17 module list >> $LOG_FIL 2>&1
226 jmc 1.16
227     #-- now really do something:
228 jmc 1.14 cd $TST_DIR
229 jmc 1.1
230 jmc 1.22 JOB_LIST=$TEST_LIST
231     sub_list_jobs
232     #echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
233 jmc 1.1
234 jmc 1.22 if test $action = 1 ; then
235     NB_JOBS2GET=0
236 jmc 1.9 else
237 jmc 1.4 #date_str=`date +%Y%m%d`"_0"
238 jmc 1.1
239 jmc 1.22 #- build the mpack utility (from build_mpack in testreport):
240 jmc 1.9 MPACKDIR=`dirname $MPACK`
241     ( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1
242     RETVAL=$?
243     if test "x$RETVAL" != x0 ; then
244 jmc 1.7 echo
245 jmc 1.17 echo "Error building the mpack tools at: $MPACK_DIR" | tee -a $LOG_FIL
246 jmc 1.7 if test -x $MPACK ; then
247     HAVE_MPACK=t
248 jmc 1.20 echo " use (old ?) executable:" | tee -a $LOG_FIL
249 jmc 1.17 ls -l $MPACK | tee -a $LOG_FIL
250 jmc 1.7 else
251     HAVE_MPACK=f
252     fi
253 jmc 1.9 else
254 jmc 1.7 if test -x $MPACK ; then
255     rm -f TTT.build_mpack.$$
256     HAVE_MPACK=t
257 jmc 1.17 echo " building mpack: OK" | tee -a $LOG_FIL
258 jmc 1.7 else
259 jmc 1.17 echo " $MPACK not executable" | tee -a $LOG_FIL
260 jmc 1.7 HAVE_MPACK=f
261     fi
262 jmc 1.9 fi
263     echo
264    
265 jmc 1.22 NB_JOBS2GET=$NB_SUB_JOBS
266 jmc 1.7 fi
267    
268 jmc 1.9 #- when it's done, retrieve output and send e-mail
269 jmc 1.22 get_outp_back $NB_JOBS2GET
270     #echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL
271    
272     if test $action = 4 -a "x$REJECTED" != x ; then
273 jmc 1.1
274 jmc 1.22 echo "Try 2nd round for fast-failed jobs: '$REJECTED'" | tee -a $LOG_FIL
275     JOB_LIST=$REJECTED
276     sub_list_jobs
277     #echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
278 jmc 1.15
279 jmc 1.22 get_outp_back $NB_SUB_JOBS
280     #echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL
281 jmc 1.1
282 jmc 1.22 fi
283 jmc 1.1
284 jmc 1.22 #------------------------
285 jmc 1.15 exit 0

  ViewVC Help
Powered by ViewVC 1.1.22