1 |
#! /usr/bin/env bash |
2 |
|
3 |
# $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/aces_test_all,v 1.24 2011/08/24 13:21:43 jmc Exp $ |
4 |
# $Name: $ |
5 |
|
6 |
# submit list of jobs or get list of submitted jobs |
7 |
sub_list_jobs() |
8 |
{ |
9 |
# sub_list_jobs |
10 |
# input : JOB_LIST |
11 |
# output: NB_SUB_JOBS (+ status of jobs: M_{sfx}=submitted or skipped ) |
12 |
|
13 |
NB_SUB_JOBS=0 |
14 |
for i in $JOB_LIST |
15 |
do |
16 |
case $i in |
17 |
'mth') sfx='ifc_'${i} ;; |
18 |
'tuv') sfx='op64_'${i} ;; |
19 |
'mp2') sfx=${i}'_mth' ;; |
20 |
'g77') sfx=${i}'_adm' ;; |
21 |
*) sfx=${i}'_mpi' ;; |
22 |
esac |
23 |
if test -f $SUB_DIR/aces_test_$sfx ; then |
24 |
JOB="tst_"$i |
25 |
job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l` |
26 |
if [ $action -eq 2 ] ; then |
27 |
#-- to get outp back: |
28 |
if test "x_$job_exist" = x_0 ; then |
29 |
echo "did not find any job: $JOB" | tee -a $LOG_FIL |
30 |
eval M_$i='skipped' |
31 |
else |
32 |
echo -n "found a job: $JOB" | tee -a $LOG_FIL |
33 |
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
34 |
eval M_$i='submitted' |
35 |
NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1` |
36 |
fi |
37 |
else |
38 |
#-- to submit job |
39 |
if test "x_$job_exist" = x_0 ; then |
40 |
#-- update code if not done within submitted script |
41 |
doUp=`grep -c '^ *checkOut=0' $SUB_DIR/aces_test_$sfx` |
42 |
if test ! -e $TST_DIR/MITgcm_$i ; then doUp=0; fi |
43 |
if [ $doUp -ge 1 ] ; then |
44 |
echo "" >> $LOG_FIL |
45 |
echo " update $TST_DIR/MITgcm_$i :" | tee -a $LOG_FIL |
46 |
( cd $TST_DIR/MITgcm_$i |
47 |
cvs -q -d :pserver:cvsanon@mitgcm.org:/u/gcmpack update -P -d |
48 |
) >> $LOG_FIL 2>&1 |
49 |
fi |
50 |
echo -n " $JOB : " | tee -a $LOG_FIL |
51 |
$QSUB $SUB_DIR/aces_test_$sfx | tee -a $LOG_FIL |
52 |
eval M_$i='submitted' |
53 |
NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1` |
54 |
else |
55 |
echo $JOB | tee -a $LOG_FIL |
56 |
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
57 |
echo 'job already exist => skip this test' | tee -a $LOG_FIL |
58 |
eval M_$i='skipped' |
59 |
fi |
60 |
fi |
61 |
else |
62 |
echo 'no file:' aces_test_$sfx 'to submit' | tee -a $LOG_FIL |
63 |
eval M_$i='skipped' |
64 |
fi |
65 |
done |
66 |
echo " info-sub-list: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
67 |
} |
68 |
|
69 |
# retrieve output when job is finished |
70 |
get_outp_back() |
71 |
{ |
72 |
# get_outp_back number_of_jobs |
73 |
# input : JOB_LIST (+ status of jobs: M_{sfx}=submitted ) |
74 |
# output: REJECTED (= list of fast-to-fail jobs) |
75 |
# (+ change status of jobs to: M_{sfx}=done ) |
76 |
|
77 |
nbJobs=$1 |
78 |
REJECTED= |
79 |
minutes=0 ; freq=10 |
80 |
fsec=`expr $freq \* 60` |
81 |
echo "Check every $freq mn for $nbJobs test(s) to finish" | tee -a $LOG_FIL |
82 |
echo "- start at :" `date` | tee -a $LOG_FIL |
83 |
while test $nbJobs != 0 ; do |
84 |
|
85 |
sleep $fsec |
86 |
minutes=$(( $minutes + $freq )) |
87 |
|
88 |
for i in $JOB_LIST ; do |
89 |
|
90 |
eval comm=\$M_$i |
91 |
if test $comm = 'submitted' ; then |
92 |
JOB="tst_"$i |
93 |
$QSTAT -a > $TMP_FIL |
94 |
RETVAL=$? |
95 |
ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l` |
96 |
rm -f $TMP_FIL |
97 |
if test "x$RETVAL" != x0 ; then |
98 |
echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL |
99 |
continue |
100 |
fi |
101 |
|
102 |
if test "x_$ready_to_send" = x_0 ; then |
103 |
run_dir=${TST_DIR}"/MITgcm_"$i"/verification" |
104 |
#- results output: |
105 |
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^tr_$outPfix" | head -1` |
106 |
if test "x$tdir" != x ; then |
107 |
#- check this is the right output |
108 |
chk=`echo $tdir | grep -c $today` |
109 |
if test $chk = '0' ; then |
110 |
curday=`date +%Y%m%d` |
111 |
chk=`echo $tdir | grep -c $curday` |
112 |
fi |
113 |
if test $chk = '0' ; then |
114 |
echo "tdir='$tdir'" | tee -a $LOG_FIL |
115 |
echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL |
116 |
if [ $minutes -eq $freq ] ; then |
117 |
#- add to rejected list if it fails in less than "freq" minutes |
118 |
REJECTED="$REJECTED $i" |
119 |
fi |
120 |
else |
121 |
rm -f "/tmp/tr_$outPfix-"$i".tar.gz" |
122 |
( cd $run_dir ; tar -czf "/tmp/tr_$outPfix-"$i".tar.gz" ./$tdir ) |
123 |
if test "x$HAVE_MPACK" = xt ; then |
124 |
$MPACK -s MITgcm-test -m 3555000 "/tmp/tr_$outPfix-"$i".tar.gz" jmc@mitgcm.org |
125 |
echo "Email sent for $i at: " `date` | tee -a $LOG_FIL |
126 |
else |
127 |
echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL |
128 |
fi |
129 |
fi |
130 |
else |
131 |
echo " no output found for $i" | tee -a $LOG_FIL |
132 |
if [ $minutes -eq $freq ] ; then |
133 |
#- add to rejected list if it fails in less than "freq" minutes |
134 |
REJECTED="$REJECTED $i" |
135 |
fi |
136 |
fi |
137 |
#- restart output: |
138 |
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^rs_$outPfix" | head -1` |
139 |
if test "x$tdir" != x ; then |
140 |
#- check this is the right output |
141 |
chk=`echo $tdir | grep -c $today` |
142 |
if test $chk = '0' ; then |
143 |
curday=`date +%Y%m%d` |
144 |
chk=`echo $tdir | grep -c $curday` |
145 |
fi |
146 |
if test $chk = '0' ; then |
147 |
echo "tdir='$tdir'" | tee -a $LOG_FIL |
148 |
echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL |
149 |
else |
150 |
rm -f "/tmp/rs_$outPfix-"$i".tar.gz" |
151 |
( cd $run_dir ; tar -czf "/tmp/rs_$outPfix-"$i".tar.gz" ./$tdir ) |
152 |
if test "x$HAVE_MPACK" = xt ; then |
153 |
$MPACK -s MITgcm-test -m 3555000 "/tmp/rs_$outPfix-"$i".tar.gz" jmc@mitgcm.org |
154 |
echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL |
155 |
else |
156 |
echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL |
157 |
fi |
158 |
fi |
159 |
else |
160 |
echo " no restart output for $i" | tee -a $LOG_FIL |
161 |
fi |
162 |
#- record successful sending |
163 |
eval M_$i=done |
164 |
nbJobs=`expr $nbJobs - 1` |
165 |
chmod 644 output/tst_$i.std* |
166 |
fi |
167 |
fi |
168 |
done |
169 |
|
170 |
# "long" queue is 24hrs = 24*60min = 1440min |
171 |
if test $minutes -gt 2160 ; then |
172 |
hrs=$(( $minutes / 60 )); |
173 |
echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL |
174 |
echo ' ' $nbJobs '/' $NB_SUB_JOBS 'tests not yet finished' | tee -a $LOG_FIL |
175 |
exit 1 |
176 |
fi |
177 |
|
178 |
done |
179 |
|
180 |
echo "Retrieving $NB_SUB_JOBS tests finish :" `date` | tee -a $LOG_FIL |
181 |
echo " info-get-outp: REJECTED='$REJECTED'" >> $LOG_FIL |
182 |
|
183 |
} |
184 |
|
185 |
#---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| |
186 |
#-- Sequential part of the script starts here: |
187 |
#--------------------------------------------- |
188 |
|
189 |
# action =1 : submit test jobs ; =2 : get jobs output ; =3 : do both |
190 |
case $1 in |
191 |
'-subOnly') action=1 ; shift ;; |
192 |
'-getOnly') action=2 ; shift ;; |
193 |
'-sub+get') action=3 ; shift ;; |
194 |
'-double' ) action=4 ; shift ;; |
195 |
*) action=4 ;; |
196 |
esac |
197 |
#echo "action= $action , Arg= $# " |
198 |
|
199 |
today=`date +%Y%m%d` |
200 |
|
201 |
if test $# = 0 |
202 |
then |
203 |
TEST_LIST='g77 gnu ifc pgi adm mp2 mth tuv' |
204 |
else |
205 |
TEST_LIST=$* |
206 |
fi |
207 |
|
208 |
# QSUB="/usr/local/bin/qsub" |
209 |
# QSTAT="/usr/local/bin/qstat" |
210 |
# TST_DIR="/mit/jm_c/test_acesgrid" |
211 |
# logPfix='tst_submit' |
212 |
# outPfix='acesgrid' |
213 |
QSUB=qsub |
214 |
QSTAT=qstat |
215 |
TST_DIR="/home/jmc/test_aces" |
216 |
logPfix='tst_all' |
217 |
outPfix='aces' |
218 |
|
219 |
MPACK="MITgcm_tools/mpack-1.6/mpack" |
220 |
SUB_DIR="MITgcm_tools/example_scripts/ACESgrid" |
221 |
TMP_FIL="$TST_DIR/output/TTT.$$" |
222 |
LOG_FIL="$TST_DIR/output/$logPfix."`date +%m%d`".log" |
223 |
|
224 |
#-- clean up old log files and start a new one: |
225 |
cd $TST_DIR/output |
226 |
|
227 |
rm -f $logPfix.*.log_bak |
228 |
if test -f $LOG_FIL ; then mv -f $LOG_FIL ${LOG_FIL}_bak ; fi |
229 |
echo -n '-- Starting: ' | tee -a $LOG_FIL |
230 |
date | tee -a $LOG_FIL |
231 |
|
232 |
n=$(( `ls $logPfix.*.log | wc -l` - 10 )) |
233 |
if test $n -gt 0 ; then |
234 |
echo ' remove old log files:' | tee -a $LOG_FIL |
235 |
ls -lt $logPfix.*.log | tail -"$n" | tee -a $LOG_FIL |
236 |
ls -t $logPfix.*.log | tail -"$n" | xargs rm -f |
237 |
fi |
238 |
|
239 |
. /etc/profile.d/modules.sh |
240 |
module list >> $LOG_FIL 2>&1 |
241 |
|
242 |
#-- now really do something: |
243 |
cd $TST_DIR |
244 |
|
245 |
JOB_LIST=$TEST_LIST |
246 |
sub_list_jobs |
247 |
#echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
248 |
|
249 |
if test $action = 1 ; then |
250 |
NB_JOBS2GET=0 |
251 |
else |
252 |
#date_str=`date +%Y%m%d`"_0" |
253 |
|
254 |
echo "" | tee -a $LOG_FIL |
255 |
#- build the mpack utility (from build_mpack in testreport): |
256 |
MPACKDIR=`dirname $MPACK` |
257 |
( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1 |
258 |
RETVAL=$? |
259 |
if test "x$RETVAL" != x0 ; then |
260 |
echo "Error building the mpack tools at: $MPACK_DIR" | tee -a $LOG_FIL |
261 |
if test -x $MPACK ; then |
262 |
HAVE_MPACK=t |
263 |
echo " use (old ?) executable:" | tee -a $LOG_FIL |
264 |
ls -l $MPACK | tee -a $LOG_FIL |
265 |
else |
266 |
HAVE_MPACK=f |
267 |
fi |
268 |
else |
269 |
if test -x $MPACK ; then |
270 |
rm -f TTT.build_mpack.$$ |
271 |
HAVE_MPACK=t |
272 |
echo "Building mpack: OK" | tee -a $LOG_FIL |
273 |
else |
274 |
echo " $MPACK not executable" | tee -a $LOG_FIL |
275 |
HAVE_MPACK=f |
276 |
fi |
277 |
fi |
278 |
echo "" >> $LOG_FIL |
279 |
|
280 |
NB_JOBS2GET=$NB_SUB_JOBS |
281 |
fi |
282 |
|
283 |
#- when it's done, retrieve output and send e-mail |
284 |
get_outp_back $NB_JOBS2GET |
285 |
#echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL |
286 |
|
287 |
if test $action = 4 -a "x$REJECTED" != x ; then |
288 |
|
289 |
echo "Try 2nd round for fast-failed jobs: '$REJECTED'" | tee -a $LOG_FIL |
290 |
JOB_LIST=$REJECTED |
291 |
sub_list_jobs |
292 |
#echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
293 |
|
294 |
get_outp_back $NB_SUB_JOBS |
295 |
#echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL |
296 |
|
297 |
fi |
298 |
|
299 |
#------------------------ |
300 |
exit 0 |