3 |
# $Header$ |
# $Header$ |
4 |
# $Name$ |
# $Name$ |
5 |
|
|
6 |
# action =2 : submit test jobs ; =1 : get jobs output ; =3 : do both |
# submit list of jobs or get list of submitted jobs |
7 |
|
sub_list_jobs() |
8 |
|
{ |
9 |
|
# sub_list_jobs |
10 |
|
# input : JOB_LIST |
11 |
|
# output: NB_SUB_JOBS (+ status of jobs: M_{sfx}=submitted or skipped ) |
12 |
|
|
13 |
|
NB_SUB_JOBS=0 |
14 |
|
for i in $JOB_LIST |
15 |
|
do |
16 |
|
case $i in |
17 |
|
'mth') sfx='ifc_'${i} ;; |
18 |
|
'tuv') sfx='op64_'${i} ;; |
19 |
|
'mp2') sfx=${i}'_mth' ;; |
20 |
|
'g77') sfx=${i}'_adm' ;; |
21 |
|
*) sfx=${i}'_mpi' ;; |
22 |
|
esac |
23 |
|
if test -f $SUB_DIR/aces_test_$sfx ; then |
24 |
|
JOB="tst_"$i |
25 |
|
job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l` |
26 |
|
if [ $action -eq 2 ] ; then |
27 |
|
#-- to get outp back: |
28 |
|
if test "x_$job_exist" = x_0 ; then |
29 |
|
echo "did not find any job: $JOB" | tee -a $LOG_FIL |
30 |
|
eval M_$i='skipped' |
31 |
|
else |
32 |
|
echo -n "found a job: $JOB" | tee -a $LOG_FIL |
33 |
|
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
34 |
|
eval M_$i='submitted' |
35 |
|
NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1` |
36 |
|
fi |
37 |
|
else |
38 |
|
#-- to submit job |
39 |
|
if test "x_$job_exist" = x_0 ; then |
40 |
|
echo -n " $JOB : " | tee -a $LOG_FIL |
41 |
|
$QSUB $SUB_DIR/aces_test_$sfx | tee -a $LOG_FIL |
42 |
|
eval M_$i='submitted' |
43 |
|
NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1` |
44 |
|
else |
45 |
|
echo $JOB | tee -a $LOG_FIL |
46 |
|
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
47 |
|
echo 'job already exist => skip this test' | tee -a $LOG_FIL |
48 |
|
eval M_$i='skipped' |
49 |
|
fi |
50 |
|
fi |
51 |
|
else |
52 |
|
echo 'no file:' aces_test_$sfx 'to submit' | tee -a $LOG_FIL |
53 |
|
eval M_$i='skipped' |
54 |
|
fi |
55 |
|
done |
56 |
|
echo " info-sub-list: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
57 |
|
} |
58 |
|
|
59 |
|
# retrieve output when job is finished |
60 |
|
get_outp_back() |
61 |
|
{ |
62 |
|
# get_outp_back number_of_jobs |
63 |
|
# input : JOB_LIST (+ status of jobs: M_{sfx}=submitted ) |
64 |
|
# output: REJECTED (= list of fast-to-fail jobs) |
65 |
|
# (+ change status of jobs to: M_{sfx}=done ) |
66 |
|
|
67 |
|
nbJobs=$1 |
68 |
|
REJECTED= |
69 |
|
minutes=0 ; freq=10 |
70 |
|
fsec=`expr $freq \* 60` |
71 |
|
echo "Check every $freq mn for $nbJobs test(s) to finish" | tee -a $LOG_FIL |
72 |
|
echo "- start at :" `date` | tee -a $LOG_FIL |
73 |
|
while test $nbJobs != 0 ; do |
74 |
|
|
75 |
|
sleep $fsec |
76 |
|
minutes=$(( $minutes + $freq )) |
77 |
|
|
78 |
|
for i in $JOB_LIST ; do |
79 |
|
|
80 |
|
eval comm=\$M_$i |
81 |
|
if test $comm = 'submitted' ; then |
82 |
|
JOB="tst_"$i |
83 |
|
$QSTAT -a > $TMP_FIL |
84 |
|
RETVAL=$? |
85 |
|
ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l` |
86 |
|
rm -f $TMP_FIL |
87 |
|
if test "x$RETVAL" != x0 ; then |
88 |
|
echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL |
89 |
|
continue |
90 |
|
fi |
91 |
|
|
92 |
|
if test "x_$ready_to_send" = x_0 ; then |
93 |
|
run_dir=${TST_DIR}"/MITgcm_"$i"/verification" |
94 |
|
#- results output: |
95 |
|
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^tr_aces' | head -1` |
96 |
|
if test "x$tdir" != x ; then |
97 |
|
#- check this is the right output |
98 |
|
chk=`echo $tdir | grep -c $today` |
99 |
|
if test $chk = '0' ; then |
100 |
|
curday=`date +%Y%m%d` |
101 |
|
chk=`echo $tdir | grep -c $curday` |
102 |
|
fi |
103 |
|
if test $chk = '0' ; then |
104 |
|
echo "tdir='$tdir'" | tee -a $LOG_FIL |
105 |
|
echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL |
106 |
|
if [ $minutes -eq $freq ] ; then |
107 |
|
#- add to rejected list if it fails in less than "freq" minutes |
108 |
|
REJECTED="$REJECTED $i" |
109 |
|
fi |
110 |
|
else |
111 |
|
rm -f "/tmp/tr_aces-"$i".tar.gz" |
112 |
|
( cd $run_dir ; tar -czf "/tmp/tr_aces-"$i".tar.gz" ./$tdir ) |
113 |
|
if test "x$HAVE_MPACK" = xt ; then |
114 |
|
$MPACK -s MITgcm-test -m 3555000 "/tmp/tr_aces-"$i".tar.gz" jmc@mitgcm.org |
115 |
|
echo "Email sent for $i at: " `date` | tee -a $LOG_FIL |
116 |
|
else |
117 |
|
echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL |
118 |
|
fi |
119 |
|
fi |
120 |
|
else |
121 |
|
echo " no output found for $i" | tee -a $LOG_FIL |
122 |
|
if [ $minutes -eq $freq ] ; then |
123 |
|
#- add to rejected list if it fails in less than "freq" minutes |
124 |
|
REJECTED="$REJECTED $i" |
125 |
|
fi |
126 |
|
fi |
127 |
|
#- restart output: |
128 |
|
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^rs_aces' | head -1` |
129 |
|
if test "x$tdir" != x ; then |
130 |
|
#- check this is the right output |
131 |
|
chk=`echo $tdir | grep -c $today` |
132 |
|
if test $chk = '0' ; then |
133 |
|
curday=`date +%Y%m%d` |
134 |
|
chk=`echo $tdir | grep -c $curday` |
135 |
|
fi |
136 |
|
if test $chk = '0' ; then |
137 |
|
echo "tdir='$tdir'" | tee -a $LOG_FIL |
138 |
|
echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL |
139 |
|
else |
140 |
|
rm -f "/tmp/rs_aces-"$i".tar.gz" |
141 |
|
( cd $run_dir ; tar -czf "/tmp/rs_aces-"$i".tar.gz" ./$tdir ) |
142 |
|
if test "x$HAVE_MPACK" = xt ; then |
143 |
|
$MPACK -s MITgcm-test -m 3555000 "/tmp/rs_aces-"$i".tar.gz" jmc@mitgcm.org |
144 |
|
echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL |
145 |
|
else |
146 |
|
echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL |
147 |
|
fi |
148 |
|
fi |
149 |
|
else |
150 |
|
echo " no restart output for $i" | tee -a $LOG_FIL |
151 |
|
fi |
152 |
|
#- record successful sending |
153 |
|
eval M_$i=done |
154 |
|
nbJobs=`expr $nbJobs - 1` |
155 |
|
chmod 644 output/tst_$i.std* |
156 |
|
fi |
157 |
|
fi |
158 |
|
done |
159 |
|
|
160 |
|
# "long" queue is 24hrs = 24*60min = 1440min |
161 |
|
if test $minutes -gt 2160 ; then |
162 |
|
hrs=$(( $minutes / 60 )); |
163 |
|
echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL |
164 |
|
echo ' ' $nbJobs '/' $NB_SUB_JOBS 'tests not yet finished' | tee -a $LOG_FIL |
165 |
|
exit 1 |
166 |
|
fi |
167 |
|
|
168 |
|
done |
169 |
|
|
170 |
|
echo "Retrieving $NB_SUB_JOBS tests finish :" `date` | tee -a $LOG_FIL |
171 |
|
echo " info-get-outp: REJECTED='$REJECTED'" >> $LOG_FIL |
172 |
|
|
173 |
|
} |
174 |
|
|
175 |
|
#---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----| |
176 |
|
#-- Sequential part of the script starts here: |
177 |
|
#--------------------------------------------- |
178 |
|
|
179 |
|
# action =1 : submit test jobs ; =2 : get jobs output ; =3 : do both |
180 |
case $1 in |
case $1 in |
181 |
'-subOnly') action=2 ; shift ;; |
'-subOnly') action=1 ; shift ;; |
182 |
'-getOnly') action=1 ; shift ;; |
'-getOnly') action=2 ; shift ;; |
183 |
'-sub+get') action=3 ; shift ;; |
'-sub+get') action=3 ; shift ;; |
184 |
*) action=3 ;; |
'-double' ) action=4 ; shift ;; |
185 |
|
*) action=4 ;; |
186 |
esac |
esac |
187 |
#echo "action= $action , Arg= $# " |
#echo "action= $action , Arg= $# " |
188 |
|
|
190 |
|
|
191 |
if test $# = 0 |
if test $# = 0 |
192 |
then |
then |
193 |
tst_list='g77 gnu ifc pgi adm mp2 mth tuv' |
TEST_LIST='g77 gnu ifc pgi adm mp2 mth tuv' |
194 |
else |
else |
195 |
tst_list=$* |
TEST_LIST=$* |
196 |
fi |
fi |
197 |
|
|
198 |
# QSUB="/usr/local/bin/qsub" |
# QSUB="/usr/local/bin/qsub" |
227 |
#-- now really do something: |
#-- now really do something: |
228 |
cd $TST_DIR |
cd $TST_DIR |
229 |
|
|
230 |
nbtst=0 |
JOB_LIST=$TEST_LIST |
231 |
for i in $tst_list |
sub_list_jobs |
232 |
do |
#echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
|
case $i in |
|
|
'mth') sfx='ifc_'${i} ;; |
|
|
'tuv') sfx='op64_'${i} ;; |
|
|
'mp2') sfx=${i}'_mth' ;; |
|
|
'g77') sfx=${i}'_adm' ;; |
|
|
*) sfx=${i}'_mpi' ;; |
|
|
esac |
|
|
if test -f $SUB_DIR/aces_test_$sfx ; then |
|
|
JOB="tst_"$i |
|
|
job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l` |
|
|
if [ $action -ge 2 ] ; then |
|
|
#-- to submit job |
|
|
if test "x_$job_exist" = x_0 ; then |
|
|
echo -n " $JOB : " | tee -a $LOG_FIL |
|
|
$QSUB $SUB_DIR/aces_test_$sfx | tee -a $LOG_FIL |
|
|
eval M_$i='submitted' |
|
|
nbtst=`expr $nbtst + 1` |
|
|
else |
|
|
echo $JOB | tee -a $LOG_FIL |
|
|
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
|
|
echo 'job already exist => skip this test' | tee -a $LOG_FIL |
|
|
eval M_$i='skipped' |
|
|
fi |
|
|
else |
|
|
#-- to get outp back: |
|
|
if test "x_$job_exist" = x_0 ; then |
|
|
echo "did not find any job: $JOB" | tee -a $LOG_FIL |
|
|
eval M_$i='skipped' |
|
|
else |
|
|
echo -n "found a job: $JOB" | tee -a $LOG_FIL |
|
|
$QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL |
|
|
eval M_$i='submitted' |
|
|
nbtst=`expr $nbtst + 1` |
|
|
fi |
|
|
fi |
|
|
else |
|
|
echo 'no file:' aces_test_$sfx 'to submit' | tee -a $LOG_FIL |
|
|
eval M_$i='skipped' |
|
|
fi |
|
|
done |
|
233 |
|
|
234 |
if test $action = 2 ; then |
if test $action = 1 ; then |
235 |
count=0 |
NB_JOBS2GET=0 |
236 |
else |
else |
237 |
#date_str=`date +%Y%m%d`"_0" |
#date_str=`date +%Y%m%d`"_0" |
238 |
|
|
|
MPACKDIR=`dirname $MPACK` |
|
239 |
#- build the mpack utility (from build_mpack in testreport): |
#- build the mpack utility (from build_mpack in testreport): |
240 |
|
MPACKDIR=`dirname $MPACK` |
241 |
( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1 |
( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1 |
242 |
RETVAL=$? |
RETVAL=$? |
243 |
if test "x$RETVAL" != x0 ; then |
if test "x$RETVAL" != x0 ; then |
262 |
fi |
fi |
263 |
echo |
echo |
264 |
|
|
265 |
count=$nbtst |
NB_JOBS2GET=$NB_SUB_JOBS |
266 |
fi |
fi |
267 |
|
|
268 |
#- when it's done, retrieve output and send e-mail |
#- when it's done, retrieve output and send e-mail |
269 |
minutes=0 ; freq=10 |
get_outp_back $NB_JOBS2GET |
270 |
fsec=`expr $freq \* 60` |
#echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL |
|
echo "Check every $freq mn for $count test(s) to finish" | tee -a $LOG_FIL |
|
|
echo "- start at :" `date` | tee -a $LOG_FIL |
|
|
while test $count != 0 ; do |
|
|
|
|
|
sleep $fsec |
|
|
minutes=$(( $minutes + $freq )) |
|
|
|
|
|
for i in $tst_list ; do |
|
|
|
|
|
eval comm=\$M_$i |
|
|
if test $comm = 'submitted' ; then |
|
|
JOB="tst_"$i |
|
|
$QSTAT -a > $TMP_FIL |
|
|
RETVAL=$? |
|
|
ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l` |
|
|
rm -f $TMP_FIL |
|
|
if test "x$RETVAL" != x0 ; then |
|
|
echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL |
|
|
continue |
|
|
fi |
|
271 |
|
|
272 |
if test "x_$ready_to_send" = x_0 ; then |
if test $action = 4 -a "x$REJECTED" != x ; then |
|
run_dir=${TST_DIR}"/MITgcm_"$i"/verification" |
|
|
#- results output: |
|
|
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^tr_aces' | head -1` |
|
|
if test "x$tdir" != x ; then |
|
|
#- check this is the right output |
|
|
chk=`echo $tdir | grep -c $today` |
|
|
if test $chk = '0' ; then |
|
|
curday=`date +%Y%m%d` |
|
|
chk=`echo $tdir | grep -c $curday` |
|
|
fi |
|
|
if test $chk = '0' ; then |
|
|
echo "tdir='$tdir'" | tee -a $LOG_FIL |
|
|
echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL |
|
|
else |
|
|
rm -f "/tmp/tr_aces-"$i".tar.gz" |
|
|
( cd $run_dir ; tar -czf "/tmp/tr_aces-"$i".tar.gz" ./$tdir ) |
|
|
if test "x$HAVE_MPACK" = xt ; then |
|
|
$MPACK -s MITgcm-test -m 3555000 "/tmp/tr_aces-"$i".tar.gz" jmc@mitgcm.org |
|
|
echo "Email sent for $i at: " `date` | tee -a $LOG_FIL |
|
|
else |
|
|
echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL |
|
|
fi |
|
|
fi |
|
|
else |
|
|
echo " no output found for $i" | tee -a $LOG_FIL |
|
|
fi |
|
|
#- restart output: |
|
|
tdir=`ls -1 -t $run_dir | grep -v tr_out | grep '^rs_aces' | head -1` |
|
|
if test "x$tdir" != x ; then |
|
|
#- check this is the right output |
|
|
chk=`echo $tdir | grep -c $today` |
|
|
if test $chk = '0' ; then |
|
|
curday=`date +%Y%m%d` |
|
|
chk=`echo $tdir | grep -c $curday` |
|
|
fi |
|
|
if test $chk = '0' ; then |
|
|
echo "tdir='$tdir'" | tee -a $LOG_FIL |
|
|
echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL |
|
|
else |
|
|
rm -f "/tmp/rs_aces-"$i".tar.gz" |
|
|
( cd $run_dir ; tar -czf "/tmp/rs_aces-"$i".tar.gz" ./$tdir ) |
|
|
if test "x$HAVE_MPACK" = xt ; then |
|
|
$MPACK -s MITgcm-test -m 3555000 "/tmp/rs_aces-"$i".tar.gz" jmc@mitgcm.org |
|
|
echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL |
|
|
else |
|
|
echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL |
|
|
fi |
|
|
fi |
|
|
else |
|
|
echo " no restart output for $i" | tee -a $LOG_FIL |
|
|
fi |
|
|
#- record successful sending |
|
|
eval M_$i=done |
|
|
count=`expr $count - 1` |
|
|
chmod 644 output/tst_$i.std* |
|
|
fi |
|
|
fi |
|
|
done |
|
273 |
|
|
274 |
# "long" queue is 24hrs = 24*60min = 1440min |
echo "Try 2nd round for fast-failed jobs: '$REJECTED'" | tee -a $LOG_FIL |
275 |
if test $minutes -gt 2160 ; then |
JOB_LIST=$REJECTED |
276 |
hrs=$(( $minutes / 60 )); |
sub_list_jobs |
277 |
echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL |
#echo " info-main: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL |
|
echo ' ' $count '/' $nbtst 'tests not yet finished' | tee -a $LOG_FIL |
|
|
exit 1 |
|
|
fi |
|
278 |
|
|
279 |
done |
get_outp_back $NB_SUB_JOBS |
280 |
|
#echo " info-main: REJECTED='$REJECTED'" >> $LOG_FIL |
281 |
|
|
282 |
echo "Retrieving $nbtst tests finish :" `date` | tee -a $LOG_FIL |
fi |
|
exit 0 |
|
283 |
|
|
284 |
|
#------------------------ |
285 |
|
exit 0 |