1 |
jmc |
1.1 |
#!/bin/bash |
2 |
|
|
|
3 |
|
|
# $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/aces_test_mp2_mth,v 1.2 2010/01/24 18:17:35 jmc Exp $ |
4 |
|
|
# $Name: $ |
5 |
|
|
|
6 |
|
|
|
7 |
|
|
sfx='mp2' |
8 |
|
|
#EXE="mpiexec -pernode -comm pmi -np $NCPU ./mitgcmuv" |
9 |
|
|
longChk=600 |
10 |
|
|
longChk=300 |
11 |
|
|
shortChk=60 |
12 |
|
|
pNam='mpiexec' |
13 |
|
|
uNam=$USER |
14 |
|
|
HERE=`pwd` |
15 |
|
|
pLog="kill_$sfx.log" |
16 |
|
|
|
17 |
|
|
echo "start $0 from $HERE at:" `date` "by user: $uNam" |
18 |
|
|
#uNam='jmc' ; HERE='/home/jmc/test_ACES/output' ; cd $HERE |
19 |
|
|
|
20 |
|
|
while test ! -f stop_check_$sfx |
21 |
|
|
do |
22 |
|
|
sleep $longChk |
23 |
|
|
# check for defunct proc |
24 |
|
|
nZ=`ps wauxx | grep $uNam | grep $pNam | grep '<defunct>' | wc -l` |
25 |
|
|
if [ $nZ -ge 1 ] ; then |
26 |
|
|
echo "found $nZ $pNam zombie processes at:" `date` |
27 |
|
|
listZ=`ps wauxx | grep $uNam | grep $pNam | grep '<defunct>' | awk '{print $2}'` |
28 |
|
|
sleep $shortChk |
29 |
|
|
for p1Z in $listZ ; do |
30 |
|
|
p2Z=`ps -f -p $p1Z | grep $pNam | grep '<defunct>' | awk '{print $2}'` |
31 |
|
|
if test "x$p2Z" = "x$p1Z" ; then |
32 |
|
|
#-- report to permanent log file |
33 |
|
|
echo '--------------------' >> $pLog |
34 |
|
|
date >> $pLog ; uname -a >> $pLog |
35 |
|
|
ps -f -p $p1Z >> $pLog |
36 |
|
|
#-- |
37 |
|
|
ppZ=`ps -f -p $p1Z | grep $pNam | awk '{print $3}'` |
38 |
|
|
echo " try to kill parent proc: $ppZ at:" `date` | tee -a $pLog |
39 |
|
|
kill -9 $ppZ |
40 |
|
|
out=$? |
41 |
|
|
echo " return code: $out" | tee -a $pLog |
42 |
|
|
ps wauxx | grep $uNam | grep $pNam | tee -a $pLog |
43 |
|
|
echo '--------------------' >> $pLog |
44 |
|
|
else |
45 |
|
|
echo " proc: $p1Z no more Zombie at:" `date` |
46 |
|
|
fi |
47 |
|
|
done |
48 |
|
|
nZ=`ps wauxx | grep $uNam | grep $pNam | grep '<defunct>' | wc -l` |
49 |
|
|
echo " $nZ $pNam zombie process remain at:" `date` |
50 |
|
|
fi |
51 |
|
|
done |
52 |
|
|
ls -l stop_check_$sfx |
53 |
|
|
exit |