1 |
#!/bin/bash |
2 |
|
3 |
# $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/aces_check_mp2,v 1.3 2011/11/23 09:38:43 jmc Exp $ |
4 |
# $Name: $ |
5 |
|
6 |
sfx='mp2' |
7 |
#EXE="mpiexec -pernode -comm pmi -np $NCPU ./mitgcmuv" |
8 |
longChk=300 |
9 |
shortChk=60 |
10 |
pNam='mpiexec' |
11 |
uNam=$USER |
12 |
HERE=`pwd` |
13 |
pLog="kill_$sfx.log" |
14 |
|
15 |
echo "start $0 +from dir: $HERE +by user: $uNam" |
16 |
echo " on: "`hostname`" +at:" `date` |
17 |
#uNam='jmc' ; HERE='/home/jmc/test_ACES/output' ; cd $HERE |
18 |
|
19 |
while test ! -f stop_check_$sfx |
20 |
do |
21 |
# check for defunct proc |
22 |
nZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | wc -l` |
23 |
if [ $nZ -ge 1 ] ; then |
24 |
echo "===> found $nZ $pNam zombie processes at:" `date` |
25 |
listZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | awk '{print $2}'` |
26 |
sleep $shortChk |
27 |
for p1Z in $listZ ; do |
28 |
p2Z=`ps -f -p $p1Z | grep '<defunct>' | awk '{print $2}'` |
29 |
if test "x$p2Z" = "x$p1Z" ; then |
30 |
#-- report to permanent log file |
31 |
date >> $pLog ; uname -a >> $pLog |
32 |
ps -f -p $p1Z | tee -a $pLog |
33 |
ppZ=`ps -f -p $p1Z | grep $pNam | awk '{print $3}'` |
34 |
#--- version-1 : try to kill parent of Zombie proc |
35 |
echo " try to kill parent proc: $ppZ at:" `date` | tee -a $pLog |
36 |
kill -9 $ppZ |
37 |
out=$? |
38 |
echo " return code: $out" | tee -a $pLog |
39 |
#--- version-2 : try to kill other pNam child proc from same parent |
40 |
# echo " list of $pNam proc at:" `date` |
41 |
# ps -f -u $uNam | grep $pNam |
42 |
# listP=`ps -f -u $uNam | grep $pNam | awk '{print $2 "p" $3}'` |
43 |
# echo " try to kill proc from same parent=$ppZ" | tee -a $pLog |
44 |
# ps -f -p $ppZ |
45 |
# #echo " listP='$listP'" |
46 |
# for xx in $listP |
47 |
# do |
48 |
# pc=`echo $xx | sed 's/p/ /' | awk '{print $1}'` |
49 |
# pp=`echo $xx | sed 's/p/ /' | awk '{print $2}'` |
50 |
# #echo " xx='$xx' ; child=$pc ; parent=$pp" |
51 |
# if test "x$pp" = "x$ppZ" -a "x$pc" != "x$p1Z" ; then |
52 |
# ps -f -p $pc | tee -a $pLog |
53 |
# echo " killing proc: $pc" | tee -a $pLog |
54 |
# kill -9 $pc |
55 |
# out=$? |
56 |
# echo " return code: $out" | tee -a $pLog |
57 |
# fi |
58 |
# done |
59 |
#--- |
60 |
echo " list of remaining $pNam proc:" | tee -a $pLog |
61 |
ps -f -u $uNam | grep $pNam | tee -a $pLog |
62 |
echo '--------------------' | tee -a $pLog |
63 |
else |
64 |
echo " proc: $p1Z no more Zombie at:" `date` |
65 |
fi |
66 |
done |
67 |
nZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | wc -l` |
68 |
echo " --> $nZ $pNam zombie process remain at:" `date` |
69 |
fi |
70 |
sleep $longChk |
71 |
done |
72 |
ls -l stop_check_$sfx |
73 |
exit |