1 |
jmc |
1.1 |
#!/bin/bash |
2 |
|
|
|
3 |
jmc |
1.4 |
# $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/check_aces_mp2,v 1.3 2011/11/23 09:38:43 jmc Exp $ |
4 |
jmc |
1.1 |
# $Name: $ |
5 |
|
|
|
6 |
|
|
sfx='mp2' |
7 |
|
|
#EXE="mpiexec -pernode -comm pmi -np $NCPU ./mitgcmuv" |
8 |
|
|
longChk=300 |
9 |
|
|
shortChk=60 |
10 |
|
|
pNam='mitgcmuv' |
11 |
|
|
uNam=$USER |
12 |
|
|
HERE=`pwd` |
13 |
|
|
pLog="kill_$sfx.log" |
14 |
|
|
|
15 |
jmc |
1.2 |
echo "start $0 +from dir: $HERE +by user: $uNam" |
16 |
|
|
echo " on: "`hostname`" +at:" `date` |
17 |
jmc |
1.1 |
#uNam='jmc' ; HERE='/home/jmc/test_ACES/output' ; cd $HERE |
18 |
|
|
|
19 |
|
|
while test ! -f stop_check_$sfx |
20 |
|
|
do |
21 |
|
|
# check for defunct proc |
22 |
|
|
nZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | wc -l` |
23 |
|
|
if [ $nZ -ge 1 ] ; then |
24 |
jmc |
1.2 |
echo "===> found $nZ $pNam zombie processes at:" `date` |
25 |
jmc |
1.1 |
listZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | awk '{print $2}'` |
26 |
|
|
sleep $shortChk |
27 |
|
|
for p1Z in $listZ ; do |
28 |
|
|
p2Z=`ps -f -p $p1Z | grep '<defunct>' | awk '{print $2}'` |
29 |
|
|
if test "x$p2Z" = "x$p1Z" ; then |
30 |
|
|
#-- report to permanent log file |
31 |
|
|
date >> $pLog ; uname -a >> $pLog |
32 |
jmc |
1.4 |
ps -f -p $listZ | tee -a $pLog |
33 |
|
|
# ps -f -p $p1Z | tee -a $pLog |
34 |
jmc |
1.1 |
ppZ=`ps -f -p $p1Z | grep $pNam | awk '{print $3}'` |
35 |
|
|
#--- version-1 : try to kill parent of Zombie proc |
36 |
|
|
# echo " try to kill parent proc: $ppZ at:" `date` | tee -a $pLog |
37 |
|
|
# kill -9 $ppZ |
38 |
|
|
# out=$? |
39 |
|
|
# echo " return code: $out" | tee -a $pLog |
40 |
|
|
#--- version-2 : try to kill other pNam child proc from same parent |
41 |
jmc |
1.4 |
echo "==> list of $pNam proc at:" `date` |
42 |
jmc |
1.1 |
ps -f -u $uNam | grep $pNam |
43 |
jmc |
1.4 |
#listP=`ps -f -u $uNam | grep $pNam | awk '{print $2 "p" $3}'` |
44 |
|
|
listP=`ps -f -u $uNam | grep $pNam | grep -v '<defunct>' | awk '{print $2 "p" $3}'` |
45 |
|
|
echo "==> pZ=$p1Z : try to kill proc from same parent=$ppZ" | tee -a $pLog |
46 |
jmc |
1.1 |
ps -f -p $ppZ |
47 |
|
|
#echo " listP='$listP'" |
48 |
|
|
for xx in $listP |
49 |
|
|
do |
50 |
|
|
pc=`echo $xx | sed 's/p/ /' | awk '{print $1}'` |
51 |
|
|
pp=`echo $xx | sed 's/p/ /' | awk '{print $2}'` |
52 |
|
|
#echo " xx='$xx' ; child=$pc ; parent=$pp" |
53 |
jmc |
1.4 |
#if test "x$pp" = "x$ppZ" -a "x$pc" != "x$p1Z" ; then |
54 |
|
|
if test "x$pp" = "x$ppZ" ; then |
55 |
jmc |
1.1 |
ps -f -p $pc | tee -a $pLog |
56 |
|
|
echo " killing proc: $pc" | tee -a $pLog |
57 |
jmc |
1.3 |
kill -9 $pc |
58 |
jmc |
1.1 |
out=$? |
59 |
|
|
echo " return code: $out" | tee -a $pLog |
60 |
|
|
fi |
61 |
|
|
done |
62 |
|
|
#--- |
63 |
jmc |
1.4 |
echo "==> list of remaining $pNam proc:" | tee -a $pLog |
64 |
jmc |
1.1 |
ps -f -u $uNam | grep $pNam | tee -a $pLog |
65 |
jmc |
1.2 |
echo '--------------------' | tee -a $pLog |
66 |
jmc |
1.1 |
else |
67 |
|
|
echo " proc: $p1Z no more Zombie at:" `date` |
68 |
|
|
fi |
69 |
|
|
done |
70 |
|
|
nZ=`ps -f -u $uNam | grep $pNam | grep '<defunct>' | wc -l` |
71 |
jmc |
1.2 |
echo " --> $nZ $pNam zombie process remain at:" `date` |
72 |
jmc |
1.1 |
fi |
73 |
|
|
sleep $longChk |
74 |
|
|
done |
75 |
|
|
ls -l stop_check_$sfx |
76 |
|
|
exit |