#!/bin/bash # $Header: /home/ubuntu/mnt/e9_copy/MITgcm/tools/example_scripts/ACESgrid/Attic/check_aces_mp2,v 1.4 2012/03/22 20:05:27 jmc Exp $ # $Name: $ sfx='mp2' #EXE="mpiexec -pernode -comm pmi -np $NCPU ./mitgcmuv" longChk=300 shortChk=60 pNam='mitgcmuv' uNam=$USER HERE=`pwd` pLog="kill_$sfx.log" echo "start $0 +from dir: $HERE +by user: $uNam" echo " on: "`hostname`" +at:" `date` #uNam='jmc' ; HERE='/home/jmc/test_ACES/output' ; cd $HERE while test ! -f stop_check_$sfx do # check for defunct proc nZ=`ps -f -u $uNam | grep $pNam | grep '' | wc -l` if [ $nZ -ge 1 ] ; then echo "===> found $nZ $pNam zombie processes at:" `date` listZ=`ps -f -u $uNam | grep $pNam | grep '' | awk '{print $2}'` sleep $shortChk for p1Z in $listZ ; do p2Z=`ps -f -p $p1Z | grep '' | awk '{print $2}'` if test "x$p2Z" = "x$p1Z" ; then #-- report to permanent log file date >> $pLog ; uname -a >> $pLog ps -f -p $listZ | tee -a $pLog # ps -f -p $p1Z | tee -a $pLog ppZ=`ps -f -p $p1Z | grep $pNam | awk '{print $3}'` #--- version-1 : try to kill parent of Zombie proc # echo " try to kill parent proc: $ppZ at:" `date` | tee -a $pLog # kill -9 $ppZ # out=$? # echo " return code: $out" | tee -a $pLog #--- version-2 : try to kill other pNam child proc from same parent echo "==> list of $pNam proc at:" `date` ps -f -u $uNam | grep $pNam #listP=`ps -f -u $uNam | grep $pNam | awk '{print $2 "p" $3}'` listP=`ps -f -u $uNam | grep $pNam | grep -v '' | awk '{print $2 "p" $3}'` echo "==> pZ=$p1Z : try to kill proc from same parent=$ppZ" | tee -a $pLog ps -f -p $ppZ #echo " listP='$listP'" for xx in $listP do pc=`echo $xx | sed 's/p/ /' | awk '{print $1}'` pp=`echo $xx | sed 's/p/ /' | awk '{print $2}'` #echo " xx='$xx' ; child=$pc ; parent=$pp" #if test "x$pp" = "x$ppZ" -a "x$pc" != "x$p1Z" ; then if test "x$pp" = "x$ppZ" ; then ps -f -p $pc | tee -a $pLog echo " killing proc: $pc" | tee -a $pLog kill -9 $pc out=$? echo " return code: $out" | tee -a $pLog fi done #--- echo "==> list of remaining $pNam proc:" | tee -a $pLog ps -f -u $uNam | grep $pNam | tee -a $pLog echo '--------------------' | tee -a $pLog else echo " proc: $p1Z no more Zombie at:" `date` fi done nZ=`ps -f -u $uNam | grep $pNam | grep '' | wc -l` echo " --> $nZ $pNam zombie process remain at:" `date` fi sleep $longChk done ls -l stop_check_$sfx exit