1. 程式人生 > >Shell指令碼監控Storm叢集,郵件報警

Shell指令碼監控Storm叢集,郵件報警

這個指令碼只是一個臨時的方案,在全面的監控,報警系統完成之前為了節省一定的人力而寫的

之前寫過一個指令碼用來監控Storm的Nimbus和Supervisor程序,在檢測到程序不存在時會進行重啟,在實際使用中發現,該指令碼雖然可以不斷重啟,但某些情況下並不能完成自動重啟,人為的介入仍然是必要的,所以對指令碼進行了修改,增加檢測重啟次數,如果檢測到重啟失敗,則會通過郵件通知相關人員。遇到的困難主要在於獲取ssh操作的返回值,思路比較簡單,指令碼內容如下:

#!/bin/bash
dir=`dirname $0`
slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
stopnode=""

while [ 1 ]
do
    echo "==========  "`date`"    ==============="
    nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
    if [ "$nid" = "" ]; then
            echo  'storm nimbus is dead!'
            echo  'trying to start nimbus...'
            nohup storm nimbus >nimbus.log &
            echo 'finish starting!'
    else
            echo "storm nimbus id: $nid"
    fi

    uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
    if [ "$nid" = "" ]; then
        echo  'storm ui process is dead!'
                echo  'trying to start storm ui'
                nohup storm ui >ui.log &
                echo 'finish starting storm ui!'
    else
        echo "storm ui id: $uid"
    fi 

    stopnode=""
    for node in $slaves
    do
       tmp=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
             if [ "$sid" = "" ]; then
                 echo "supervisor is dead trying to start supervisor!";
                 mkdir -p ~/rzx;
                 #重啟前 刪除worker檔案
                 rm -fr /data/tmp/storm/worker;
                 nohup  storm supervisor >supervisor.log &
             else
                 echo " supervisor is alived,"${sid};
             fi')
    alived="alived"
    if [ "$tmp" = "${tmp//$alived}"  ] ; then
        stopnode=${stopnode}" "${node}
        echo ${node}"'s supervisor is dead!"
        tmp=""
    else
       echo ${node}"'s"${tmp}
       tmp=""
    fi
    done

    #sleep 等待Supervisor重啟完成
    sleep 40

    #檢測是否有Supervisor dead
    if [ -n "$stopnode" ] ;then
        echo "check dead supervisor!"
        for node in $stopnode
        do
           check=$(ssh  $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
             if [ "$sid" = "" ]; then
                 echo "supervisor is still dead!";
             else
                 echo " supervisor is alived,"${sid};
             fi')
           if [ "$check" = "${check//$alived}"  ] ; then
                echo ${node}"'s supervisor is still dead, send the email to admin!"
                title="Supervisor--is--dead"
                contxt=${node}"'s--supervisor--is--dead,please--check--the--server!"
                sh /data/www/mail/bin/start.sh  #傳送郵件指令碼
                slaves1=${slaves/${node}/""}
                slaves=""
                slaves=${slaves1}
                check=""
           else
                echo ${node}"'s"$check
                check=""
           fi
       done
    else
        echo "no dead supervisor!"
    fi  

    if [ "$nid" = "" ]; then
       nid1=`jps -l|grep 'nimbus'|awk '{print $1}'`
       if [ "$nid1" = ""  ]; then
         echo "nimbus is still dead, send the email to admin!"
         title1="Nimbus--is--dead"
         contxt1="Nimbus--is--dead,please--check--the--server!"
         sh /data/www/mail/bin/start.sh  #傳送郵件指令碼
       else
         echo "Nimbus is restarted!"
       fi       
    else
         echo
    fi
   
  echo "sleeping 20s..."
        sleep 20
done