Shell指令碼監控Storm叢集,郵件報警
阿新 • • 發佈:2019-01-02
這個指令碼只是一個臨時的方案,在全面的監控,報警系統完成之前為了節省一定的人力而寫的
之前寫過一個指令碼用來監控Storm的Nimbus和Supervisor程序,在檢測到程序不存在時會進行重啟,在實際使用中發現,該指令碼雖然可以不斷重啟,但某些情況下並不能完成自動重啟,人為的介入仍然是必要的,所以對指令碼進行了修改,增加檢測重啟次數,如果檢測到重啟失敗,則會通過郵件通知相關人員。遇到的困難主要在於獲取ssh操作的返回值,思路比較簡單,指令碼內容如下:
#!/bin/bash dir=`dirname $0` slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23" stopnode="" while [ 1 ] do echo "========== "`date`" ===============" nid=`jps -l|grep 'nimbus'|awk '{print $1}'` if [ "$nid" = "" ]; then echo 'storm nimbus is dead!' echo 'trying to start nimbus...' nohup storm nimbus >nimbus.log & echo 'finish starting!' else echo "storm nimbus id: $nid" fi uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'` if [ "$nid" = "" ]; then echo 'storm ui process is dead!' echo 'trying to start storm ui' nohup storm ui >ui.log & echo 'finish starting storm ui!' else echo "storm ui id: $uid" fi stopnode="" for node in $slaves do tmp=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`; if [ "$sid" = "" ]; then echo "supervisor is dead trying to start supervisor!"; mkdir -p ~/rzx; #重啟前 刪除worker檔案 rm -fr /data/tmp/storm/worker; nohup storm supervisor >supervisor.log & else echo " supervisor is alived,"${sid}; fi') alived="alived" if [ "$tmp" = "${tmp//$alived}" ] ; then stopnode=${stopnode}" "${node} echo ${node}"'s supervisor is dead!" tmp="" else echo ${node}"'s"${tmp} tmp="" fi done #sleep 等待Supervisor重啟完成 sleep 40 #檢測是否有Supervisor dead if [ -n "$stopnode" ] ;then echo "check dead supervisor!" for node in $stopnode do check=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`; if [ "$sid" = "" ]; then echo "supervisor is still dead!"; else echo " supervisor is alived,"${sid}; fi') if [ "$check" = "${check//$alived}" ] ; then echo ${node}"'s supervisor is still dead, send the email to admin!" title="Supervisor--is--dead" contxt=${node}"'s--supervisor--is--dead,please--check--the--server!" sh /data/www/mail/bin/start.sh #傳送郵件指令碼 slaves1=${slaves/${node}/""} slaves="" slaves=${slaves1} check="" else echo ${node}"'s"$check check="" fi done else echo "no dead supervisor!" fi if [ "$nid" = "" ]; then nid1=`jps -l|grep 'nimbus'|awk '{print $1}'` if [ "$nid1" = "" ]; then echo "nimbus is still dead, send the email to admin!" title1="Nimbus--is--dead" contxt1="Nimbus--is--dead,please--check--the--server!" sh /data/www/mail/bin/start.sh #傳送郵件指令碼 else echo "Nimbus is restarted!" fi else echo fi echo "sleeping 20s..." sleep 20 done