1. 程式人生 > >Shell指令碼監控、拉起Nimbus和Supervisor程序

Shell指令碼監控、拉起Nimbus和Supervisor程序

Nimbus和Supervisor都是快速失敗,無狀態的程序,Nimbus的單點問題一直沒有很好的解決辦法,所以我們可以對相關程序進行監控,在其掛掉時嘗試重啟。

在之前的專案裡,比較常用的方式是通過monit對相關程序進行監控,通過monit監控需要對每臺機器進行配置,可以通過Fabric(http://www.fabfile.org)進行統一配置,這裡不詳細介紹monit監控的方式。

通過Shell指令碼可以實現在Nimbus節點上,監控整個叢集,前提是需要Nimbus節點與Supervisor建立SSH無密碼訪問。具體內容如下 

main.sh

#!bin/bash
dir=`dirname $0`

while [ 1 ]
do
        echo "==========  "`date`"    ==============="
        nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
        if [ "$nid" = "" ]; then
                echo  'storm nimbus is dead!'
                echo  'trying to start nimbus...'
                nohup storm nimbus >nimbus.log &
                echo 'finish starting!'
        else
                echo "storm nimbus id: $nid"
        fi

        uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
    if [ "$nid" = "" ]; then
        echo  'storm ui process is dead!'
                echo  'trying to start storm ui'
                nohup storm ui >ui.log &
                echo 'finish starting storm ui!'
    else
        echo "storm ui id: $uid"
    fi 

        sh $dir/storm_manager.sh start

        echo "sleeping 20s..."
        sleep 20
done
<pre name="code" class="plain" style="font-size: 18px;">storm_manager.sh
#!bin/bash

    slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
storm_dir='/data/tmp/storm'

check_supervisors(){
    for node in $slaves
    do 
        ssh  $node <<END
            source /etc/profile
            source ~/.bash_profile
            echo "=== check supervisor on $node..."
           
                        sid=\`jps |grep supervisor |awk '{print \$1}'\`
                        if [ "\$sid" = "" ] ;then
                               echo "supervisor is dead!"
                        else
                               echo "supervisor process id: \$sid"
                        fi

            echo "finishing checking $node's supervisor"
                        echo 
END
    done

}

stop_supervisor(){
        for node in $slaves
        do
                ssh $node <<END
                        source /etc/profile
                        source ~/.bash_profile
                        echo "=== killing supervisor on $node..."
                        jps |grep 'supervisor' |awk '{print \$1}' |xargs kill
                        echo "finishing killing $node's supervisor"
END
        done
}

start_supervisor(){
    for node in $slaves
    do 
        ssh $node <<END
            source /etc/profile
                        source ~/.bash_profile
           
                        sid=\`jps |grep supervisor |awk '{print \$1}'\`
                        echo "=== starting supervisor on $node..."
            if [ "\$sid" = "" ] ;then
                echo "supervisor is dead!"
                               mkdir -p ~/rzx
                    rm -fr $storm_dir/supervisor
                               cd ~/rzx
                    nohup storm supervisor >supervisor.log &
                echo "finishing starting $node's supervisor"
            else
                echo "supervisor process id: \$sid"
            fi

END
                echo

    done

}

#同步配置檔案
sync_config(){
    for node in $slaves
    do 
                scp /opt/package/apache-storm-0.9.2-incubating/conf/storm.yaml 
[email protected]
$node:/opt/package/apache-storm-0.9.2-incubating/conf/ echo "finishing sync $node config!" done } mytest(){ for node in $slaves do ssh $node <<END ls END done } if [ "$1" = "stop" ] ; then stop_supervisor elif [ "$1" = "start" ]; then start_supervisor elif [ "$1" = "sync" ]; then sync_config elif [ "$1" = "check" ]; then check_supervisors else mytest fi