diff --git a/scripts/pegasus_add_node_list.sh b/scripts/pegasus_add_node_list.sh index 1f358d848a6b372a5a1d9eeaff940277b3e4af16..a8ebaeef55afdb3384f539ef3e595d2a679c597c 100755 --- a/scripts/pegasus_add_node_list.sh +++ b/scripts/pegasus_add_node_list.sh @@ -14,6 +14,12 @@ if [ $# -le 2 ]; then exit 1 fi +echo "UID=$UID" +echo "PID=$PID" +echo "Start time: `date`" +add_node_start_time=$((`date +%s`)) +echo + cluster=$1 meta_list=$2 replica_task_id_list=$3 @@ -49,70 +55,10 @@ do echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" done -echo "Set meta.lb.only_move_primary true" -echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing." -echo "So the following steps only include move_primary and copy_secondary." -echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary -set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l` -if [ $set_ok -ne 1 ]; then - echo "ERROR: meta.lb.only_move_primary true" - exit 1 -fi -echo - -echo "Set meta level to lively..." -echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level -set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l` -if [ $set_ok -ne 1 ]; then - echo "ERROR: set meta level to lively failed" - exit 1 -fi - -echo "Wait cluster to become balanced..." -echo "Wait for 3 minutes to do load balance..." -sleep 180 -while true; do - op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2) - if [ -z "op_count" ]; then - break - fi - if [ $op_count -eq 0 ]; then - echo "Cluster may be balanced, try wait 30 seconds..." - sleep 30 - op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2) - if [ $op_count -eq 0 ]; then - echo "Cluster becomes balanced." - break - fi - else - echo "Still $op_count balance operations to do..." - sleep 1 - fi -done -echo - - - -echo "Set meta level to steady..." -echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level -set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l` -if [ $set_ok -ne 1 ]; then - echo "ERROR: set meta level to steady failed" - exit 1 -fi - -echo "Set meta.lb.only_move_primary false" -echo "This remote-command tells the meta-server to rebalance with copying primaries." -echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary -set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l` -if [ $set_ok -ne 1 ]; then - echo "ERROR: meta.lb.only_move_primary false" - exit 1 -fi -echo +./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list true echo "Finish time: `date`" -all_finish_time=$((`date +%s`)) -echo "add node list done, elasped time is $((all_finish_time - all_start_time)) seconds." +add_node_finish_time=$((`date +%s`)) +echo "add node list done, elasped time is $((add_node_finish_time - add_node_start_time)) seconds." rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null diff --git a/scripts/pegasus_check_arguments.sh b/scripts/pegasus_check_arguments.sh index de22f48f189da44d3399ebbe380205c8bad4948b..5b3f37bb9763d9d8ccf6f0cf31819aa2be7a481c 100755 --- a/scripts/pegasus_check_arguments.sh +++ b/scripts/pegasus_check_arguments.sh @@ -31,12 +31,6 @@ if [ $? -ne 0 ]; then exit 1 fi -echo "UID=$UID" -echo "PID=$PID" -echo "Start time: `date`" -all_start_time=$((`date +%s`)) -echo - id_list_file="/tmp/$UID.$PID.pegasus.$check_type.id_list" echo "Generating $id_list_file..." minos_show_replica $cluster $id_list_file diff --git a/scripts/pegasus_offline_node_list.sh b/scripts/pegasus_offline_node_list.sh index f8883f21496f073768486ebb6d027c283199faf0..093fe167bf106361ede07a3743b9b4931b295b35 100755 --- a/scripts/pegasus_offline_node_list.sh +++ b/scripts/pegasus_offline_node_list.sh @@ -14,6 +14,12 @@ if [ $# -le 2 ]; then exit 1 fi +echo "UID=$UID" +echo "PID=$PID" +echo "Start time: `date`" +offline_node_start_time=$((`date +%s`)) +echo + cluster=$1 meta_list=$2 replica_task_id_list=$3 @@ -68,8 +74,8 @@ if [ $set_ok -ne 1 ]; then exit 1 fi -all_finish_time=$((`date +%s`)) +offline_finish_time=$((`date +%s`)) echo "Offline replica server task list done." -echo "Elapsed time is $((all_finish_time - all_start_time)) seconds." +echo "Elapsed time is $((offline_finish_time - offline_node_start_time)) seconds." rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null diff --git a/scripts/pegasus_rebalance_cluster.sh b/scripts/pegasus_rebalance_cluster.sh new file mode 100755 index 0000000000000000000000000000000000000000..99112fe4bc18c158ed848e6385aa03b2526b6588 --- /dev/null +++ b/scripts/pegasus_rebalance_cluster.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Pegasus cluster rebalance +# + +PID=$$ + +if [ $# -le 1 ]; then + echo "USAGE: $0 (default false)" + echo + echo "for example:" + echo " $0 onebox 127.0.0.1:34601,127.0.0.1:34602 true" + echo + exit 1 +fi + +cluster=$1 +meta_list=$2 + +if [ -z $3 ]; then + only_move_primary=false +else + only_move_primary=$3 +fi + +pwd="$( cd "$( dirname "$0" )" && pwd )" +shell_dir="$( cd $pwd/.. && pwd )" +cd $shell_dir + +source ./scripts/minos_common.sh +find_cluster $cluster +if [ $? -ne 0 ]; then + echo "ERROR: cluster \"$cluster\" not found" + exit 1 +fi + +echo "UID=$UID" +echo "PID=$PID" +echo "Start time: `date`" +rebalance_start_time=$((`date +%s`)) +echo + +echo "Generating /tmp/$UID.$PID.pegasus.rebalance.cluster_info..." +echo cluster_info | ./run.sh shell --cluster $meta_list 2>&1 | sed 's/ *$//' >/tmp/$UID.$PID.pegasus.rebalance.cluster_info +cname=`grep zookeeper_root /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '/[^/]*$' | grep -o '[^/]*$'` +if [ "$cname" != "$cluster" ]; then + echo "ERROR: cluster name and meta list not matched" + exit 1 +fi +pmeta=`grep primary_meta_server /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '[0-9.:]*$'` +if [ "$pmeta" == "" ]; then + echo "ERROR: extract primary_meta_server by shell failed" + exit 1 +fi + +if [ "$only_move_primary" == "true" ]; then + echo "Set meta.lb.only_move_primary true" + echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing." + echo "So the following steps only include move_primary and copy_secondary." + echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary + set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l` + if [ $set_ok -ne 1 ]; then + echo "ERROR: meta.lb.only_move_primary true" + exit 1 + fi +fi +echo + +echo "Set meta level to lively..." +echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level +set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l` +if [ $set_ok -ne 1 ]; then + echo "ERROR: set meta level to lively failed" + exit 1 +fi + +echo "Wait cluster to become balanced..." +echo "Wait for 3 minutes to do load balance..." +sleep 180 +## Number of check times for balanced state, in case that op_count is 0 but +## the cluster is in fact unbalanced. Each check waits for 30 secs. +op_count_check_remain_times=1 +while true; do + op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2) + if [ -z $op_count ]; then + break + fi + + if [ $op_count -eq 0 ]; then + if [ $op_count_check_remain_times -eq 0 ]; then + break + else + echo "Cluster may be balanced, try wait 30 seconds..." + ((op_count_check_remain_times--)) + sleep 30 + fi + else + echo "Still $op_count balance operations to do..." + sleep 10 + fi +done +echo + +echo "Set meta level to steady..." +echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level +set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l` +if [ $set_ok -ne 1 ]; then + echo "ERROR: set meta level to steady failed" + exit 1 +fi + +if [ "$only_move_primary" == "true" ]; then + echo "Set meta.lb.only_move_primary false" + echo "This remote-command tells the meta-server to rebalance with copying primaries." + echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary + set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l` + if [ $set_ok -ne 1 ]; then + echo "ERROR: meta.lb.only_move_primary false" + exit 1 + fi + echo +fi + +echo "Finish time: `date`" +rebalance_finish_time=$((`date +%s`)) +echo "rebalance done, elasped time is $((rebalance_finish_time - rebalance_start_time)) seconds." + +rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null diff --git a/scripts/pegasus_rolling_update.sh b/scripts/pegasus_rolling_update.sh index 95231196029157e085ab3cf7fe23a20424c28b32..a7ba7f535be31037806225b8d12d28fc4297e1a6 100755 --- a/scripts/pegasus_rolling_update.sh +++ b/scripts/pegasus_rolling_update.sh @@ -46,7 +46,7 @@ fi echo "UID=$UID" echo "PID=$PID" echo "Start time: `date`" -all_start_time=$((`date +%s`)) +rolling_start_time=$((`date +%s`)) echo rs_list_file="/tmp/$UID.$PID.pegasus.rolling_update.rs.list" @@ -279,46 +279,11 @@ if [ "$type" = "all" ]; then echo "Rolling update collectors done." echo - echo "Set meta level to lively..." - echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level - set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l` - if [ $set_ok -ne 1 ]; then - echo "ERROR: set meta level to lively failed" - exit 1 - fi - echo - - echo "Wait cluster to become balanced..." - echo "Wait for 3 minutes to do load balance..." - sleep 180 - while true - do - op_count=`echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2` - if [ -z "op_count" ]; then - break - fi - if [ $op_count -eq 0 ]; then - echo "Cluster becomes balanced." - break - else - echo "Still $op_count balance operations to do..." - sleep 10 - fi - done - echo - - echo "Set meta level to steady..." - echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level - set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l` - if [ $set_ok -ne 1 ]; then - echo "ERROR: set meta level to steady failed" - exit 1 - fi - echo + ./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list fi echo "Finish time: `date`" -all_finish_time=$((`date +%s`)) -echo "Rolling update $type done, elasped time is $((all_finish_time - all_start_time)) seconds." +rolling_finish_time=$((`date +%s`)) +echo "Rolling update $type done, elasped time is $((rolling_finish_time - rolling_start_time)) seconds." rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null