未验证 提交 17f6a554 编写于 作者: W Wu Tao 提交者: GitHub

fix(script): operation return before the cluster balance when using...

fix(script): operation return before the cluster balance when using pegasus_rolling_update.sh (#585)
上级 f3829a1e
......@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
exit 1
fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
add_node_start_time=$((`date +%s`))
echo
cluster=$1
meta_list=$2
replica_task_id_list=$3
......@@ -49,70 +55,10 @@ do
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
done
echo "Set meta.lb.only_move_primary true"
echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing."
echo "So the following steps only include move_primary and copy_secondary."
echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: meta.lb.only_move_primary true"
exit 1
fi
echo
echo "Set meta level to lively..."
echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to lively failed"
exit 1
fi
echo "Wait cluster to become balanced..."
echo "Wait for 3 minutes to do load balance..."
sleep 180
while true; do
op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
if [ -z "op_count" ]; then
break
fi
if [ $op_count -eq 0 ]; then
echo "Cluster may be balanced, try wait 30 seconds..."
sleep 30
op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
if [ $op_count -eq 0 ]; then
echo "Cluster becomes balanced."
break
fi
else
echo "Still $op_count balance operations to do..."
sleep 1
fi
done
echo
echo "Set meta level to steady..."
echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to steady failed"
exit 1
fi
echo "Set meta.lb.only_move_primary false"
echo "This remote-command tells the meta-server to rebalance with copying primaries."
echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: meta.lb.only_move_primary false"
exit 1
fi
echo
./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list true
echo "Finish time: `date`"
all_finish_time=$((`date +%s`))
echo "add node list done, elasped time is $((all_finish_time - all_start_time)) seconds."
add_node_finish_time=$((`date +%s`))
echo "add node list done, elasped time is $((add_node_finish_time - add_node_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
......@@ -31,12 +31,6 @@ if [ $? -ne 0 ]; then
exit 1
fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
all_start_time=$((`date +%s`))
echo
id_list_file="/tmp/$UID.$PID.pegasus.$check_type.id_list"
echo "Generating $id_list_file..."
minos_show_replica $cluster $id_list_file
......
......@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
exit 1
fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
offline_node_start_time=$((`date +%s`))
echo
cluster=$1
meta_list=$2
replica_task_id_list=$3
......@@ -68,8 +74,8 @@ if [ $set_ok -ne 1 ]; then
exit 1
fi
all_finish_time=$((`date +%s`))
offline_finish_time=$((`date +%s`))
echo "Offline replica server task list done."
echo "Elapsed time is $((all_finish_time - all_start_time)) seconds."
echo "Elapsed time is $((offline_finish_time - offline_node_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
#!/bin/bash
#
# Pegasus cluster rebalance
#
PID=$$
if [ $# -le 1 ]; then
echo "USAGE: $0 <cluster-name> <cluster-meta-list> <only-move-primary>(default false)"
echo
echo "for example:"
echo " $0 onebox 127.0.0.1:34601,127.0.0.1:34602 true"
echo
exit 1
fi
cluster=$1
meta_list=$2
if [ -z $3 ]; then
only_move_primary=false
else
only_move_primary=$3
fi
pwd="$( cd "$( dirname "$0" )" && pwd )"
shell_dir="$( cd $pwd/.. && pwd )"
cd $shell_dir
source ./scripts/minos_common.sh
find_cluster $cluster
if [ $? -ne 0 ]; then
echo "ERROR: cluster \"$cluster\" not found"
exit 1
fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
rebalance_start_time=$((`date +%s`))
echo
echo "Generating /tmp/$UID.$PID.pegasus.rebalance.cluster_info..."
echo cluster_info | ./run.sh shell --cluster $meta_list 2>&1 | sed 's/ *$//' >/tmp/$UID.$PID.pegasus.rebalance.cluster_info
cname=`grep zookeeper_root /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '/[^/]*$' | grep -o '[^/]*$'`
if [ "$cname" != "$cluster" ]; then
echo "ERROR: cluster name and meta list not matched"
exit 1
fi
pmeta=`grep primary_meta_server /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '[0-9.:]*$'`
if [ "$pmeta" == "" ]; then
echo "ERROR: extract primary_meta_server by shell failed"
exit 1
fi
if [ "$only_move_primary" == "true" ]; then
echo "Set meta.lb.only_move_primary true"
echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing."
echo "So the following steps only include move_primary and copy_secondary."
echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: meta.lb.only_move_primary true"
exit 1
fi
fi
echo
echo "Set meta level to lively..."
echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to lively failed"
exit 1
fi
echo "Wait cluster to become balanced..."
echo "Wait for 3 minutes to do load balance..."
sleep 180
## Number of check times for balanced state, in case that op_count is 0 but
## the cluster is in fact unbalanced. Each check waits for 30 secs.
op_count_check_remain_times=1
while true; do
op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
if [ -z $op_count ]; then
break
fi
if [ $op_count -eq 0 ]; then
if [ $op_count_check_remain_times -eq 0 ]; then
break
else
echo "Cluster may be balanced, try wait 30 seconds..."
((op_count_check_remain_times--))
sleep 30
fi
else
echo "Still $op_count balance operations to do..."
sleep 10
fi
done
echo
echo "Set meta level to steady..."
echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to steady failed"
exit 1
fi
if [ "$only_move_primary" == "true" ]; then
echo "Set meta.lb.only_move_primary false"
echo "This remote-command tells the meta-server to rebalance with copying primaries."
echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: meta.lb.only_move_primary false"
exit 1
fi
echo
fi
echo "Finish time: `date`"
rebalance_finish_time=$((`date +%s`))
echo "rebalance done, elasped time is $((rebalance_finish_time - rebalance_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
......@@ -46,7 +46,7 @@ fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
all_start_time=$((`date +%s`))
rolling_start_time=$((`date +%s`))
echo
rs_list_file="/tmp/$UID.$PID.pegasus.rolling_update.rs.list"
......@@ -279,46 +279,11 @@ if [ "$type" = "all" ]; then
echo "Rolling update collectors done."
echo
echo "Set meta level to lively..."
echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to lively failed"
exit 1
fi
echo
echo "Wait cluster to become balanced..."
echo "Wait for 3 minutes to do load balance..."
sleep 180
while true
do
op_count=`echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2`
if [ -z "op_count" ]; then
break
fi
if [ $op_count -eq 0 ]; then
echo "Cluster becomes balanced."
break
else
echo "Still $op_count balance operations to do..."
sleep 10
fi
done
echo
echo "Set meta level to steady..."
echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
if [ $set_ok -ne 1 ]; then
echo "ERROR: set meta level to steady failed"
exit 1
fi
echo
./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list
fi
echo "Finish time: `date`"
all_finish_time=$((`date +%s`))
echo "Rolling update $type done, elasped time is $((all_finish_time - all_start_time)) seconds."
rolling_finish_time=$((`date +%s`))
echo "Rolling update $type done, elasped time is $((rolling_finish_time - rolling_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册