fix(script): operation return before the cluster balance when using...

fix(script): operation return before the cluster balance when using pegasus_rolling_update.sh (#585)

fix(script): operation return before the cluster balance when using...
fix(script): operation return before the cluster balance when using pegasus_rolling_update.sh (#585)
17f6a554 · Wu Tao · GitHub · f3829a1e · 17f6a554 · 17f6a554
5 changed file
--- a/scripts/pegasus_add_node_list.sh
+++ b/scripts/pegasus_add_node_list.sh
@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
  exit 1
 fi

+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+add_node_start_time=$((`date +%s`))
+echo
+
 cluster=$1
 meta_list=$2
 replica_task_id_list=$3
@@ -49,70 +55,10 @@ do
  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 done

-echo "Set meta.lb.only_move_primary true"
-echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing."
-echo "So the following steps only include move_primary and copy_secondary."
-echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
-set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
-if [ $set_ok -ne 1 ]; then
-  echo "ERROR: meta.lb.only_move_primary true"
-  exit 1
-fi
-echo
-
-echo "Set meta level to lively..."
-echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
-set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
-if [ $set_ok -ne 1 ]; then
-  echo "ERROR: set meta level to lively failed"
-  exit 1
-fi
-
-echo "Wait cluster to become balanced..."
-echo "Wait for 3 minutes to do load balance..."
-sleep 180
-while true; do
-    op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
-    if [ -z "op_count" ]; then
-        break
-    fi
-    if [ $op_count -eq 0 ]; then
-        echo "Cluster may be balanced, try wait 30 seconds..."
-        sleep 30
-        op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
-        if [ $op_count -eq 0 ]; then
-            echo "Cluster becomes balanced."
-            break
-        fi
-    else
-        echo "Still $op_count balance operations to do..."
-        sleep 1
-    fi
-done
-echo
-
-
-
-echo "Set meta level to steady..."
-echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
-set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
-if [ $set_ok -ne 1 ]; then
-  echo "ERROR: set meta level to steady failed"
-  exit 1
-fi
-
-echo "Set meta.lb.only_move_primary false"
-echo "This remote-command tells the meta-server to rebalance with copying primaries."
-echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
-set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
-if [ $set_ok -ne 1 ]; then
-  echo "ERROR: meta.lb.only_move_primary false"
-  exit 1
-fi
-echo
+./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list true

 echo "Finish time: `date`"
-all_finish_time=$((`date +%s`))
-echo "add node list done, elasped time is $((all_finish_time - all_start_time)) seconds."
+add_node_finish_time=$((`date +%s`))
+echo "add node list done, elasped time is $((add_node_finish_time - add_node_start_time)) seconds."

 rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
--- a/scripts/pegasus_check_arguments.sh
+++ b/scripts/pegasus_check_arguments.sh
@@ -31,12 +31,6 @@ if [ $? -ne 0 ]; then
  exit 1
 fi

-echo "UID=$UID"
-echo "PID=$PID"
-echo "Start time: `date`"
-all_start_time=$((`date +%s`))
-echo
-
 id_list_file="/tmp/$UID.$PID.pegasus.$check_type.id_list"
 echo "Generating $id_list_file..."
 minos_show_replica $cluster $id_list_file

--- a/scripts/pegasus_offline_node_list.sh
+++ b/scripts/pegasus_offline_node_list.sh
@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
  exit 1
 fi

+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+offline_node_start_time=$((`date +%s`))
+echo
+
 cluster=$1
 meta_list=$2
 replica_task_id_list=$3
@@ -68,8 +74,8 @@ if [ $set_ok -ne 1 ]; then
  exit 1
 fi

-all_finish_time=$((`date +%s`))
+offline_finish_time=$((`date +%s`))
 echo "Offline replica server task list done."
-echo "Elapsed time is $((all_finish_time - all_start_time)) seconds."
+echo "Elapsed time is $((offline_finish_time - offline_node_start_time)) seconds."

 rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
--- a/scripts/pegasus_rebalance_cluster.sh
+++ b/scripts/pegasus_rebalance_cluster.sh
+#!/bin/bash
+#
+# Pegasus cluster rebalance 
+#
+
+PID=$$
+
+if [ $# -le 1 ]; then
+  echo "USAGE: $0 <cluster-name> <cluster-meta-list> <only-move-primary>(default false)"
+  echo 
+  echo "for example:"
+  echo "  $0 onebox 127.0.0.1:34601,127.0.0.1:34602 true"
+  echo
+  exit 1
+fi
+
+cluster=$1
+meta_list=$2
+
+if [ -z $3 ]; then
+  only_move_primary=false
+else
+  only_move_primary=$3
+fi
+
+pwd="$( cd "$( dirname "$0"  )" && pwd )"
+shell_dir="$( cd $pwd/.. && pwd )"
+cd $shell_dir
+
+source ./scripts/minos_common.sh
+find_cluster $cluster
+if [ $? -ne 0 ]; then
+  echo "ERROR: cluster \"$cluster\" not found"
+  exit 1
+fi
+
+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+rebalance_start_time=$((`date +%s`))
+echo
+
+echo "Generating /tmp/$UID.$PID.pegasus.rebalance.cluster_info..."
+echo cluster_info | ./run.sh shell --cluster $meta_list 2>&1 | sed 's/ *$//' >/tmp/$UID.$PID.pegasus.rebalance.cluster_info
+cname=`grep zookeeper_root /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '/[^/]*$' | grep -o '[^/]*$'`
+if [ "$cname" != "$cluster" ]; then
+  echo "ERROR: cluster name and meta list not matched"
+  exit 1
+fi
+pmeta=`grep primary_meta_server /tmp/$UID.$PID.pegasus.rebalance.cluster_info | grep -o '[0-9.:]*$'`
+if [ "$pmeta" == "" ]; then
+  echo "ERROR: extract primary_meta_server by shell failed"
+  exit 1
+fi
+
+if [ "$only_move_primary" == "true" ]; then
+  echo "Set meta.lb.only_move_primary true"
+  echo "This remote-command tells the meta-server to ignore copying primaries during rebalancing."
+  echo "So the following steps only include move_primary and copy_secondary."
+  echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
+  set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
+  if [ $set_ok -ne 1 ]; then
+    echo "ERROR: meta.lb.only_move_primary true"
+    exit 1
+  fi
+fi
+echo
+
+echo "Set meta level to lively..."
+echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
+set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
+if [ $set_ok -ne 1 ]; then
+  echo "ERROR: set meta level to lively failed"
+  exit 1
+fi
+
+echo "Wait cluster to become balanced..."
+echo "Wait for 3 minutes to do load balance..."
+sleep 180
+## Number of check times for balanced state, in case that op_count is 0 but
+## the cluster is in fact unbalanced. Each check waits for 30 secs.
+op_count_check_remain_times=1
+while true; do
+    op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
+    if [ -z $op_count ]; then
+        break
+    fi
+
+    if [ $op_count -eq 0 ]; then
+        if [ $op_count_check_remain_times -eq 0 ]; then
+          break
+        else
+           echo "Cluster may be balanced, try wait 30 seconds..."
+           ((op_count_check_remain_times--))
+           sleep 30
+        fi
+    else
+        echo "Still $op_count balance operations to do..."
+        sleep 10
+    fi
+done
+echo
+
+echo "Set meta level to steady..."
+echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
+set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
+if [ $set_ok -ne 1 ]; then
+  echo "ERROR: set meta level to steady failed"
+  exit 1
+fi
+
+if [ "$only_move_primary" == "true" ]; then
+  echo "Set meta.lb.only_move_primary false"
+  echo "This remote-command tells the meta-server to rebalance with copying primaries."
+  echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
+  set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
+  if [ $set_ok -ne 1 ]; then
+    echo "ERROR: meta.lb.only_move_primary false"
+    exit 1
+  fi
+  echo
+fi
+
+echo "Finish time: `date`"
+rebalance_finish_time=$((`date +%s`))
+echo "rebalance done, elasped time is $((rebalance_finish_time - rebalance_start_time)) seconds."
+
+rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
--- a/scripts/pegasus_rolling_update.sh
+++ b/scripts/pegasus_rolling_update.sh
@@ -46,7 +46,7 @@ fi
 echo "UID=$UID"
 echo "PID=$PID"
 echo "Start time: `date`"
-all_start_time=$((`date +%s`))
+rolling_start_time=$((`date +%s`))
 echo

 rs_list_file="/tmp/$UID.$PID.pegasus.rolling_update.rs.list"
@@ -279,46 +279,11 @@ if [ "$type" = "all" ]; then
  echo "Rolling update collectors done."
  echo

-  echo "Set meta level to lively..."
-  echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
-  set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
-  if [ $set_ok -ne 1 ]; then
-    echo "ERROR: set meta level to lively failed"
-    exit 1
-  fi
-  echo
-
-  echo "Wait cluster to become balanced..."
-  echo "Wait for 3 minutes to do load balance..."
-  sleep 180
-  while true
-  do
-    op_count=`echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2`
-    if [ -z "op_count" ]; then
-      break
-    fi
-    if [ $op_count -eq 0 ]; then
-      echo "Cluster becomes balanced."
-      break
-    else
-      echo "Still $op_count balance operations to do..."
-      sleep 10
-    fi
-  done
-  echo
-
-  echo "Set meta level to steady..."
-  echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
-  set_ok=`grep 'control meta level ok' /tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
-  if [ $set_ok -ne 1 ]; then
-    echo "ERROR: set meta level to steady failed"
-    exit 1
-  fi
-  echo
+  ./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list
 fi

 echo "Finish time: `date`"
-all_finish_time=$((`date +%s`))
-echo "Rolling update $type done, elasped time is $((all_finish_time - all_start_time)) seconds."
+rolling_finish_time=$((`date +%s`))
+echo "Rolling update $type done, elasped time is $((rolling_finish_time - rolling_start_time)) seconds."

 rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null