From 283e55b4b6d55329dbbd325cc52a1657e4ad70ff Mon Sep 17 00:00:00 2001 From: qinzuoyan Date: Tue, 5 Dec 2017 16:06:52 +0800 Subject: [PATCH] fix scripts; add downgrade_node script Summary: Ref T10174 Test Plan: N/A Reviewers: sunweijie, cailiuyang, heyuchen, wutao1 Reviewed By: cailiuyang Subscribers: #pegasus Maniphest Tasks: T10174 Differential Revision: https://phabricator.d.xiaomi.net/D76405 --- run.sh | 99 +++++++++++++++++++++++++++++++ scripts/downgrade_node.sh | 81 +++++++++++++++++++++++++ scripts/migrate_node.sh | 2 +- scripts/pack_server.sh | 1 + scripts/pegasus_rolling_update.sh | 27 ++++++++- scripts/pegasus_stat_available.sh | 6 +- 6 files changed, 209 insertions(+), 7 deletions(-) create mode 100755 scripts/downgrade_node.sh diff --git a/run.sh b/run.sh index 7b2d2a6..4bd9a44 100755 --- a/run.sh +++ b/run.sh @@ -35,6 +35,7 @@ function usage() echo " bench run benchmark test" echo " shell run pegasus shell" echo " migrate_node migrate primary replicas out of specified node" + echo " downgrade_node downgrade replicas to inactive on specified node" echo echo " test run unit test" echo @@ -1275,6 +1276,100 @@ function run_migrate_node() fi } +##################### +## downgrade_node +##################### +function usage_downgrade_node() +{ + echo "Options for subcommand 'downgrade_node':" + echo " -h|--help print the help info" + echo " -c|--cluster cluster meta lists" + echo " -n|--node the node to downgrade replicas, should be ip:port" + echo " -a|--app the app to downgrade replicas, if not set, means downgrade all apps" + echo " -t|--type type: test or run, default is test" +} + +function run_downgrade_node() +{ + CLUSTER="" + NODE="" + APP="*" + TYPE="test" + while [[ $# > 0 ]]; do + key="$1" + case $key in + -h|--help) + usage_downgrade_node + exit 0 + ;; + -c|--cluster) + CLUSTER="$2" + shift + ;; + -n|--node) + NODE="$2" + shift + ;; + -a|--app) + APP="$2" + shift + ;; + -t|--type) + TYPE="$2" + shift + ;; + *) + echo "ERROR: unknown option \"$key\"" + echo + usage_downgrade_node + exit -1 + ;; + esac + shift + done + + if [ "$CLUSTER" == "" ]; then + echo "ERROR: no cluster specified" + echo + usage_downgrade_node + exit -1 + fi + + if [ "$NODE" == "" ]; then + echo "ERROR: no node specified" + echo + usage_downgrade_node + exit -1 + fi + + if [ "$TYPE" != "test" -a "$TYPE" != "run" ]; then + echo "ERROR: invalid type $TYPE" + echo + usage_downgrade_node + exit -1 + fi + + echo "CLUSTER=$CLUSTER" + echo "NODE=$NODE" + echo "APP=$APP" + echo "TYPE=$TYPE" + echo + cd ${ROOT} + echo "------------------------------" + ./scripts/downgrade_node.sh $CLUSTER $NODE "$APP" $TYPE + echo "------------------------------" + echo + if [ "$TYPE" == "test" ]; then + echo "The above is sample downgrade commands." + echo "Run with option '-t run' to do migration actually." + else + echo "Done." + echo "You can run shell command 'nodes -d' to check the result." + echo + echo "The cluster's auto migration is disabled now, you can run shell command 'set_meta_level lively' to enable it again." + fi +} + #################################################################### if [ $# -eq 0 ]; then @@ -1358,6 +1453,10 @@ case $cmd in shift run_migrate_node $* ;; + downgrade_node) + shift + run_downgrade_node $* + ;; test) shift run_test $* diff --git a/scripts/downgrade_node.sh b/scripts/downgrade_node.sh new file mode 100755 index 0000000..2104e77 --- /dev/null +++ b/scripts/downgrade_node.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +if [ $# -ne 4 ] +then + echo "This tool is for downgrading replicas of specified node." + echo "USAGE: $0 " + echo " app-name = * means migrate all apps" + exit -1 +fi + +pwd="$( cd "$( dirname "$0" )" && pwd )" +shell_dir="$( cd $pwd/.. && pwd )" +cd $shell_dir + +cluster=$1 +node=$2 +app_name=$3 +type=$4 + +if [ "$type" != "run" -a "$type" != "test" ] +then + echo "ERROR: invalid type: $type" + echo "USAGE: $0 " + exit -1 +fi + +echo "set_meta_level steady" | ./run.sh shell --cluster $cluster &>/tmp/pegasus.set_meta_level + +echo ls | ./run.sh shell --cluster $cluster &>/tmp/pegasus.ls + +while read app_line +do + status=`echo $app_line | awk '{print $2}'` + if [ "$status" = "AVAILABLE" ] + then + gid=`echo $app_line | awk '{print $1}'` + app=`echo $app_line | awk '{print $3}'` + if [ "$app_name" != "*" -a "$app_name" != "$app" ] + then + continue + fi + + echo "app $app -d" | ./run.sh shell --cluster $cluster &>/tmp/pegasus.app.$app + + while read line + do + sec=`echo $line | awk '{print $5}' | grep -o '\[.*\]' | grep -o '[0-9.:,]*'` + if echo $sec | grep -q "$node" + then + pid=`echo $line | awk '{print $1}'` + pri=`echo $line | awk '{print $4}'` + if [ "$pri" = "" ] + then + echo "ERROR: can't downgrade ${gid}.${pid} because it is unhealthy" + exit -1 + fi + if [ "$pri" = "$node" ] + then + echo "ERROR: can't downgrade ${gid}.${pid} because $node is primary" + exit -1 + fi + if echo $sec | grep -v -q ',' + then + echo "ERROR: can't downgrade ${gid}.${pid} because it is unhealthy" + exit -1 + fi + echo "propose --gpid ${gid}.${pid} --type DOWNGRADE_TO_INACTIVE -t $pri -n $node" + fi + done /tmp/pegasus.cmd.$app + + if [ "$type" = "run" ] + then + cat /tmp/pegasus.cmd.$app | ./run.sh shell --cluster $cluster 2>/dev/null + echo + echo + else + cat /tmp/pegasus.cmd.$app + fi + fi +done " + echo "USAGE: $0 " exit -1 fi diff --git a/scripts/pack_server.sh b/scripts/pack_server.sh index 03ee98b..cba78d1 100755 --- a/scripts/pack_server.sh +++ b/scripts/pack_server.sh @@ -112,6 +112,7 @@ cp -v ./DSN_ROOT/bin/pegasus_server/pegasus_server ${pack}/bin cp -v ./DSN_ROOT/bin/pegasus_rproxy/pegasus_rproxy ${pack}/bin cp -v ./DSN_ROOT/lib/libdsn_meta_server.so ${pack}/bin cp -v ./DSN_ROOT/lib/libdsn_layer2_stateful_type1.so ${pack}/bin +cp -v ./rdsn/thirdparty/output/lib/libPoco*.so.48 ${pack}/bin cp -v ./rdsn/scripts/linux/learn_stat.py ${pack}/bin cp -v ./scripts/sendmail.sh ${pack}/bin diff --git a/scripts/pegasus_rolling_update.sh b/scripts/pegasus_rolling_update.sh index 80656cb..7956191 100755 --- a/scripts/pegasus_rolling_update.sh +++ b/scripts/pegasus_rolling_update.sh @@ -111,6 +111,7 @@ do echo "Migrating primary replicas out of node..." ./run.sh migrate_node -c $meta_list -n $node -t run &>/tmp/pegasus.rolling_update.migrate_node echo "Wait [$node] to migrate done..." + echo "Refer to /tmp/pegasus.rolling_update.migrate_node for details" while true do pri_count=`echo 'nodes -d' | ./run.sh shell --cluster $meta_list | grep $node | awk '{print $4}'` @@ -118,10 +119,30 @@ do echo "Migrate done." break else + echo "Still $pri_count primary replicas left on $node" sleep 1 fi - done + done echo + sleep 3 + + echo "Downgrading replicas on node..." + ./run.sh downgrade_node -c $meta_list -n $node -t run &>/tmp/pegasus.rolling_update.downgrade_node + echo "Wait [$node] to downgrade done..." + echo "Refer to /tmp/pegasus.rolling_update.downgrade_node for details" + while true + do + rep_count=`echo 'nodes -d' | ./run.sh shell --cluster $meta_list | grep $node | awk '{print $3}'` + if [ $rep_count -eq 0 ]; then + echo "Downgrade done." + break + else + echo "Still $rep_count replicas left on $node" + sleep 1 + fi + done + echo + sleep 3 echo "Rolling update by minos..." cd $minos_client_dir @@ -145,7 +166,7 @@ do else sleep 1 fi - done + done echo echo "Wait cluster to become healthy..." @@ -159,7 +180,7 @@ do else sleep 1 fi - done + done echo "Sleep done." echo diff --git a/scripts/pegasus_stat_available.sh b/scripts/pegasus_stat_available.sh index 9e32b82..10c153c 100755 --- a/scripts/pegasus_stat_available.sh +++ b/scripts/pegasus_stat_available.sh @@ -30,7 +30,7 @@ fi result_file="pegasus.stat_available.scan_result" tmp_file="/tmp/pegasus.stat_available.scan.$UID" -echo -e "use $detect_table\nscan detect_available_day -o $result_file" | ./run.sh shell -n $cluster &>$tmp_file +echo -e "use $detect_table\nhash_scan detect_available_day '' '' -s prefix -y \"$filter\" -o $result_file" | ./run.sh shell -n $cluster &>$tmp_file scan_ok=`grep 'key-value pairs got' $tmp_file | wc -l` if [ $scan_ok -ne 1 ]; then echo "ERROR: scan detect table failed, refer error to $tmp_file" @@ -38,13 +38,13 @@ if [ $scan_ok -ne 1 ]; then exit -1 fi -days=`grep $filter $result_file | wc -l` +days=`cat $result_file | wc -l` if [ $days -eq 0 ]; then echo "ERROR: no detect data found for filter \"$filter\", refer to $tmp_file" rm -f $result_file exit -1 fi -available=`grep $filter $result_file | grep -o '[0-9]*,[0-9]*,[0-9]*' | awk -F, '{a+=$1;b+=$2}END{printf("%f\n",(double)b/a);}'` +available=`cat $result_file | grep -o '[0-9]*,[0-9]*,[0-9]*' | awk -F, '{a+=$1;b+=$2}END{printf("%f\n",(double)b/a);}'` rm -f $result_file echo "$cluster $filter $days $available" -- GitLab