collector: do not send email when email address is not provided; start...

collector: do not send email when email address is not provided; start collector in onebox; improve config to support collector (#70)

collector: do not send email when email address is not provided; start...
collector: do not send email when email address is not provided; start collector in onebox; improve config to support collector (#70)
d31c1431 · QinZuoyan · GitHub · 5e2a47ee · 14d8ba6e · 343e2e72
6 changed file
--- a/rdsn @ 14d8ba6e
+++ b/rdsn @ 14d8ba6e
-Subproject commit 343e2e72e4a7110e39882a93fe066fc4b6feb513
+Subproject commit 14d8ba6e0c444cf4694631b70d3087a1ddda213e
--- a/run.sh
+++ b/run.sh
@@ -437,7 +437,7 @@ function run_start_onebox()
        echo "ERROR: file ${SERVER_PATH}/pegasus_server not exist"
        exit -1
    fi
-    if ps -ef | grep ' /pegasus_server config.ini' | grep -E 'app_list meta|app_list replica'; then
+    if ps -ef | grep ' /pegasus_server config.ini' | grep -E 'app_list meta|app_list replica|app_list collector'; then
        echo "ERROR: some onebox processes are running, start failed"
        exit -1
    fi
@@ -504,6 +504,15 @@ function run_start_onebox()
        ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
        cd ..
    done
+    mkdir -p collector
+    cd collector
+    ln -s -f ${SERVER_PATH}/pegasus_server pegasus_server
+    sed "s/@META_PORT@/34600/;s/@REPLICA_PORT@/34800/" ${ROOT}/config-server.ini >config.ini
+    echo "cd `pwd` && $PWD/pegasus_server config.ini -app_list collector &>result &"
+    $PWD/pegasus_server config.ini -app_list collector &>result &
+    PID=$!
+    ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
+    cd ..
 }

 #####################
@@ -533,7 +542,7 @@ function run_stop_onebox()
        esac
        shift
    done
-    ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica' | awk '{print $2}' | xargs kill &>/dev/null
+    ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica|app_list collector' | awk '{print $2}' | xargs kill &>/dev/null
 }

 #####################
@@ -563,7 +572,7 @@ function run_list_onebox()
        esac
        shift
    done
-    ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica' | sort -k11
+    ps -ef | grep '/pegasus_server config.ini' | grep -E 'app_list meta|app_list replica|app_list collector' | sort -k11
 }

 #####################
@@ -604,17 +613,17 @@ function run_clear_onebox()
 function usage_start_onebox_instance()
 {
    echo "Options for subcommand 'start_onebox_instance':"
-    echo "   -h|--help         print the help info"
-    echo "   -m|--meta_id <num>"
-    echo "                     meta server id"
-    echo "   -r|--replica_id <num>"
-    echo "                     replica server id"
+    echo "   -h|--help              print the help info"
+    echo "   -m|--meta_id <num>     start meta server with id"
+    echo "   -r|--replica_id <num>  start replica server with id"
+    echo "   -c|--collector         start collector server"
 }

 function run_start_onebox_instance()
 {
    META_ID=0
    REPLICA_ID=0
+    COLLECTOR_ID=0
    while [[ $# > 0 ]]; do
        key="$1"
        case $key in
@@ -630,6 +639,9 @@ function run_start_onebox_instance()
                REPLICA_ID="$2"
                shift
                ;;
+            -c|--collector)
+                COLLECTOR_ID=1
+                ;;
            *)
                echo "ERROR: unknown option \"$key\""
                echo
@@ -639,12 +651,8 @@ function run_start_onebox_instance()
        esac
        shift
    done
-    if [ $META_ID = "0" -a $REPLICA_ID = "0" ]; then
-        echo "ERROR: no meta_id or replica_id set"
-        exit -1
-    fi
-    if [ $META_ID != "0" -a $REPLICA_ID != "0" ]; then
-        echo "ERROR: meta_id and replica_id can only set one"
+    if [ $META_ID = "0" -a $REPLICA_ID = "0" -a $COLLECTOR_ID = "0" ]; then
+        echo "ERROR: no meta_id or replica_id or collector set"
        exit -1
    fi
    if [ $META_ID != "0" ]; then
@@ -683,6 +691,24 @@ function run_start_onebox_instance()
        cd ..
        echo "INFO: replica@$REPLICA_ID started"
    fi
+    if [ $COLLECTOR_ID != "0" ]; then
+        dir=onebox/collector
+        if [ ! -d $dir ]; then
+            echo "ERROR: collector dir $dir not exist"
+            exit -1
+        fi
+        if ps -ef | grep "/collector/pegasus_server config.ini" | grep "app_list collector" ; then
+            echo "INFO: collector already running"
+            exit -1
+        fi
+        cd $dir
+        echo "cd `pwd` && $PWD/pegasus_server config.ini -app_list collector &>result &"
+        $PWD/pegasus_server config.ini -app_list collector &>result &
+        PID=$!
+        ps -ef | grep '/pegasus_server config.ini' | grep "\<$PID\>"
+        cd ..
+        echo "INFO: collector started"
+    fi
 }

 #####################
@@ -691,17 +717,17 @@ function run_start_onebox_instance()
 function usage_stop_onebox_instance()
 {
    echo "Options for subcommand 'stop_onebox_instance':"
-    echo "   -h|--help         print the help info"
-    echo "   -m|--meta_id <num>"
-    echo "                     meta server id"
-    echo "   -r|--replica_id <num>"
-    echo "                     replica server id"
+    echo "   -h|--help              print the help info"
+    echo "   -m|--meta_id <num>     stop meta server with id"
+    echo "   -r|--replica_id <num>  stop replica server with id"
+    echo "   -c|--collector         stop collector server"
 }

 function run_stop_onebox_instance()
 {
    META_ID=0
    REPLICA_ID=0
+    COLLECTOR_ID=0
    while [[ $# > 0 ]]; do
        key="$1"
        case $key in
@@ -717,6 +743,9 @@ function run_stop_onebox_instance()
                REPLICA_ID="$2"
                shift
                ;;
+            -c|--collector)
+                COLLECTOR_ID=1
+                ;;
            *)
                echo "ERROR: unknown option \"$key\""
                echo
@@ -726,12 +755,8 @@ function run_stop_onebox_instance()
        esac
        shift
    done
-    if [ $META_ID = "0" -a $REPLICA_ID = "0" ]; then
-        echo "ERROR: no meta_id or replica_id set"
-        exit -1
-    fi
-    if [ $META_ID != "0" -a $REPLICA_ID != "0" ]; then
-        echo "ERROR: meta_id and replica_id can only set one"
+    if [ $META_ID = "0" -a $REPLICA_ID = "0" -a $COLLECTOR_ID = "0" ]; then
+        echo "ERROR: no meta_id or replica_id or collector set"
        exit -1
    fi
    if [ $META_ID != "0" ]; then
@@ -760,6 +785,14 @@ function run_stop_onebox_instance()
        ps -ef | grep "/replica$REPLICA_ID/pegasus_server config.ini" | grep "app_list replica" | awk '{print $2}' | xargs kill &>/dev/null
        echo "INFO: replica@$REPLICA_ID stopped"
    fi
+    if [ $COLLECTOR_ID != "0" ]; then
+        if ! ps -ef | grep "/collector/pegasus_server config.ini" | grep "app_list collector" ; then
+            echo "INFO: collector is not running"
+            exit -1
+        fi
+        ps -ef | grep "/collector/pegasus_server config.ini" | grep "app_list collector" | awk '{print $2}' | xargs kill &>/dev/null
+        echo "INFO: collector stopped"
+    fi
 }

 #####################
@@ -768,19 +801,18 @@ function run_stop_onebox_instance()
 function usage_restart_onebox_instance()
 {
    echo "Options for subcommand 'restart_onebox_instance':"
-    echo "   -h|--help         print the help info"
-    echo "   -m|--meta_id <num>"
-    echo "                     meta server id"
-    echo "   -r|--replica_id <num>"
-    echo "                     replica server id"
-    echo "   -s|--sleep <num>"
-    echo "                     sleep time in seconds between stop and start, default is 1"
+    echo "   -h|--help              print the help info"
+    echo "   -m|--meta_id <num>     restart meta server with id"
+    echo "   -r|--replica_id <num>  restart replica server with id"
+    echo "   -c|--collector         restart collector server"
+    echo "   -s|--sleep <num>       sleep time in seconds between stop and start, default is 1"
 }

 function run_restart_onebox_instance()
 {
    META_ID=0
    REPLICA_ID=0
+    COLLECTOR_OPT=""
    SLEEP=1
    while [[ $# > 0 ]]; do
        key="$1"
@@ -797,6 +829,10 @@ function run_restart_onebox_instance()
                REPLICA_ID="$2"
                shift
                ;;
+            -c|--collector)
+                COLLECTOR_OPT="-c"
+                shift
+                ;;
            -s|--sleep)
                SLEEP="$2"
                shift
@@ -810,18 +846,14 @@ function run_restart_onebox_instance()
        esac
        shift
    done
-    if [ $META_ID = "0" -a $REPLICA_ID = "0" ]; then
-        echo "ERROR: no meta_id or replica_id set"
-        exit -1
-    fi
-    if [ $META_ID != "0" -a $REPLICA_ID != "0" ]; then
-        echo "ERROR: meta_id and replica_id can only set one"
+    if [ $META_ID = "0" -a $REPLICA_ID = "0" -a "x$COLLECTOR_OPT" = "x" ]; then
+        echo "ERROR: no meta_id or replica_id or collector set"
        exit -1
    fi
-    run_stop_onebox_instance -m $META_ID -r $REPLICA_ID
+    run_stop_onebox_instance -m $META_ID -r $REPLICA_ID $COLLECTOR_OPT
    echo "sleep $SLEEP"
    sleep $SLEEP
-    run_start_onebox_instance -m $META_ID -r $REPLICA_ID
+    run_start_onebox_instance -m $META_ID -r $REPLICA_ID $COLLECTOR_OPT
 }

 #####################

--- a/src/server/available_detector.cpp
+++ b/src/server/available_detector.cpp
@@ -36,11 +36,11 @@ available_detector::available_detector()
                                                    ".",
                                                    "available detect alert script dir");
    dassert(_alert_script_dir.size() > 0, "");
-    _alert_email_address = dsn_config_get_value_string("pegasus.collector",
-                                                       "available_detect_alert_email_address",
-                                                       "",
-                                                       "available detect alert email address");
-    dassert(_alert_email_address.size() > 0, "");
+    _alert_email_address = dsn_config_get_value_string(
+        "pegasus.collector",
+        "available_detect_alert_email_address",
+        "",
+        "available detect alert email address, empty means not send email");
    _meta_list.clear();
    dsn::replication::replica_helper::load_meta_servers(_meta_list);
    dassert(_meta_list.size() > 0, "");
@@ -59,17 +59,20 @@ available_detector::available_detector()
                                              1000, // unit is millisecond,default is 1s = 1000ms
                                              "available detect timeout");
    // initialize the _client.
-    if (!pegasus_client_factory::initialize(nullptr))
+    if (!pegasus_client_factory::initialize(nullptr)) {
        dassert(false, "Initialize the pegasus client failed");
+    }
    _client = pegasus_client_factory::get_client(_cluster_name.c_str(), _app_name.c_str());
    dassert(_client != nullptr, "Initialize the _client failed");
    _ddl_client.reset(new replication_ddl_client(_meta_list));
    dassert(_ddl_client != nullptr, "Initialize the _ddl_client failed");
-    _send_alert_email_cmd = "cd " + _alert_script_dir + "; bash sendmail.sh alert " +
-                            _alert_email_address + " " + _cluster_name + " " + _app_name + " ";
-    _send_availability_info_email_cmd = "cd " + _alert_script_dir +
-                                        "; bash sendmail.sh availability_info " +
-                                        _alert_email_address + " " + _cluster_name + " ";
+    if (!_alert_email_address.empty()) {
+        _send_alert_email_cmd = "cd " + _alert_script_dir + "; bash sendmail.sh alert " +
+                                _alert_email_address + " " + _cluster_name + " " + _app_name + " ";
+        _send_availability_info_email_cmd = "cd " + _alert_script_dir +
+                                            "; bash sendmail.sh availability_info " +
+                                            _alert_email_address + " " + _cluster_name + " ";
+    }

    _pfc_detect_times_day.init_app_counter("app.pegasus",
                                           "cluster.available.detect.times.day",
@@ -130,7 +133,7 @@ void available_detector::start()
                                nullptr,
                                std::move(std::bind(&available_detector::detect_available, this)),
                                0,
-                                std::chrono::seconds(60));
+                                std::chrono::minutes(1));
    report_availability_info();
 }

@@ -151,7 +154,13 @@ void available_detector::stop()
 void available_detector::detect_available()
 {
    if (!generate_hash_keys()) {
-        dassert(false, "Initialize hash_keys failed and can't detect the available");
+        derror("initialize hash_keys failed, do not detect available, retry after 60 seconds");
+        _detect_timer = ::dsn::tasking::enqueue(
+            LPC_DETECT_AVAILABLE,
+            nullptr,
+            std::move(std::bind(&available_detector::detect_available, this)),
+            0,
+            std::chrono::minutes(1));
        return;
    }
    _detect_tasks.clear();
@@ -259,12 +268,13 @@ bool available_detector::generate_hash_keys()
 void available_detector::on_detect(int32_t idx)
 {
    if (idx == 0) {
-        ddebug("detecting table[%s] with id[%d] on the cluster[%s], "
+        ddebug("detecting table[%s] with app_id[%d] and partition_count[%d] on cluster[%s], "
               "recent_day_detect_times(%" PRId64 "), recent_day_fail_times(%" PRId64 "), "
               "recent_hour_detect_times(%" PRId64 "), recent_hour_fail_times(%" PRId64 ") "
               "recent_minute_detect_times(%" PRId64 "), recent_minute_fail_times(%" PRId64 ")",
               _app_name.c_str(),
               _app_id,
+               _partition_count,
               _cluster_name.c_str(),
               _recent_day_detect_times.load(),
               _recent_day_fail_times.load(),
@@ -284,50 +294,30 @@ void available_detector::on_detect(int32_t idx)
    _recent_day_detect_times.fetch_add(1);
    _recent_hour_detect_times.fetch_add(1);
    _recent_minute_detect_times.fetch_add(1);
+
    // define async_get callback function.
-    auto async_get_callback = [this, idx](
-        int err, std::string &&_value, pegasus_client::internal_info &&info) {
-        std::atomic<int> &cnt = (*_fail_count[idx]);
-        if (err != PERR_OK) {
-            int prev = cnt.fetch_add(1);
-            _recent_day_fail_times.fetch_add(1);
-            _recent_hour_fail_times.fetch_add(1);
-            _recent_minute_fail_times.fetch_add(1);
-            derror("async_get partition[%d] fail, fail_count = %d, hash_key = %s, error = %s",
-                   idx,
-                   prev + 1,
-                   _hash_keys[idx].c_str(),
-                   _client->get_error_string(err));
-            // check and send email.
-            bool send_email = false;
-            if (cnt.load() >= _alert_fail_count) {
-                ::dsn::utils::auto_lock<::dsn::utils::ex_lock_nr> l(_alert_lock);
-                if (cnt.load() >= _alert_fail_count) {
-                    for (auto i = 0; i < _partition_count; i++) {
-                        std::atomic<int> &c = (*_fail_count[i]);
-                        c.store(0);
-                    }
-                    send_email = true;
-                }
-            }
-            if (send_email) {
-                ddebug("start to send alert email, partition_index = %d", idx);
-                int r = system((_send_alert_email_cmd + std::to_string(idx)).c_str());
-                if (r == 0) {
-                    ddebug("send alert email done, partition_index = %d", idx);
-                } else {
-                    derror(
-                        "send alert email fail, partition_index = %d, command_return = %d", idx, r);
-                }
+    auto async_get_callback =
+        [this, idx](int err, std::string &&_value, pegasus_client::internal_info &&info) {
+            std::atomic<int> &cnt = (*_fail_count[idx]);
+            if (err != PERR_OK) {
+                int prev = cnt.fetch_add(1);
+                _recent_day_fail_times.fetch_add(1);
+                _recent_hour_fail_times.fetch_add(1);
+                _recent_minute_fail_times.fetch_add(1);
+                derror("async_get partition[%d] fail, fail_count = %d, hash_key = %s, error = %s",
+                       idx,
+                       prev + 1,
+                       _hash_keys[idx].c_str(),
+                       _client->get_error_string(err));
+                check_and_send_email(&cnt, idx);
+            } else {
+                cnt.store(0);
+                dinfo("async_get partition[%d] ok, hash_key = %s, value = %s",
+                      idx,
+                      _hash_keys[idx].c_str(),
+                      _value.c_str());
            }
-        } else {
-            cnt.store(0);
-            dinfo("async_get partition[%d] ok, hash_key = %s, value = %s",
-                  idx,
-                  _hash_keys[idx].c_str(),
-                  _value.c_str());
-        }
-    };
+        };

    // define async_set callback function.
    auto async_set_callback =
@@ -345,28 +335,7 @@ void available_detector::on_detect(int32_t idx)
                   prev + 1,
                   _hash_keys[idx].c_str(),
                   _client->get_error_string(err));
-            // check and send email.
-            bool send_email = false;
-            if (cnt.load() >= _alert_fail_count) {
-                ::dsn::utils::auto_lock<::dsn::utils::ex_lock_nr> l(_alert_lock);
-                if (cnt.load() >= _alert_fail_count) {
-                    for (auto i = 0; i < _partition_count; i++) {
-                        std::atomic<int> &c = (*_fail_count[i]);
-                        c.store(0);
-                    }
-                    send_email = true;
-                }
-            }
-            if (send_email) {
-                ddebug("start to send alert email, partition_index = %d", idx);
-                int r = system((_send_alert_email_cmd + std::to_string(idx)).c_str());
-                if (r == 0) {
-                    ddebug("send alert email done, partition_index = %d", idx);
-                } else {
-                    derror(
-                        "send alert email fail, partition_index = %d, command_return = %d", idx, r);
-                }
-            }
+            check_and_send_email(&cnt, idx);
        } else {
            dinfo("async_set partition[%d] ok, hash_key = %s", idx, _hash_keys[idx].c_str());
            _client->async_get(
@@ -377,16 +346,49 @@ void available_detector::on_detect(int32_t idx)
    _client->async_set(_hash_keys[idx], "", value, std::move(async_set_callback), _detect_timeout);
 }

+void available_detector::check_and_send_email(std::atomic<int> *cnt, int32_t idx)
+{
+    bool send_email = false;
+    if (cnt->load() >= _alert_fail_count) {
+        ::dsn::utils::auto_lock<::dsn::utils::ex_lock_nr> l(_alert_lock);
+        if (cnt->load() >= _alert_fail_count) {
+            for (auto i = 0; i < _partition_count; i++) {
+                std::atomic<int> &c = (*_fail_count[i]);
+                c.store(0);
+            }
+            send_email = true;
+        }
+    }
+    if (send_email) {
+        ddebug("start to send alert email, partition_index = %d", idx);
+        if (_send_alert_email_cmd.empty()) {
+            ddebug("ignore sending alert email because email address is not set, "
+                   "partition_index = %d",
+                   idx);
+        } else {
+            int r = system((_send_alert_email_cmd + std::to_string(idx)).c_str());
+            if (r == 0) {
+                ddebug("send alert email done, partition_index = %d", idx);
+            } else {
+                derror("send alert email failed, partition_index = %d, "
+                       "command_return = %d",
+                       idx,
+                       r);
+            }
+        }
+    }
+}
+
 void available_detector::on_day_report()
 {
    ddebug("start to report on new day, last_day = %s", _old_day.c_str());
    int64_t detect_times = _recent_day_detect_times.fetch_and(0);
    int64_t fail_times = _recent_day_fail_times.fetch_and(0);
    int64_t succ_times = std::max(0L, detect_times - fail_times);
-    int64_t available = 1000000;
+    int64_t available = 0;
    std::string hash_key("detect_available_day");
    std::string sort_key(_old_day);
-    std::string value("0,0,1000000");
+    std::string value("0,0,0");
    if (detect_times > 0) {
        available = (int64_t)((double)succ_times / detect_times * 1000000);
        std::ostringstream oss;
@@ -399,25 +401,35 @@ void available_detector::on_day_report()
    _pfc_available_day->set(available);

    ddebug("start to send availability email, date = %s", _old_day.c_str());
-    int r = system((_send_availability_info_email_cmd + std::to_string(detect_times) + " " +
-                    std::to_string(fail_times) + " " + _old_day)
-                       .c_str());
-    if (r == 0) {
-        ddebug("send availability email done, date = %s, total_detect_times = %u, total_fail_times "
-               "= %u",
+    if (_send_availability_info_email_cmd.empty()) {
+        ddebug("ignore sending availability email because email address is not set, "
+               "date = %s, total_detect_times = %u, total_fail_times = %u",
               _old_day.c_str(),
               detect_times,
               fail_times);
    } else {
-        derror("send availability email fail, date = %s, total_detect_times = %u, total_fail_times "
-               "= %u, command_return = %d",
-               _old_day.c_str(),
-               detect_times,
-               fail_times,
-               r);
+        int r = system((_send_availability_info_email_cmd + std::to_string(detect_times) + " " +
+                        std::to_string(fail_times) + " " + _old_day)
+                           .c_str());
+        if (r == 0) {
+            ddebug("send availability email done, date = %s, "
+                   "total_detect_times = %u, total_fail_times = %u",
+                   _old_day.c_str(),
+                   detect_times,
+                   fail_times);
+        } else {
+            derror("send availability email fail, date = %s, "
+                   "total_detect_times = %u, total_fail_times = %u, command_return = %d",
+                   _old_day.c_str(),
+                   detect_times,
+                   fail_times,
+                   r);
+        }
    }

-    set_detect_result(hash_key, sort_key, value, 3);
+    // set try_count to 3000 (keep on retrying for 3000 minutes) to avoid losting detect result
+    // if the result table is also unavailable for a long time.
+    set_detect_result(hash_key, sort_key, value, 3000);
 }

 void available_detector::on_hour_report()
@@ -426,10 +438,10 @@ void available_detector::on_hour_report()
    int64_t detect_times = _recent_hour_detect_times.fetch_and(0);
    int64_t fail_times = _recent_hour_fail_times.fetch_and(0);
    int64_t succ_times = std::max(0L, detect_times - fail_times);
-    int64_t available = 1000000;
+    int64_t available = 0;
    std::string hash_key("detect_available_hour");
    std::string sort_key(_old_hour);
-    std::string value("0,0,1000000");
+    std::string value("0,0,0");
    if (detect_times > 0) {
        available = (int64_t)((double)succ_times / detect_times * 1000000);
        std::ostringstream oss;
@@ -441,7 +453,9 @@ void available_detector::on_hour_report()
    _pfc_fail_times_hour->set(fail_times);
    _pfc_available_hour->set(available);

-    set_detect_result(hash_key, sort_key, value, 3);
+    // set try_count to 3000 (keep on retrying for 3000 minutes) to avoid losting detect result
+    // if the result table is also unavailable for a long time.
+    set_detect_result(hash_key, sort_key, value, 3000);
 }

 void available_detector::on_minute_report()
@@ -450,10 +464,10 @@ void available_detector::on_minute_report()
    int64_t detect_times = _recent_minute_detect_times.fetch_and(0);
    int64_t fail_times = _recent_minute_fail_times.fetch_and(0);
    int64_t succ_times = std::max(0L, detect_times - fail_times);
-    int64_t available = 1000000;
+    int64_t available = 0;
    std::string hash_key("detect_available_minute");
    std::string sort_key(_old_minute);
-    std::string value("0,0,1000000");
+    std::string value("0,0,0");
    if (detect_times > 0) {
        available = (int64_t)((double)succ_times / detect_times * 1000000);
        std::ostringstream oss;
@@ -465,7 +479,9 @@ void available_detector::on_minute_report()
    _pfc_fail_times_minute->set(fail_times);
    _pfc_available_minute->set(available);

-    set_detect_result(hash_key, sort_key, value, 3);
+    // set try_count to 3000 (keep on retrying for 3000 minutes) to avoid losting detect result
+    // if the result table is also unavailable for a long time.
+    set_detect_result(hash_key, sort_key, value, 3000);
 }

 void available_detector::set_detect_result(const std::string &hash_key,
@@ -483,7 +499,7 @@ void available_detector::set_detect_result(const std::string &hash_key,
                int new_try_count = try_count - 1;
                if (new_try_count > 0) {
                    derror("set_detect_result fail, hash_key = %s, sort_key = %s, value = %s, "
-                           "error = %s, try_count = %d, try again after 1 minute",
+                           "error = %s, left_try_count = %d, try again after 1 minute",
                           hash_key.c_str(),
                           sort_key.c_str(),
                           value.c_str(),
@@ -499,7 +515,7 @@ void available_detector::set_detect_result(const std::string &hash_key,
                                            std::chrono::minutes(1));
                } else {
                    derror("set_detect_result fail, hash_key = %s, sort_key = %s, value = %s, "
-                           "error = %s, try_count = %d, do not try again",
+                           "error = %s, left_try_count = %d, do not try again",
                           hash_key.c_str(),
                           sort_key.c_str(),
                           value.c_str(),

--- a/src/server/available_detector.h
+++ b/src/server/available_detector.h
@@ -32,6 +32,7 @@ private:
    // generate hash_keys that can access every partition.
    bool generate_hash_keys();
    void on_detect(int32_t idx);
+    void check_and_send_email(std::atomic<int> *cnt, int32_t idx);
    void detect_available();
    void report_availability_info();


--- a/src/server/config-server.ini
+++ b/src/server/config-server.ini
@@ -23,6 +23,15 @@ pools = THREAD_POOL_DEFAULT,THREAD_POOL_REPLICATION_LONG,THREAD_POOL_REPLICATION
 run = true
 count = 1

+[apps.collector]
+name = collector
+type = collector
+arguments =
+ports = 34100
+pools = THREAD_POOL_DEFAULT,THREAD_POOL_REPLICATION
+run = true
+count = 1
+
 [core]
 ;tool = simulator
 ;tool = fastrun
@@ -157,7 +166,7 @@ perf_test_concurrent = true
 perf_test_concurrent_count = 20

 [meta_server]
-server_list = @LOCAL_IP@:34601, @LOCAL_IP@:34602, @LOCAL_IP@:34603
+server_list = @LOCAL_IP@:34601,@LOCAL_IP@:34602,@LOCAL_IP@:34603
 cluster_root = /pegasus/onebox/@LOCAL_IP@
 meta_state_service_type = meta_state_service_zookeeper
 meta_state_service_parameters =
@@ -187,7 +196,7 @@ stateful = true
 package_id = 

 [replication]
-data_dirs_black_list_file = /home/mi/.pegasus_data_dirs_black_list
+data_dirs_black_list_file = ./.pegasus_data_dirs_black_list

 deny_client_on_start = false
 delay_for_fd_timeout_on_start = false
@@ -261,6 +270,20 @@ falcon_host = 127.0.0.1
 falcon_port = 1988
 falcon_path = /v1/push

+[pegasus.collector]
+cluster = onebox
+available_detect_app = @APP_NAME@
+available_detect_alert_script_dir = ./package/bin
+available_detect_alert_email_address =
+available_detect_interval_seconds = 3
+available_detect_alert_fail_count = 30
+available_detect_timeout = 5000
+app_stat_interval_seconds = 10
+
+[uri-resolver.dsn://onebox]
+factory = partition_resolver_simple
+arguments = @LOCAL_IP@:34601,@LOCAL_IP@:34602,@LOCAL_IP@:34603
+
 [components.pegasus_perf_counter_number_percentile_atomic]
 counter_computation_interval_seconds = 10


--- a/src/server/config.ini
+++ b/src/server/config.ini
@@ -20,6 +20,15 @@
  run = true
  count = 1

+[apps.collector]
+  name = collector
+  type = collector
+  arguments =
+  ports = 34101
+  pools = THREAD_POOL_DEFAULT,THREAD_POOL_REPLICATION
+  run = true
+  count = 1
+
 [apps.mimic]
  name = mimic
  type = dsn.app.mimic
@@ -29,15 +38,6 @@
  count = 1
  delay_seconds = 30

-[apps.collector]
-  name = collector
-  type = collector
-  arguments =
-  ports = 34101
-  pools = THREAD_POOL_DEFAULT, THREAD_POOL_REPLICATION
-  run = true
-  count = 1
-
 [core]
  data_dir = %{app.dir}

@@ -80,7 +80,7 @@
  max_file_copy_request_count_per_file = 10

 [network]
-  primary_interface = %{network.interface}
+  primary_interface =
  io_service_worker_count = 4

 [threadpool..default]
@@ -291,12 +291,16 @@
  cluster = %{cluster.name}
  available_detect_app = temp
  available_detect_alert_script_dir = ./package/bin
-  available_detect_alert_email_address = %{email.address}
+  available_detect_alert_email_address =
  available_detect_interval_seconds = 3
  available_detect_alert_fail_count = 30
  available_detect_timeout = 5000
  app_stat_interval_seconds = 10

+[uri-resolver.dsn://%{cluster.name}]
+  factory = partition_resolver_simple
+  arguments = %{meta.server.list}
+
 [components.pegasus_perf_counter_number_percentile_atomic]
  counter_computation_interval_seconds = 10