diff --git a/deps/oblib/src/lib/io/ob_io_common.cpp b/deps/oblib/src/lib/io/ob_io_common.cpp index 3e19cacf80263c4eb7a3a26b7d89ead17dbdb479..2ab5e4e17a5ee94044f11a71ad3b67b26962b7e2 100644 --- a/deps/oblib/src/lib/io/ob_io_common.cpp +++ b/deps/oblib/src/lib/io/ob_io_common.cpp @@ -150,8 +150,8 @@ void ObIOConfig::set_default_value() cpu_high_water_level_ = DEFAULT_CPU_HIGH_WATER_LEVEL; write_failure_detect_interval_ = DEFAULT_WRITE_FAILURE_DETECT_INTERVAL; read_failure_black_list_interval_ = DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL; - retry_warn_limit_ = DEFAULT_RETRY_WARN_LIMIT; - retry_error_limit_ = DEFAULT_RETRY_ERROR_LIMIT; + data_storage_warning_tolerance_time_ = DEFAULT_WARNING_TOLERANCE_TIME; + data_storage_error_tolerance_time_ = DEFAULT_ERROR_TOLERANCE_TIME; disk_io_thread_count_ = DEFAULT_DISK_IO_THREAD_COUNT; callback_thread_count_ = DEFAULT_IO_CALLBACK_THREAD_COUNT; large_query_io_percent_ = DEFAULT_LARGE_QUERY_IO_PERCENT; @@ -163,7 +163,8 @@ bool ObIOConfig::is_valid() const return sys_io_low_percent_ >= 0 && sys_io_low_percent_ <= 100 && sys_io_high_percent_ > 0 && sys_io_high_percent_ <= 100 && sys_io_low_percent_ <= sys_io_high_percent_ && user_iort_up_percent_ >= 0 && cpu_high_water_level_ > 0 && write_failure_detect_interval_ > 0 && read_failure_black_list_interval_ > 0 && - retry_warn_limit_ > 0 && retry_error_limit_ > retry_warn_limit_ && disk_io_thread_count_ > 0 && + data_storage_warning_tolerance_time_ > 0 && + data_storage_error_tolerance_time_ >= data_storage_warning_tolerance_time_ && disk_io_thread_count_ > 0 && disk_io_thread_count_ <= ObDisk::MAX_DISK_CHANNEL_CNT * 2 && disk_io_thread_count_ % 2 == 0 && callback_thread_count_ > 0 && large_query_io_percent_ >= 0 && large_query_io_percent_ <= 100 && data_storage_io_timeout_ms_ > 0; @@ -177,8 +178,8 @@ void ObIOConfig::reset() cpu_high_water_level_ = 0; write_failure_detect_interval_ = 0; read_failure_black_list_interval_ = 0; - retry_warn_limit_ = 0; - retry_error_limit_ = 0; + data_storage_warning_tolerance_time_ = 0; + data_storage_error_tolerance_time_ = 0; disk_io_thread_count_ = 0; callback_thread_count_ = 0; large_query_io_percent_ = 0; diff --git a/deps/oblib/src/lib/io/ob_io_common.h b/deps/oblib/src/lib/io/ob_io_common.h index 24ae1dbaf0f7d7ca42450c7b3150300b421d1b38..c37a0dcafb34d97c0c9a7d34b7ec74390f53889b 100644 --- a/deps/oblib/src/lib/io/ob_io_common.h +++ b/deps/oblib/src/lib/io/ob_io_common.h @@ -98,8 +98,8 @@ public: static const int64_t DEFAULT_CPU_HIGH_WATER_LEVEL = 4800; static const int64_t DEFAULT_WRITE_FAILURE_DETECT_INTERVAL = 60 * 1000 * 1000; // 1 min static const int64_t DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL = 300 * 1000 * 1000; // 5 min - static const int32_t DEFAULT_RETRY_WARN_LIMIT = 2; - static const int32_t DEFAULT_RETRY_ERROR_LIMIT = 5; + static const int32_t DEFAULT_WARNING_TOLERANCE_TIME = 30L * 1000L * 1000L; // 30s + static const int32_t DEFAULT_ERROR_TOLERANCE_TIME = 300L * 1000L * 1000L; // 300s static const int64_t DEFAULT_DISK_IO_THREAD_COUNT = 8; static const int64_t DEFAULT_IO_CALLBACK_THREAD_COUNT = 8; static const int64_t DEFAULT_LARGE_QUERY_IO_PERCENT = 0; // 0 means unlimited @@ -113,19 +113,22 @@ public: bool is_valid() const; void reset(); TO_STRING_KV(K_(sys_io_low_percent), K_(sys_io_high_percent), K_(user_iort_up_percent), K_(cpu_high_water_level), - K_(write_failure_detect_interval), K_(read_failure_black_list_interval), K_(retry_warn_limit), - K_(retry_error_limit), K_(disk_io_thread_count), K_(callback_thread_count), K_(large_query_io_percent), - K_(data_storage_io_timeout_ms)); + K_(write_failure_detect_interval), K_(read_failure_black_list_interval), K_(data_storage_warning_tolerance_time), + K_(data_storage_error_tolerance_time), K_(disk_io_thread_count), K_(callback_thread_count), + K_(large_query_io_percent), K_(data_storage_io_timeout_ms)); public: + // schedule related int64_t sys_io_low_percent_; int64_t sys_io_high_percent_; int64_t user_iort_up_percent_; int64_t cpu_high_water_level_; + // diagnose related int64_t write_failure_detect_interval_; int64_t read_failure_black_list_interval_; - int64_t retry_warn_limit_; - int64_t retry_error_limit_; + int64_t data_storage_warning_tolerance_time_; + int64_t data_storage_error_tolerance_time_; + // resource related int64_t disk_io_thread_count_; int64_t callback_thread_count_; int64_t large_query_io_percent_; diff --git a/deps/oblib/src/lib/io/ob_io_disk.cpp b/deps/oblib/src/lib/io/ob_io_disk.cpp index c10783cd851b75e30b87aac27b48e9adec66f468..25db7902ce2c632f68c730ccf81a6a59ab147d68 100644 --- a/deps/oblib/src/lib/io/ob_io_disk.cpp +++ b/deps/oblib/src/lib/io/ob_io_disk.cpp @@ -41,28 +41,23 @@ void ObDiskDiagnose::reset() MEMSET(write_failure_event_ts_, 0, sizeof(write_failure_event_ts_)); } -void ObDiskDiagnose::record_read_fail(const int64_t retry_cnt) +void ObDiskDiagnose::record_read_fail(const int64_t diagnose_begin_ts) { const ObIOConfig io_config = OB_IO_MANAGER.get_io_config(); - // in oder to reduce the misjudgement, here is the rules: - // watch the continuous read timeout with the exponential growth of timeout - // 1. for more than 3 times, record as dick warning, - // after that, this server is not allowed to be the paxos leader for a period, - // which is indicated by READ_FAILURE_IN_BLACK_LIST_INTERVAL, usually 300s. - // - // 2. for more than 6 times, record as disk error - // if the disk is confirmed normal, the administrator can reset the disk error by - // alter system set disk valid server [=] 'ip:port' - // - if (retry_cnt < io_config.retry_warn_limit_) { - // do nothing - } else if (retry_cnt < io_config.retry_error_limit_) { - last_read_failure_warn_ts_ = ObTimeUtility::current_time(); - } else { + const int64_t current_ts = ObTimeUtility::current_time(); + if (current_ts >= diagnose_begin_ts + io_config.data_storage_warning_tolerance_time_) { + // set disk warning and record warn_ts + // until warn_ts + READ_FAILURE_IN_BLACK_LIST_INTERVAL, this server is not allowed to be partition leader + last_read_failure_warn_ts_ = current_ts; + } + if (current_ts >= diagnose_begin_ts + io_config.data_storage_error_tolerance_time_) { + // set disk error and record error_ts + // if the disk is confirmed normal, the administrator can reset disk status by: + // alter system set disk valid server [=] 'ip:port' if (!is_disk_error_) { - disk_error_begin_ts_ = ObTimeUtility::current_time(); + disk_error_begin_ts_ = current_ts; } - disk_error_last_ts_ = ObTimeUtility::current_time(); + disk_error_last_ts_ = current_ts; is_disk_error_ = true; COMMON_LOG(ERROR, "set_disk_error: attention!!!"); } @@ -119,18 +114,6 @@ int64_t ObDiskDiagnose::get_last_io_failure_ts() const return MAX(disk_error_last_ts_, last_read_failure_warn_ts_); } -int64_t ObDiskDiagnose::get_max_retry_cnt() const -{ - const ObIOConfig io_config = OB_IO_MANAGER.get_io_config(); - return io_config.retry_error_limit_; -} - -int64_t ObDiskDiagnose::get_warn_retry_cnt() const -{ - const ObIOConfig io_config = OB_IO_MANAGER.get_io_config(); - return io_config.retry_warn_limit_; -} - /** * ---------------------------------------------- ObDisk --------------------------------------------- */ @@ -648,33 +631,41 @@ void ObIOFaultDetector::handle(void* t) const ObIOInfo& info = task->info_; ObIOHandle handle; uint64_t timeout_ms = task->timeout_ms_; - int64_t retry_cnt = 0; - const int64_t MIN_IO_WAIT_TIME_MS = 30000; // 30s - - for (retry_cnt = 0; retry_cnt < disk_diagnose.get_max_retry_cnt(); ++retry_cnt) { + // remain 1s to avoid race condition for retry_black_list_interval + const int64_t retry_black_list_interval_ms = + OB_IO_MANAGER.get_io_config().read_failure_black_list_interval_ / 1000L - 1000L; + // rety_io_timeout must less than black_list_interval + const int64_t MIN_IO_RETRY_TIMEOUT_MS = min(10L * 1000L /* 10s */, retry_black_list_interval_ms); + const int64_t MAX_IO_RETRY_TIMEOUT_MS = min(180L * 1000L /* 180s*/, retry_black_list_interval_ms); + const int64_t diagnose_begin_ts = ObTimeUtility::current_time(); + bool is_retry_succ = false; + while (OB_SUCC(ret) && !is_retry_succ && !disk_diagnose.is_disk_error()) { handle.reset(); - // timeout grows exponentially - if (retry_cnt >= disk_diagnose.get_warn_retry_cnt() - 1) { - timeout_ms = max(timeout_ms * 2, MIN_IO_WAIT_TIME_MS); - } else { - timeout_ms = timeout_ms * 2; - } - - if (retry_cnt == disk_diagnose.get_warn_retry_cnt()) { - disk_diagnose.record_read_fail(retry_cnt); + const ObIOConfig io_conf = OB_IO_MANAGER.get_io_config(); + const int64_t current_retry_ts = ObTimeUtility::current_time(); + const int64_t warn_ts = diagnose_begin_ts + io_conf.data_storage_warning_tolerance_time_; + const int64_t error_ts = diagnose_begin_ts + io_conf.data_storage_error_tolerance_time_; + const int64_t left_timeout_ms = + !disk_diagnose.is_disk_warning() ? (warn_ts - current_retry_ts) / 1000 : (error_ts - current_retry_ts) / 1000; + // timeout of retry io increase exponentially + timeout_ms = min(left_timeout_ms, min(MAX_IO_RETRY_TIMEOUT_MS, max(timeout_ms * 2, MIN_IO_RETRY_TIMEOUT_MS))); + if (timeout_ms > 0) { + // do retry io + if (disk->get_admin_status() != DISK_USING) { + ret = OB_STATE_NOT_MATCH; + COMMON_LOG(WARN, "check_admin_status failed, disk is deleting", K(ret), "status", disk->get_admin_status()); + break; + } else if (OB_FAIL(OB_IO_MANAGER.read(info, handle, timeout_ms))) { + COMMON_LOG(WARN, "ObIOManager::read failed", K(ret), K(info), K(timeout_ms)); + ret = OB_SUCCESS; + } else { + is_retry_succ = true; + } } - - if (disk->get_admin_status() != DISK_USING) { - ret = OB_STATE_NOT_MATCH; - COMMON_LOG(WARN, "check_admin_status failed, disk is deleting", K(ret), "status", disk->get_admin_status()); - break; - } else if (OB_FAIL(OB_IO_MANAGER.read(info, handle, timeout_ms))) { - COMMON_LOG(WARN, "ObIOManager::read failed", K(ret), K(info), K(timeout_ms)); - } else { - break; // stop retry if success + if (OB_SUCC(ret) && !is_retry_succ) { + disk_diagnose.record_read_fail(diagnose_begin_ts); } } - disk_diagnose.record_read_fail(retry_cnt); op_free(task); task = NULL; diff --git a/deps/oblib/src/lib/io/ob_io_disk.h b/deps/oblib/src/lib/io/ob_io_disk.h index 4dc4923e9463449f13176a5d2e48a229f778b900..db92b457e7f756b476a06a2489a90b0d981864ee 100644 --- a/deps/oblib/src/lib/io/ob_io_disk.h +++ b/deps/oblib/src/lib/io/ob_io_disk.h @@ -56,13 +56,11 @@ class ObDiskDiagnose { public: ObDiskDiagnose(); virtual ~ObDiskDiagnose(); - void record_read_fail(const int64_t retry_cnt); + void record_read_fail(const int64_t diagnose_begin_ts); void record_write_fail(); bool is_disk_warning() const; bool is_disk_error() const; void reset_disk_health(); - int64_t get_max_retry_cnt() const; - int64_t get_warn_retry_cnt() const; int64_t get_disk_error_begin_ts() const { return disk_error_begin_ts_; diff --git a/src/observer/ob_server_reload_config.cpp b/src/observer/ob_server_reload_config.cpp index 2c5b73e46e331514245bed1ff183ee1bda32130c..3811a201ae6283a35edc6a3456a2b170c3da4ec0 100644 --- a/src/observer/ob_server_reload_config.cpp +++ b/src/observer/ob_server_reload_config.cpp @@ -111,6 +111,8 @@ int ObServerReloadConfig::operator()() // In the 2.x version, reuse the sys_bkgd_io_timeout configuration item to indicate the data disk io timeout time // After version 3.1, use the data_storage_io_timeout configuration item. io_config.data_storage_io_timeout_ms_ = GCONF._data_storage_io_timeout / 1000L; + io_config.data_storage_warning_tolerance_time_ = GCONF.data_storage_warning_tolerance_time; + io_config.data_storage_error_tolerance_time_ = GCONF.data_storage_error_tolerance_time; if (OB_FAIL(ObIOManager::get_instance().set_io_config(io_config))) { real_ret = ret; LOG_WARN("reload io manager config fail, ", K(ret)); diff --git a/src/share/config/ob_config_helper.cpp b/src/share/config/ob_config_helper.cpp index 7d5acb697a0e40b6fe8031d5ce023e56dc52d2a5..45346f0b42442f58210d7daa649e4f57fae02744 100644 --- a/src/share/config/ob_config_helper.cpp +++ b/src/share/config/ob_config_helper.cpp @@ -302,6 +302,17 @@ bool ObConfigPartitionBalanceStrategyFuncChecker::check(const ObConfigItem& t) c return is_valid; } +bool ObDataStorageErrorToleranceTimeChecker::check(const ObConfigItem& t) const +{ + bool is_valid = false; + int64_t value = ObConfigTimeParser::get(t.str(), is_valid); + if (is_valid) { + const int64_t warning_value = GCONF.data_storage_warning_tolerance_time; + is_valid = value >= warning_value; + } + return is_valid; +} + int64_t ObConfigIntParser::get(const char* str, bool& valid) { char* p_end = NULL; diff --git a/src/share/config/ob_config_helper.h b/src/share/config/ob_config_helper.h index a47f571fe15055bd1f0d2792b28495f1d6001dc9..29840f7a08cb90c8f77a86d1f5bfeb90cbf77c9e 100644 --- a/src/share/config/ob_config_helper.h +++ b/src/share/config/ob_config_helper.h @@ -394,6 +394,18 @@ private: DISALLOW_COPY_AND_ASSIGN(ObConfigPartitionBalanceStrategyFuncChecker); }; +class ObDataStorageErrorToleranceTimeChecker : public ObConfigChecker { +public: + ObDataStorageErrorToleranceTimeChecker() + {} + virtual ~ObDataStorageErrorToleranceTimeChecker() + {} + bool check(const ObConfigItem& t) const; + +private: + DISABLE_COPY_ASSIGN(ObDataStorageErrorToleranceTimeChecker); +}; + // config item container class ObConfigStringKey { public: diff --git a/src/share/parameter/ob_parameter_seed.ipp b/src/share/parameter/ob_parameter_seed.ipp index fb8ed46f96df0eec8311b156c93ef0a19fc56ddd..d91076bc93b9330d8db06ec983d7816cb61796b6 100644 --- a/src/share/parameter/ob_parameter_seed.ipp +++ b/src/share/parameter/ob_parameter_seed.ipp @@ -881,6 +881,15 @@ DEF_TIME(_data_storage_io_timeout, OB_CLUSTER_PARAMETER, "120s", "[5s,600s]", "io timeout for data storage, Range [5s,600s]. " "The default value is 120s", ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); +DEF_TIME(data_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "30s", "[10s,300s]", + "time to tolerate disk read failure, after that, the disk status will be set warning. Range [10s,300s]. The " + "default value is 30s", + ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); +DEF_TIME_WITH_CHECKER(data_storage_error_tolerance_time, OB_CLUSTER_PARAMETER, "300s", + common::ObDataStorageErrorToleranceTimeChecker, "[10s,7200s]", + "time to tolerate disk read failure, after that, the disk status will be set error. Range [10s,7200s]. The default " + "value is 300s", + ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); DEF_INT(data_disk_usage_limit_percentage, OB_CLUSTER_PARAMETER, "90", "[50,100]", "the safe use percentage of data disk" "Range: [50,100] in integer",