Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Metz
oceanbase
提交
7eb645fb
O
oceanbase
项目概览
Metz
/
oceanbase
与 Fork 源项目一致
Fork自
oceanbase / oceanbase
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
oceanbase
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
7eb645fb
编写于
7月 15, 2021
作者:
O
obdev
提交者:
wangzelin.wzl
7月 15, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add parameters for detecting disk warning and error
上级
20c0cae3
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
95 addition
and
68 deletion
+95
-68
deps/oblib/src/lib/io/ob_io_common.cpp
deps/oblib/src/lib/io/ob_io_common.cpp
+6
-5
deps/oblib/src/lib/io/ob_io_common.h
deps/oblib/src/lib/io/ob_io_common.h
+10
-7
deps/oblib/src/lib/io/ob_io_disk.cpp
deps/oblib/src/lib/io/ob_io_disk.cpp
+44
-53
deps/oblib/src/lib/io/ob_io_disk.h
deps/oblib/src/lib/io/ob_io_disk.h
+1
-3
src/observer/ob_server_reload_config.cpp
src/observer/ob_server_reload_config.cpp
+2
-0
src/share/config/ob_config_helper.cpp
src/share/config/ob_config_helper.cpp
+11
-0
src/share/config/ob_config_helper.h
src/share/config/ob_config_helper.h
+12
-0
src/share/parameter/ob_parameter_seed.ipp
src/share/parameter/ob_parameter_seed.ipp
+9
-0
未找到文件。
deps/oblib/src/lib/io/ob_io_common.cpp
浏览文件 @
7eb645fb
...
...
@@ -150,8 +150,8 @@ void ObIOConfig::set_default_value()
cpu_high_water_level_
=
DEFAULT_CPU_HIGH_WATER_LEVEL
;
write_failure_detect_interval_
=
DEFAULT_WRITE_FAILURE_DETECT_INTERVAL
;
read_failure_black_list_interval_
=
DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL
;
retry_warn_limit_
=
DEFAULT_RETRY_WARN_LIMIT
;
retry_error_limit_
=
DEFAULT_RETRY_ERROR_LIMIT
;
data_storage_warning_tolerance_time_
=
DEFAULT_WARNING_TOLERANCE_TIME
;
data_storage_error_tolerance_time_
=
DEFAULT_ERROR_TOLERANCE_TIME
;
disk_io_thread_count_
=
DEFAULT_DISK_IO_THREAD_COUNT
;
callback_thread_count_
=
DEFAULT_IO_CALLBACK_THREAD_COUNT
;
large_query_io_percent_
=
DEFAULT_LARGE_QUERY_IO_PERCENT
;
...
...
@@ -163,7 +163,8 @@ bool ObIOConfig::is_valid() const
return
sys_io_low_percent_
>=
0
&&
sys_io_low_percent_
<=
100
&&
sys_io_high_percent_
>
0
&&
sys_io_high_percent_
<=
100
&&
sys_io_low_percent_
<=
sys_io_high_percent_
&&
user_iort_up_percent_
>=
0
&&
cpu_high_water_level_
>
0
&&
write_failure_detect_interval_
>
0
&&
read_failure_black_list_interval_
>
0
&&
retry_warn_limit_
>
0
&&
retry_error_limit_
>
retry_warn_limit_
&&
disk_io_thread_count_
>
0
&&
data_storage_warning_tolerance_time_
>
0
&&
data_storage_error_tolerance_time_
>=
data_storage_warning_tolerance_time_
&&
disk_io_thread_count_
>
0
&&
disk_io_thread_count_
<=
ObDisk
::
MAX_DISK_CHANNEL_CNT
*
2
&&
disk_io_thread_count_
%
2
==
0
&&
callback_thread_count_
>
0
&&
large_query_io_percent_
>=
0
&&
large_query_io_percent_
<=
100
&&
data_storage_io_timeout_ms_
>
0
;
...
...
@@ -177,8 +178,8 @@ void ObIOConfig::reset()
cpu_high_water_level_
=
0
;
write_failure_detect_interval_
=
0
;
read_failure_black_list_interval_
=
0
;
retry_warn_limit
_
=
0
;
retry_error_limit
_
=
0
;
data_storage_warning_tolerance_time
_
=
0
;
data_storage_error_tolerance_time
_
=
0
;
disk_io_thread_count_
=
0
;
callback_thread_count_
=
0
;
large_query_io_percent_
=
0
;
...
...
deps/oblib/src/lib/io/ob_io_common.h
浏览文件 @
7eb645fb
...
...
@@ -98,8 +98,8 @@ public:
static
const
int64_t
DEFAULT_CPU_HIGH_WATER_LEVEL
=
4800
;
static
const
int64_t
DEFAULT_WRITE_FAILURE_DETECT_INTERVAL
=
60
*
1000
*
1000
;
// 1 min
static
const
int64_t
DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL
=
300
*
1000
*
1000
;
// 5 min
static
const
int32_t
DEFAULT_
RETRY_WARN_LIMIT
=
2
;
static
const
int32_t
DEFAULT_
RETRY_ERROR_LIMIT
=
5
;
static
const
int32_t
DEFAULT_
WARNING_TOLERANCE_TIME
=
30L
*
1000L
*
1000L
;
// 30s
static
const
int32_t
DEFAULT_
ERROR_TOLERANCE_TIME
=
300L
*
1000L
*
1000L
;
// 300s
static
const
int64_t
DEFAULT_DISK_IO_THREAD_COUNT
=
8
;
static
const
int64_t
DEFAULT_IO_CALLBACK_THREAD_COUNT
=
8
;
static
const
int64_t
DEFAULT_LARGE_QUERY_IO_PERCENT
=
0
;
// 0 means unlimited
...
...
@@ -113,19 +113,22 @@ public:
bool
is_valid
()
const
;
void
reset
();
TO_STRING_KV
(
K_
(
sys_io_low_percent
),
K_
(
sys_io_high_percent
),
K_
(
user_iort_up_percent
),
K_
(
cpu_high_water_level
),
K_
(
write_failure_detect_interval
),
K_
(
read_failure_black_list_interval
),
K_
(
retry_warn_limit
),
K_
(
retry_error_limit
),
K_
(
disk_io_thread_count
),
K_
(
callback_thread_count
),
K_
(
large_query_io_perce
nt
),
K_
(
data_storage_io_timeout_ms
));
K_
(
write_failure_detect_interval
),
K_
(
read_failure_black_list_interval
),
K_
(
data_storage_warning_tolerance_time
),
K_
(
data_storage_error_tolerance_time
),
K_
(
disk_io_thread_count
),
K_
(
callback_thread_cou
nt
),
K_
(
large_query_io_percent
),
K_
(
data_storage_io_timeout_ms
));
public:
// schedule related
int64_t
sys_io_low_percent_
;
int64_t
sys_io_high_percent_
;
int64_t
user_iort_up_percent_
;
int64_t
cpu_high_water_level_
;
// diagnose related
int64_t
write_failure_detect_interval_
;
int64_t
read_failure_black_list_interval_
;
int64_t
retry_warn_limit_
;
int64_t
retry_error_limit_
;
int64_t
data_storage_warning_tolerance_time_
;
int64_t
data_storage_error_tolerance_time_
;
// resource related
int64_t
disk_io_thread_count_
;
int64_t
callback_thread_count_
;
int64_t
large_query_io_percent_
;
...
...
deps/oblib/src/lib/io/ob_io_disk.cpp
浏览文件 @
7eb645fb
...
...
@@ -41,28 +41,23 @@ void ObDiskDiagnose::reset()
MEMSET
(
write_failure_event_ts_
,
0
,
sizeof
(
write_failure_event_ts_
));
}
void
ObDiskDiagnose
::
record_read_fail
(
const
int64_t
retry_cnt
)
void
ObDiskDiagnose
::
record_read_fail
(
const
int64_t
diagnose_begin_ts
)
{
const
ObIOConfig
io_config
=
OB_IO_MANAGER
.
get_io_config
();
// in oder to reduce the misjudgement, here is the rules:
// watch the continuous read timeout with the exponential growth of timeout
// 1. for more than 3 times, record as dick warning,
// after that, this server is not allowed to be the paxos leader for a period,
// which is indicated by READ_FAILURE_IN_BLACK_LIST_INTERVAL, usually 300s.
//
// 2. for more than 6 times, record as disk error
// if the disk is confirmed normal, the administrator can reset the disk error by
const
int64_t
current_ts
=
ObTimeUtility
::
current_time
();
if
(
current_ts
>=
diagnose_begin_ts
+
io_config
.
data_storage_warning_tolerance_time_
)
{
// set disk warning and record warn_ts
// until warn_ts + READ_FAILURE_IN_BLACK_LIST_INTERVAL, this server is not allowed to be partition leader
last_read_failure_warn_ts_
=
current_ts
;
}
if
(
current_ts
>=
diagnose_begin_ts
+
io_config
.
data_storage_error_tolerance_time_
)
{
// set disk error and record error_ts
// if the disk is confirmed normal, the administrator can reset disk status by:
// alter system set disk valid server [=] 'ip:port'
//
if
(
retry_cnt
<
io_config
.
retry_warn_limit_
)
{
// do nothing
}
else
if
(
retry_cnt
<
io_config
.
retry_error_limit_
)
{
last_read_failure_warn_ts_
=
ObTimeUtility
::
current_time
();
}
else
{
if
(
!
is_disk_error_
)
{
disk_error_begin_ts_
=
ObTimeUtility
::
current_time
()
;
disk_error_begin_ts_
=
current_ts
;
}
disk_error_last_ts_
=
ObTimeUtility
::
current_time
()
;
disk_error_last_ts_
=
current_ts
;
is_disk_error_
=
true
;
COMMON_LOG
(
ERROR
,
"set_disk_error: attention!!!"
);
}
...
...
@@ -119,18 +114,6 @@ int64_t ObDiskDiagnose::get_last_io_failure_ts() const
return
MAX
(
disk_error_last_ts_
,
last_read_failure_warn_ts_
);
}
int64_t
ObDiskDiagnose
::
get_max_retry_cnt
()
const
{
const
ObIOConfig
io_config
=
OB_IO_MANAGER
.
get_io_config
();
return
io_config
.
retry_error_limit_
;
}
int64_t
ObDiskDiagnose
::
get_warn_retry_cnt
()
const
{
const
ObIOConfig
io_config
=
OB_IO_MANAGER
.
get_io_config
();
return
io_config
.
retry_warn_limit_
;
}
/**
* ---------------------------------------------- ObDisk ---------------------------------------------
*/
...
...
@@ -648,33 +631,41 @@ void ObIOFaultDetector::handle(void* t)
const
ObIOInfo
&
info
=
task
->
info_
;
ObIOHandle
handle
;
uint64_t
timeout_ms
=
task
->
timeout_ms_
;
int64_t
retry_cnt
=
0
;
const
int64_t
MIN_IO_WAIT_TIME_MS
=
30000
;
// 30s
for
(
retry_cnt
=
0
;
retry_cnt
<
disk_diagnose
.
get_max_retry_cnt
();
++
retry_cnt
)
{
// remain 1s to avoid race condition for retry_black_list_interval
const
int64_t
retry_black_list_interval_ms
=
OB_IO_MANAGER
.
get_io_config
().
read_failure_black_list_interval_
/
1000L
-
1000L
;
// rety_io_timeout must less than black_list_interval
const
int64_t
MIN_IO_RETRY_TIMEOUT_MS
=
min
(
10L
*
1000L
/* 10s */
,
retry_black_list_interval_ms
);
const
int64_t
MAX_IO_RETRY_TIMEOUT_MS
=
min
(
180L
*
1000L
/* 180s*/
,
retry_black_list_interval_ms
);
const
int64_t
diagnose_begin_ts
=
ObTimeUtility
::
current_time
();
bool
is_retry_succ
=
false
;
while
(
OB_SUCC
(
ret
)
&&
!
is_retry_succ
&&
!
disk_diagnose
.
is_disk_error
())
{
handle
.
reset
();
// timeout grows exponentially
if
(
retry_cnt
>=
disk_diagnose
.
get_warn_retry_cnt
()
-
1
)
{
timeout_ms
=
max
(
timeout_ms
*
2
,
MIN_IO_WAIT_TIME_MS
);
}
else
{
timeout_ms
=
timeout_ms
*
2
;
}
if
(
retry_cnt
==
disk_diagnose
.
get_warn_retry_cnt
())
{
disk_diagnose
.
record_read_fail
(
retry_cnt
);
}
const
ObIOConfig
io_conf
=
OB_IO_MANAGER
.
get_io_config
();
const
int64_t
current_retry_ts
=
ObTimeUtility
::
current_time
();
const
int64_t
warn_ts
=
diagnose_begin_ts
+
io_conf
.
data_storage_warning_tolerance_time_
;
const
int64_t
error_ts
=
diagnose_begin_ts
+
io_conf
.
data_storage_error_tolerance_time_
;
const
int64_t
left_timeout_ms
=
!
disk_diagnose
.
is_disk_warning
()
?
(
warn_ts
-
current_retry_ts
)
/
1000
:
(
error_ts
-
current_retry_ts
)
/
1000
;
// timeout of retry io increase exponentially
timeout_ms
=
min
(
left_timeout_ms
,
min
(
MAX_IO_RETRY_TIMEOUT_MS
,
max
(
timeout_ms
*
2
,
MIN_IO_RETRY_TIMEOUT_MS
)));
if
(
timeout_ms
>
0
)
{
// do retry io
if
(
disk
->
get_admin_status
()
!=
DISK_USING
)
{
ret
=
OB_STATE_NOT_MATCH
;
COMMON_LOG
(
WARN
,
"check_admin_status failed, disk is deleting"
,
K
(
ret
),
"status"
,
disk
->
get_admin_status
());
break
;
}
else
if
(
OB_FAIL
(
OB_IO_MANAGER
.
read
(
info
,
handle
,
timeout_ms
)))
{
COMMON_LOG
(
WARN
,
"ObIOManager::read failed"
,
K
(
ret
),
K
(
info
),
K
(
timeout_ms
));
ret
=
OB_SUCCESS
;
}
else
{
break
;
// stop retry if success
is_retry_succ
=
true
;
}
}
if
(
OB_SUCC
(
ret
)
&&
!
is_retry_succ
)
{
disk_diagnose
.
record_read_fail
(
diagnose_begin_ts
);
}
}
disk_diagnose
.
record_read_fail
(
retry_cnt
);
op_free
(
task
);
task
=
NULL
;
...
...
deps/oblib/src/lib/io/ob_io_disk.h
浏览文件 @
7eb645fb
...
...
@@ -56,13 +56,11 @@ class ObDiskDiagnose {
public:
ObDiskDiagnose
();
virtual
~
ObDiskDiagnose
();
void
record_read_fail
(
const
int64_t
retry_cnt
);
void
record_read_fail
(
const
int64_t
diagnose_begin_ts
);
void
record_write_fail
();
bool
is_disk_warning
()
const
;
bool
is_disk_error
()
const
;
void
reset_disk_health
();
int64_t
get_max_retry_cnt
()
const
;
int64_t
get_warn_retry_cnt
()
const
;
int64_t
get_disk_error_begin_ts
()
const
{
return
disk_error_begin_ts_
;
...
...
src/observer/ob_server_reload_config.cpp
浏览文件 @
7eb645fb
...
...
@@ -111,6 +111,8 @@ int ObServerReloadConfig::operator()()
// In the 2.x version, reuse the sys_bkgd_io_timeout configuration item to indicate the data disk io timeout time
// After version 3.1, use the data_storage_io_timeout configuration item.
io_config
.
data_storage_io_timeout_ms_
=
GCONF
.
_data_storage_io_timeout
/
1000L
;
io_config
.
data_storage_warning_tolerance_time_
=
GCONF
.
data_storage_warning_tolerance_time
;
io_config
.
data_storage_error_tolerance_time_
=
GCONF
.
data_storage_error_tolerance_time
;
if
(
OB_FAIL
(
ObIOManager
::
get_instance
().
set_io_config
(
io_config
)))
{
real_ret
=
ret
;
LOG_WARN
(
"reload io manager config fail, "
,
K
(
ret
));
...
...
src/share/config/ob_config_helper.cpp
浏览文件 @
7eb645fb
...
...
@@ -302,6 +302,17 @@ bool ObConfigPartitionBalanceStrategyFuncChecker::check(const ObConfigItem& t) c
return
is_valid
;
}
bool
ObDataStorageErrorToleranceTimeChecker
::
check
(
const
ObConfigItem
&
t
)
const
{
bool
is_valid
=
false
;
int64_t
value
=
ObConfigTimeParser
::
get
(
t
.
str
(),
is_valid
);
if
(
is_valid
)
{
const
int64_t
warning_value
=
GCONF
.
data_storage_warning_tolerance_time
;
is_valid
=
value
>=
warning_value
;
}
return
is_valid
;
}
int64_t
ObConfigIntParser
::
get
(
const
char
*
str
,
bool
&
valid
)
{
char
*
p_end
=
NULL
;
...
...
src/share/config/ob_config_helper.h
浏览文件 @
7eb645fb
...
...
@@ -394,6 +394,18 @@ private:
DISALLOW_COPY_AND_ASSIGN
(
ObConfigPartitionBalanceStrategyFuncChecker
);
};
class
ObDataStorageErrorToleranceTimeChecker
:
public
ObConfigChecker
{
public:
ObDataStorageErrorToleranceTimeChecker
()
{}
virtual
~
ObDataStorageErrorToleranceTimeChecker
()
{}
bool
check
(
const
ObConfigItem
&
t
)
const
;
private:
DISABLE_COPY_ASSIGN
(
ObDataStorageErrorToleranceTimeChecker
);
};
// config item container
class
ObConfigStringKey
{
public:
...
...
src/share/parameter/ob_parameter_seed.ipp
浏览文件 @
7eb645fb
...
...
@@ -881,6 +881,15 @@ DEF_TIME(_data_storage_io_timeout, OB_CLUSTER_PARAMETER, "120s", "[5s,600s]",
"io timeout for data storage, Range [5s,600s]. "
"The default value is 120s",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_TIME(data_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "30s", "[10s,300s]",
"time to tolerate disk read failure, after that, the disk status will be set warning. Range [10s,300s]. The "
"default value is 30s",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_TIME_WITH_CHECKER(data_storage_error_tolerance_time, OB_CLUSTER_PARAMETER, "300s",
common::ObDataStorageErrorToleranceTimeChecker, "[10s,7200s]",
"time to tolerate disk read failure, after that, the disk status will be set error. Range [10s,7200s]. The default "
"value is 300s",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_INT(data_disk_usage_limit_percentage, OB_CLUSTER_PARAMETER, "90", "[50,100]",
"the safe use percentage of data disk"
"Range: [50,100] in integer",
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录