Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
milvus
milvus
提交
518b6310
M
milvus
项目概览
milvus
/
milvus
9 个月 前同步成功
通知
260
Star
22476
Fork
2472
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
milvus
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
518b6310
编写于
8月 03, 2023
作者:
W
wei liu
提交者:
GitHub
8月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine retry times on replica (#26043)
Signed-off-by:
N
Wei Liu
<
wei.liu@zilliz.com
>
上级
6663e753
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
65 addition
and
27 deletion
+65
-27
internal/proxy/lb_policy.go
internal/proxy/lb_policy.go
+2
-1
internal/proxy/lb_policy_test.go
internal/proxy/lb_policy_test.go
+2
-0
internal/proxy/look_aside_balancer.go
internal/proxy/look_aside_balancer.go
+34
-10
internal/proxy/look_aside_balancer_test.go
internal/proxy/look_aside_balancer_test.go
+2
-2
internal/querycoordv2/observers/leader_observer.go
internal/querycoordv2/observers/leader_observer.go
+4
-11
internal/querycoordv2/observers/leader_observer_test.go
internal/querycoordv2/observers/leader_observer_test.go
+2
-2
pkg/util/paramtable/component_param.go
pkg/util/paramtable/component_param.go
+18
-1
pkg/util/paramtable/component_param_test.go
pkg/util/paramtable/component_param_test.go
+1
-0
未找到文件。
internal/proxy/lb_policy.go
浏览文件 @
518b6310
...
...
@@ -199,6 +199,7 @@ func (lb *LBPolicyImpl) Execute(ctx context.Context, workload CollectionWorkLoad
for
channel
,
nodes
:=
range
dml2leaders
{
channel
:=
channel
nodes
:=
lo
.
Map
(
nodes
,
func
(
node
nodeInfo
,
_
int
)
int64
{
return
node
.
nodeID
})
retryOnReplica
:=
Params
.
ProxyCfg
.
RetryTimesOnReplica
.
GetAsInt
()
wg
.
Go
(
func
()
error
{
err
:=
lb
.
ExecuteWithRetry
(
ctx
,
ChannelWorkload
{
db
:
workload
.
db
,
...
...
@@ -208,7 +209,7 @@ func (lb *LBPolicyImpl) Execute(ctx context.Context, workload CollectionWorkLoad
shardLeaders
:
nodes
,
nq
:
workload
.
nq
,
exec
:
workload
.
exec
,
retryTimes
:
uint
(
len
(
nodes
)),
retryTimes
:
uint
(
len
(
nodes
)
*
retryOnReplica
),
})
return
err
})
...
...
internal/proxy/lb_policy_test.go
浏览文件 @
518b6310
...
...
@@ -366,6 +366,7 @@ func (s *LBPolicySuite) TestExecute() {
collectionID
:
s
.
collectionID
,
nq
:
1
,
exec
:
func
(
ctx
context
.
Context
,
ui
UniqueID
,
qn
types
.
QueryNode
,
s
...
string
)
error
{
// succeed in first execute
if
counter
.
Add
(
1
)
==
1
{
return
nil
}
...
...
@@ -374,6 +375,7 @@ func (s *LBPolicySuite) TestExecute() {
},
})
s
.
Error
(
err
)
s
.
Equal
(
int64
(
11
),
counter
.
Load
())
// test get shard leader failed
s
.
qc
.
ExpectedCalls
=
nil
...
...
internal/proxy/look_aside_balancer.go
浏览文件 @
518b6310
...
...
@@ -50,6 +50,9 @@ type LookAsideBalancer struct {
unreachableQueryNodes
*
typeutil
.
ConcurrentSet
[
int64
]
// query node id -> number of consecutive heartbeat failures
failedHeartBeatCounter
*
typeutil
.
ConcurrentMap
[
int64
,
*
atomic
.
Int64
]
closeCh
chan
struct
{}
closeOnce
sync
.
Once
wg
sync
.
WaitGroup
...
...
@@ -57,12 +60,13 @@ type LookAsideBalancer struct {
func
NewLookAsideBalancer
(
clientMgr
shardClientMgr
)
*
LookAsideBalancer
{
balancer
:=
&
LookAsideBalancer
{
clientMgr
:
clientMgr
,
metricsMap
:
typeutil
.
NewConcurrentMap
[
int64
,
*
internalpb
.
CostAggregation
](),
metricsUpdateTs
:
typeutil
.
NewConcurrentMap
[
int64
,
int64
](),
executingTaskTotalNQ
:
typeutil
.
NewConcurrentMap
[
int64
,
*
atomic
.
Int64
](),
unreachableQueryNodes
:
typeutil
.
NewConcurrentSet
[
int64
](),
closeCh
:
make
(
chan
struct
{}),
clientMgr
:
clientMgr
,
metricsMap
:
typeutil
.
NewConcurrentMap
[
int64
,
*
internalpb
.
CostAggregation
](),
metricsUpdateTs
:
typeutil
.
NewConcurrentMap
[
int64
,
int64
](),
executingTaskTotalNQ
:
typeutil
.
NewConcurrentMap
[
int64
,
*
atomic
.
Int64
](),
unreachableQueryNodes
:
typeutil
.
NewConcurrentSet
[
int64
](),
failedHeartBeatCounter
:
typeutil
.
NewConcurrentMap
[
int64
,
*
atomic
.
Int64
](),
closeCh
:
make
(
chan
struct
{}),
}
return
balancer
...
...
@@ -198,13 +202,28 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
checkInterval
)
defer
cancel
()
setUnreachable
:=
func
()
bool
{
setUnreachable
:=
func
(
err
error
)
bool
{
failures
,
ok
:=
b
.
failedHeartBeatCounter
.
Get
(
node
)
if
!
ok
{
failures
=
atomic
.
NewInt64
(
0
)
}
failures
.
Inc
()
b
.
failedHeartBeatCounter
.
Insert
(
node
,
failures
)
if
failures
.
Load
()
<
Params
.
ProxyCfg
.
RetryTimesOnHealthCheck
.
GetAsInt64
()
{
log
.
Warn
(
"get component status failed"
,
zap
.
Int64
(
"node"
,
node
),
zap
.
Int64
(
"times"
,
failures
.
Load
()),
zap
.
Error
(
err
))
return
false
}
return
b
.
unreachableQueryNodes
.
Insert
(
node
)
}
qn
,
err
:=
b
.
clientMgr
.
GetClient
(
ctx
,
node
)
if
err
!=
nil
{
if
setUnreachable
()
{
if
setUnreachable
(
err
)
{
log
.
Warn
(
"get client failed, set node unreachable"
,
zap
.
Int64
(
"node"
,
node
),
zap
.
Error
(
err
))
}
return
struct
{}{},
nil
...
...
@@ -212,14 +231,14 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
resp
,
err
:=
qn
.
GetComponentStates
(
ctx
)
if
err
!=
nil
{
if
setUnreachable
()
{
if
setUnreachable
(
err
)
{
log
.
Warn
(
"get component status failed,set node unreachable"
,
zap
.
Int64
(
"node"
,
node
),
zap
.
Error
(
err
))
}
return
struct
{}{},
nil
}
if
resp
.
GetState
()
.
GetStateCode
()
!=
commonpb
.
StateCode_Healthy
{
if
setUnreachable
()
{
if
setUnreachable
(
merr
.
ErrServiceUnavailable
)
{
log
.
Warn
(
"component status unhealthy,set node unreachable"
,
zap
.
Int64
(
"node"
,
node
),
zap
.
Error
(
err
))
}
return
struct
{}{},
nil
...
...
@@ -228,6 +247,11 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
// check health successfully, update check health ts
b
.
metricsUpdateTs
.
Insert
(
node
,
time
.
Now
()
.
Local
()
.
UnixMilli
())
if
b
.
unreachableQueryNodes
.
TryRemove
(
node
)
{
// once heartbeat succeed, clear filed counter
failures
,
ok
:=
b
.
failedHeartBeatCounter
.
Get
(
node
)
if
ok
{
failures
.
Store
(
0
)
}
log
.
Info
(
"component recuperated, set node reachable"
,
zap
.
Int64
(
"node"
,
node
),
zap
.
Error
(
err
))
}
...
...
internal/proxy/look_aside_balancer_test.go
浏览文件 @
518b6310
...
...
@@ -302,14 +302,14 @@ func (suite *LookAsideBalancerSuite) TestCheckHealthLoop() {
suite
.
balancer
.
unreachableQueryNodes
.
Insert
(
2
)
suite
.
Eventually
(
func
()
bool
{
return
suite
.
balancer
.
unreachableQueryNodes
.
Contain
(
1
)
},
3
*
time
.
Second
,
100
*
time
.
Millisecond
)
},
5
*
time
.
Second
,
100
*
time
.
Millisecond
)
targetNode
,
err
:=
suite
.
balancer
.
SelectNode
(
context
.
Background
(),
[]
int64
{
1
},
1
)
suite
.
ErrorIs
(
err
,
merr
.
ErrServiceUnavailable
)
suite
.
Equal
(
int64
(
-
1
),
targetNode
)
suite
.
Eventually
(
func
()
bool
{
return
!
suite
.
balancer
.
unreachableQueryNodes
.
Contain
(
2
)
},
3
*
time
.
Second
,
100
*
time
.
Millisecond
)
},
5
*
time
.
Second
,
100
*
time
.
Millisecond
)
}
func
(
suite
*
LookAsideBalancerSuite
)
TestNodeRecover
()
{
...
...
internal/querycoordv2/observers/leader_observer.go
浏览文件 @
518b6310
...
...
@@ -111,7 +111,7 @@ func (o *LeaderObserver) observeCollection(ctx context.Context, collection int64
actions
:=
o
.
findNeedLoadedSegments
(
leaderView
,
dists
)
actions
=
append
(
actions
,
o
.
findNeedRemovedSegments
(
leaderView
,
dists
)
...
)
updateVersionAction
:=
o
.
checkNeedUpdateTargetVersion
(
leaderView
)
updateVersionAction
:=
o
.
checkNeedUpdateTargetVersion
(
ctx
,
leaderView
)
if
updateVersionAction
!=
nil
{
actions
=
append
(
actions
,
updateVersionAction
)
}
...
...
@@ -133,14 +133,15 @@ func (ob *LeaderObserver) CheckTargetVersion(collectionID int64) bool {
return
<-
notifier
}
func
(
o
*
LeaderObserver
)
checkNeedUpdateTargetVersion
(
leaderView
*
meta
.
LeaderView
)
*
querypb
.
SyncAction
{
func
(
o
*
LeaderObserver
)
checkNeedUpdateTargetVersion
(
ctx
context
.
Context
,
leaderView
*
meta
.
LeaderView
)
*
querypb
.
SyncAction
{
log
.
Ctx
(
ctx
)
.
WithRateGroup
(
"qcv2.LeaderObserver"
,
1
,
60
)
targetVersion
:=
o
.
target
.
GetCollectionTargetVersion
(
leaderView
.
CollectionID
,
meta
.
CurrentTarget
)
if
targetVersion
<=
leaderView
.
TargetVersion
{
return
nil
}
log
.
Info
(
"Update readable segment version"
,
log
.
RatedInfo
(
10
,
"Update readable segment version"
,
zap
.
Int64
(
"collectionID"
,
leaderView
.
CollectionID
),
zap
.
String
(
"channelName"
,
leaderView
.
Channel
),
zap
.
Int64
(
"nodeID"
,
leaderView
.
ID
),
...
...
@@ -152,14 +153,6 @@ func (o *LeaderObserver) checkNeedUpdateTargetVersion(leaderView *meta.LeaderVie
growingSegments
:=
o
.
target
.
GetStreamingSegmentsByChannel
(
leaderView
.
CollectionID
,
leaderView
.
Channel
,
meta
.
CurrentTarget
)
droppedSegments
:=
o
.
target
.
GetDroppedSegmentsByChannel
(
leaderView
.
CollectionID
,
leaderView
.
Channel
,
meta
.
CurrentTarget
)
log
.
Info
(
"Update readable segment version"
,
zap
.
Int64
(
"collectionID"
,
leaderView
.
CollectionID
),
zap
.
String
(
"channelName"
,
leaderView
.
Channel
),
zap
.
Int64
(
"nodeID"
,
leaderView
.
ID
),
zap
.
Int64
(
"oldVersion"
,
leaderView
.
TargetVersion
),
zap
.
Int64
(
"newVersion"
,
targetVersion
),
)
return
&
querypb
.
SyncAction
{
Type
:
querypb
.
SyncType_UpdateVersion
,
GrowingInTarget
:
growingSegments
.
Collect
(),
...
...
internal/querycoordv2/observers/leader_observer_test.go
浏览文件 @
518b6310
...
...
@@ -558,11 +558,11 @@ func (suite *LeaderObserverTestSuite) TestSyncTargetVersion() {
view
:=
utils
.
CreateTestLeaderView
(
1
,
collectionID
,
"channel-1"
,
nil
,
nil
)
view
.
TargetVersion
=
TargetVersion
action
:=
observer
.
checkNeedUpdateTargetVersion
(
view
)
action
:=
observer
.
checkNeedUpdateTargetVersion
(
context
.
Background
(),
view
)
suite
.
Nil
(
action
)
view
.
TargetVersion
=
TargetVersion
-
1
action
=
observer
.
checkNeedUpdateTargetVersion
(
view
)
action
=
observer
.
checkNeedUpdateTargetVersion
(
context
.
Background
(),
view
)
suite
.
NotNil
(
action
)
suite
.
Equal
(
querypb
.
SyncType_UpdateVersion
,
action
.
Type
)
suite
.
Len
(
action
.
GrowingInTarget
,
2
)
...
...
pkg/util/paramtable/component_param.go
浏览文件 @
518b6310
...
...
@@ -968,6 +968,8 @@ type proxyConfig struct {
ReplicaSelectionPolicy
ParamItem
`refreshable:"false"`
CheckQueryNodeHealthInterval
ParamItem
`refreshable:"false"`
CostMetricsExpireTime
ParamItem
`refreshable:"true"`
RetryTimesOnReplica
ParamItem
`refreshable:"true"`
RetryTimesOnHealthCheck
ParamItem
`refreshable:"true"`
}
func
(
p
*
proxyConfig
)
init
(
base
*
BaseTable
)
{
...
...
@@ -984,7 +986,7 @@ func (p *proxyConfig) init(base *BaseTable) {
p
.
HealthCheckTimetout
=
ParamItem
{
Key
:
"proxy.healthCheckTimetout"
,
Version
:
"2.3.0"
,
DefaultValue
:
"
5
00"
,
DefaultValue
:
"
10
00"
,
PanicIfEmpty
:
true
,
Doc
:
"ms, the interval that to do component healthy check"
,
Export
:
true
,
...
...
@@ -1212,6 +1214,21 @@ please adjust in embedded Milvus: false`,
}
p
.
CostMetricsExpireTime
.
Init
(
base
.
mgr
)
p
.
RetryTimesOnReplica
=
ParamItem
{
Key
:
"proxy.retryTimesOnReplica"
,
Version
:
"2.3.0"
,
DefaultValue
:
"2"
,
Doc
:
"retry times on each replica"
,
}
p
.
RetryTimesOnReplica
.
Init
(
base
.
mgr
)
p
.
RetryTimesOnHealthCheck
=
ParamItem
{
Key
:
"proxy.retryTimesOnHealthCheck"
,
Version
:
"2.3.0"
,
DefaultValue
:
"3"
,
Doc
:
"set query node unavailable on proxy when heartbeat failures reach this limit"
,
}
p
.
RetryTimesOnHealthCheck
.
Init
(
base
.
mgr
)
}
// /////////////////////////////////////////////////////////////////////////////
...
...
pkg/util/paramtable/component_param_test.go
浏览文件 @
518b6310
...
...
@@ -185,6 +185,7 @@ func TestComponentParam(t *testing.T) {
assert
.
Equal
(
t
,
Params
.
ReplicaSelectionPolicy
.
GetValue
(),
"look_aside"
)
assert
.
Equal
(
t
,
Params
.
CheckQueryNodeHealthInterval
.
GetAsInt
(),
1000
)
assert
.
Equal
(
t
,
Params
.
CostMetricsExpireTime
.
GetAsInt
(),
1000
)
assert
.
Equal
(
t
,
Params
.
RetryTimesOnReplica
.
GetAsInt
(),
2
)
})
// t.Run("test proxyConfig panic", func(t *testing.T) {
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录