Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
milvus
milvus
提交
5872c5af
M
milvus
项目概览
milvus
/
milvus
12 个月 前同步成功
通知
261
Star
22476
Fork
2472
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
milvus
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
5872c5af
编写于
5月 26, 2022
作者:
Y
yah01
提交者:
GitHub
5月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix updating shard leaders may lost some modifications (#17218)
Signed-off-by:
N
yah01
<
yang.cen@zilliz.com
>
上级
56778787
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
66 addition
and
44 deletion
+66
-44
internal/querycoord/meta.go
internal/querycoord/meta.go
+5
-0
internal/querycoord/replica.go
internal/querycoord/replica.go
+47
-16
internal/querycoord/task.go
internal/querycoord/task.go
+14
-28
未找到文件。
internal/querycoord/meta.go
浏览文件 @
5872c5af
...
...
@@ -105,6 +105,7 @@ type Meta interface {
getReplicasByCollectionID
(
collectionID
int64
)
([]
*
milvuspb
.
ReplicaInfo
,
error
)
getReplicasByNodeID
(
nodeID
int64
)
([]
*
milvuspb
.
ReplicaInfo
,
error
)
applyReplicaBalancePlan
(
p
*
balancePlan
)
error
updateShardLeader
(
replicaID
UniqueID
,
dmChannel
string
,
leaderID
UniqueID
,
leaderAddr
string
)
error
}
// MetaReplica records the current load information on all querynodes
...
...
@@ -1288,6 +1289,10 @@ func (m *MetaReplica) applyReplicaBalancePlan(p *balancePlan) error {
return
m
.
replicas
.
ApplyBalancePlan
(
p
,
m
.
getKvClient
())
}
func
(
m
*
MetaReplica
)
updateShardLeader
(
replicaID
UniqueID
,
dmChannel
string
,
leaderID
UniqueID
,
leaderAddr
string
)
error
{
return
m
.
replicas
.
UpdateShardLeader
(
replicaID
,
dmChannel
,
leaderID
,
leaderAddr
,
m
.
getKvClient
())
}
//func (m *MetaReplica) printMeta() {
// m.RLock()
// defer m.RUnlock()
...
...
internal/querycoord/replica.go
浏览文件 @
5872c5af
...
...
@@ -179,22 +179,9 @@ func (rep *ReplicaInfos) ApplyBalancePlan(p *balancePlan, kv kv.MetaKv) error {
}
// save to etcd first
if
len
(
replicasChanged
)
>
0
{
data
:=
make
(
map
[
string
]
string
)
for
_
,
info
:=
range
replicasChanged
{
infoBytes
,
err
:=
proto
.
Marshal
(
info
)
if
err
!=
nil
{
return
err
}
key
:=
fmt
.
Sprintf
(
"%s/%d"
,
ReplicaMetaPrefix
,
info
.
ReplicaID
)
data
[
key
]
=
string
(
infoBytes
)
}
err
:=
kv
.
MultiSave
(
data
)
if
err
!=
nil
{
return
err
}
err
:=
saveReplica
(
kv
,
replicasChanged
...
)
if
err
!=
nil
{
return
err
}
// apply change to in-memory meta
...
...
@@ -209,6 +196,33 @@ func (rep *ReplicaInfos) ApplyBalancePlan(p *balancePlan, kv kv.MetaKv) error {
return
nil
}
func
(
rep
*
ReplicaInfos
)
UpdateShardLeader
(
replicaID
UniqueID
,
dmChannel
string
,
leaderID
UniqueID
,
leaderAddr
string
,
meta
kv
.
MetaKv
)
error
{
rep
.
globalGuard
.
Lock
()
defer
rep
.
globalGuard
.
Unlock
()
replica
,
ok
:=
rep
.
get
(
replicaID
)
if
!
ok
{
return
fmt
.
Errorf
(
"replica %v not found"
,
replicaID
)
}
for
_
,
shard
:=
range
replica
.
ShardReplicas
{
if
shard
.
DmChannelName
==
dmChannel
{
shard
.
LeaderID
=
leaderID
shard
.
LeaderAddr
=
leaderAddr
break
}
}
err
:=
saveReplica
(
meta
,
replica
)
if
err
!=
nil
{
return
err
}
rep
.
upsert
(
replica
)
return
nil
}
// removeNodeFromReplica helper function to remove nodeID from replica NodeIds list.
func
removeNodeFromReplica
(
replica
*
milvuspb
.
ReplicaInfo
,
nodeID
int64
)
*
milvuspb
.
ReplicaInfo
{
for
i
:=
0
;
i
<
len
(
replica
.
NodeIds
);
i
++
{
...
...
@@ -220,3 +234,20 @@ func removeNodeFromReplica(replica *milvuspb.ReplicaInfo, nodeID int64) *milvusp
}
return
replica
}
// save the replicas into etcd.
func
saveReplica
(
meta
kv
.
MetaKv
,
replicas
...*
milvuspb
.
ReplicaInfo
)
error
{
data
:=
make
(
map
[
string
]
string
)
for
_
,
info
:=
range
replicas
{
infoBytes
,
err
:=
proto
.
Marshal
(
info
)
if
err
!=
nil
{
return
err
}
key
:=
fmt
.
Sprintf
(
"%s/%d"
,
ReplicaMetaPrefix
,
info
.
ReplicaID
)
data
[
key
]
=
string
(
infoBytes
)
}
return
meta
.
MultiSave
(
data
)
}
internal/querycoord/task.go
浏览文件 @
5872c5af
...
...
@@ -2405,50 +2405,36 @@ func (lbt *loadBalanceTask) globalPostExecute(ctx context.Context) error {
for
_
,
childTask
:=
range
lbt
.
getChildTask
()
{
if
task
,
ok
:=
childTask
.
(
*
watchDmChannelTask
);
ok
{
wg
.
Go
(
func
()
error
{
nodeInfo
,
err
:=
lbt
.
cluster
.
getNodeInfoByID
(
task
.
NodeID
)
leaderID
:=
task
.
NodeID
dmChannel
:=
task
.
Infos
[
0
]
.
ChannelName
nodeInfo
,
err
:=
lbt
.
cluster
.
getNodeInfoByID
(
leaderID
)
if
err
!=
nil
{
log
.
Error
(
"failed to get node info to update shard leader info"
,
zap
.
Int64
(
"triggerTaskID"
,
lbt
.
getTaskID
()),
zap
.
Int64
(
"taskID"
,
task
.
getTaskID
()),
zap
.
Int64
(
"nodeID"
,
task
.
Node
ID
),
zap
.
String
(
"dmChannel"
,
task
.
Infos
[
0
]
.
ChannelName
),
zap
.
Int64
(
"nodeID"
,
leader
ID
),
zap
.
String
(
"dmChannel"
,
dmChannel
),
zap
.
Error
(
err
))
return
err
}
replica
,
err
:=
lbt
.
meta
.
getReplicaByID
(
task
.
ReplicaID
)
err
=
lbt
.
meta
.
updateShardLeader
(
task
.
ReplicaID
,
dmChannel
,
leaderID
,
nodeInfo
.
(
*
queryNode
)
.
address
)
if
err
!=
nil
{
log
.
Error
(
"failed to
get replica to update shard leader info
"
,
log
.
Error
(
"failed to
update shard leader info of replica
"
,
zap
.
Int64
(
"triggerTaskID"
,
lbt
.
getTaskID
()),
zap
.
Int64
(
"taskID"
,
task
.
getTaskID
()),
zap
.
Int64
(
"replicaID"
,
task
.
ReplicaID
),
zap
.
String
(
"dmChannel"
,
task
.
Infos
[
0
]
.
ChannelName
),
zap
.
String
(
"dmChannel"
,
dmChannel
),
zap
.
Error
(
err
))
return
err
}
for
_
,
shard
:=
range
replica
.
ShardReplicas
{
if
shard
.
DmChannelName
==
task
.
Infos
[
0
]
.
ChannelName
{
log
.
Debug
(
"LoadBalance: update shard leader"
,
zap
.
Int64
(
"triggerTaskID"
,
lbt
.
getTaskID
()),
zap
.
Int64
(
"taskID"
,
task
.
getTaskID
()),
zap
.
Int64
(
"oldLeader"
,
shard
.
LeaderID
),
zap
.
Int64
(
"newLeader"
,
task
.
NodeID
))
shard
.
LeaderID
=
task
.
NodeID
shard
.
LeaderAddr
=
nodeInfo
.
(
*
queryNode
)
.
address
break
}
}
err
=
lbt
.
meta
.
setReplicaInfo
(
replica
)
if
err
!=
nil
{
log
.
Error
(
"failed to remove offline nodes from replica info"
,
zap
.
Int64
(
"triggerTaskID"
,
lbt
.
getTaskID
()),
zap
.
Int64
(
"taskID"
,
task
.
getTaskID
()),
zap
.
Int64
(
"replicaID"
,
replica
.
ReplicaID
),
zap
.
Error
(
err
))
return
err
}
log
.
Debug
(
"LoadBalance: update shard leader"
,
zap
.
Int64
(
"triggerTaskID"
,
lbt
.
getTaskID
()),
zap
.
Int64
(
"taskID"
,
task
.
getTaskID
()),
zap
.
String
(
"dmChannel"
,
dmChannel
),
zap
.
Int64
(
"leader"
,
leaderID
))
return
nil
})
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录