Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
milvus
milvus
提交
b27e6b52
M
milvus
项目概览
milvus
/
milvus
11 个月 前同步成功
通知
260
Star
22476
Fork
2472
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
milvus
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
b27e6b52
编写于
6月 28, 2021
作者:
C
congqixia
提交者:
GitHub
6月 28, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Force cluster refresh for each dn change event (#6161)
Signed-off-by:
N
Congqi Xia
<
congqi.xia@zilliz.com
>
上级
a5f74c4a
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
155 addition
and
54 deletion
+155
-54
internal/datacoord/cluster.go
internal/datacoord/cluster.go
+130
-52
internal/datacoord/server.go
internal/datacoord/server.go
+25
-2
未找到文件。
internal/datacoord/cluster.go
浏览文件 @
b27e6b52
...
...
@@ -17,6 +17,7 @@ import (
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/util/retry"
"go.uber.org/zap"
"golang.org/x/net/context"
)
...
...
@@ -102,68 +103,130 @@ func (c *cluster) startup(dataNodes []*datapb.DataNodeInfo) error {
deltaChange
:=
c
.
dataManager
.
updateCluster
(
dataNodes
)
nodes
,
chanBuffer
:=
c
.
dataManager
.
getDataNodes
(
false
)
var
rets
[]
*
datapb
.
DataNodeInfo
var
err
error
rets
,
chanBuffer
=
c
.
startupPolicy
.
apply
(
nodes
,
deltaChange
,
chanBuffer
)
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
rets
=
c
.
watch
(
rets
)
rets
,
err
=
c
.
watch
(
rets
)
if
err
!=
nil
{
log
.
Warn
(
"Failed to watch all the status change"
,
zap
.
Error
(
err
))
//does not trigger new another refresh, pending evt will do
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
return
nil
}
func
(
c
*
cluster
)
watch
(
nodes
[]
*
datapb
.
DataNodeInfo
)
[]
*
datapb
.
DataNodeInfo
{
// refresh rough refresh datanode status after event received
func
(
c
*
cluster
)
refresh
(
dataNodes
[]
*
datapb
.
DataNodeInfo
)
error
{
deltaChange
:=
c
.
dataManager
.
updateCluster
(
dataNodes
)
nodes
,
chanBuffer
:=
c
.
dataManager
.
getDataNodes
(
false
)
var
rets
[]
*
datapb
.
DataNodeInfo
var
err
error
rets
,
chanBuffer
=
c
.
startupPolicy
.
apply
(
nodes
,
deltaChange
,
chanBuffer
)
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
rets
,
err
=
c
.
watch
(
rets
)
if
err
!=
nil
{
log
.
Warn
(
"Failed to watch all the status change"
,
zap
.
Error
(
err
))
//does not trigger new another refresh, pending evt will do
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
// even if some watch failed, status should sync into etcd
return
err
}
// paraRun parallel run, with max Parallel limit
func
parraRun
(
works
[]
func
(),
maxRunner
int
)
{
wg
:=
sync
.
WaitGroup
{}
ch
:=
make
(
chan
func
())
wg
.
Add
(
len
(
works
))
for
i
:=
0
;
i
<
maxRunner
;
i
++
{
go
func
()
{
work
,
ok
:=
<-
ch
if
!
ok
{
return
}
work
()
wg
.
Done
()
}()
}
for
_
,
work
:=
range
works
{
ch
<-
work
}
wg
.
Wait
()
close
(
ch
)
}
func
(
c
*
cluster
)
watch
(
nodes
[]
*
datapb
.
DataNodeInfo
)
([]
*
datapb
.
DataNodeInfo
,
error
)
{
works
:=
make
([]
func
(),
0
,
len
(
nodes
))
mut
:=
sync
.
Mutex
{}
errs
:=
make
([]
error
,
0
,
len
(
nodes
))
for
_
,
n
:=
range
nodes
{
logMsg
:=
fmt
.
Sprintf
(
"Begin to watch channels for node %s:"
,
n
.
Address
)
uncompletes
:=
make
([]
vchannel
,
0
,
len
(
n
.
Channels
))
for
_
,
ch
:=
range
n
.
Channels
{
if
ch
.
State
==
datapb
.
ChannelWatchState_Uncomplete
{
if
len
(
uncompletes
)
==
0
{
logMsg
+=
ch
.
Name
}
else
{
logMsg
+=
","
+
ch
.
Name
works
=
append
(
works
,
func
()
{
logMsg
:=
fmt
.
Sprintf
(
"Begin to watch channels for node %s:"
,
n
.
Address
)
uncompletes
:=
make
([]
vchannel
,
0
,
len
(
n
.
Channels
))
for
_
,
ch
:=
range
n
.
Channels
{
if
ch
.
State
==
datapb
.
ChannelWatchState_Uncomplete
{
if
len
(
uncompletes
)
==
0
{
logMsg
+=
ch
.
Name
}
else
{
logMsg
+=
","
+
ch
.
Name
}
uncompletes
=
append
(
uncompletes
,
vchannel
{
CollectionID
:
ch
.
CollectionID
,
DmlChannel
:
ch
.
Name
,
})
}
uncompletes
=
append
(
uncompletes
,
vchannel
{
CollectionID
:
ch
.
CollectionID
,
DmlChannel
:
ch
.
Name
,
})
}
}
if
len
(
uncompletes
)
==
0
{
continue
}
log
.
Debug
(
logMsg
)
if
len
(
uncompletes
)
==
0
{
return
// all set, just return
}
log
.
Debug
(
logMsg
)
vchanInfos
,
err
:=
c
.
posProvider
.
GetVChanPositions
(
uncompletes
,
true
)
if
err
!=
nil
{
log
.
Warn
(
"get vchannel position failed"
,
zap
.
Error
(
err
))
continue
}
cli
,
err
:=
c
.
sessionManager
.
getOrCreateSession
(
n
.
Address
)
if
err
!=
nil
{
log
.
Warn
(
"get session failed"
,
zap
.
String
(
"addr"
,
n
.
Address
),
zap
.
Error
(
err
))
continue
}
req
:=
&
datapb
.
WatchDmChannelsRequest
{
Base
:
&
commonpb
.
MsgBase
{
SourceID
:
Params
.
NodeID
,
},
Vchannels
:
vchanInfos
,
}
resp
,
err
:=
cli
.
WatchDmChannels
(
c
.
ctx
,
req
)
if
err
!=
nil
{
log
.
Warn
(
"watch dm channel failed"
,
zap
.
String
(
"addr"
,
n
.
Address
),
zap
.
Error
(
err
))
continue
}
if
resp
.
ErrorCode
!=
commonpb
.
ErrorCode_Success
{
log
.
Warn
(
"watch channels failed"
,
zap
.
String
(
"address"
,
n
.
Address
),
zap
.
Error
(
err
))
continue
}
for
_
,
ch
:=
range
n
.
Channels
{
if
ch
.
State
==
datapb
.
ChannelWatchState_Uncomplete
{
ch
.
State
=
datapb
.
ChannelWatchState_Complete
vchanInfos
,
err
:=
c
.
posProvider
.
GetVChanPositions
(
uncompletes
,
true
)
if
err
!=
nil
{
log
.
Warn
(
"get vchannel position failed"
,
zap
.
Error
(
err
))
mut
.
Lock
()
errs
=
append
(
errs
,
err
)
mut
.
Unlock
()
return
}
}
cli
,
err
:=
c
.
sessionManager
.
getOrCreateSession
(
n
.
Address
)
// this might take time if address went offline
if
err
!=
nil
{
log
.
Warn
(
"get session failed"
,
zap
.
String
(
"addr"
,
n
.
Address
),
zap
.
Error
(
err
))
mut
.
Lock
()
errs
=
append
(
errs
,
err
)
mut
.
Unlock
()
return
}
req
:=
&
datapb
.
WatchDmChannelsRequest
{
Base
:
&
commonpb
.
MsgBase
{
SourceID
:
Params
.
NodeID
,
},
Vchannels
:
vchanInfos
,
}
resp
,
err
:=
cli
.
WatchDmChannels
(
c
.
ctx
,
req
)
if
err
!=
nil
{
log
.
Warn
(
"watch dm channel failed"
,
zap
.
String
(
"addr"
,
n
.
Address
),
zap
.
Error
(
err
))
mut
.
Lock
()
errs
=
append
(
errs
,
err
)
mut
.
Unlock
()
}
if
resp
.
ErrorCode
!=
commonpb
.
ErrorCode_Success
{
log
.
Warn
(
"watch channels failed"
,
zap
.
String
(
"address"
,
n
.
Address
),
zap
.
Error
(
err
))
mut
.
Lock
()
errs
=
append
(
errs
,
fmt
.
Errorf
(
"watch fail with stat %v, msg:%s"
,
resp
.
ErrorCode
,
resp
.
Reason
))
mut
.
Unlock
()
return
}
for
_
,
ch
:=
range
n
.
Channels
{
if
ch
.
State
==
datapb
.
ChannelWatchState_Uncomplete
{
ch
.
State
=
datapb
.
ChannelWatchState_Complete
}
}
})
}
return
nodes
parraRun
(
works
,
3
)
return
nodes
,
retry
.
ErrorList
(
errs
)
}
func
(
c
*
cluster
)
register
(
n
*
datapb
.
DataNodeInfo
)
{
...
...
@@ -172,11 +235,16 @@ func (c *cluster) register(n *datapb.DataNodeInfo) {
c
.
dataManager
.
register
(
n
)
cNodes
,
chanBuffer
:=
c
.
dataManager
.
getDataNodes
(
true
)
var
rets
[]
*
datapb
.
DataNodeInfo
var
err
error
log
.
Debug
(
"before register policy applied"
,
zap
.
Any
(
"n.Channels"
,
n
.
Channels
),
zap
.
Any
(
"buffer"
,
chanBuffer
))
rets
,
chanBuffer
=
c
.
registerPolicy
.
apply
(
cNodes
,
n
,
chanBuffer
)
log
.
Debug
(
"after register policy applied"
,
zap
.
Any
(
"ret"
,
rets
),
zap
.
Any
(
"buffer"
,
chanBuffer
))
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
rets
=
c
.
watch
(
rets
)
rets
,
err
=
c
.
watch
(
rets
)
if
err
!=
nil
{
log
.
Warn
(
"Failed to watch all the status change"
,
zap
.
Error
(
err
))
//does not trigger new another refresh, pending evt will do
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
}
...
...
@@ -192,6 +260,7 @@ func (c *cluster) unregister(n *datapb.DataNodeInfo) {
cNodes
,
chanBuffer
:=
c
.
dataManager
.
getDataNodes
(
true
)
log
.
Debug
(
"before unregister policy applied"
,
zap
.
Any
(
"n.Channels"
,
n
.
Channels
),
zap
.
Any
(
"buffer"
,
chanBuffer
))
var
rets
[]
*
datapb
.
DataNodeInfo
var
err
error
if
len
(
cNodes
)
==
0
{
for
_
,
chStat
:=
range
n
.
Channels
{
chStat
.
State
=
datapb
.
ChannelWatchState_Uncomplete
...
...
@@ -202,7 +271,11 @@ func (c *cluster) unregister(n *datapb.DataNodeInfo) {
}
log
.
Debug
(
"after register policy applied"
,
zap
.
Any
(
"ret"
,
rets
),
zap
.
Any
(
"buffer"
,
chanBuffer
))
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
rets
=
c
.
watch
(
rets
)
rets
,
err
=
c
.
watch
(
rets
)
if
err
!=
nil
{
log
.
Warn
(
"Failed to watch all the status change"
,
zap
.
Error
(
err
))
//does not trigger new another refresh, pending evt will do
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
}
...
...
@@ -211,6 +284,7 @@ func (c *cluster) watchIfNeeded(channel string, collectionID UniqueID) {
defer
c
.
mu
.
Unlock
()
cNodes
,
chanBuffer
:=
c
.
dataManager
.
getDataNodes
(
true
)
var
rets
[]
*
datapb
.
DataNodeInfo
var
err
error
if
len
(
cNodes
)
==
0
{
// no nodes to assign, put into buffer
chanBuffer
=
append
(
chanBuffer
,
&
datapb
.
ChannelStatus
{
Name
:
channel
,
...
...
@@ -221,7 +295,11 @@ func (c *cluster) watchIfNeeded(channel string, collectionID UniqueID) {
rets
=
c
.
assignPolicy
.
apply
(
cNodes
,
channel
,
collectionID
)
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
rets
=
c
.
watch
(
rets
)
rets
,
err
=
c
.
watch
(
rets
)
if
err
!=
nil
{
log
.
Warn
(
"Failed to watch all the status change"
,
zap
.
Error
(
err
))
//does not trigger new another refresh, pending evt will do
}
c
.
dataManager
.
updateDataNodes
(
rets
,
chanBuffer
)
}
...
...
internal/datacoord/server.go
浏览文件 @
b27e6b52
...
...
@@ -192,6 +192,27 @@ func (s *Server) initServiceDiscovery() error {
return
nil
}
func
(
s
*
Server
)
loadDataNodes
()
[]
*
datapb
.
DataNodeInfo
{
if
s
.
session
==
nil
{
log
.
Warn
(
"load data nodes but session is nil"
)
return
[]
*
datapb
.
DataNodeInfo
{}
}
sessions
,
_
,
err
:=
s
.
session
.
GetSessions
(
typeutil
.
DataNodeRole
)
if
err
!=
nil
{
log
.
Warn
(
"load data nodes faild"
,
zap
.
Error
(
err
))
return
[]
*
datapb
.
DataNodeInfo
{}
}
datanodes
:=
make
([]
*
datapb
.
DataNodeInfo
,
0
,
len
(
sessions
))
for
_
,
session
:=
range
sessions
{
datanodes
=
append
(
datanodes
,
&
datapb
.
DataNodeInfo
{
Address
:
session
.
Address
,
Version
:
session
.
ServerID
,
Channels
:
[]
*
datapb
.
ChannelStatus
{},
})
}
return
datanodes
}
func
(
s
*
Server
)
startSegmentManager
()
{
helper
:=
createNewSegmentHelper
(
s
.
segmentInfoStream
)
s
.
segmentManager
=
newSegmentManager
(
s
.
meta
,
s
.
allocator
,
withAllocHelper
(
helper
))
...
...
@@ -368,12 +389,14 @@ func (s *Server) startWatchService(ctx context.Context) {
log
.
Info
(
"Received datanode register"
,
zap
.
String
(
"address"
,
datanode
.
Address
),
zap
.
Int64
(
"serverID"
,
datanode
.
Version
))
s
.
cluster
.
register
(
datanode
)
//s.cluster.register(datanode)
s
.
cluster
.
refresh
(
s
.
loadDataNodes
())
case
sessionutil
.
SessionDelEvent
:
log
.
Info
(
"Received datanode unregister"
,
zap
.
String
(
"address"
,
datanode
.
Address
),
zap
.
Int64
(
"serverID"
,
datanode
.
Version
))
s
.
cluster
.
unregister
(
datanode
)
//s.cluster.unregister(datanode)
s
.
cluster
.
refresh
(
s
.
loadDataNodes
())
default
:
log
.
Warn
(
"receive unknown service event type"
,
zap
.
Any
(
"type"
,
event
.
EventType
))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录