未验证 提交 4a32b842 编写于 作者: Z zhenshan.cao 提交者: GitHub

Improve the check logic of channel remove (#23473)

Signed-off-by: Nzhenshan.cao <zhenshan.cao@zilliz.com>
上级 3ad9ff7a
......@@ -243,7 +243,7 @@ func (c *ChannelManager) unwatchDroppedChannels() {
nodeChannels := c.store.GetNodesChannels()
for _, nodeChannel := range nodeChannels {
for _, ch := range nodeChannel.Channels {
if !c.h.CheckShouldDropChannel(ch.Name) {
if !c.h.CheckShouldDropChannel(ch.Name, ch.CollectionID) {
continue
}
err := c.remove(nodeChannel.NodeID, ch)
......@@ -766,7 +766,7 @@ func (c *ChannelManager) Reassign(originNodeID UniqueID, channelName string) err
reallocates := &NodeChannelInfo{originNodeID, []*channel{ch}}
if c.isMarkedDrop(channelName) {
if c.isMarkedDrop(channelName, ch.CollectionID) {
if err := c.remove(originNodeID, ch); err != nil {
return fmt.Errorf("failed to remove watch info: %v,%s", ch, err.Error())
}
......@@ -813,7 +813,7 @@ func (c *ChannelManager) CleanupAndReassign(nodeID UniqueID, channelName string)
reallocates := &NodeChannelInfo{nodeID, []*channel{chToCleanUp}}
if c.isMarkedDrop(channelName) {
if c.isMarkedDrop(channelName, chToCleanUp.CollectionID) {
if err := c.remove(nodeID, chToCleanUp); err != nil {
return fmt.Errorf("failed to remove watch info: %v,%s", chToCleanUp, err.Error())
}
......@@ -871,8 +871,8 @@ func (c *ChannelManager) getNodeIDByChannelName(chName string) (bool, UniqueID)
return false, 0
}
func (c *ChannelManager) isMarkedDrop(channelName string) bool {
return c.h.CheckShouldDropChannel(channelName)
func (c *ChannelManager) isMarkedDrop(channelName string, collectionID UniqueID) bool {
return c.h.CheckShouldDropChannel(channelName, collectionID)
}
func getReleaseOp(nodeID UniqueID, ch *channel) ChannelOpSet {
......
......@@ -18,6 +18,7 @@ package datacoord
import (
"context"
"time"
"github.com/milvus-io/milvus/pkg/util/tsoutil"
"go.uber.org/zap"
......@@ -27,6 +28,7 @@ import (
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/retry"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
......@@ -36,7 +38,7 @@ type Handler interface {
GetQueryVChanPositions(channel *channel, partitionID UniqueID) *datapb.VchannelInfo
// GetDataVChanPositions gets the information recovery needed of a channel for DataNode
GetDataVChanPositions(channel *channel, partitionID UniqueID) *datapb.VchannelInfo
CheckShouldDropChannel(channel string) bool
CheckShouldDropChannel(channel string, collectionID UniqueID) bool
FinishDropChannel(channel string) error
GetCollection(ctx context.Context, collectionID UniqueID) (*collectionInfo, error)
}
......@@ -316,6 +318,26 @@ func trimSegmentInfo(info *datapb.SegmentInfo) *datapb.SegmentInfo {
}
}
// HasCollection returns whether the collection exist from user's perspective.
func (h *ServerHandler) HasCollection(ctx context.Context, collectionID UniqueID) (bool, error) {
var hasCollection bool
ctx2, cancel := context.WithTimeout(ctx, time.Minute*30)
defer cancel()
if err := retry.Do(ctx2, func() error {
has, err := h.s.hasCollection(ctx2, collectionID)
if err != nil {
log.RatedInfo(60, "datacoord ServerHandler HasCollection retry failed", zap.Error(err))
return err
}
hasCollection = has
return nil
}, retry.Attempts(100000)); err != nil {
log.Error("datacoord ServerHandler HasCollection finally failed")
panic("datacoord ServerHandler HasCollection finally failed")
}
return hasCollection, nil
}
// GetCollection returns collection info with specified collection id
func (h *ServerHandler) GetCollection(ctx context.Context, collectionID UniqueID) (*collectionInfo, error) {
coll := h.s.meta.GetCollection(collectionID)
......@@ -332,9 +354,20 @@ func (h *ServerHandler) GetCollection(ctx context.Context, collectionID UniqueID
}
// CheckShouldDropChannel returns whether specified channel is marked to be removed
func (h *ServerHandler) CheckShouldDropChannel(channel string) bool {
return h.s.meta.catalog.ShouldDropChannel(h.s.ctx, channel) ||
!h.s.meta.catalog.ChannelExists(h.s.ctx, channel)
func (h *ServerHandler) CheckShouldDropChannel(channel string, collectionID UniqueID) bool {
if h.s.meta.catalog.ShouldDropChannel(h.s.ctx, channel) {
return true
}
// collectionID parse from channelName
has, err := h.HasCollection(h.s.ctx, collectionID)
if err != nil {
log.Info("datacoord ServerHandler CheckShouldDropChannel hasCollection failed", zap.Error(err))
return false
}
log.Info("datacoord ServerHandler CheckShouldDropChannel hasCollection", zap.Bool("shouldDropChannel", !has),
zap.String("channel", channel))
return !has
}
// FinishDropChannel cleans up the remove flag for channels
......
......@@ -860,7 +860,7 @@ func (h *mockHandler) GetDataVChanPositions(channel *channel, partitionID Unique
}
}
func (h *mockHandler) CheckShouldDropChannel(channel string) bool {
func (h *mockHandler) CheckShouldDropChannel(channel string, collectionID UniqueID) bool {
return false
}
......
......@@ -1016,6 +1016,54 @@ func (s *Server) loadCollectionFromRootCoord(ctx context.Context, collectionID i
return nil
}
// hasCollection communicates with RootCoord and check whether this collection exist from the user's perspective.
func (s *Server) hasCollection(ctx context.Context, collectionID int64) (bool, error) {
resp, err := s.rootCoordClient.DescribeCollection(ctx, &milvuspb.DescribeCollectionRequest{
Base: commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_DescribeCollection),
commonpbutil.WithSourceID(paramtable.GetNodeID()),
),
DbName: "",
CollectionID: collectionID,
})
if err != nil {
return false, err
}
if resp == nil {
return false, errNilResponse
}
if resp.Status.ErrorCode == commonpb.ErrorCode_Success {
return true, nil
}
if resp.Status.ErrorCode == commonpb.ErrorCode_CollectionNotExists {
return false, nil
}
return false, fmt.Errorf("code:%s, reason:%s", resp.Status.GetErrorCode().String(), resp.Status.GetReason())
}
// hasCollectionInternal communicates with RootCoord and check whether this collection's meta exist in rootcoord.
func (s *Server) hasCollectionInternal(ctx context.Context, collectionID int64) (bool, error) {
resp, err := s.rootCoordClient.DescribeCollectionInternal(ctx, &milvuspb.DescribeCollectionRequest{
Base: commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_DescribeCollection),
commonpbutil.WithSourceID(paramtable.GetNodeID()),
),
DbName: "",
CollectionID: collectionID,
})
if err != nil {
return false, err
}
if resp == nil {
return false, errNilResponse
}
if resp.Status.ErrorCode != commonpb.ErrorCode_Success {
return false, nil
}
return true, nil
}
func (s *Server) reCollectSegmentStats(ctx context.Context) {
if s.channelManager == nil {
log.Error("null channel manager found, which should NOT happen in non-testing environment")
......
......@@ -60,6 +60,7 @@ import (
"github.com/milvus-io/milvus/pkg/util/metautil"
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/tsoutil"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
......@@ -2352,7 +2353,30 @@ func TestGetQueryVChanPositions(t *testing.T) {
}
func TestShouldDropChannel(t *testing.T) {
svr := newTestServer(t, nil)
type myRootCoord struct {
mocks.RootCoord
}
myRoot := &myRootCoord{}
myRoot.EXPECT().Init().Return(nil)
myRoot.EXPECT().Start().Return(nil)
myRoot.EXPECT().AllocTimestamp(mock.Anything, mock.Anything).Return(&rootcoordpb.AllocTimestampResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success},
Timestamp: tsoutil.ComposeTSByTime(time.Now(), 0),
Count: 1,
}, nil)
myRoot.EXPECT().AllocID(mock.Anything, mock.Anything).Return(&rootcoordpb.AllocIDResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success},
ID: int64(tsoutil.ComposeTSByTime(time.Now(), 0)),
Count: 1,
}, nil)
var crt rootCoordCreatorFunc = func(ctx context.Context, metaRoot string, etcdClient *clientv3.Client) (types.RootCoord, error) {
return myRoot, nil
}
opt := WithRootCoordCreator(crt)
svr := newTestServer(t, nil, opt)
defer closeTestServer(t, svr)
schema := newTestSchema()
svr.meta.AddCollection(&collectionInfo{
......@@ -2375,122 +2399,53 @@ func TestShouldDropChannel(t *testing.T) {
},
},
})
/*
s1 := &datapb.SegmentInfo{
ID: 1,
CollectionID: 0,
PartitionID: 0,
InsertChannel: "ch1",
State: commonpb.SegmentState_Dropped,
StartPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{8, 9, 10},
MsgGroup: "",
Timestamp: 0,
},
DmlPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{1, 2, 3},
MsgGroup: "",
Timestamp: 0,
},
}
s2 := &datapb.SegmentInfo{
ID: 2,
CollectionID: 0,
PartitionID: 0,
InsertChannel: "ch1",
State: commonpb.SegmentState_Flushed,
CompactionFrom: []int64{4, 5},
StartPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{8, 9, 10},
MsgGroup: "",
},
DmlPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{1, 2, 3},
MsgGroup: "",
Timestamp: 1,
},
}
s3 := &datapb.SegmentInfo{
ID: 3,
CollectionID: 0,
PartitionID: 1,
InsertChannel: "ch1",
State: commonpb.SegmentState_Growing,
StartPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{8, 9, 10},
MsgGroup: "",
},
DmlPosition: &msgpb.MsgPosition{
ChannelName: "ch1",
MsgID: []byte{11, 12, 13},
MsgGroup: "",
Timestamp: 2,
},
}
s4 := &datapb.SegmentInfo{
ID: 4,
CollectionID: 0,
PartitionID: 1,
InsertChannel: "ch1",
State: commonpb.SegmentState_Growing,
}*/
/*
t.Run("channel without segments", func(t *testing.T) {
r := svr.handler.CheckShouldDropChannel("ch1")
assert.True(t, r)
})
t.Run("channel with all dropped segments", func(t *testing.T) {
err := svr.meta.AddSegment(NewSegmentInfo(s1))
require.NoError(t, err)
r := svr.handler.CheckShouldDropChannel("ch1")
assert.True(t, r)
})
t.Run("channel with all dropped segments and flushed compacted segments", func(t *testing.T) {
err := svr.meta.AddSegment(NewSegmentInfo(s2))
require.Nil(t, err)
r := svr.handler.CheckShouldDropChannel("ch1")
assert.False(t, r)
})
t.Run("channel with other state segments", func(t *testing.T) {
err := svr.meta.DropSegment(2)
require.Nil(t, err)
err = svr.meta.AddSegment(NewSegmentInfo(s3))
require.Nil(t, err)
t.Run("channel name not in kv, collection not exist", func(t *testing.T) {
//myRoot.code = commonpb.ErrorCode_CollectionNotExists
myRoot.EXPECT().DescribeCollection(mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_CollectionNotExists},
CollectionID: -1,
}, nil).Once()
assert.True(t, svr.handler.CheckShouldDropChannel("ch99", -1))
})
r := svr.handler.CheckShouldDropChannel("ch1")
assert.False(t, r)
})
t.Run("channel name not in kv, collection exist", func(t *testing.T) {
myRoot.EXPECT().DescribeCollection(mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success},
CollectionID: 0,
}, nil).Once()
assert.False(t, svr.handler.CheckShouldDropChannel("ch99", 0))
})
t.Run("channel with dropped segment and with segment without start position", func(t *testing.T) {
err := svr.meta.DropSegment(3)
require.Nil(t, err)
err = svr.meta.AddSegment(NewSegmentInfo(s4))
require.Nil(t, err)
t.Run("collection name in kv, collection exist", func(t *testing.T) {
myRoot.EXPECT().DescribeCollection(mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success},
CollectionID: 0,
}, nil).Once()
assert.False(t, svr.handler.CheckShouldDropChannel("ch1", 0))
})
r := svr.handler.CheckShouldDropChannel("ch1")
assert.True(t, r)
})
*/
t.Run("channel name not in kv", func(t *testing.T) {
assert.True(t, svr.handler.CheckShouldDropChannel("ch99"))
t.Run("collection name in kv, collection not exist", func(t *testing.T) {
myRoot.EXPECT().DescribeCollection(mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_CollectionNotExists},
CollectionID: -1,
}, nil).Once()
assert.True(t, svr.handler.CheckShouldDropChannel("ch1", -1))
})
t.Run("channel in remove flag", func(t *testing.T) {
t.Run("channel in remove flag, collection exist", func(t *testing.T) {
err := svr.meta.catalog.MarkChannelDeleted(context.TODO(), "ch1")
require.NoError(t, err)
assert.True(t, svr.handler.CheckShouldDropChannel("ch1"))
myRoot.EXPECT().DescribeCollection(mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success},
CollectionID: 0,
}, nil).Once()
assert.True(t, svr.handler.CheckShouldDropChannel("ch1", 0))
})
}
......@@ -3956,7 +3911,7 @@ func newTestServer(t *testing.T, receiveCh chan any, opts ...Option) *Server {
_, err = etcdCli.Delete(context.Background(), sessKey, clientv3.WithPrefix())
assert.Nil(t, err)
svr := CreateServer(context.TODO(), factory, opts...)
svr := CreateServer(context.TODO(), factory)
svr.SetEtcdClient(etcdCli)
svr.dataNodeCreator = func(ctx context.Context, addr string) (types.DataNode, error) {
return newMockDataNodeClient(0, receiveCh)
......@@ -3965,6 +3920,10 @@ func newTestServer(t *testing.T, receiveCh chan any, opts ...Option) *Server {
return newMockRootCoordService(), nil
}
for _, opt := range opts {
opt(svr)
}
err = svr.Init()
assert.Nil(t, err)
if Params.DataCoordCfg.EnableActiveStandby.GetAsBool() {
......
......@@ -209,10 +209,10 @@ func TestFlowGraphManager(t *testing.T) {
fm.dropAll()
const channelPrefix = "by-dev-rootcoord-dml-test-fg-mgr-execute-"
var baseParams = &Params.BaseTable
baseParams.Save(Params.DataNodeCfg.MemoryForceSyncEnable.Key, fmt.Sprintf("%t", true))
for _, test := range tests {
var baseParams = &Params.BaseTable
baseParams.Save(Params.DataNodeCfg.MemoryWatermark.Key, fmt.Sprintf("%f", test.watermark))
baseParams.Save(Params.DataNodeCfg.MemoryForceSyncEnable.Key, fmt.Sprintf("%t", true))
for i, memorySize := range test.memorySizes {
vchannel := fmt.Sprintf("%s%d", channelPrefix, i)
vchan := &datapb.VchannelInfo{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册