未验证 提交 a1cd03dd 编写于 作者: G groot 提交者: GitHub

Dont expire pending import tasks (#20432)

Signed-off-by: Nyhmo <yihua.mo@zilliz.com>
Signed-off-by: Nyhmo <yihua.mo@zilliz.com>
上级 13dbc090
......@@ -559,6 +559,7 @@ message ImportTaskInfo {
string collection_name = 12; // Collection name for the import task.
string partition_name = 13; // Partition name for the import task.
repeated common.KeyValuePair infos = 14; // extra information about the task, bucket, etc.
int64 start_ts = 15; // Timestamp when the import task is sent to datanode to execute.
}
message ImportTaskResponse {
......
......@@ -252,6 +252,7 @@ func (m *importManager) sendOutTasks(ctx context.Context) error {
defer m.workingLock.Unlock()
log.Debug("import task added as working task", zap.Int64("task ID", it.TaskId))
task.State.StateCode = commonpb.ImportState_ImportStarted
task.StartTs = time.Now().Unix()
// first update the import task into meta store and then put it into working tasks
if err := m.persistTaskInfo(task); err != nil {
log.Error("failed to update import task",
......@@ -284,6 +285,11 @@ func (m *importManager) flipTaskState(ctx context.Context) error {
if task.GetState().GetStateCode() == commonpb.ImportState_ImportPersisted {
log.Info("<ImportPersisted> task found, checking if it is eligible to become <ImportCompleted>",
zap.Int64("task ID", task.GetId()))
// TODO: if collection or partition has been dropped before the task complete,
// we need to set the task to failed, because the checkIndexingDone() cannot know
// whether the collection has been dropped.
resp := m.getTaskState(task.GetId())
ok, err := m.checkIndexingDone(ctx, resp.GetCollectionId(), resp.GetSegmentIds())
if err != nil {
......@@ -858,54 +864,27 @@ func (m *importManager) yieldTaskInfo(tID int64) error {
// expireOldTasks removes expired tasks from memory.
func (m *importManager) expireOldTasksFromMem() {
// Expire old pending tasks, if any.
func() {
m.pendingLock.Lock()
defer m.pendingLock.Unlock()
index := 0
for _, t := range m.pendingTasks {
taskExpiredAndStateUpdated := false
if taskExpired(t) {
taskID := t.GetId()
m.pendingLock.Unlock()
if err := m.setImportTaskStateAndReason(taskID, commonpb.ImportState_ImportFailed,
"the import task has timed out"); err != nil {
log.Error("failed to set import task state",
zap.Int64("task ID", taskID),
zap.Any("target state", commonpb.ImportState_ImportFailed))
} else {
// Set true when task has expired and its state has been successfully updated.
taskExpiredAndStateUpdated = true
}
m.pendingLock.Lock()
log.Info("a pending task has expired", zap.Int64("task ID", t.GetId()))
}
if !taskExpiredAndStateUpdated {
// Only keep tasks that are not expired or failed to have their states updated.
m.pendingTasks[index] = t
index++
}
}
// To prevent memory leak.
for i := index; i < len(m.pendingTasks); i++ {
m.pendingTasks[i] = nil
}
m.pendingTasks = m.pendingTasks[:index]
}()
// Expire old working tasks.
// no need to expire pending tasks. With old working tasks finish or turn into expired, datanodes back to idle,
// let the sendOutTasksLoop() push pending tasks into datanodes.
// expire old working tasks.
func() {
m.workingLock.Lock()
defer m.workingLock.Unlock()
for _, v := range m.workingTasks {
taskExpiredAndStateUpdated := false
if v.GetState().GetStateCode() != commonpb.ImportState_ImportCompleted && taskExpired(v) {
log.Info("a working task has expired", zap.Int64("task ID", v.GetId()))
log.Info("a working task has expired and will be marked as failed",
zap.Int64("task ID", v.GetId()),
zap.Int64("startTs", v.GetStartTs()),
zap.Float64("ImportTaskExpiration", Params.RootCoordCfg.ImportTaskExpiration))
taskID := v.GetId()
m.workingLock.Unlock()
// Remove DataNode from busy node list, so it can serve other tasks again.
m.busyNodesLock.Lock()
delete(m.busyNodes, v.GetDatanodeId())
m.busyNodesLock.Unlock()
if err := m.setImportTaskStateAndReason(taskID, commonpb.ImportState_ImportFailed,
"the import task has timed out"); err != nil {
log.Error("failed to set import task state",
......@@ -943,7 +922,9 @@ func (m *importManager) expireOldTasksFromEtcd() {
}
if taskPastRetention(ti) {
log.Info("an import task has passed retention period and will be removed from Etcd",
zap.Int64("task ID", ti.GetId()))
zap.Int64("task ID", ti.GetId()),
zap.Int64("createTs", ti.GetCreateTs()),
zap.Float64("ImportTaskRetention", Params.RootCoordCfg.ImportTaskRetention))
if err = m.yieldTaskInfo(ti.GetId()); err != nil {
log.Error("failed to remove import task from Etcd",
zap.Int64("task ID", ti.GetId()),
......@@ -1053,7 +1034,7 @@ func BuildImportTaskKey(taskID int64) string {
// taskExpired returns true if the in-mem task is considered expired.
func taskExpired(ti *datapb.ImportTaskInfo) bool {
return Params.RootCoordCfg.ImportTaskExpiration <= float64(time.Now().Unix()-ti.GetCreateTs())
return Params.RootCoordCfg.ImportTaskExpiration <= float64(time.Now().Unix()-ti.GetStartTs())
}
// taskPastRetention returns true if the task is considered expired in Etcd.
......@@ -1087,6 +1068,7 @@ func cloneImportTaskInfo(taskInfo *datapb.ImportTaskInfo) *datapb.ImportTaskInfo
CollectionName: taskInfo.GetCollectionName(),
PartitionName: taskInfo.GetPartitionName(),
Infos: taskInfo.GetInfos(),
StartTs: taskInfo.GetStartTs(),
}
return cloned
}
......@@ -49,10 +49,10 @@ func TestImportManager_NewImportManager(t *testing.T) {
return globalCount, 0, nil
}
Params.RootCoordCfg.ImportTaskSubPath = "test_import_task"
Params.RootCoordCfg.ImportTaskExpiration = 50
Params.RootCoordCfg.ImportTaskRetention = 200
checkPendingTasksInterval = 100
cleanUpLoopInterval = 100
Params.RootCoordCfg.ImportTaskExpiration = 1 // unit: second
Params.RootCoordCfg.ImportTaskRetention = 200 // unit: second
checkPendingTasksInterval = 500 // unit: millisecond
cleanUpLoopInterval = 500 // unit: millisecond
mockKv := memkv.NewMemoryKV()
ti1 := &datapb.ImportTaskInfo{
Id: 100,
......@@ -104,14 +104,27 @@ func TestImportManager_NewImportManager(t *testing.T) {
defer cancel()
mgr := newImportManager(ctx, mockKv, idAlloc, callImportServiceFn, callMarkSegmentsDropped, nil, nil, nil, nil)
assert.NotNil(t, mgr)
// there are 2 tasks read from store, one is pending, the other is persisted.
// the persisted task will be marked to failed since the server restart
// pending list: 1 task, working list: 0 task
_, err := mgr.loadFromTaskStore(true)
assert.NoError(t, err)
var wgLoop sync.WaitGroup
wgLoop.Add(2)
// the pending task will be sent to working list
// pending list: 0 task, working list: 1 task
mgr.sendOutTasks(ctx)
assert.Equal(t, 1, len(mgr.workingTasks))
// this case wait 3 seconds, the pending task's StartTs is set when it is put into working list
// ImportTaskExpiration is 1 second, it will be marked as expired task by the expireOldTasksFromMem()
// pending list: 0 task, working list: 0 task
mgr.cleanupLoop(&wgLoop)
assert.Equal(t, 0, len(mgr.workingTasks))
// nothing to send now
mgr.sendOutTasksLoop(&wgLoop)
wgLoop.Wait()
})
......@@ -186,42 +199,10 @@ func TestImportManager_NewImportManager(t *testing.T) {
}()
})
wg.Add(1)
t.Run("pending task expired", func(t *testing.T) {
defer wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
mgr := newImportManager(ctx, mockKv, idAlloc, callImportServiceFn, callMarkSegmentsDropped, nil, nil, nil, nil)
assert.NotNil(t, mgr)
mgr.pendingTasks = append(mgr.pendingTasks, &datapb.ImportTaskInfo{
Id: 300,
State: &datapb.ImportTaskState{
StateCode: commonpb.ImportState_ImportPending,
},
CreateTs: time.Now().Unix() + 1,
})
mgr.pendingTasks = append(mgr.pendingTasks, &datapb.ImportTaskInfo{
Id: 400,
State: &datapb.ImportTaskState{
StateCode: commonpb.ImportState_ImportPending,
},
CreateTs: time.Now().Unix() - 100,
})
_, err := mgr.loadFromTaskStore(true)
assert.NoError(t, err)
var wgLoop sync.WaitGroup
wgLoop.Add(2)
assert.Equal(t, 2, len(mgr.pendingTasks))
mgr.cleanupLoop(&wgLoop)
assert.Equal(t, 1, len(mgr.pendingTasks))
mgr.sendOutTasksLoop(&wgLoop)
wgLoop.Wait()
})
wg.Add(1)
t.Run("check init", func(t *testing.T) {
defer wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
mgr := newImportManager(ctx, mockKv, idAlloc, callImportServiceFn, callMarkSegmentsDropped, nil, nil, nil, nil)
assert.NotNil(t, mgr)
......@@ -1149,3 +1130,100 @@ func TestImportManager_isRowbased(t *testing.T) {
assert.Nil(t, err)
assert.False(t, rb)
}
func TestImportManager_checkIndexingDone(t *testing.T) {
ctx := context.Background()
mgr := &importManager{
callDescribeIndex: func(ctx context.Context, colID UniqueID) (*indexpb.DescribeIndexResponse, error) {
return nil, errors.New("error")
},
callGetSegmentIndexState: func(ctx context.Context, collID UniqueID, indexName string, segIDs []UniqueID) ([]*indexpb.SegmentIndexState, error) {
return nil, errors.New("error")
},
}
segmentsID := []typeutil.UniqueID{1, 2, 3}
// check index of 3 segments
// callDescribeIndex() failed
done, err := mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.False(t, done)
assert.Error(t, err)
mgr.callDescribeIndex = func(ctx context.Context, colID UniqueID) (*indexpb.DescribeIndexResponse, error) {
return &indexpb.DescribeIndexResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
},
}, nil
}
// callDescribeIndex() unexpected error
done, err = mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.False(t, done)
assert.Error(t, err)
mgr.callDescribeIndex = func(ctx context.Context, colID UniqueID) (*indexpb.DescribeIndexResponse, error) {
return &indexpb.DescribeIndexResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_IndexNotExist,
},
}, nil
}
// callDescribeIndex() index not exist
done, err = mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.True(t, done)
assert.Nil(t, err)
mgr.callDescribeIndex = func(ctx context.Context, colID UniqueID) (*indexpb.DescribeIndexResponse, error) {
return &indexpb.DescribeIndexResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
},
IndexInfos: []*indexpb.IndexInfo{
{
State: commonpb.IndexState_Finished,
},
},
}, nil
}
// callGetSegmentIndexState() failed
done, err = mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.False(t, done)
assert.Error(t, err)
mgr.callGetSegmentIndexState = func(ctx context.Context, collID UniqueID, indexName string, segIDs []UniqueID) ([]*indexpb.SegmentIndexState, error) {
return []*indexpb.SegmentIndexState{
{
State: commonpb.IndexState_Finished,
},
}, nil
}
// only 1 segment indexed
done, err = mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.False(t, done)
assert.Nil(t, err)
mgr.callGetSegmentIndexState = func(ctx context.Context, collID UniqueID, indexName string, segIDs []UniqueID) ([]*indexpb.SegmentIndexState, error) {
return []*indexpb.SegmentIndexState{
{
State: commonpb.IndexState_Finished,
},
{
State: commonpb.IndexState_Finished,
},
{
State: commonpb.IndexState_Finished,
},
}, nil
}
// all segments indexed
done, err = mgr.checkIndexingDone(ctx, 1, segmentsID)
assert.True(t, done)
assert.Nil(t, err)
}
......@@ -37,6 +37,22 @@ const (
BackupFlag = "backup"
)
type ImportOptions struct {
OnlyValidate bool
TsStartPoint uint64
TsEndPoint uint64
IsBackup bool // whether is triggered by backup tool
}
func DefaultImportOptions() ImportOptions {
options := ImportOptions{
OnlyValidate: false,
TsStartPoint: 0,
TsEndPoint: math.MaxUint64,
}
return options
}
// ValidateOptions the options is illegal, return nil if illegal, return error if not.
// Illegal options:
// start_ts: 10-digit physical timestamp, e.g. 1665995420
......
......@@ -20,7 +20,6 @@ import (
"bufio"
"context"
"fmt"
"math"
"go.uber.org/zap"
......@@ -78,22 +77,6 @@ type WorkingSegment struct {
fieldsStats []*datapb.FieldBinlog // stats of persisted binlogs
}
type ImportOptions struct {
OnlyValidate bool
TsStartPoint uint64
TsEndPoint uint64
IsBackup bool // whether is triggered by backup tool
}
func DefaultImportOptions() ImportOptions {
options := ImportOptions{
OnlyValidate: false,
TsStartPoint: 0,
TsEndPoint: math.MaxUint64,
}
return options
}
type ImportWrapper struct {
ctx context.Context // for canceling parse process
cancel context.CancelFunc // for canceling parse process
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册