未验证 提交 cd3b3d3d 编写于 作者: G groot 提交者: GitHub

Only allow one file for row-based import (#20234)

Signed-off-by: Nyhmo <yihua.mo@zilliz.com>
Signed-off-by: Nyhmo <yihua.mo@zilliz.com>
上级 de45db04
......@@ -1698,7 +1698,7 @@ func TestProxy(t *testing.T) {
defer wg.Done()
req := &milvuspb.ImportRequest{
CollectionName: collectionName,
Files: []string{"f1.json", "f2.json", "f3.json"},
Files: []string{"f1.json"},
}
proxy.stateCode.Store(commonpb.StateCode_Healthy)
resp, err := proxy.Import(context.TODO(), req)
......@@ -1713,7 +1713,7 @@ func TestProxy(t *testing.T) {
defer wg.Done()
req := &milvuspb.ImportRequest{
CollectionName: "bad_collection_name",
Files: []string{"f1", "f2", "f3"},
Files: []string{"f1.json"},
}
proxy.stateCode.Store(commonpb.StateCode_Healthy)
resp, err := proxy.Import(context.TODO(), req)
......@@ -1726,7 +1726,7 @@ func TestProxy(t *testing.T) {
defer wg.Done()
req := &milvuspb.ImportRequest{
CollectionName: "bad_collection_name",
Files: []string{"f1", "f2", "f3"},
Files: []string{"f1.json"},
}
proxy.stateCode.Store(commonpb.StateCode_Healthy)
resp, err := proxy.Import(context.TODO(), req)
......
......@@ -394,6 +394,12 @@ func (m *importManager) isRowbased(files []string) (bool, error) {
}
}
// for row_based, we only allow one file so that each invocation only generate a task
if isRowBased && len(files) > 1 {
log.Error("row-based import, only allow one JSON file each time", zap.Strings("files", files))
return isRowBased, fmt.Errorf("row-based import, only allow one JSON file each time")
}
return isRowBased, nil
}
......@@ -459,6 +465,7 @@ func (m *importManager) importJob(ctx context.Context, req *milvuspb.ImportReque
for i := 0; i < len(req.Files); i++ {
tID, _, err := m.idAllocator(1)
if err != nil {
log.Error("failed to allocate ID for import task", zap.Error(err))
return err
}
newTask := &datapb.ImportTaskInfo{
......@@ -976,22 +983,29 @@ func (m *importManager) listAllTasks(colName string, limit int64) []*milvuspb.Ge
log.Error("failed to load from task store", zap.Error(err))
return tasks
}
taskCount := int64(0)
// filter tasks by collection name
// TODO: how to handle duplicated collection name? for example: a new collection has same name with a dropped collection
for _, task := range importTasks {
if colName != "" && task.GetCollectionName() != colName {
continue
}
taskCount++
if limit > 0 && taskCount > limit {
break
}
currTask := &milvuspb.GetImportStateResponse{}
m.copyTaskInfo(task, currTask)
tasks = append(tasks, currTask)
}
// arrange tasks by id with ascending order, actually, id is the create time of a task
rearrangeTasks(tasks)
return tasks
// if limit is 0 or larger than length of tasks, return all tasks
if limit <= 0 || limit >= int64(len(tasks)) {
return tasks
}
// return the newly tasks from the tail
return tasks[len(tasks)-int(limit):]
}
// removeBadImportSegments marks segments of a failed import task as `dropped`.
......
......@@ -19,7 +19,6 @@ package rootcoord
import (
"context"
"errors"
"strconv"
"sync"
"testing"
"time"
......@@ -590,6 +589,10 @@ func TestImportManager_ImportJob(t *testing.T) {
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
// row-based import not allow multiple files
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
return &datapb.ImportTaskResponse{
Status: &commonpb.Status{
......@@ -600,6 +603,7 @@ func TestImportManager_ImportJob(t *testing.T) {
// row-based case, task count equal to file count
// since the importServiceFunc return error, tasks will be kept in pending list
rowReq.Files = []string{"f1.json"}
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
......@@ -626,13 +630,13 @@ func TestImportManager_ImportJob(t *testing.T) {
}, nil
}
// row-based case, since the importServiceFunc return success, tasks will be sent to woring list
// row-based case, since the importServiceFunc return success, tasks will be sent to working list
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))
// column-based case, since the importServiceFunc return success, tasks will be sent to woring list
// column-based case, since the importServiceFunc return success, tasks will be sent to working list
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
assert.Equal(t, 0, len(mgr.pendingTasks))
......@@ -640,7 +644,7 @@ func TestImportManager_ImportJob(t *testing.T) {
count := 0
importServiceFunc = func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
if count >= 2 {
if count >= 1 {
return &datapb.ImportTaskResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
......@@ -655,19 +659,26 @@ func TestImportManager_ImportJob(t *testing.T) {
}, nil
}
// row-based case, since the importServiceFunc return success for 2 tasks
// the 2 tasks are sent to working list, and 1 task left in pending list
// row-based case, since the importServiceFunc return success for 1 task
// the first task is sent to working list, and 1 task left in pending list
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, len(rowReq.Files)-2, len(mgr.pendingTasks))
assert.Equal(t, 2, len(mgr.workingTasks))
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, 1, len(mgr.workingTasks))
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, 1, len(mgr.pendingTasks))
assert.Equal(t, 1, len(mgr.workingTasks))
// files count exceed MaxPendingCount, return error
// the pending list already has one task
// once task count exceeds MaxPendingCount, return error
for i := 0; i <= MaxPendingCount; i++ {
rowReq.Files = append(rowReq.Files, strconv.Itoa(i)+".json")
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
if i < MaxPendingCount-1 {
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
} else {
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
}
}
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
}
func TestImportManager_AllDataNodesBusy(t *testing.T) {
......@@ -687,7 +698,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
rowReq := &milvuspb.ImportRequest{
CollectionName: "c1",
PartitionName: "p1",
Files: []string{"f1.json", "f2.json", "f3.json"},
Files: []string{"f1.json"},
}
colReq := &milvuspb.ImportRequest{
CollectionName: "c1",
......@@ -703,7 +714,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
dnList := []int64{1, 2, 3}
count := 0
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
if count < len(dnList) {
count++
return &datapb.ImportTaskResponse{
......@@ -725,32 +736,44 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
ErrorCode: commonpb.ErrorCode_Success,
}, nil
}
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))
mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), rowReq, colID, 0)
// each data node owns one task
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
for i := 0; i < len(dnList); i++ {
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, i+1, len(mgr.workingTasks))
}
// all data nodes are busy, new task waiting in pending list
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
assert.Equal(t, 0, len(mgr.workingTasks))
// Reset count.
// now all data nodes are free again, new task is executed instantly
count = 0
mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), colReq, colID, 0)
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, 1, len(mgr.workingTasks))
mgr.importJob(context.TODO(), colReq, colID, 0)
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, 2, len(mgr.workingTasks))
mgr.importJob(context.TODO(), colReq, colID, 0)
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 0, len(mgr.pendingTasks))
assert.Equal(t, 3, len(mgr.workingTasks))
mgr.importJob(context.TODO(), colReq, colID, 0)
// all data nodes are busy now, new task is pending
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 1, len(mgr.pendingTasks))
assert.Equal(t, 3, len(mgr.workingTasks))
}
......@@ -769,7 +792,7 @@ func TestImportManager_TaskState(t *testing.T) {
colID := int64(100)
mockKv := &kv.MockMetaKV{}
mockKv.InMemKv = sync.Map{}
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
return &datapb.ImportTaskResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
......@@ -780,7 +803,7 @@ func TestImportManager_TaskState(t *testing.T) {
rowReq := &milvuspb.ImportRequest{
CollectionName: "c1",
PartitionName: "p1",
Files: []string{"f1.json", "f2.json", "f3.json"},
Files: []string{"f1.json"},
}
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
......@@ -789,7 +812,12 @@ func TestImportManager_TaskState(t *testing.T) {
}, nil
}
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
// add 3 tasks, their ID is 10000, 10001, 10002, make sure updateTaskInfo() works correctly
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), rowReq, colID, 0)
rowReq.Files = []string{"f2.json"}
mgr.importJob(context.TODO(), rowReq, colID, 0)
rowReq.Files = []string{"f3.json"}
mgr.importJob(context.TODO(), rowReq, colID, 0)
info := &rootcoordpb.ImportResult{
......@@ -865,7 +893,7 @@ func TestImportManager_AllocFail(t *testing.T) {
colID := int64(100)
mockKv := &kv.MockMetaKV{}
mockKv.InMemKv = sync.Map{}
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
return &datapb.ImportTaskResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
......@@ -876,7 +904,7 @@ func TestImportManager_AllocFail(t *testing.T) {
rowReq := &milvuspb.ImportRequest{
CollectionName: "c1",
PartitionName: "p1",
Files: []string{"f1.json", "f2.json", "f3.json"},
Files: []string{"f1.json"},
}
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
......@@ -884,8 +912,10 @@ func TestImportManager_AllocFail(t *testing.T) {
ErrorCode: commonpb.ErrorCode_Success,
}, nil
}
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), rowReq, colID, 0)
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
assert.Equal(t, 0, len(mgr.pendingTasks))
}
func TestImportManager_ListAllTasks(t *testing.T) {
......@@ -916,7 +946,7 @@ func TestImportManager_ListAllTasks(t *testing.T) {
rowReq := &milvuspb.ImportRequest{
CollectionName: "c1",
PartitionName: "p1",
Files: []string{"f1.json", "f2.json", "f3.json"},
Files: []string{"f1.json"},
}
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
return &commonpb.Status{
......@@ -924,10 +954,25 @@ func TestImportManager_ListAllTasks(t *testing.T) {
}, nil
}
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
mgr.importJob(context.TODO(), rowReq, colID, 0)
repeat := 10
for i := 0; i < repeat; i++ {
mgr.importJob(context.TODO(), rowReq, colID, 0)
}
tasks := mgr.listAllTasks("", 100)
assert.Equal(t, len(rowReq.Files), len(tasks))
// list all tasks
tasks := mgr.listAllTasks("", int64(repeat))
assert.Equal(t, repeat, len(tasks))
for i := 0; i < repeat; i++ {
assert.Equal(t, int64(i+1), tasks[i].Id)
}
// list few tasks
limit := 3
tasks = mgr.listAllTasks("", int64(limit))
assert.Equal(t, limit, len(tasks))
for i := 0; i < limit; i++ {
assert.Equal(t, int64(i+repeat-limit+1), tasks[i].Id)
}
resp := mgr.getTaskState(1)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
......@@ -943,13 +988,10 @@ func TestImportManager_ListAllTasks(t *testing.T) {
}, nil
}
// there are 10 tasks in working list, and 1 task in pending list, totally 11 tasks
mgr.importJob(context.TODO(), rowReq, colID, 0)
tasks = mgr.listAllTasks("", 100)
assert.Equal(t, len(rowReq.Files)*2, len(tasks))
tasks = mgr.listAllTasks("", 1)
assert.Equal(t, 1, len(tasks))
tasks = mgr.listAllTasks("bad-collection-name", 1)
assert.Equal(t, 0, len(tasks))
assert.Equal(t, repeat+1, len(tasks))
// the id of tasks must be 1,2,3,4,5,6(sequence not guaranteed)
ids := make(map[int64]struct{})
......@@ -960,6 +1002,14 @@ func TestImportManager_ListAllTasks(t *testing.T) {
delete(ids, tasks[i].Id)
}
assert.Equal(t, 0, len(ids))
// list few tasks
tasks = mgr.listAllTasks("", 1)
assert.Equal(t, 1, len(tasks))
// invliad collection name, returns empty
tasks = mgr.listAllTasks("bad-collection-name", 1)
assert.Equal(t, 0, len(tasks))
}
func TestImportManager_setCollectionPartitionName(t *testing.T) {
......@@ -1009,11 +1059,16 @@ func TestImportManager_rearrangeTasks(t *testing.T) {
func TestImportManager_isRowbased(t *testing.T) {
mgr := &importManager{}
files := []string{"1.json", "2.json"}
files := []string{"1.json"}
rb, err := mgr.isRowbased(files)
assert.Nil(t, err)
assert.True(t, rb)
files = []string{"1.json", "2.json"}
rb, err = mgr.isRowbased(files)
assert.NotNil(t, err)
assert.True(t, rb)
files = []string{"1.json", "2.npy"}
rb, err = mgr.isRowbased(files)
assert.NotNil(t, err)
......
......@@ -419,13 +419,14 @@ func tryFlushBlocks(ctx context.Context,
// if segment size is larger than predefined blockSize, flush to create a new binlog file
// initialize a new FieldData list for next round batch read
if size > int(blockSize) && rowCount > 0 {
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than maxBlockSize", nil)
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil)
err := callFlushFunc(blockData, i)
if err != nil {
log.Error("Import util: failed to flush block data", zap.Int("shardID", i))
return err
}
log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("shardID", i))
log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount),
zap.Int("size", size), zap.Int("shardID", i), zap.Int64("blockSize", blockSize))
blocksData[i] = initSegmentData(collectionSchema)
if blocksData[i] == nil {
......
......@@ -264,11 +264,10 @@ func (p *ImportWrapper) fileValidation(filePaths []string) (bool, error) {
fileNames[name] = struct{}{}
// check file size, single file size cannot exceed MaxFileSize
// TODO add context
size, err := p.chunkManager.Size(context.TODO(), filePath)
size, err := p.chunkManager.Size(p.ctx, filePath)
if err != nil {
log.Error("import wrapper: failed to get file size", zap.String("filePath", filePath), zap.Error(err))
return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s'", filePath)
return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s', make sure the input path is related path, error:%w", filePath, err)
}
// empty file
......@@ -312,7 +311,6 @@ func (p *ImportWrapper) Import(filePaths []string, options ImportOptions) error
// data restore function to import milvus native binlog files(for backup/restore tools)
// the backup/restore tool provide two paths for a partition, the first path is binlog path, the second is deltalog path
if p.isBinlogImport(filePaths) {
// TODO: handle the timestamp end point passed from client side, currently use math.MaxUint64
return p.doBinlogImport(filePaths, options.TsStartPoint, options.TsEndPoint)
}
......@@ -454,23 +452,51 @@ func (p *ImportWrapper) reportPersisted() error {
// isBinlogImport is to judge whether it is binlog import operation
// For internal usage by the restore tool: https://github.com/zilliztech/milvus-backup
// This tool exports data from a milvus service, and call bulkload interface to import native data into another milvus service.
// This tool provides two paths: one is data log path of a partition,the other is delta log path of this partition.
// This tool provides two paths: one is insert log path of a partition,the other is delta log path of this partition.
// This method checks the filePaths, if the file paths is exist and not a file, we say it is native import.
func (p *ImportWrapper) isBinlogImport(filePaths []string) bool {
// must contains the insert log path, and the delta log path is optional
if len(filePaths) != 1 && len(filePaths) != 2 {
log.Info("import wrapper: paths count is not 1 or 2, not binlog import", zap.Int("len", len(filePaths)))
// must contains the insert log path, and the delta log path is optional to be empty string
if len(filePaths) != 2 {
log.Info("import wrapper: paths count is not 2, not binlog import", zap.Int("len", len(filePaths)))
return false
}
for i := 0; i < len(filePaths); i++ {
filePath := filePaths[i]
_, fileType := GetFileNameAndExt(filePath)
checkFunc := func(filePath string) bool {
// contains file extension, is not a path
_, fileType := GetFileNameAndExt(filePath)
if len(fileType) != 0 {
log.Info("import wrapper: not a path, not binlog import", zap.String("filePath", filePath), zap.String("fileType", fileType))
return false
}
// check path existence
exist, err := p.chunkManager.Exist(p.ctx, filePath)
if err != nil {
log.Error("import wrapper: failed to check the path existence, not binlog import", zap.String("filePath", filePath), zap.Error(err))
return false
}
if !exist {
log.Info("import wrapper: the input path doesn't exist, not binlog import", zap.String("filePath", filePath))
return false
}
return true
}
// the first path is insert log path
filePath := filePaths[0]
if len(filePath) == 0 {
log.Info("import wrapper: the first path is empty string, not binlog import")
return false
}
if !checkFunc(filePath) {
return false
}
// the second path is delta log path
filePath = filePaths[1]
if len(filePath) > 0 && !checkFunc(filePath) {
return false
}
log.Info("import wrapper: do binlog import")
......@@ -501,12 +527,9 @@ func (p *ImportWrapper) doBinlogImport(filePaths []string, tsStartPoint uint64,
func (p *ImportWrapper) parseRowBasedJSON(filePath string, onlyValidate bool) error {
tr := timerecord.NewTimeRecorder("json row-based parser: " + filePath)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// for minio storage, chunkManager will download file into local memory
// for local storage, chunkManager open the file directly
file, err := p.chunkManager.Reader(ctx, filePath)
file, err := p.chunkManager.Reader(p.ctx, filePath)
if err != nil {
return err
}
......@@ -553,13 +576,11 @@ func (p *ImportWrapper) parseColumnBasedNumpy(filePath string, onlyValidate bool
combineFunc func(fields map[storage.FieldID]storage.FieldData) error) error {
tr := timerecord.NewTimeRecorder("numpy parser: " + filePath)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
fileName, _ := GetFileNameAndExt(filePath)
// for minio storage, chunkManager will download file into local memory
// for local storage, chunkManager open the file directly
file, err := p.chunkManager.Reader(ctx, filePath)
file, err := p.chunkManager.Reader(p.ctx, filePath)
if err != nil {
return err
}
......@@ -721,7 +742,7 @@ func (p *ImportWrapper) splitFieldsData(fieldsData map[storage.FieldID]storage.F
return fmt.Errorf("import wrapper: field '%s' row count %d is not equal to other fields row count: %d", name, count, rowCount)
}
}
log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount), zap.Any("rowCountOfEachField", rowCounter))
log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount))
primaryData, ok := fieldsData[primaryKey.GetFieldID()]
if !ok {
......
......@@ -37,7 +37,6 @@ import (
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/internal/util/timerecord"
)
......@@ -173,7 +172,9 @@ func createMockCallbackFunctions(t *testing.T, rowCounter *rowCounterTest) (Assi
}
func Test_NewImportWrapper(t *testing.T) {
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -226,7 +227,9 @@ func Test_ImportWrapperRowBased(t *testing.T) {
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -388,7 +391,9 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -485,7 +490,9 @@ func Test_ImportWrapperRowBased_perf(t *testing.T) {
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -764,7 +771,9 @@ func Test_ImportWrapperReportFailRowBased(t *testing.T) {
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -824,7 +833,9 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
f := dependency.NewDefaultFactory(true)
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
......@@ -867,11 +878,16 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
}
func Test_ImportWrapperIsBinlogImport(t *testing.T) {
ctx := context.Background()
err := os.MkdirAll(TempFilesPath, os.ModePerm)
assert.Nil(t, err)
defer os.RemoveAll(TempFilesPath)
cm := &MockChunkManager{
size: 1,
}
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
// NewChunkManagerFactory() can specify the root path
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
ctx := context.Background()
cm, err := f.NewPersistentStorageChunkManager(ctx)
assert.NoError(t, err)
idAllocator := newIDAllocator(ctx, t, nil)
schema := perfSchema(128)
......@@ -902,13 +918,43 @@ func Test_ImportWrapperIsBinlogImport(t *testing.T) {
b = wrapper.isBinlogImport(paths)
assert.False(t, b)
// success
// path doesn't exist
paths = []string{
"/tmp",
"/tmp",
"path1",
"path2",
}
b = wrapper.isBinlogImport(paths)
assert.False(t, b)
// insert log path is created, but delta log path doesn't exist
err = os.MkdirAll(TempFilesPath+paths[0], os.ModePerm)
assert.NoError(t, err)
b = wrapper.isBinlogImport(paths)
assert.False(t, b)
// both the two path are created, success
err = os.MkdirAll(TempFilesPath+paths[1], os.ModePerm)
assert.NoError(t, err)
b = wrapper.isBinlogImport(paths)
assert.True(t, b)
// the delta log path is empty, success
paths = []string{
"path1",
"",
}
b = wrapper.isBinlogImport(paths)
assert.True(t, b)
// path is empty string
paths = []string{
"",
"",
}
b = wrapper.isBinlogImport(paths)
assert.False(t, b)
}
func Test_ImportWrapperDoBinlogImport(t *testing.T) {
......@@ -1022,7 +1068,7 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
err := wrapper.splitFieldsData(nil, 0)
assert.NotNil(t, err)
// split 100 rows to 4 blocks
// split 100 rows to 4 blocks, success
rowCount := 100
input := initSegmentData(schema)
for j := 0; j < rowCount; j++ {
......@@ -1039,6 +1085,12 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
assert.Equal(t, 4, rowCounter.callTime)
assert.Equal(t, rowCount, rowCounter.rowCount)
// alloc id failed
wrapper.rowIDAllocator = newIDAllocator(ctx, t, errors.New("error"))
err = wrapper.splitFieldsData(input, 512)
assert.NotNil(t, err)
wrapper.rowIDAllocator = newIDAllocator(ctx, t, nil)
// row count of fields are unequal
schema.Fields[0].AutoID = false
input = initSegmentData(schema)
......@@ -1053,4 +1105,29 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
}
err = wrapper.splitFieldsData(input, 512)
assert.NotNil(t, err)
// primary key not found
wrapper.collectionSchema.Fields[0].IsPrimaryKey = false
err = wrapper.splitFieldsData(input, 512)
assert.NotNil(t, err)
wrapper.collectionSchema.Fields[0].IsPrimaryKey = true
// primary key is varchar, success
wrapper.collectionSchema.Fields[0].DataType = schemapb.DataType_VarChar
input = initSegmentData(schema)
for j := 0; j < rowCount; j++ {
pkField := input[101].(*storage.StringFieldData)
pkField.Data = append(pkField.Data, strconv.FormatInt(int64(j), 10))
flagField := input[102].(*storage.BoolFieldData)
flagField.Data = append(flagField.Data, true)
}
rowCounter.callTime = 0
rowCounter.rowCount = 0
importResult.AutoIds = []int64{}
err = wrapper.splitFieldsData(input, 1024)
assert.Nil(t, err)
assert.Equal(t, 0, len(importResult.AutoIds))
assert.Equal(t, 2, rowCounter.callTime)
assert.Equal(t, rowCount, rowCounter.rowCount)
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册