diff --git a/internal/core/bench/bench_search.cpp b/internal/core/bench/bench_search.cpp index 671f55e71a5c19270f7ad44653947dea11bb2ffd..fdf2d52aab1ea2aba6a3fd2ac793703cbdb22b4b 100644 --- a/internal/core/bench/bench_search.cpp +++ b/internal/core/bench/bench_search.cpp @@ -20,8 +20,7 @@ using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; -static int dim = 128; -static int64_t N = 1024 * 1024 * 1; +static int dim = 768; const auto schema = []() { auto schema = std::make_shared(); @@ -29,10 +28,6 @@ const auto schema = []() { return schema; }(); -const auto dataset_ = [] { - auto dataset_ = DataGen(schema, N); - return dataset_; -}(); const auto plan = [] { std::string dsl = R"({ @@ -43,7 +38,7 @@ const auto plan = [] { "fakevec": { "metric_type": "L2", "params": { - "nprobe": 4 + "nprobe": 10 }, "query": "$0", "topk": 5 @@ -57,7 +52,7 @@ const auto plan = [] { return plan; }(); auto ph_group = [] { - auto num_queries = 5; + auto num_queries = 10; auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); return ph_group; @@ -66,6 +61,12 @@ auto ph_group = [] { static void Search_SmallIndex(benchmark::State& state) { // schema->AddDebugField("age", DataType::FLOAT); + + static int64_t N = 1024 * 32; + const auto dataset_ = [] { + auto dataset_ = DataGen(schema, N); + return dataset_; + }(); auto is_small_index = state.range(0); auto chunk_size = state.range(1) * 1024; @@ -88,11 +89,16 @@ Search_SmallIndex(benchmark::State& state) { } } -BENCHMARK(Search_SmallIndex)->MinTime(5)->ArgsProduct({{true, false}, {8, 16, 32, 64, 128}}); +BENCHMARK(Search_SmallIndex)->MinTime(5)->ArgsProduct({{true, false}, {8, 16, 32}}); static void Search_Sealed(benchmark::State& state) { auto segment = CreateSealedSegment(schema); + static int64_t N = 1024 * 1024; + const auto dataset_ = [] { + auto dataset_ = DataGen(schema, N); + return dataset_; + }(); SealedLoader(dataset_, *segment); auto choice = state.range(0); if (choice == 0) { @@ -103,6 +109,7 @@ Search_Sealed(benchmark::State& state) { auto indexing = GenIndexing(N, dim, vec); LoadIndexInfo info; info.index = indexing; + info.field_id = (*schema)[FieldName("fakevec")].get_id().get(); info.index_params["index_type"] = "IVF"; info.index_params["index_mode"] = "CPU"; info.index_params["metric_type"] = MetricTypeToName(MetricType::METRIC_L2); diff --git a/internal/core/unittest/test_query.cpp b/internal/core/unittest/test_query.cpp index 01a9a30eefb929620d45be5e9755d70953888245..e6f5f326f5c2ddf893aa536e48ef48575d5cd61f 100644 --- a/internal/core/unittest/test_query.cpp +++ b/internal/core/unittest/test_query.cpp @@ -184,42 +184,43 @@ TEST(Query, ExecWithPredicateLoader) { [ [ [ - "980486->3.149221", - "318367->3.661235", - "302798->4.553688", - "321424->4.757450", - "565529->5.083780" + "982->0.000000", + "25315->4.741588", + "57893->4.758279", + "551029->5.078479", + "455002->5.134716" ], [ - "233390->7.931535", - "238958->8.109344", - "230645->8.439169", - "901939->8.658772", - "380328->8.731251" + "528353->8.740297", + "659305->8.802286", + "935763->9.422906", + "794649->9.436665", + "192031->9.832053" ], [ - "897246->3.749835", - "750683->3.897577", - "857598->4.230977", - "299009->4.379639", - "440010->4.454046" + "980439->3.342777", + "433044->3.424016", + "797884->3.663446", + "697705->3.944479", + "186546->4.404788" ], [ - "840855->4.782170", - "709627->5.063170", - "72322->5.166143", - "107142->5.180207", - "948403->5.247065" + "642941->3.753775", + "967504->3.885163", + "764517->4.364819", + "332938->4.418214", + "232724->4.574215" ], [ - "810401->3.926393", - "46575->4.054171", - "201740->4.274491", - "669040->4.399628", - "231500->4.831223" + "351788->4.453843", + "410227->4.699380", + "501497->4.805948", + "715061->5.166959", + "414882->5.179897" ] ] ])"); + std::cout << json.dump(2); ASSERT_EQ(json.dump(2), ref.dump(2)); } @@ -326,42 +327,43 @@ TEST(Query, ExecWithPredicate) { [ [ [ - "980486->3.149221", - "318367->3.661235", - "302798->4.553688", - "321424->4.757450", - "565529->5.083780" + "982->0.000000", + "25315->4.741588", + "57893->4.758279", + "551029->5.078479", + "455002->5.134716" ], [ - "233390->7.931535", - "238958->8.109344", - "230645->8.439169", - "901939->8.658772", - "380328->8.731251" + "528353->8.740297", + "659305->8.802286", + "935763->9.422906", + "794649->9.436665", + "192031->9.832053" ], [ - "897246->3.749835", - "750683->3.897577", - "857598->4.230977", - "299009->4.379639", - "440010->4.454046" + "980439->3.342777", + "433044->3.424016", + "797884->3.663446", + "697705->3.944479", + "186546->4.404788" ], [ - "840855->4.782170", - "709627->5.063170", - "72322->5.166143", - "107142->5.180207", - "948403->5.247065" + "642941->3.753775", + "967504->3.885163", + "764517->4.364819", + "332938->4.418214", + "232724->4.574215" ], [ - "810401->3.926393", - "46575->4.054171", - "201740->4.274491", - "669040->4.399628", - "231500->4.831223" + "351788->4.453843", + "410227->4.699380", + "501497->4.805948", + "715061->5.166959", + "414882->5.179897" ] ] ])"); + std::cout << json.dump(2); ASSERT_EQ(json.dump(2), ref.dump(2)); } @@ -551,43 +553,44 @@ TEST(Query, ExecWithoutPredicate) { [ [ [ - "980486->3.149221", - "318367->3.661235", - "302798->4.553688", - "321424->4.757450", - "565529->5.083780" + "982->0.000000", + "25315->4.741588", + "57893->4.758279", + "694663->4.980466", + "551029->5.078479" ], [ - "233390->7.931535", - "238958->8.109344", - "230645->8.439169", - "901939->8.658772", - "380328->8.731251" + "559507->7.956653", + "871836->8.694542", + "528353->8.740297", + "659305->8.802286", + "516137->8.935913" ], [ - "749862->3.398494", - "701321->3.632437", - "897246->3.749835", - "750683->3.897577", - "105995->4.073595" + "980439->3.342777", + "433044->3.424016", + "527556->3.487235", + "797884->3.663446", + "814805->3.782786" ], [ - "138274->3.454446", - "124548->3.783290", - "840855->4.782170", - "936719->5.026924", - "709627->5.063170" + "642941->3.753775", + "967504->3.885163", + "177960->4.339530", + "764517->4.364819", + "841079->4.403300" ], [ - "810401->3.926393", - "46575->4.054171", - "201740->4.274491", - "669040->4.399628", - "231500->4.831223" + "688614->4.259011", + "351788->4.453843", + "452698->4.473838", + "410227->4.699380", + "501497->4.805948" ] ] ] )"); + std::cout << json.dump(2); ASSERT_EQ(json.dump(2), ref.dump(2)); } diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index 2a5864efc510a7d30cc84d352b7e63b51d41aeba..bf2d4ade72f401ac111961c2ca133439c2722697 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -306,42 +306,41 @@ TEST(Sealed, LoadFieldData) { [ [ [ - "980486->3.149221", - "579754->3.634295", - "318367->3.661235", - "265835->4.333358", - "302798->4.553688" + "982->0.000000", + "25315->4.741588", + "551029->5.078479", + "455002->5.134716", + "504754->5.329021" ], [ - "233390->7.931535", - "238958->8.109344", - "230645->8.439169", - "901939->8.658772", - "380328->8.731251" + "287136->8.409121", + "528353->8.740297", + "935763->9.422906", + "794649->9.436665", + "192031->9.832053" ], [ - "897246->3.749835", - "750683->3.897577", - "857598->4.230977", - "299009->4.379639", - "440010->4.454046" + "59251->2.542610", + "433044->3.424016", + "797884->3.663446", + "430441->3.692723", + "697705->3.944479" ], [ - "37641->3.783446", - "22628->4.719435", - "840855->4.782170", - "709627->5.063170", - "635836->5.156095" + "611544->3.463480", + "642941->3.753775", + "967504->3.885163", + "232724->4.574215", + "507245->5.040902" ], [ - "810401->3.926393", - "46575->4.054171", - "201740->4.274491", - "669040->4.399628", - "231500->4.831223" + "351788->4.453843", + "410227->4.699380", + "501497->4.805948", + "715061->5.166959", + "414882->5.179897" ] ] -] - )"); +])"); ASSERT_EQ(std_json.dump(-2), json.dump(-2)); } diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index 9d54e16c2139d8d482ee226711f461c1371aad73..34e3c73d9592dc751234408149017b2282a69efa 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -56,15 +56,15 @@ struct GeneratedData { friend GeneratedData DataGen(SchemaPtr schema, int64_t N, uint64_t seed); void - generate_rows(int N, SchemaPtr schema); + generate_rows(int64_t N, SchemaPtr schema); }; inline void -GeneratedData::generate_rows(int N, SchemaPtr schema) { +GeneratedData::generate_rows(int64_t N, SchemaPtr schema) { std::vector offset_infos(schema->size() + 1, 0); auto sizeof_infos = schema->get_sizeof_infos(); std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1); - auto len_per_row = offset_infos.back(); + int64_t len_per_row = offset_infos.back(); assert(len_per_row == schema->get_total_sizeof()); std::vector result(len_per_row * N); @@ -103,13 +103,17 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) { switch (field.get_data_type()) { case engine::DataType::VECTOR_FLOAT: { auto dim = field.get_dim(); - vector final; + vector final(dim * N); bool is_ip = starts_with(field.get_name().get(), "normalized"); +#pragma omp parallel for for (int n = 0; n < N; ++n) { vector data(dim); float sum = 0; + + std::default_random_engine er2(seed + n); + std::normal_distribution<> distr2(0, 1); for (auto& x : data) { - x = distr(er) + offset; + x = distr2(er2) + offset; sum += x * x; } if (is_ip) { @@ -119,7 +123,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) { } } - final.insert(final.end(), data.begin(), data.end()); + std::copy(data.begin(), data.end(), final.begin() + dim * n); } insert_cols(final); break; @@ -319,9 +323,9 @@ SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) { inline knowhere::VecIndexPtr GenIndexing(int64_t N, int64_t dim, const float* vec) { + // {knowhere::IndexParams::nprobe, 10}, auto conf = knowhere::Config{{knowhere::meta::DIM, dim}, - {knowhere::IndexParams::nlist, 100}, - {knowhere::IndexParams::nprobe, 10}, + {knowhere::IndexParams::nlist, 1024}, {knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto database = knowhere::GenDataset(N, dim, vec); diff --git a/internal/util/typeutil/schema.go b/internal/util/typeutil/schema.go index 86df9809d8b5e497f3091dfab537f17bf9ca34d0..0acb0be58d577467866e6953aed42b8cbaec1b65 100644 --- a/internal/util/typeutil/schema.go +++ b/internal/util/typeutil/schema.go @@ -61,16 +61,17 @@ func EstimateSizePerRecord(schema *schemapb.CollectionSchema) (int, error) { } type SchemaHelper struct { - schema *schemapb.CollectionSchema - nameOffset map[string]int - idOffset map[int64]int + schema *schemapb.CollectionSchema + nameOffset map[string]int + idOffset map[int64]int + primaryKeyOffset int } func CreateSchemaHelper(schema *schemapb.CollectionSchema) (*SchemaHelper, error) { if schema == nil { return nil, errors.New("schema is nil") } - schemaHelper := SchemaHelper{schema: schema, nameOffset: make(map[string]int), idOffset: make(map[int64]int)} + schemaHelper := SchemaHelper{schema: schema, nameOffset: make(map[string]int), idOffset: make(map[int64]int), primaryKeyOffset: -1} for offset, field := range schema.Fields { if _, ok := schemaHelper.nameOffset[field.Name]; ok { return nil, errors.New("duplicated fieldName: " + field.Name) @@ -80,10 +81,23 @@ func CreateSchemaHelper(schema *schemapb.CollectionSchema) (*SchemaHelper, error } schemaHelper.nameOffset[field.Name] = offset schemaHelper.idOffset[field.FieldID] = offset + if field.IsPrimaryKey { + if schemaHelper.primaryKeyOffset != -1 { + return nil, errors.New("primary key is not unique") + } + schemaHelper.primaryKeyOffset = offset + } } return &schemaHelper, nil } +func (helper *SchemaHelper) GetPrimaryKeyField() (*schemapb.FieldSchema, error) { + if helper.primaryKeyOffset == -1 { + return nil, fmt.Errorf("no primary in schema") + } + return helper.schema.Fields[helper.primaryKeyOffset], nil +} + func (helper *SchemaHelper) GetFieldFromName(fieldName string) (*schemapb.FieldSchema, error) { offset, ok := helper.nameOffset[fieldName] if !ok {