未验证 提交 97b14266 编写于 作者: F FluorineDog 提交者: GitHub

fix benchmark, add support for primary key in schemaHelper (#6281)

* fix benchmark
Signed-off-by: Nfluorinedog <fluorinedog@gmail.com>

* fix tests
Signed-off-by: Nfluorinedog <fluorinedog@gmail.com>
上级 836a45ec
......@@ -20,8 +20,7 @@ using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
static int dim = 128;
static int64_t N = 1024 * 1024 * 1;
static int dim = 768;
const auto schema = []() {
auto schema = std::make_shared<Schema>();
......@@ -29,10 +28,6 @@ const auto schema = []() {
return schema;
}();
const auto dataset_ = [] {
auto dataset_ = DataGen(schema, N);
return dataset_;
}();
const auto plan = [] {
std::string dsl = R"({
......@@ -43,7 +38,7 @@ const auto plan = [] {
"fakevec": {
"metric_type": "L2",
"params": {
"nprobe": 4
"nprobe": 10
},
"query": "$0",
"topk": 5
......@@ -57,7 +52,7 @@ const auto plan = [] {
return plan;
}();
auto ph_group = [] {
auto num_queries = 5;
auto num_queries = 10;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, 1024);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
return ph_group;
......@@ -66,6 +61,12 @@ auto ph_group = [] {
static void
Search_SmallIndex(benchmark::State& state) {
// schema->AddDebugField("age", DataType::FLOAT);
static int64_t N = 1024 * 32;
const auto dataset_ = [] {
auto dataset_ = DataGen(schema, N);
return dataset_;
}();
auto is_small_index = state.range(0);
auto chunk_size = state.range(1) * 1024;
......@@ -88,11 +89,16 @@ Search_SmallIndex(benchmark::State& state) {
}
}
BENCHMARK(Search_SmallIndex)->MinTime(5)->ArgsProduct({{true, false}, {8, 16, 32, 64, 128}});
BENCHMARK(Search_SmallIndex)->MinTime(5)->ArgsProduct({{true, false}, {8, 16, 32}});
static void
Search_Sealed(benchmark::State& state) {
auto segment = CreateSealedSegment(schema);
static int64_t N = 1024 * 1024;
const auto dataset_ = [] {
auto dataset_ = DataGen(schema, N);
return dataset_;
}();
SealedLoader(dataset_, *segment);
auto choice = state.range(0);
if (choice == 0) {
......@@ -103,6 +109,7 @@ Search_Sealed(benchmark::State& state) {
auto indexing = GenIndexing(N, dim, vec);
LoadIndexInfo info;
info.index = indexing;
info.field_id = (*schema)[FieldName("fakevec")].get_id().get();
info.index_params["index_type"] = "IVF";
info.index_params["index_mode"] = "CPU";
info.index_params["metric_type"] = MetricTypeToName(MetricType::METRIC_L2);
......
......@@ -184,42 +184,43 @@ TEST(Query, ExecWithPredicateLoader) {
[
[
[
"980486->3.149221",
"318367->3.661235",
"302798->4.553688",
"321424->4.757450",
"565529->5.083780"
"982->0.000000",
"25315->4.741588",
"57893->4.758279",
"551029->5.078479",
"455002->5.134716"
],
[
"233390->7.931535",
"238958->8.109344",
"230645->8.439169",
"901939->8.658772",
"380328->8.731251"
"528353->8.740297",
"659305->8.802286",
"935763->9.422906",
"794649->9.436665",
"192031->9.832053"
],
[
"897246->3.749835",
"750683->3.897577",
"857598->4.230977",
"299009->4.379639",
"440010->4.454046"
"980439->3.342777",
"433044->3.424016",
"797884->3.663446",
"697705->3.944479",
"186546->4.404788"
],
[
"840855->4.782170",
"709627->5.063170",
"72322->5.166143",
"107142->5.180207",
"948403->5.247065"
"642941->3.753775",
"967504->3.885163",
"764517->4.364819",
"332938->4.418214",
"232724->4.574215"
],
[
"810401->3.926393",
"46575->4.054171",
"201740->4.274491",
"669040->4.399628",
"231500->4.831223"
"351788->4.453843",
"410227->4.699380",
"501497->4.805948",
"715061->5.166959",
"414882->5.179897"
]
]
])");
std::cout << json.dump(2);
ASSERT_EQ(json.dump(2), ref.dump(2));
}
......@@ -326,42 +327,43 @@ TEST(Query, ExecWithPredicate) {
[
[
[
"980486->3.149221",
"318367->3.661235",
"302798->4.553688",
"321424->4.757450",
"565529->5.083780"
"982->0.000000",
"25315->4.741588",
"57893->4.758279",
"551029->5.078479",
"455002->5.134716"
],
[
"233390->7.931535",
"238958->8.109344",
"230645->8.439169",
"901939->8.658772",
"380328->8.731251"
"528353->8.740297",
"659305->8.802286",
"935763->9.422906",
"794649->9.436665",
"192031->9.832053"
],
[
"897246->3.749835",
"750683->3.897577",
"857598->4.230977",
"299009->4.379639",
"440010->4.454046"
"980439->3.342777",
"433044->3.424016",
"797884->3.663446",
"697705->3.944479",
"186546->4.404788"
],
[
"840855->4.782170",
"709627->5.063170",
"72322->5.166143",
"107142->5.180207",
"948403->5.247065"
"642941->3.753775",
"967504->3.885163",
"764517->4.364819",
"332938->4.418214",
"232724->4.574215"
],
[
"810401->3.926393",
"46575->4.054171",
"201740->4.274491",
"669040->4.399628",
"231500->4.831223"
"351788->4.453843",
"410227->4.699380",
"501497->4.805948",
"715061->5.166959",
"414882->5.179897"
]
]
])");
std::cout << json.dump(2);
ASSERT_EQ(json.dump(2), ref.dump(2));
}
......@@ -551,43 +553,44 @@ TEST(Query, ExecWithoutPredicate) {
[
[
[
"980486->3.149221",
"318367->3.661235",
"302798->4.553688",
"321424->4.757450",
"565529->5.083780"
"982->0.000000",
"25315->4.741588",
"57893->4.758279",
"694663->4.980466",
"551029->5.078479"
],
[
"233390->7.931535",
"238958->8.109344",
"230645->8.439169",
"901939->8.658772",
"380328->8.731251"
"559507->7.956653",
"871836->8.694542",
"528353->8.740297",
"659305->8.802286",
"516137->8.935913"
],
[
"749862->3.398494",
"701321->3.632437",
"897246->3.749835",
"750683->3.897577",
"105995->4.073595"
"980439->3.342777",
"433044->3.424016",
"527556->3.487235",
"797884->3.663446",
"814805->3.782786"
],
[
"138274->3.454446",
"124548->3.783290",
"840855->4.782170",
"936719->5.026924",
"709627->5.063170"
"642941->3.753775",
"967504->3.885163",
"177960->4.339530",
"764517->4.364819",
"841079->4.403300"
],
[
"810401->3.926393",
"46575->4.054171",
"201740->4.274491",
"669040->4.399628",
"231500->4.831223"
"688614->4.259011",
"351788->4.453843",
"452698->4.473838",
"410227->4.699380",
"501497->4.805948"
]
]
]
)");
std::cout << json.dump(2);
ASSERT_EQ(json.dump(2), ref.dump(2));
}
......
......@@ -306,42 +306,41 @@ TEST(Sealed, LoadFieldData) {
[
[
[
"980486->3.149221",
"579754->3.634295",
"318367->3.661235",
"265835->4.333358",
"302798->4.553688"
"982->0.000000",
"25315->4.741588",
"551029->5.078479",
"455002->5.134716",
"504754->5.329021"
],
[
"233390->7.931535",
"238958->8.109344",
"230645->8.439169",
"901939->8.658772",
"380328->8.731251"
"287136->8.409121",
"528353->8.740297",
"935763->9.422906",
"794649->9.436665",
"192031->9.832053"
],
[
"897246->3.749835",
"750683->3.897577",
"857598->4.230977",
"299009->4.379639",
"440010->4.454046"
"59251->2.542610",
"433044->3.424016",
"797884->3.663446",
"430441->3.692723",
"697705->3.944479"
],
[
"37641->3.783446",
"22628->4.719435",
"840855->4.782170",
"709627->5.063170",
"635836->5.156095"
"611544->3.463480",
"642941->3.753775",
"967504->3.885163",
"232724->4.574215",
"507245->5.040902"
],
[
"810401->3.926393",
"46575->4.054171",
"201740->4.274491",
"669040->4.399628",
"231500->4.831223"
"351788->4.453843",
"410227->4.699380",
"501497->4.805948",
"715061->5.166959",
"414882->5.179897"
]
]
]
)");
])");
ASSERT_EQ(std_json.dump(-2), json.dump(-2));
}
......@@ -56,15 +56,15 @@ struct GeneratedData {
friend GeneratedData
DataGen(SchemaPtr schema, int64_t N, uint64_t seed);
void
generate_rows(int N, SchemaPtr schema);
generate_rows(int64_t N, SchemaPtr schema);
};
inline void
GeneratedData::generate_rows(int N, SchemaPtr schema) {
GeneratedData::generate_rows(int64_t N, SchemaPtr schema) {
std::vector<int> offset_infos(schema->size() + 1, 0);
auto sizeof_infos = schema->get_sizeof_infos();
std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1);
auto len_per_row = offset_infos.back();
int64_t len_per_row = offset_infos.back();
assert(len_per_row == schema->get_total_sizeof());
std::vector<char> result(len_per_row * N);
......@@ -103,13 +103,17 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) {
switch (field.get_data_type()) {
case engine::DataType::VECTOR_FLOAT: {
auto dim = field.get_dim();
vector<float> final;
vector<float> final(dim * N);
bool is_ip = starts_with(field.get_name().get(), "normalized");
#pragma omp parallel for
for (int n = 0; n < N; ++n) {
vector<float> data(dim);
float sum = 0;
std::default_random_engine er2(seed + n);
std::normal_distribution<> distr2(0, 1);
for (auto& x : data) {
x = distr(er) + offset;
x = distr2(er2) + offset;
sum += x * x;
}
if (is_ip) {
......@@ -119,7 +123,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) {
}
}
final.insert(final.end(), data.begin(), data.end());
std::copy(data.begin(), data.end(), final.begin() + dim * n);
}
insert_cols(final);
break;
......@@ -319,9 +323,9 @@ SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) {
inline knowhere::VecIndexPtr
GenIndexing(int64_t N, int64_t dim, const float* vec) {
// {knowhere::IndexParams::nprobe, 10},
auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
{knowhere::IndexParams::nlist, 100},
{knowhere::IndexParams::nprobe, 10},
{knowhere::IndexParams::nlist, 1024},
{knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
{knowhere::meta::DEVICEID, 0}};
auto database = knowhere::GenDataset(N, dim, vec);
......
......@@ -61,16 +61,17 @@ func EstimateSizePerRecord(schema *schemapb.CollectionSchema) (int, error) {
}
type SchemaHelper struct {
schema *schemapb.CollectionSchema
nameOffset map[string]int
idOffset map[int64]int
schema *schemapb.CollectionSchema
nameOffset map[string]int
idOffset map[int64]int
primaryKeyOffset int
}
func CreateSchemaHelper(schema *schemapb.CollectionSchema) (*SchemaHelper, error) {
if schema == nil {
return nil, errors.New("schema is nil")
}
schemaHelper := SchemaHelper{schema: schema, nameOffset: make(map[string]int), idOffset: make(map[int64]int)}
schemaHelper := SchemaHelper{schema: schema, nameOffset: make(map[string]int), idOffset: make(map[int64]int), primaryKeyOffset: -1}
for offset, field := range schema.Fields {
if _, ok := schemaHelper.nameOffset[field.Name]; ok {
return nil, errors.New("duplicated fieldName: " + field.Name)
......@@ -80,10 +81,23 @@ func CreateSchemaHelper(schema *schemapb.CollectionSchema) (*SchemaHelper, error
}
schemaHelper.nameOffset[field.Name] = offset
schemaHelper.idOffset[field.FieldID] = offset
if field.IsPrimaryKey {
if schemaHelper.primaryKeyOffset != -1 {
return nil, errors.New("primary key is not unique")
}
schemaHelper.primaryKeyOffset = offset
}
}
return &schemaHelper, nil
}
func (helper *SchemaHelper) GetPrimaryKeyField() (*schemapb.FieldSchema, error) {
if helper.primaryKeyOffset == -1 {
return nil, fmt.Errorf("no primary in schema")
}
return helper.schema.Fields[helper.primaryKeyOffset], nil
}
func (helper *SchemaHelper) GetFieldFromName(fieldName string) (*schemapb.FieldSchema, error) {
offset, ok := helper.nameOffset[fieldName]
if !ok {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册