// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License // // Created by mike on 12/28/20. // #include "test_utils/DataGen.h" #include #include #include #include #include #include "segcore/SegmentSealedImpl.h" using namespace milvus; using namespace milvus::segcore; using namespace milvus::query; TEST(Sealed, without_predicate) { using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); auto dim = 16; auto topK = 5; auto metric_type = MetricType::METRIC_L2; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); schema->AddDebugField("age", DataType::FLOAT); std::string dsl = R"({ "bool": { "must": [ { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5 } } } ] } })"; int64_t N = 1000 * 1000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(0); for (int64_t i = 0; i < 1000 * dim; ++i) { vec_col.push_back(0); } auto query_ptr = vec_col.data() + 4200 * dim; auto segment = CreateGrowingSegment(schema); segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); SearchResult sr; Timestamp time = 1000000; std::vector ph_group_arr = {ph_group.get()}; sr = segment->Search(plan.get(), *ph_group, time); auto pre_result = SearchResultToJson(sr); auto indexing = std::make_shared(); auto conf = knowhere::Config{{knowhere::meta::DIM, dim}, {knowhere::meta::TOPK, topK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto database = knowhere::GenDataset(N, dim, vec_col.data() + 1000 * dim); indexing->Train(database, conf); indexing->AddWithoutIds(database, conf); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), dim); auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr); auto result = indexing->Query(query_dataset, conf, nullptr); auto ids = result->Get(milvus::knowhere::meta::IDS); // for comparison auto dis = result->Get(milvus::knowhere::meta::DISTANCE); // for comparison std::vector vec_ids(ids, ids + topK * num_queries); std::vector vec_dis(dis, dis + topK * num_queries); sr.internal_seg_offsets_ = vec_ids; sr.result_distances_ = vec_dis; auto ref_result = SearchResultToJson(sr); LoadIndexInfo load_info; load_info.field_id = fake_id.get(); load_info.index = indexing; load_info.index_params["metric_type"] = "L2"; auto sealed_segment = SealedCreator(schema, dataset, load_info); sr = sealed_segment->Search(plan.get(), *ph_group, time); auto post_result = SearchResultToJson(sr); std::cout << ref_result.dump(1); std::cout << post_result.dump(1); ASSERT_EQ(ref_result.dump(2), post_result.dump(2)); } TEST(Sealed, with_predicate) { using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); auto dim = 16; auto topK = 5; auto metric_type = MetricType::METRIC_L2; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); schema->AddDebugField("counter", DataType::INT64); std::string dsl = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 420000, "LT": 420005 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5 } } } ] } })"; int64_t N = 1000 * 1000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(0); auto query_ptr = vec_col.data() + 420000 * dim; auto segment = CreateGrowingSegment(schema); segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); SearchResult sr; Timestamp time = 10000000; std::vector ph_group_arr = {ph_group.get()}; sr = segment->Search(plan.get(), *ph_group, time); auto pre_sr = sr; auto indexing = std::make_shared(); auto conf = knowhere::Config{{knowhere::meta::DIM, dim}, {knowhere::meta::TOPK, topK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto database = knowhere::GenDataset(N, dim, vec_col.data()); indexing->Train(database, conf); indexing->AddWithoutIds(database, conf); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), dim); auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr); auto result = indexing->Query(query_dataset, conf, nullptr); LoadIndexInfo load_info; load_info.field_id = fake_id.get(); load_info.index = indexing; load_info.index_params["metric_type"] = "L2"; auto sealed_segment = SealedCreator(schema, dataset, load_info); sr = sealed_segment->Search(plan.get(), *ph_group, time); auto post_sr = sr; for (int i = 0; i < num_queries; ++i) { auto offset = i * topK; ASSERT_EQ(post_sr.internal_seg_offsets_[offset], 420000 + i); ASSERT_EQ(post_sr.result_distances_[offset], 0.0); } } TEST(Sealed, LoadFieldData) { auto dim = 16; auto topK = 5; int64_t N = 1000 * 1000; auto metric_type = MetricType::METRIC_L2; auto schema = std::make_shared(); auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto counter_id = schema->AddDebugField("counter", DataType::INT64); auto double_id = schema->AddDebugField("double", DataType::DOUBLE); auto nothing_id = schema->AddDebugField("nothing", DataType::INT32); auto dataset = DataGen(schema, N); auto fakevec = dataset.get_col(0); auto indexing = GenIndexing(N, dim, fakevec.data()); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ "bool": { "must": [ { "range": { "double": { "GE": -1, "LT": 1 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5 } } } ] } })"; Timestamp time = 1000000; auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); SealedLoader(dataset, *segment); segment->DropFieldData(nothing_id); segment->Search(plan.get(), *ph_group, time); segment->DropFieldData(fakevec_id); ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); LoadIndexInfo vec_info; vec_info.field_id = fakevec_id.get(); vec_info.index = indexing; vec_info.index_params["metric_type"] = milvus::knowhere::Metric::L2; segment->LoadIndex(vec_info); ASSERT_EQ(segment->num_chunk(), 1); auto chunk_span1 = segment->chunk_data(FieldOffset(1), 0); auto chunk_span2 = segment->chunk_data(FieldOffset(2), 0); auto ref1 = dataset.get_col(1); auto ref2 = dataset.get_col(2); for (int i = 0; i < N; ++i) { ASSERT_EQ(chunk_span1[i], ref1[i]); ASSERT_EQ(chunk_span2[i], ref2[i]); } auto sr = segment->Search(plan.get(), *ph_group, time); auto json = SearchResultToJson(sr); std::cout << json.dump(1); segment->DropIndex(fakevec_id); ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); segment->LoadIndex(vec_info); auto sr2 = segment->Search(plan.get(), *ph_group, time); auto json2 = SearchResultToJson(sr); ASSERT_EQ(json.dump(-2), json2.dump(-2)); segment->DropFieldData(double_id); ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time)); auto std_json = Json::parse(R"( [ [ [ "982->0.000000", "25315->4.741588", "551029->5.078479", "455002->5.134716", "504754->5.329021" ], [ "287136->8.409121", "528353->8.740297", "935763->9.422906", "794649->9.436665", "192031->9.832053" ], [ "59251->2.542610", "433044->3.424016", "797884->3.663446", "430441->3.692723", "697705->3.944479" ], [ "611544->3.463480", "642941->3.753775", "967504->3.885163", "232724->4.574215", "507245->5.040902" ], [ "351788->4.453843", "410227->4.699380", "501497->4.805948", "715061->5.166959", "414882->5.179897" ] ] ])"); ASSERT_EQ(std_json.dump(-2), json.dump(-2)); }