test_sealed.cpp 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

//
// Created by mike on 12/28/20.
//
#include "test_utils/DataGen.h"
#include <gtest/gtest.h>
#include <knowhere/index/vector_index/VecIndex.h>
#include <knowhere/index/vector_index/adapter/VectorAdapter.h>
#include <knowhere/index/vector_index/VecIndexFactory.h>
#include <knowhere/index/vector_index/IndexIVF.h>
C
cai.zhang 已提交
21
#include "segcore/SegmentSealedImpl.h"
22 23 24

using namespace milvus;
using namespace milvus::segcore;
F
FluorineDog 已提交
25
using namespace milvus::query;
26 27 28 29 30 31 32 33

TEST(Sealed, without_predicate) {
    using namespace milvus::query;
    using namespace milvus::segcore;
    auto schema = std::make_shared<Schema>();
    auto dim = 16;
    auto topK = 5;
    auto metric_type = MetricType::METRIC_L2;
34
    auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
G
GuoRentong 已提交
35
    schema->AddDebugField("age", DataType::FLOAT);
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    std::string dsl = R"({
        "bool": {
            "must": [
            {
                "vector": {
                    "fakevec": {
                        "metric_type": "L2",
                        "params": {
                            "nprobe": 10
                        },
                        "query": "$0",
                        "topk": 5
                    }
                }
            }
            ]
        }
    })";

    int64_t N = 1000 * 1000;

    auto dataset = DataGen(schema, N);
    auto vec_col = dataset.get_col<float>(0);
    for (int64_t i = 0; i < 1000 * dim; ++i) {
        vec_col.push_back(0);
    }
    auto query_ptr = vec_col.data() + 4200 * dim;
63
    auto segment = CreateGrowingSegment(schema);
64 65 66 67 68 69 70 71
    segment->PreInsert(N);
    segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);

    auto plan = CreatePlan(*schema, dsl);
    auto num_queries = 5;
    auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr);
    auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());

72
    SearchResult sr;
73 74 75
    Timestamp time = 1000000;
    std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};

76 77
    sr = segment->Search(plan.get(), *ph_group, time);
    auto pre_result = SearchResultToJson(sr);
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
    auto indexing = std::make_shared<knowhere::IVF>();

    auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
                                 {knowhere::meta::TOPK, topK},
                                 {knowhere::IndexParams::nlist, 100},
                                 {knowhere::IndexParams::nprobe, 10},
                                 {knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
                                 {knowhere::meta::DEVICEID, 0}};

    auto database = knowhere::GenDataset(N, dim, vec_col.data() + 1000 * dim);
    indexing->Train(database, conf);
    indexing->AddWithoutIds(database, conf);

    EXPECT_EQ(indexing->Count(), N);
    EXPECT_EQ(indexing->Dim(), dim);

    auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr);

    auto result = indexing->Query(query_dataset, conf, nullptr);

    auto ids = result->Get<int64_t*>(milvus::knowhere::meta::IDS);     // for comparison
    auto dis = result->Get<float*>(milvus::knowhere::meta::DISTANCE);  // for comparison
    std::vector<int64_t> vec_ids(ids, ids + topK * num_queries);
    std::vector<float> vec_dis(dis, dis + topK * num_queries);

103 104 105
    sr.internal_seg_offsets_ = vec_ids;
    sr.result_distances_ = vec_dis;
    auto ref_result = SearchResultToJson(sr);
106 107

    LoadIndexInfo load_info;
108
    load_info.field_id = fake_id.get();
109 110 111
    load_info.index = indexing;
    load_info.index_params["metric_type"] = "L2";

F
FluorineDog 已提交
112 113
    auto sealed_segment = SealedCreator(schema, dataset, load_info);
    sr = sealed_segment->Search(plan.get(), *ph_group, time);
114

115
    auto post_result = SearchResultToJson(sr);
116 117 118 119 120 121 122 123 124 125 126 127
    std::cout << ref_result.dump(1);
    std::cout << post_result.dump(1);
    ASSERT_EQ(ref_result.dump(2), post_result.dump(2));
}

TEST(Sealed, with_predicate) {
    using namespace milvus::query;
    using namespace milvus::segcore;
    auto schema = std::make_shared<Schema>();
    auto dim = 16;
    auto topK = 5;
    auto metric_type = MetricType::METRIC_L2;
128
    auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
G
GuoRentong 已提交
129
    schema->AddDebugField("counter", DataType::INT64);
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
    std::string dsl = R"({
        "bool": {
            "must": [
            {
                "range": {
                    "counter": {
                        "GE": 420000,
                        "LT": 420005
                    }
                }
            },
            {
                "vector": {
                    "fakevec": {
                        "metric_type": "L2",
                        "params": {
                            "nprobe": 10
                        },
                        "query": "$0",
                        "topk": 5
                    }
                }
            }
            ]
        }
    })";

    int64_t N = 1000 * 1000;

    auto dataset = DataGen(schema, N);
    auto vec_col = dataset.get_col<float>(0);
    auto query_ptr = vec_col.data() + 420000 * dim;
162
    auto segment = CreateGrowingSegment(schema);
163 164 165 166 167 168 169 170
    segment->PreInsert(N);
    segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);

    auto plan = CreatePlan(*schema, dsl);
    auto num_queries = 5;
    auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr);
    auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());

171
    SearchResult sr;
172 173 174
    Timestamp time = 10000000;
    std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};

175 176
    sr = segment->Search(plan.get(), *ph_group, time);
    auto pre_sr = sr;
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
    auto indexing = std::make_shared<knowhere::IVF>();

    auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
                                 {knowhere::meta::TOPK, topK},
                                 {knowhere::IndexParams::nlist, 100},
                                 {knowhere::IndexParams::nprobe, 10},
                                 {knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
                                 {knowhere::meta::DEVICEID, 0}};

    auto database = knowhere::GenDataset(N, dim, vec_col.data());
    indexing->Train(database, conf);
    indexing->AddWithoutIds(database, conf);

    EXPECT_EQ(indexing->Count(), N);
    EXPECT_EQ(indexing->Dim(), dim);

    auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr);

    auto result = indexing->Query(query_dataset, conf, nullptr);

    LoadIndexInfo load_info;
198
    load_info.field_id = fake_id.get();
199 200 201
    load_info.index = indexing;
    load_info.index_params["metric_type"] = "L2";

F
FluorineDog 已提交
202 203
    auto sealed_segment = SealedCreator(schema, dataset, load_info);
    sr = sealed_segment->Search(plan.get(), *ph_group, time);
204

205
    auto post_sr = sr;
206 207
    for (int i = 0; i < num_queries; ++i) {
        auto offset = i * topK;
208 209
        ASSERT_EQ(post_sr.internal_seg_offsets_[offset], 420000 + i);
        ASSERT_EQ(post_sr.result_distances_[offset], 0.0);
210
    }
C
cai.zhang 已提交
211 212 213 214 215 216 217 218 219
}

TEST(Sealed, LoadFieldData) {
    auto dim = 16;
    auto topK = 5;
    int64_t N = 1000 * 1000;
    auto metric_type = MetricType::METRIC_L2;
    auto schema = std::make_shared<Schema>();
    auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
C
cai.zhang 已提交
220 221 222
    auto counter_id = schema->AddDebugField("counter", DataType::INT64);
    auto double_id = schema->AddDebugField("double", DataType::DOUBLE);
    auto nothing_id = schema->AddDebugField("nothing", DataType::INT32);
223

C
cai.zhang 已提交
224 225 226 227
    auto dataset = DataGen(schema, N);

    auto fakevec = dataset.get_col<float>(0);

228
    auto indexing = GenIndexing(N, dim, fakevec.data());
C
cai.zhang 已提交
229 230

    auto segment = CreateSealedSegment(schema);
F
FluorineDog 已提交
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
    std::string dsl = R"({
        "bool": {
            "must": [
            {
                "range": {
                    "double": {
                        "GE": -1,
                        "LT": 1
                    }
                }
            },
            {
                "vector": {
                    "fakevec": {
                        "metric_type": "L2",
                        "params": {
                            "nprobe": 10
                        },
                        "query": "$0",
                        "topk": 5
                    }
                }
            }
            ]
        }
    })";

C
cai.zhang 已提交
258
    Timestamp time = 1000000;
F
FluorineDog 已提交
259 260 261 262 263
    auto plan = CreatePlan(*schema, dsl);
    auto num_queries = 5;
    auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024);
    auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());

264
    ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
C
cai.zhang 已提交
265 266 267

    SealedLoader(dataset, *segment);
    segment->DropFieldData(nothing_id);
268
    segment->Search(plan.get(), *ph_group, time);
C
cai.zhang 已提交
269

270
    segment->DropFieldData(fakevec_id);
271
    ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
C
cai.zhang 已提交
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

    LoadIndexInfo vec_info;
    vec_info.field_id = fakevec_id.get();
    vec_info.index = indexing;
    vec_info.index_params["metric_type"] = milvus::knowhere::Metric::L2;
    segment->LoadIndex(vec_info);

    ASSERT_EQ(segment->num_chunk(), 1);
    auto chunk_span1 = segment->chunk_data<int64_t>(FieldOffset(1), 0);
    auto chunk_span2 = segment->chunk_data<double>(FieldOffset(2), 0);
    auto ref1 = dataset.get_col<int64_t>(1);
    auto ref2 = dataset.get_col<double>(2);
    for (int i = 0; i < N; ++i) {
        ASSERT_EQ(chunk_span1[i], ref1[i]);
        ASSERT_EQ(chunk_span2[i], ref2[i]);
    }

289 290
    auto sr = segment->Search(plan.get(), *ph_group, time);
    auto json = SearchResultToJson(sr);
F
FluorineDog 已提交
291
    std::cout << json.dump(1);
C
cai.zhang 已提交
292 293

    segment->DropIndex(fakevec_id);
294
    ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
C
cai.zhang 已提交
295
    segment->LoadIndex(vec_info);
296 297
    auto sr2 = segment->Search(plan.get(), *ph_group, time);
    auto json2 = SearchResultToJson(sr);
C
cai.zhang 已提交
298 299
    ASSERT_EQ(json.dump(-2), json2.dump(-2));
    segment->DropFieldData(double_id);
300
    ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
301 302 303 304
    auto std_json = Json::parse(R"(
[
 [
  [
305 306 307 308 309
   "982->0.000000",
   "25315->4.741588",
   "551029->5.078479",
   "455002->5.134716",
   "504754->5.329021"
310 311
  ],
  [
312 313 314 315 316
   "287136->8.409121",
   "528353->8.740297",
   "935763->9.422906",
   "794649->9.436665",
   "192031->9.832053"
317 318
  ],
  [
319 320 321 322 323
   "59251->2.542610",
   "433044->3.424016",
   "797884->3.663446",
   "430441->3.692723",
   "697705->3.944479"
324 325
  ],
  [
326 327 328 329 330
   "611544->3.463480",
   "642941->3.753775",
   "967504->3.885163",
   "232724->4.574215",
   "507245->5.040902"
331 332
  ],
  [
333 334 335 336 337
   "351788->4.453843",
   "410227->4.699380",
   "501497->4.805948",
   "715061->5.166959",
   "414882->5.179897"
338 339
  ]
 ]
340
])");
341
    ASSERT_EQ(std_json.dump(-2), json.dump(-2));
F
FluorineDog 已提交
342
}