VecIndexCreator.cpp 14.6 KB
Newer Older
D
dragondriver 已提交
1 2 3 4 5 6 7 8 9 10 11
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License

D
dragondriver 已提交
12
#include <exception>
13
#include <map>
14
#include <google/protobuf/text_format.h>
D
dragondriver 已提交
15

16
#include "exceptions/EasyAssert.h"
17
#include "pb/index_cgo_msg.pb.h"
18
#include "indexbuilder/VecIndexCreator.h"
19 20 21 22
#include "indexbuilder/utils.h"
#include "knowhere/common/Timer.h"
#include "knowhere/common/Utils.h"
#include "knowhere/index/vector_index/ConfAdapterMgr.h"
D
dragondriver 已提交
23 24 25
#include "knowhere/index/vector_index/VecIndexFactory.h"
#include "knowhere/index/vector_index/helpers/IndexParameter.h"

26
namespace milvus::indexbuilder {
D
dragondriver 已提交
27

28
VecIndexCreator::VecIndexCreator(const char* serialized_type_params, const char* serialized_index_params) {
29 30
    type_params_ = std::string(serialized_type_params);
    index_params_ = std::string(serialized_index_params);
31

D
dragondriver 已提交
32 33
    parse();

B
bigsheeper 已提交
34 35 36 37
    auto index_mode = get_index_mode();
    auto index_type = get_index_type();
    auto metric_type = get_metric_type();
    AssertInfo(!is_unsupported(index_type, metric_type), index_type + " doesn't support metric: " + metric_type);
38

B
bigsheeper 已提交
39
    index_ = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(get_index_type(), index_mode);
40
    AssertInfo(index_ != nullptr, "[VecIndexCreator]Index is null after create index");
D
dragondriver 已提交
41 42
}

43 44
template <typename ParamsT>
// ugly here, ParamsT will just be MapParams later
D
dragondriver 已提交
45
void
46
VecIndexCreator::parse_impl(const std::string& serialized_params_str, knowhere::Config& conf) {
47
    bool deserialized_success;
48

49 50
    ParamsT params;
    deserialized_success = google::protobuf::TextFormat::ParseFromString(serialized_params_str, &params);
51
    AssertInfo(deserialized_success, "[VecIndexCreator]Deserialize params failed");
52

53 54 55 56 57
    for (auto i = 0; i < params.params_size(); ++i) {
        const auto& param = params.params(i);
        const auto& key = param.key();
        const auto& value = param.value();
        conf[key] = value;
D
dragondriver 已提交
58 59
    }

S
sunby 已提交
60 61 62 63 64 65
    auto stoi_closure = [](const std::string& s) -> auto {
        return std::stoi(s);
    };
    auto stof_closure = [](const std::string& s) -> auto {
        return std::stof(s);
    };
66

B
bigsheeper 已提交
67
    /***************************** meta *******************************/
68 69
    check_parameter<int>(conf, milvus::knowhere::meta::DIM, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::meta::TOPK, stoi_closure, std::nullopt);
D
dragondriver 已提交
70

B
bigsheeper 已提交
71
    /***************************** IVF Params *******************************/
72 73 74 75
    check_parameter<int>(conf, milvus::knowhere::IndexParams::nprobe, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::nlist, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::m, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::nbits, stoi_closure, std::nullopt);
D
dragondriver 已提交
76

B
bigsheeper 已提交
77
    /************************** NSG Parameter **************************/
78 79 80 81
    check_parameter<int>(conf, milvus::knowhere::IndexParams::knng, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::search_length, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::out_degree, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::candidate, stoi_closure, std::nullopt);
D
dragondriver 已提交
82

B
bigsheeper 已提交
83
    /************************** HNSW Params *****************************/
84 85 86
    check_parameter<int>(conf, milvus::knowhere::IndexParams::efConstruction, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::M, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::ef, stoi_closure, std::nullopt);
D
dragondriver 已提交
87

B
bigsheeper 已提交
88
    /************************** Annoy Params *****************************/
89 90
    check_parameter<int>(conf, milvus::knowhere::IndexParams::n_trees, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::search_k, stoi_closure, std::nullopt);
D
dragondriver 已提交
91

B
bigsheeper 已提交
92
    /************************** PQ Params *****************************/
93
    check_parameter<int>(conf, milvus::knowhere::IndexParams::PQM, stoi_closure, std::nullopt);
D
dragondriver 已提交
94

B
bigsheeper 已提交
95
    /************************** NGT Params *****************************/
96
    check_parameter<int>(conf, milvus::knowhere::IndexParams::edge_size, stoi_closure, std::nullopt);
97

B
bigsheeper 已提交
98
    /************************** NGT Search Params *****************************/
S
sunby 已提交
99
    check_parameter<float>(conf, milvus::knowhere::IndexParams::epsilon, stof_closure, std::nullopt);
100
    check_parameter<int>(conf, milvus::knowhere::IndexParams::max_search_edges, stoi_closure, std::nullopt);
101

B
bigsheeper 已提交
102
    /************************** NGT_PANNG Params *****************************/
103 104
    check_parameter<int>(conf, milvus::knowhere::IndexParams::forcedly_pruned_edge_size, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::selectively_pruned_edge_size, stoi_closure, std::nullopt);
B
bigsheeper 已提交
105 106

    /************************** NGT_ONNG Params *****************************/
107 108
    check_parameter<int>(conf, milvus::knowhere::IndexParams::outgoing_edge_size, stoi_closure, std::nullopt);
    check_parameter<int>(conf, milvus::knowhere::IndexParams::incoming_edge_size, stoi_closure, std::nullopt);
B
bigsheeper 已提交
109 110

    /************************** Serialize Params *******************************/
111 112 113 114
    check_parameter<int>(conf, milvus::knowhere::INDEX_FILE_SLICE_SIZE_IN_MEGABYTE, stoi_closure, std::optional{4});
}

void
115
VecIndexCreator::parse() {
116 117 118 119 120 121 122
    namespace indexcgo = milvus::proto::indexcgo;

    parse_impl<indexcgo::TypeParams>(type_params_, type_config_);
    parse_impl<indexcgo::IndexParams>(index_params_, index_config_);

    config_.update(type_config_);  // just like dict().update in Python, amazing
    config_.update(index_config_);
B
bigsheeper 已提交
123
}
D
dragondriver 已提交
124

B
bigsheeper 已提交
125 126
template <typename T>
void
127 128 129 130
VecIndexCreator::check_parameter(knowhere::Config& conf,
                                 const std::string& key,
                                 std::function<T(std::string)> fn,
                                 std::optional<T> default_v) {
131
    if (!conf.contains(key)) {
B
bigsheeper 已提交
132
        if (default_v.has_value()) {
133
            conf[key] = default_v.value();
B
bigsheeper 已提交
134
        }
D
dragondriver 已提交
135
    } else {
136 137
        auto value = conf[key];
        conf[key] = fn(value);
D
dragondriver 已提交
138 139 140 141 142
    }
}

template <typename T>
std::optional<T>
143
VecIndexCreator::get_config_by_name(std::string name) {
D
dragondriver 已提交
144 145 146 147
    if (config_.contains(name)) {
        return {config_[name].get<T>()};
    }
    return std::nullopt;
D
dragondriver 已提交
148 149 150
}

int64_t
151
VecIndexCreator::dim() {
D
dragondriver 已提交
152
    auto dimension = get_config_by_name<int64_t>(milvus::knowhere::meta::DIM);
153
    AssertInfo(dimension.has_value(), "[VecIndexCreator]Dimension doesn't have value");
D
dragondriver 已提交
154
    return (dimension.value());
D
dragondriver 已提交
155 156 157
}

void
158
VecIndexCreator::BuildWithoutIds(const knowhere::DatasetPtr& dataset) {
B
bigsheeper 已提交
159
    auto index_type = get_index_type();
B
bigsheeper 已提交
160 161
    auto index_mode = get_index_mode();
    config_[knowhere::meta::ROWS] = dataset->Get<int64_t>(knowhere::meta::ROWS);
162 163 164 165 166
    if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
        if (!config_.contains(knowhere::IndexParams::nbits)) {
            config_[knowhere::IndexParams::nbits] = 8;
        }
    }
B
bigsheeper 已提交
167
    auto conf_adapter = knowhere::AdapterMgr::GetInstance().GetAdapter(index_type);
168
    std::cout << "Konwhere BuildWithoutIds config_ is " << config_ << std::endl;
B
bigsheeper 已提交
169 170
    AssertInfo(conf_adapter->CheckTrain(config_, index_mode), "something wrong in index parameters!");

D
dragondriver 已提交
171 172 173
    if (is_in_need_id_list(index_type)) {
        PanicInfo(std::string(index_type) + " doesn't support build without ids yet!");
    }
174
    knowhere::TimeRecorder rc("BuildWithoutIds", 1);
D
dragondriver 已提交
175 176 177 178 179
    // if (is_in_need_build_all_list(index_type)) {
    //     index_->BuildAll(dataset, config_);
    // } else {
    //     index_->Train(dataset, config_);
    //     index_->AddWithoutIds(dataset, config_);
Z
zhenshan.cao 已提交
180
    // }
D
dragondriver 已提交
181
    index_->BuildAll(dataset, config_);
182
    rc.RecordSection("TrainAndAdd");
D
dragondriver 已提交
183 184 185

    if (is_in_nm_list(index_type)) {
        StoreRawData(dataset);
186
        rc.RecordSection("StoreRawData");
D
dragondriver 已提交
187
    }
188
    rc.ElapseFromBegin("Done");
D
dragondriver 已提交
189 190 191
}

void
192
VecIndexCreator::BuildWithIds(const knowhere::DatasetPtr& dataset) {
Y
yukun 已提交
193
    AssertInfo(dataset->data().find(milvus::knowhere::meta::IDS) != dataset->data().end(),
194
               "[VecIndexCreator]Can't find ids field in dataset");
B
bigsheeper 已提交
195 196 197
    auto index_type = get_index_type();
    auto index_mode = get_index_mode();
    config_[knowhere::meta::ROWS] = dataset->Get<int64_t>(knowhere::meta::ROWS);
198 199 200 201 202
    if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
        if (!config_.contains(knowhere::IndexParams::nbits)) {
            config_[knowhere::IndexParams::nbits] = 8;
        }
    }
B
bigsheeper 已提交
203 204
    auto conf_adapter = knowhere::AdapterMgr::GetInstance().GetAdapter(index_type);
    AssertInfo(conf_adapter->CheckTrain(config_, index_mode), "something wrong in index parameters!");
D
dragondriver 已提交
205 206 207
    //    index_->Train(dataset, config_);
    //    index_->Add(dataset, config_);
    index_->BuildAll(dataset, config_);
B
bigsheeper 已提交
208

D
dragondriver 已提交
209 210 211 212 213 214
    if (is_in_nm_list(get_index_type())) {
        StoreRawData(dataset);
    }
}

void
215
VecIndexCreator::StoreRawData(const knowhere::DatasetPtr& dataset) {
D
dragondriver 已提交
216
    auto index_type = get_index_type();
B
bigsheeper 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229
    if (is_in_nm_list(index_type)) {
        auto tensor = dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
        auto row_num = dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
        auto dim = dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
        int64_t data_size;
        if (is_in_bin_list(index_type)) {
            data_size = dim / 8 * row_num;
        } else {
            data_size = dim * row_num * sizeof(float);
        }
        raw_data_.resize(data_size);
        memcpy(raw_data_.data(), tensor, data_size);
    }
D
dragondriver 已提交
230 231
}

232 233 234
milvus::knowhere::BinarySet
VecIndexCreator::Serialize() {
    auto ret = index_->Serialize(config_);
235 236 237 238 239
    auto index_type = get_index_type();

    if (is_in_nm_list(index_type)) {
        std::shared_ptr<uint8_t[]> raw_data(new uint8_t[raw_data_.size()], std::default_delete<uint8_t[]>());
        memcpy(raw_data.get(), raw_data_.data(), raw_data_.size());
240
        ret.Append(RAW_DATA, raw_data, raw_data_.size());
241 242 243
        auto slice_size = get_index_file_slice_size();
        // https://github.com/milvus-io/milvus/issues/6421
        // Disassemble will only divide the raw vectors, other keys were already divided
244
        knowhere::Disassemble(slice_size * 1024 * 1024, ret);
245
    }
246
    return ret;
D
dragondriver 已提交
247 248
}

249
void
250
VecIndexCreator::Load(const milvus::knowhere::BinarySet& binary_set) {
251 252 253 254 255 256 257 258 259 260 261 262 263
    auto& map_ = binary_set.binary_map_;
    for (auto it = map_.begin(); it != map_.end(); ++it) {
        if (it->first == RAW_DATA) {
            raw_data_.clear();
            auto data_size = it->second->size;
            raw_data_.resize(data_size);
            memcpy(raw_data_.data(), it->second->data.get(), data_size);
            break;
        }
    }
    index_->Load(binary_set);
}

B
bigsheeper 已提交
264
std::string
265
VecIndexCreator::get_index_type() {
B
bigsheeper 已提交
266 267 268 269 270 271 272
    // return index_->index_type();
    // knowhere bug here
    // the index_type of all ivf-based index will change to ivf flat after loaded
    auto type = get_config_by_name<std::string>("index_type");
    return type.has_value() ? type.value() : knowhere::IndexEnum::INDEX_FAISS_IVFPQ;
}

B
bigsheeper 已提交
273
std::string
274
VecIndexCreator::get_metric_type() {
B
bigsheeper 已提交
275 276 277 278 279 280 281 282 283 284 285 286 287 288
    auto type = get_config_by_name<std::string>(knowhere::Metric::TYPE);
    if (type.has_value()) {
        return type.value();
    } else {
        auto index_type = get_index_type();
        if (is_in_bin_list(index_type)) {
            return knowhere::Metric::JACCARD;
        } else {
            return knowhere::Metric::L2;
        }
    }
}

knowhere::IndexMode
289
VecIndexCreator::get_index_mode() {
B
bigsheeper 已提交
290 291 292 293 294 295 296 297
    static std::map<std::string, knowhere::IndexMode> mode_map = {
        {"CPU", knowhere::IndexMode::MODE_CPU},
        {"GPU", knowhere::IndexMode::MODE_GPU},
    };
    auto mode = get_config_by_name<std::string>("index_mode");
    return mode.has_value() ? mode_map[mode.value()] : knowhere::IndexMode::MODE_CPU;
}

298
int64_t
299
VecIndexCreator::get_index_file_slice_size() {
300 301 302 303 304 305
    if (config_.contains(knowhere::INDEX_FILE_SLICE_SIZE_IN_MEGABYTE)) {
        return config_[knowhere::INDEX_FILE_SLICE_SIZE_IN_MEGABYTE].get<int64_t>();
    }
    return 4;  // by default
}

306 307
std::unique_ptr<VecIndexCreator::QueryResult>
VecIndexCreator::Query(const knowhere::DatasetPtr& dataset) {
308 309 310
    return std::move(QueryImpl(dataset, config_));
}

311 312
std::unique_ptr<VecIndexCreator::QueryResult>
VecIndexCreator::QueryWithParam(const knowhere::DatasetPtr& dataset, const char* serialized_search_params) {
313 314 315 316 317 318 319
    namespace indexcgo = milvus::proto::indexcgo;
    milvus::knowhere::Config search_conf;
    parse_impl<indexcgo::MapParams>(std::string(serialized_search_params), search_conf);

    return std::move(QueryImpl(dataset, search_conf));
}

320 321
std::unique_ptr<VecIndexCreator::QueryResult>
VecIndexCreator::QueryImpl(const knowhere::DatasetPtr& dataset, const knowhere::Config& conf) {
S
sunby 已提交
322 323 324 325 326 327
    auto load_raw_data_closure = [&]() { LoadRawData(); };  // hide this pointer
    auto index_type = get_index_type();
    if (is_in_nm_list(index_type)) {
        std::call_once(raw_data_loaded_, load_raw_data_closure);
    }

328 329 330 331 332 333
    auto res = index_->Query(dataset, conf, nullptr);
    auto ids = res->Get<int64_t*>(milvus::knowhere::meta::IDS);
    auto distances = res->Get<float*>(milvus::knowhere::meta::DISTANCE);
    auto nq = dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
    auto k = config_[milvus::knowhere::meta::TOPK].get<int64_t>();

334
    auto query_res = std::make_unique<VecIndexCreator::QueryResult>();
335 336 337 338 339 340 341 342 343 344
    query_res->nq = nq;
    query_res->topk = k;
    query_res->ids.resize(nq * k);
    query_res->distances.resize(nq * k);
    memcpy(query_res->ids.data(), ids, sizeof(int64_t) * nq * k);
    memcpy(query_res->distances.data(), distances, sizeof(float) * nq * k);

    return std::move(query_res);
}

S
sunby 已提交
345
void
346
VecIndexCreator::LoadRawData() {
S
sunby 已提交
347 348 349 350 351 352 353 354 355 356 357 358
    auto index_type = get_index_type();
    if (is_in_nm_list(index_type)) {
        auto bs = index_->Serialize(config_);
        auto bptr = std::make_shared<milvus::knowhere::Binary>();
        auto deleter = [&](uint8_t*) {};  // avoid repeated deconstruction
        bptr->data = std::shared_ptr<uint8_t[]>(static_cast<uint8_t*>(raw_data_.data()), deleter);
        bptr->size = raw_data_.size();
        bs.Append(RAW_DATA, bptr);
        index_->Load(bs);
    }
}

359
}  // namespace milvus::indexbuilder