提交 8f09b1f2 编写于 作者: X xiaojun.lin

solve part of the problem about nsg

上级 02247edc
......@@ -27,6 +27,7 @@ Please mark all change in change log and use the ticket from JIRA.
- \#527 - faiss benchmark not compatible with faiss 1.6.0
- \#530 - BuildIndex stop when do build index and search simultaneously
- \#533 - NSG build failed with MetricType Inner Product
- \#548 - NSG search accuracy is too low
## Feature
- \#12 - Pure CPU version for Milvus
......
......@@ -126,4 +126,38 @@ GPUIDMAP::search_impl(int64_t n, const float* data, int64_t k, float* distances,
index_->search(n, (float*)data, k, distances, labels);
}
void
GPUIDMAP::GenGraph(float* data, const int64_t& k, Graph& graph, const Config& config) {
int64_t K = k + 1;
auto ntotal = Count();
size_t dim = config->d;
auto batch_size = 1000;
auto tail_batch_size = ntotal % batch_size;
auto batch_search_count = ntotal / batch_size;
auto total_search_count = tail_batch_size == 0 ? batch_search_count : batch_search_count + 1;
std::vector<float> res_dis(K * batch_size);
graph.resize(ntotal);
Graph res_vec(total_search_count);
for (int i = 0; i < total_search_count; ++i) {
auto b_size = (i == (total_search_count - 1)) && tail_batch_size != 0 ? tail_batch_size : batch_size;
auto& res = res_vec[i];
res.resize(K * b_size);
auto xq = data + batch_size * dim * i;
search_impl(b_size, (float*)xq, K, res_dis.data(), res.data(), config);
for (int j = 0; j < b_size; ++j) {
auto& node = graph[batch_size * i + j];
node.resize(k);
auto start_pos = j * K + 1;
for (int m = 0, cursor = start_pos; m < k && cursor < start_pos + k; ++m, ++cursor) {
node[m] = res[cursor];
}
}
}
}
} // namespace knowhere
......@@ -47,6 +47,9 @@ class GPUIDMAP : public IDMAP, public GPUIndex {
VectorIndexPtr
CopyGpuToGpu(const int64_t& device_id, const Config& config) override;
void
GenGraph(float* data, const int64_t& k, Graph& graph, const Config& config);
protected:
void
search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg) override;
......
......@@ -121,6 +121,26 @@ IDMAP::Add(const DatasetPtr& dataset, const Config& config) {
index_->add_with_ids(rows, (float*)p_data, p_ids);
}
void
IDMAP::AddWithoutId(const DatasetPtr& dataset, const Config& config) {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize");
}
std::lock_guard<std::mutex> lk(mutex_);
GETTENSOR(dataset)
// TODO: magic here.
auto array = dataset->array()[0];
std::vector<int64_t> new_ids(rows);
for (int i = 0; i < rows; ++i) {
new_ids[i] = i;
}
index_->add_with_ids(rows, (float*)p_data, new_ids.data());
}
int64_t
IDMAP::Count() {
return index_->ntotal;
......
......@@ -34,20 +34,31 @@ class IDMAP : public VectorIndex, public FaissBaseIndex {
BinarySet
Serialize() override;
void
Load(const BinarySet& index_binary) override;
void
Train(const Config& config);
DatasetPtr
Search(const DatasetPtr& dataset, const Config& config) override;
int64_t
Count() override;
VectorIndexPtr
Clone() override;
int64_t
Dimension() override;
void
Add(const DatasetPtr& dataset, const Config& config) override;
void
AddWithoutId(const DatasetPtr& dataset, const Config& config);
VectorIndexPtr
CopyCpuToGpu(const int64_t& device_id, const Config& config);
void
......@@ -55,12 +66,15 @@ class IDMAP : public VectorIndex, public FaissBaseIndex {
virtual float*
GetRawVectors();
virtual int64_t*
GetRawIds();
protected:
virtual void
search_impl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& cfg);
protected:
std::mutex mutex_;
};
......
......@@ -189,35 +189,34 @@ IVF::Dimension() {
}
void
IVF::GenGraph(const int64_t& k, Graph& graph, const DatasetPtr& dataset, const Config& config) {
GETTENSOR(dataset)
IVF::GenGraph(float* data, const int64_t& k, Graph& graph, const Config& config) {
int64_t K = k + 1;
auto ntotal = Count();
auto batch_size = 100;
size_t dim = config->d;
auto batch_size = 1000;
auto tail_batch_size = ntotal % batch_size;
auto batch_search_count = ntotal / batch_size;
auto total_search_count = tail_batch_size == 0 ? batch_search_count : batch_search_count + 1;
std::vector<float> res_dis(k * batch_size);
std::vector<float> res_dis(K * batch_size);
graph.resize(ntotal);
Graph res_vec(total_search_count);
for (int i = 0; i < total_search_count; ++i) {
auto b_size = i == total_search_count - 1 && tail_batch_size != 0 ? tail_batch_size : batch_size;
auto b_size = (i == (total_search_count - 1)) && tail_batch_size != 0 ? tail_batch_size : batch_size;
auto& res = res_vec[i];
res.resize(k * b_size);
res.resize(K * b_size);
auto xq = p_data + batch_size * dim * i;
search_impl(b_size, (float*)xq, k, res_dis.data(), res.data(), config);
auto xq = data + batch_size * dim * i;
search_impl(b_size, (float*)xq, K, res_dis.data(), res.data(), config);
int tmp = 0;
for (int j = 0; j < b_size; ++j) {
auto& node = graph[batch_size * i + j];
node.resize(k);
for (int m = 0; m < k && tmp < k * b_size; ++m, ++tmp) {
// TODO(linxj): avoid memcopy here.
node[m] = res[tmp];
auto start_pos = j * K + 1;
for (int m = 0, cursor = start_pos; m < k && cursor < start_pos + k; ++m, ++cursor) {
node[m] = res[cursor];
}
}
}
......
......@@ -57,7 +57,7 @@ class IVF : public VectorIndex, public FaissBaseIndex {
Search(const DatasetPtr& dataset, const Config& config) override;
void
GenGraph(const int64_t& k, Graph& graph, const DatasetPtr& dataset, const Config& config);
GenGraph(float* data, const int64_t& k, Graph& graph, const Config& config);
BinarySet
Serialize() override;
......
......@@ -21,6 +21,8 @@
#include "knowhere/common/Timer.h"
#ifdef MILVUS_GPU_VERSION
#include "knowhere/index/vector_index/IndexGPUIVF.h"
#include "knowhere/index/vector_index/IndexGPUIDMAP.h"
#include "knowhere/index/vector_index/helpers/Cloner.h"
#endif
#include "knowhere/index/vector_index/IndexIVF.h"
......@@ -110,6 +112,7 @@ NSG::Search(const DatasetPtr& dataset, const Config& config) {
IndexModelPtr
NSG::Train(const DatasetPtr& dataset, const Config& config) {
config->Dump();
auto build_cfg = std::dynamic_pointer_cast<NSGCfg>(config);
if (build_cfg != nullptr) {
build_cfg->CheckValid(); // throw exception
......@@ -117,23 +120,26 @@ NSG::Train(const DatasetPtr& dataset, const Config& config) {
// TODO(linxj): dev IndexFactory, support more IndexType
#ifdef MILVUS_GPU_VERSION
auto preprocess_index = std::make_shared<GPUIVF>(build_cfg->gpu_id);
// auto preprocess_index = std::make_shared<GPUIVF>(build_cfg->gpu_id);
#else
auto preprocess_index = std::make_shared<IVF>();
#endif
auto model = preprocess_index->Train(dataset, config);
preprocess_index->set_index_model(model);
preprocess_index->AddWithoutIds(dataset, config);
auto preprocess_index = std::make_shared<IDMAP>();
preprocess_index->Train(config);
preprocess_index->AddWithoutId(dataset, config);
float* raw_data = preprocess_index->GetRawVectors();
auto xx = cloner::CopyCpuToGpu(preprocess_index, 0, config);
auto ss = std::dynamic_pointer_cast<GPUIDMAP>(xx);
Graph knng;
preprocess_index->GenGraph(build_cfg->knng, knng, dataset, config);
ss->GenGraph(raw_data, build_cfg->knng, knng, config);
GETTENSOR(dataset)
algo::BuildParams b_params;
b_params.candidate_pool_size = build_cfg->candidate_pool_size;
b_params.out_degree = build_cfg->out_degree;
b_params.search_length = build_cfg->search_length;
GETTENSOR(dataset)
auto array = dataset->array()[0];
auto p_ids = array->data()->GetValues<int64_t>(1, 0);
......
......@@ -18,7 +18,6 @@
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <stack>
#include <utility>
......@@ -29,12 +28,13 @@
#include "knowhere/index/vector_index/nsg/NSG.h"
#include "knowhere/index/vector_index/nsg/NSGHelper.h"
// TODO: enable macro
//#include <gperftools/profiler.h>
namespace knowhere {
namespace algo {
unsigned int seed = 100;
NsgIndex::NsgIndex(const size_t& dimension, const size_t& n, METRICTYPE metric)
: dimension(dimension), ntotal(n), metric_type(metric) {
switch (metric) {
......@@ -55,8 +55,6 @@ NsgIndex::~NsgIndex() {
void
NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const BuildParams& parameters) {
TimeRecorder rc("NSG");
ntotal = nb;
ori_data_ = new float[ntotal * dimension];
ids_ = new int64_t[ntotal];
......@@ -67,25 +65,17 @@ NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const
out_degree = parameters.out_degree;
candidate_pool_size = parameters.candidate_pool_size;
TimeRecorder rc("NSG", 1);
InitNavigationPoint();
rc.RecordSection("init");
Link();
rc.RecordSection("Link");
//>> Debug code
/////
// int count = 0;
// for (int i = 0; i < ntotal; ++i) {
// count += nsg[i].size();
//}
/////
CheckConnectivity();
rc.RecordSection("Connect");
//>> Debug code
///
int total_degree = 0;
for (size_t i = 0; i < ntotal; ++i) {
total_degree += nsg[i].size();
......@@ -93,9 +83,17 @@ NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const
KNOWHERE_LOG_DEBUG << "Graph physical size: " << total_degree * sizeof(node_t) / 1024 / 1024 << "m";
KNOWHERE_LOG_DEBUG << "Average degree: " << total_degree / ntotal;
/////
is_trained = true;
// Debug code
// for (size_t i = 0; i < ntotal; i++) {
// auto& x = nsg[i];
// for (size_t j = 0; j < x.size(); j++) {
// std::cout << "id: " << x[j] << std::endl;
// }
// std::cout << std::endl;
// }
}
void
......@@ -114,28 +112,22 @@ NsgIndex::InitNavigationPoint() {
}
// select navigation point
std::vector<Neighbor> resset, fullset;
unsigned int seed = 100;
std::vector<Neighbor> resset;
navigation_point = rand_r(&seed) % ntotal; // random initialize navigating point
//>> Debug code
/////
// navigation_point = drand48();
/////
GetNeighbors(center, resset, knng);
navigation_point = resset[0].id;
//>> Debug code
/////
// Debug code
// std::cout << "ep: " << navigation_point << std::endl;
/////
//>> Debug code
/////
// for (int k = 0; k < resset.size(); ++k) {
// std::cout << "id: " << resset[k].id << ", dis: " << resset[k].distance << std::endl;
// }
// std::cout << std::endl;
//
// std::cout << "ep: " << navigation_point << std::endl;
//
// float r1 = distance_->Compare(center, ori_data_ + navigation_point * dimension, dimension);
// assert(r1 == resset[0].distance);
/////
}
// Specify Link
......@@ -149,7 +141,9 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
// TODO: throw exception here.
}
std::vector<node_t> init_ids;
resset.resize(search_length);
std::vector<node_t> init_ids(buffer_size);
// std::vector<node_t> init_ids;
{
/*
......@@ -158,25 +152,26 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
size_t count = 0;
// Get all neighbors
for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
init_ids.push_back(graph[navigation_point][i]);
for (size_t i = 0; i < init_ids.size() && i < graph[navigation_point].size(); ++i) {
// for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
// init_ids.push_back(graph[navigation_point][i]);
init_ids[i] = graph[navigation_point][i];
has_calculated_dist[init_ids[i]] = true;
++count;
}
unsigned int seed = 100;
while (count < buffer_size) {
node_t id = rand_r(&seed) % ntotal;
if (has_calculated_dist[id])
continue; // duplicate id
init_ids.push_back(id);
// init_ids.push_back(id);
init_ids[count] = id;
++count;
has_calculated_dist[id] = true;
}
}
{
resset.resize(init_ids.size());
// resset.resize(init_ids.size());
// init resset and sort by distance
for (size_t i = 0; i < init_ids.size(); ++i) {
......@@ -190,7 +185,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
float dist = distance_->Compare(ori_data_ + dimension * id, query, dimension);
resset[i] = Neighbor(id, dist, false);
///////////// difference from other GetNeighbors ///////////////
//// difference from other GetNeighbors
fullset.push_back(resset[i]);
///////////////////////////////////////
}
......@@ -247,8 +242,10 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
// TODO: throw exception here.
}
std::vector<node_t> init_ids;
boost::dynamic_bitset<> has_calculated_dist{ntotal, 0}; // TODO: ?
// std::vector<node_t> init_ids;
std::vector<node_t> init_ids(buffer_size);
resset.resize(buffer_size);
boost::dynamic_bitset<> has_calculated_dist{ntotal, 0};
{
/*
......@@ -257,24 +254,26 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
size_t count = 0;
// Get all neighbors
for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
init_ids.push_back(graph[navigation_point][i]);
for (size_t i = 0; i < init_ids.size() && i < graph[navigation_point].size(); ++i) {
// for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
// init_ids.push_back(graph[navigation_point][i]);
init_ids[i] = graph[navigation_point][i];
has_calculated_dist[init_ids[i]] = true;
++count;
}
unsigned int seed = 100;
while (count < buffer_size) {
node_t id = rand_r(&seed) % ntotal;
if (has_calculated_dist[id])
continue; // duplicate id
init_ids.push_back(id);
// init_ids.push_back(id);
init_ids[count] = id;
++count;
has_calculated_dist[id] = true;
}
}
{
resset.resize(init_ids.size());
// resset.resize(init_ids.size());
// init resset and sort by distance
for (size_t i = 0; i < init_ids.size(); ++i) {
......@@ -333,13 +332,15 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
void
NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph& graph, SearchParams* params) {
size_t& buffer_size = params ? params->search_length : search_length;
size_t buffer_size = params ? params->search_length : search_length;
if (buffer_size > ntotal) {
// TODO: throw exception here.
}
std::vector<node_t> init_ids;
// std::vector<node_t> init_ids;
std::vector<node_t> init_ids(buffer_size);
resset.resize(buffer_size);
boost::dynamic_bitset<> has_calculated_dist{ntotal, 0};
{
......@@ -349,33 +350,33 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
size_t count = 0;
// Get all neighbors
for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
init_ids.push_back(graph[navigation_point][i]);
for (size_t i = 0; i < init_ids.size() && i < graph[navigation_point].size(); ++i) {
// for (size_t i = 0; i < graph[navigation_point].size(); ++i) {
// init_ids.push_back(graph[navigation_point][i]);
init_ids[i] = graph[navigation_point][i];
has_calculated_dist[init_ids[i]] = true;
++count;
}
unsigned int seed = 100;
while (count < buffer_size) {
node_t id = rand_r(&seed) % ntotal;
if (has_calculated_dist[id])
continue; // duplicate id
init_ids.push_back(id);
// init_ids.push_back(id);
init_ids[count] = id;
++count;
has_calculated_dist[id] = true;
}
}
{
resset.resize(init_ids.size());
// resset.resize(init_ids.size());
// init resset and sort by distance
for (size_t i = 0; i < init_ids.size(); ++i) {
node_t id = init_ids[i];
// assert(id < ntotal);
if (id >= static_cast<node_t>(ntotal)) {
KNOWHERE_THROW_MSG("Build Index Error, id > ntotal");
continue;
}
float dist = distance_->Compare(ori_data_ + id * dimension, query, dimension);
......@@ -383,13 +384,6 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
}
std::sort(resset.begin(), resset.end()); // sort by distance
//>> Debug code
/////
// for (int j = 0; j < buffer_size; ++j) {
// std::cout << "resset_id: " << resset[j].id << ", resset_dist: " << resset[j].distance << std::endl;
//}
/////
// search nearest neighbor
size_t cursor = 0;
while (cursor < buffer_size) {
......@@ -410,7 +404,8 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
if (dist >= resset[buffer_size - 1].distance)
continue;
///////////// difference from other GetNeighbors ///////////////
//// difference from other GetNeighbors
Neighbor nn(id, dist, false);
///////////////////////////////////////
......@@ -440,59 +435,50 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
void
NsgIndex::Link() {
auto cut_graph_dist = new float[ntotal * out_degree];
float* cut_graph_dist = new float[ntotal * out_degree];
nsg.resize(ntotal);
#pragma omp parallel
{
std::vector<Neighbor> fullset;
std::vector<Neighbor> temp;
boost::dynamic_bitset<> flags{ntotal, 0}; // TODO: ?
boost::dynamic_bitset<> flags{ntotal, 0};
#pragma omp for schedule(dynamic, 100)
for (size_t n = 0; n < ntotal; ++n) {
fullset.clear();
temp.clear();
flags.reset();
GetNeighbors(ori_data_ + dimension * n, temp, fullset, flags);
//>> Debug code
/////
// float r1 = distance_->Compare(ori_data_ + n * dimension, ori_data_ + temp[0].id * dimension, dimension);
// assert(r1 == temp[0].distance);
/////
SyncPrune(n, fullset, flags, cut_graph_dist);
}
// Debug code
// std::cout << "ep: " << 0 << std::endl;
// for (int k = 0; k < fullset.size(); ++k) {
// std::cout << "id: " << fullset[k].id << ", dis: " << fullset[k].distance << std::endl;
// }
}
//>> Debug code
/////
// auto bak_nsg = nsg;
/////
// Debug code
// for (size_t i = 0; i < ntotal; i++)
// {
// auto& x = nsg[i];
// for (size_t j=0; j < x.size(); j++)
// {
// std::cout << "id: " << x[j] << std::endl;
// }
// std::cout << std::endl;
// }
knng.clear();
knng.shrink_to_fit();
std::vector<std::mutex> mutex_vec(ntotal);
#pragma omp for schedule(dynamic, 100)
for (unsigned n = 0; n < ntotal; ++n) {
InterInsert(n, mutex_vec, cut_graph_dist);
}
delete[] cut_graph_dist;
//>> Debug code
/////
// int count = 0;
// for (int i = 0; i < ntotal; ++i) {
// if (bak_nsg[i].size() != nsg[i].size()) {
// //count += nsg[i].size() - bak_nsg[i].size();
// count += nsg[i].size();
// }
//}
/////
for (size_t i = 0; i < ntotal; ++i) {
nsg[i].shrink_to_fit();
}
delete[] cut_graph_dist;
}
void
......@@ -654,9 +640,9 @@ NsgIndex::DFS(size_t root, boost::dynamic_bitset<>& has_linked, int64_t& linked_
std::stack<size_t> s;
s.push(root);
if (!has_linked[root]) {
linked_count++; // not link
has_linked[root] = true; // link start...
linked_count++; // not link
}
has_linked[root] = true; // link start...
while (!s.empty()) {
size_t next = ntotal + 1;
......@@ -709,7 +695,6 @@ NsgIndex::FindUnconnectedNode(boost::dynamic_bitset<>& has_linked, int64_t& root
}
}
if (found == 0) {
unsigned int seed = 100;
while (true) { // random a linked-node and add unlinked-node as its neighbor
size_t rid = rand_r(&seed) % ntotal;
if (has_linked[rid]) {
......@@ -726,7 +711,10 @@ NsgIndex::Search(const float* query, const unsigned& nq, const unsigned& dim, co
int64_t* ids, SearchParams& params) {
std::vector<std::vector<Neighbor>> resset(nq);
params.search_length = k;
if (k >= 45) {
params.search_length = k;
}
TimeRecorder rc("NsgIndex::search", 1);
// TODO(linxj): when to use openmp
if (nq <= 4) {
......@@ -734,7 +722,7 @@ NsgIndex::Search(const float* query, const unsigned& nq, const unsigned& dim, co
} else {
#pragma omp parallel for
for (unsigned int i = 0; i < nq; ++i) {
auto single_query = query + i * dim;
const float* single_query = query + i * dim;
GetNeighbors(single_query, resset[i], nsg, &params);
}
}
......@@ -759,13 +747,6 @@ NsgIndex::Search(const float* query, const unsigned& nq, const unsigned& dim, co
}
rc.RecordSection("merge");
//>> Debug: test single insert
// int x_0 = resset[0].size();
// for (int l = 0; l < resset[0].size(); ++l) {
// resset[0].pop_back();
//}
// resset.clear();
// ProfilerStart("xx.prof");
// std::vector<Neighbor> resset;
// GetNeighbors(query, resset, nsg, &params);
......@@ -781,30 +762,5 @@ NsgIndex::SetKnnGraph(Graph& g) {
knng = std::move(g);
}
// void NsgIndex::GetKnnGraphFromFile() {
// //std::string filename = "sift.1M.50NN.graph";
// std::string filename = "sift.50NN.graph";
//
// std::ifstream in(filename, std::ios::binary);
// unsigned k;
// in.read((char *) &k, sizeof(unsigned));
// in.seekg(0, std::ios::end);
// std::ios::pos_type ss = in.tellg();
// size_t fsize = (size_t) ss;
// size_t num = (unsigned) (fsize / (k + 1) / 4);
// in.seekg(0, std::ios::beg);
//
// knng.resize(num);
// knng.reserve(num);
// unsigned kk = (k + 3) / 4 * 4;
// for (size_t i = 0; i < num; i++) {
// in.seekg(4, std::ios::cur);
// knng[i].resize(k);
// knng[i].reserve(kk);
// in.read((char *) knng[i].data(), k * sizeof(unsigned));
// }
// in.close();
//}
} // namespace algo
} // namespace knowhere
......@@ -52,7 +52,7 @@ class NsgIndex {
Distance* distance_;
float* ori_data_;
int64_t* ids_; // TODO: support different type
int64_t* ids_;
Graph nsg; // final graph
Graph knng; // reset after build
......@@ -134,9 +134,6 @@ class NsgIndex {
void
FindUnconnectedNode(boost::dynamic_bitset<>& flags, int64_t& root);
// private:
// void GetKnnGraphFromFile();
};
} // namespace algo
......
......@@ -23,6 +23,8 @@
#include "knowhere/index/vector_index/IndexNSG.h"
#ifdef MILVUS_GPU_VERSION
#include "knowhere/index/vector_index/helpers/FaissGpuResourceMgr.h"
#include "knowhere/index/vector_index/IndexGPUIDMAP.h"
#include "knowhere/index/vector_index/helpers/Cloner.h"
#endif
#include "knowhere/common/Timer.h"
......@@ -113,3 +115,173 @@ TEST_F(NSGInterfaceTest, comparetest) {
}
tc.RecordSection("IP");
}
//#include <src/index/knowhere/knowhere/index/vector_index/nsg/OriNSG.h>
//TEST(test, ori_nsg) {
// // float* p_data = nullptr;
// size_t rows, dim;
// char* filename = "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Data/sift/sift_base.fvecs";
// // loads_data(filename, p_data, rows, dim);
// float* p_data = fvecs_read(filename, &dim, &rows);
//
// std::string knng_filename =
// "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Cellar/anns/efanna_graph/tests/sift.1M.50NN.graph";
// std::vector<std::vector<int64_t>> knng;
// Load_nns_graph(knng, knng_filename.c_str());
//
// // float* search_data = nullptr;
// size_t nq, search_dim;
// char* searchfile = "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Data/sift/sift_query.fvecs";
// // loads_data(searchfile, search_data, nq, search_dim);
// float* search_data = fvecs_read(searchfile, &search_dim, &nq);
// assert(search_dim == dim);
//
// size_t k, nq2;
// char* gtfile = "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Data/sift/sift_groundtruth.ivecs";
// int* gt_int = ivecs_read(gtfile, &k, &nq2);
// int64_t* gt = new int64_t[k * nq2];
// for (int i = 0; i < k * nq2; i++) {
// gt[i] = gt_int[i];
// }
// delete[] gt_int;
//
// std::vector<int64_t> store_ids(rows);
// for (int i = 0; i < rows; ++i) {
// store_ids[i] = i;
// }
//
// int64_t* I = new int64_t[nq * k];
// float* D = new float[nq * k];
//#if 0
// efanna2e::Parameters params;
// params.Set<int64_t>("L", 50);
// params.Set<int64_t>("R", 55);
// params.Set<int64_t>("C", 300);
// auto orinsg = std::make_shared<efanna2e::IndexNSG>(dim, rows, efanna2e::Metric::L2, nullptr);
// orinsg->Load_nn_graph(knng);
// orinsg->Build(rows, (float*)p_data, params);
//
// efanna2e::Parameters paras;
// paras.Set<unsigned>("L_search", 45);
// paras.Set<unsigned>("P_search",100);
// k = 10;
// std::vector<std::vector<int64_t> > res;
// for (unsigned i = 0; i < nq; i++) {
// std::vector<int64_t> tmp(k);
// orinsg->Search(search_data + i * dim, p_data, k, paras, tmp.data());
// res.push_back(tmp);
// }
// }
//#else
// knowhere::algo::BuildParams params;
// params.search_length = 50;
// params.out_degree = 55;
// params.candidate_pool_size = 300;
// auto nsg = std::make_shared<knowhere::algo::NsgIndex>(dim, rows);
//#if 1
// knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, 1024 * 1024 * 200, 1024 * 1024 * 600, 2);
// auto dataset = generate_dataset(int64_t(rows), int64_t(dim), p_data, store_ids.data());
// auto config = std::make_shared<knowhere::IVFCfg>();
// config->d = dim;
// config->gpu_id = 0;
// config->metric_type = knowhere::METRICTYPE::L2;
// auto preprocess_index = std::make_shared<knowhere::IDMAP>();
// preprocess_index->Train(config);
// preprocess_index->AddWithoutId(dataset, config);
// auto xx = knowhere::cloner::CopyCpuToGpu(preprocess_index, 0, config);
// auto ss = std::dynamic_pointer_cast<knowhere::GPUIDMAP>(xx);
//
// std::vector<std::vector<int64_t>> kng;
// ss->GenGraph(p_data, 50, kng, config);
// nsg->SetKnnGraph(kng);
// knowhere::FaissGpuResourceMgr::GetInstance().Free();
//#else
// nsg->SetKnnGraph(knng);
//#endif
// nsg->Build_with_ids(rows, (float*)p_data, store_ids.data(), params);
// knowhere::algo::SearchParams s_params;
// s_params.search_length = 45;
// nsg->Search(search_data, nq, dim, k, D, I, s_params);
//#endif
//
// int n_1 = 0, n_10 = 0, n_100 = 0;
// for (int i = 0; i < nq; i++) {
// int gt_nn = gt[i * k];
// for (int j = 0; j < k; j++) {
// if (I[i * k + j] == gt_nn) {
// if (j < 1)
// n_1++;
// if (j < 10)
// n_10++;
// if (j < 100)
// n_100++;
// }
// }
// }
// printf("R@1 = %.4f\n", n_1 / float(nq));
// printf("R@10 = %.4f\n", n_10 / float(nq));
// printf("R@100 = %.4f\n", n_100 / float(nq));
//}
//
//TEST(testxx, test_idmap){
// int k = 50;
// std::string knng_filename =
// "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Cellar/anns/efanna_graph/tests/sift.50NN.graph";
// std::vector<std::vector<int64_t>> gt_knng;
// Load_nns_graph(gt_knng, knng_filename.c_str());
//
// size_t rows, dim;
// char* filename = "/mnt/112d53a6-5592-4360-a33b-7fd789456fce/workspace/Cellar/anns/efanna_graph/tests/siftsmall/siftsmall_base.fvecs";
// float* p_data = fvecs_read(filename, &dim, &rows);
//
// std::vector<int64_t> store_ids(rows);
// for (int i = 0; i < rows; ++i) {
// store_ids[i] = i;
// }
//
// knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, 1024 * 1024 * 200, 1024 * 1024 * 600, 2);
// auto dataset = generate_dataset(int64_t(rows), int64_t(dim), p_data, store_ids.data());
// auto config = std::make_shared<knowhere::IVFCfg>();
// config->d = dim;
// config->gpu_id = 0;
// config->metric_type = knowhere::METRICTYPE::L2;
// auto preprocess_index = std::make_shared<knowhere::IDMAP>();
// preprocess_index->Train(config);
// preprocess_index->AddWithoutId(dataset, config);
// auto xx = knowhere::cloner::CopyCpuToGpu(preprocess_index, 0, config);
// auto ss = std::dynamic_pointer_cast<knowhere::GPUIDMAP>(xx);
// std::vector<std::vector<int64_t>> idmap_knng;
// ss->GenGraph(p_data, k, idmap_knng,config);
// knowhere::FaissGpuResourceMgr::GetInstance().Free();
//
// int n_1 = 0, n_10 = 0, n_100 = 0;
// for (int i = 0; i < rows; i++) {
// int gt_nn = gt_knng[i][0];
// int l_n_1 = 0;
// int l_n_10 = 0;
// int l_n_100 = 0;
// for (int j = 0; j < k; j++) {
// if (idmap_knng[i][j] == gt_nn) {
// if (j < 1){
// n_1++;
// l_n_1++;
// }
// if (j < 10){
// n_10++;
// l_n_10++;
// }
// if (j < 100){
// n_100++;
// l_n_100++;
// }
//
// }
// if ((j == k-1) && (l_n_100 == 0)){
// std::cout << "error id: " << i << std::endl;
// }
// }
// }
// printf("R@1 = %.4f\n", n_1 / float(rows));
// printf("R@10 = %.4f\n", n_10 / float(rows));
// printf("R@100 = %.4f\n", n_100 / float(rows));
//}
......@@ -178,3 +178,72 @@ PrintResult(const knowhere::DatasetPtr& result, const int& nq, const int& k) {
std::cout << "id\n" << ss_id.str() << std::endl;
std::cout << "dist\n" << ss_dist.str() << std::endl;
}
void
Load_nns_graph(std::vector<std::vector<int64_t>>& final_graph, const char* filename) {
std::vector<std::vector<unsigned>> knng;
std::ifstream in(filename, std::ios::binary);
unsigned k;
in.read((char*)&k, sizeof(unsigned));
in.seekg(0, std::ios::end);
std::ios::pos_type ss = in.tellg();
size_t fsize = (size_t)ss;
size_t num = (size_t)(fsize / (k + 1) / 4);
in.seekg(0, std::ios::beg);
knng.resize(num);
knng.reserve(num);
int64_t kk = (k + 3) / 4 * 4;
for (size_t i = 0; i < num; i++) {
in.seekg(4, std::ios::cur);
knng[i].resize(k);
knng[i].reserve(kk);
in.read((char*)knng[i].data(), k * sizeof(unsigned));
}
in.close();
final_graph.resize(knng.size());
for (int i = 0; i < knng.size(); ++i) {
final_graph[i].resize(knng[i].size());
for (int j = 0; j < knng[i].size(); ++j) {
final_graph[i][j] = knng[i][j];
}
}
}
float*
fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
FILE* f = fopen(fname, "r");
if (!f) {
fprintf(stderr, "could not open %s\n", fname);
perror("");
abort();
}
int d;
fread(&d, 1, sizeof(int), f);
assert((d > 0 && d < 1000000) || !"unreasonable dimension");
fseek(f, 0, SEEK_SET);
struct stat st;
fstat(fileno(f), &st);
size_t sz = st.st_size;
assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
size_t n = sz / ((d + 1) * 4);
*d_out = d;
*n_out = n;
float* x = new float[n * (d + 1)];
size_t nr = fread(x, sizeof(float), n * (d + 1), f);
assert(nr == n * (d + 1) || !"could not read whole file");
// shift array to remove row headers
for (size_t i = 0; i < n; i++) memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
fclose(f);
return x;
}
int* // not very clean, but works as long as sizeof(int) == sizeof(float)
ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
return (int*)fvecs_read(fname, d_out, n_out);
}
......@@ -93,3 +93,12 @@ struct FileIOReader {
size_t
operator()(void* ptr, size_t size);
};
void
Load_nns_graph(std::vector<std::vector<int64_t>>& final_graph_, const char* filename);
float*
fvecs_read(const char* fname, size_t* d_out, size_t* n_out);
int*
ivecs_read(const char* fname, size_t* d_out, size_t* n_out);
\ No newline at end of file
......@@ -201,10 +201,11 @@ NSGConfAdapter::Match(const TempMetaConf& metaconf) {
auto scale_factor = round(metaconf.dim / 128.0);
scale_factor = scale_factor >= 4 ? 4 : scale_factor;
conf->nprobe = int64_t(conf->nlist * 0.01);
conf->knng = 40 + 10 * scale_factor; // the size of knng
conf->search_length = 40 + 5 * scale_factor;
// conf->knng = 40 + 10 * scale_factor; // the size of knng
conf->knng = 50;
conf->search_length = 50 + 5 * scale_factor;
conf->out_degree = 50 + 5 * scale_factor;
conf->candidate_pool_size = 200 + 100 * scale_factor;
conf->candidate_pool_size = 300;
MatchBase(conf);
return conf;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册