未验证 提交 680c0c9e 编写于 作者: W Wei Shengyu 提交者: GitHub

Merge pull request #761 from FredHuang16/develop_reg

add vector search
CXX=/usr/bin/g++-5
all : index
index.so : src/config.h src/graph.h src/data.h interface.cc
$(CXX) -shared -fPIC interface.cc -o index.so -std=c++11 -Ofast -march=native -g -flto -funroll-loops -DOMP -fopenmp
# 向量检索
## 简介
一些垂域识别任务(如车辆、商品等)需要识别的类别数较大,往往采用基于检索的方式,通过查询向量与底库向量进行快速的最近邻搜索,获得匹配的预测类别。向量检索模块提供基础的近似最近邻搜索算法,基于百度自研的Möbius算法,一种基于图的近似最近邻搜索算法,用于最大内积搜索 (MIPS)。 该模块提供python接口,支持numpy和 tensor类型向量,支持L2和Inner Product距离计算。
Mobius 算法细节详见论文 ([Möbius Transformation for Fast Inner Product Search on Graph](http://research.baidu.com/Public/uploads/5e189d36b5cf6.PDF), [Code](https://github.com/sunbelbd/mobius)
## 安装
若index.so不可用,在项目目录下运行以下命令生成新的index.so文件
make index.so
编译环境: g++ 5.4.0 , 9.3.0. 其他版本也可能工作。 请确保您的 C++ 编译器支持 C++11 标准。
## 快速使用
import numpy as np
from interface import Graph_Index
# 随机产生样本
index_vectors = np.random.rand(100000,128).astype(np.float32)
query_vector = np.random.rand(128).astype(np.float32)
index_docs = ["ID_"+str(i) for i in range(100000)]
# 初始化索引结构
indexer = Graph_Index(dist_type="IP") #支持"IP"和"L2"
indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
# 查询
scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
print(scores)
print(docs)
# 保存与加载
indexer.dump(index_path="test")
indexer.load(index_path="test")
from .interface import Graph_Index
#MIT License
#
#Copyright (c) 2021 Mobius Authors
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
#from https://github.com/sunbelbd/mobius/blob/e2d166547d61d791da8f06747a63b9cd38f02c71/main.cc
#include<stdio.h>
#include<string.h>
#include <iostream>
#include <fstream>
#include <queue>
#include <chrono>
#include <unordered_set>
#include <unordered_map>
#include<stdlib.h>
#include<memory>
#include<vector>
#include<functional>
#include"src/data.h"
#include"src/graph.h"
struct IndexContext{
void* graph;
void* data;
};
int topk = 0;
int display_topk = 1;
int build_idx_offset = 0;
int query_idx_offset = 0;
void flush_add_buffer(
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>>& add_buffer,
GraphWrapper* graph){
#pragma omp parallel for
for(int i = 0;i < add_buffer.size();++i){
auto& idx = add_buffer[i].first;
auto& point = add_buffer[i].second;
graph->add_vertex_lock(idx,point);
}
add_buffer.clear();
}
extern "C"{
// for mobius IP index
void build_mobius_index(float* dense_mat,int row,int dim, int pq_size, double mobius_pow , const char* prefix){
std::unique_ptr<Data> data;
std::unique_ptr<Data> data_original;
std::unique_ptr<GraphWrapper> graph;
int topk = 0;
int display_topk = 1;
int build_idx_offset = 0;
int query_idx_offset = 0;
++row;
data = std::unique_ptr<Data>(new Data(row,dim));
graph = std::unique_ptr<GraphWrapper>(new FixedDegreeGraph<3>(data.get()));
graph->set_construct_pq_size(pq_size);
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>> add_buffer;
((FixedDegreeGraph<3>*)graph.get())->get_data()->mobius_pow = mobius_pow;
data_original = std::unique_ptr<Data>(new Data(row,dim));
std::vector<std::pair<int,value_t>> dummy_mobius_point;
for(int i = 0;i < dim;++i)
dummy_mobius_point.push_back(std::make_pair(i,0));
//idx += build_idx_offset;
for(int i = 0;i < row - 1;++i){
std::vector<std::pair<int,value_t>> point;
point.reserve(dim);
for(int j = 0;j < dim;++j)
point.push_back(std::make_pair(j,dense_mat[i * dim + j]));
data_original->add(i,point);
data->add_mobius(i,point);
if(i < 1000){
graph->add_vertex(i,point);
}else{
add_buffer.push_back(std::make_pair(i,point));
}
if(add_buffer.size() >= 1000000)
flush_add_buffer(add_buffer,graph.get());
}
flush_add_buffer(add_buffer,graph.get());
graph->add_vertex(row - 1,dummy_mobius_point);
data.swap(data_original);
std::string str = std::string(prefix);
data->dump(str + ".data");
graph->dump(str + ".graph");
}
void load_mobius_index_prefix(int row,int dim,IndexContext* index_context,const char* prefix){
std::string str = std::string(prefix);
++row;
Data* data = new Data(row,dim);
GraphWrapper* graph = new FixedDegreeGraph<1>(data);
//idx += build_idx_offset;
data->load(str + ".data");
graph->load(str + ".graph");
((FixedDegreeGraph<1>*)graph)->search_start_point = row - 1;
((FixedDegreeGraph<1>*)graph)->ignore_startpoint = true;
index_context->graph = graph;
index_context->data = data;
}
void save_mobius_index_prefix(IndexContext* index_context,const char* prefix){
std::string str = std::string(prefix);
Data* data = (Data*)(index_context->data);
GraphWrapper* graph = (GraphWrapper*)(index_context->graph);
data->dump(str + ".data");
graph->dump(str + ".graph");
}
void search_mobius_index(float* dense_vec,int dim,int search_budget,int return_k, IndexContext* index_context,idx_t* ret_id,double* ret_score){
int topk = 0;
int display_topk = 1;
int build_idx_offset = 0;
int query_idx_offset = 0;
Data* data = reinterpret_cast<Data*>(index_context->data);
GraphWrapper* graph = reinterpret_cast<GraphWrapper*>(index_context->graph);
//auto flag = (data==NULL);
//std::cout<<flag<<std::endl;
std::vector<std::pair<int,value_t>> point;
point.reserve(dim);
for(int j = 0;j < dim;++j)
point.push_back(std::make_pair(j,dense_vec[j]));
std::vector<idx_t> topN;
std::vector<double> score;
graph->search_top_k_with_score(point,search_budget,topN,score);
for(int i = 0;i < topN.size() && i < return_k;++i){
ret_id[i] = topN[i];
ret_score[i] = score[i];
}
}
// For L2 index
void build_l2_index(float* dense_mat,int row,int dim, int pq_size, const char* prefix){
std::unique_ptr<Data> data;
std::unique_ptr<GraphWrapper> graph;
int topk = 0;
int display_topk = 1;
int build_idx_offset = 0;
int query_idx_offset = 0;
data = std::unique_ptr<Data>(new Data(row,dim));
graph = std::unique_ptr<GraphWrapper>(new FixedDegreeGraph<3>(data.get()));
graph->set_construct_pq_size(pq_size);
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>> add_buffer;
for(int i = 0;i < row;++i){
std::vector<std::pair<int,value_t>> point;
point.reserve(dim);
for(int j = 0;j < dim;++j)
point.push_back(std::make_pair(j,dense_mat[i * dim + j]));
data->add(i,point);
if(i < 1000){
graph->add_vertex(i,point);
}else{
add_buffer.push_back(std::make_pair(i,point));
}
if(add_buffer.size() >= 1000000)
flush_add_buffer(add_buffer,graph.get());
}
flush_add_buffer(add_buffer,graph.get());
std::string str = std::string(prefix);
data->dump(str + ".data");
graph->dump(str + ".graph");
}
void load_l2_index_prefix(int row,int dim,IndexContext* index_context,const char* prefix){
std::string str = std::string(prefix);
Data* data = new Data(row,dim);
GraphWrapper* graph = new FixedDegreeGraph<3>(data);
//idx += build_idx_offset;
data->load(str + ".data");
graph->load(str + ".graph");
index_context->graph = graph;
index_context->data = data;
}
void save_l2_index_prefix(IndexContext* index_context,const char* prefix){
std::string str = std::string(prefix);
Data* data = (Data*)(index_context->data);
GraphWrapper* graph = (GraphWrapper*)(index_context->graph);
data->dump(str + ".data");
graph->dump(str + ".graph");
}
void search_l2_index(float* dense_vec,int dim,int search_budget,int return_k, IndexContext* index_context,idx_t* ret_id,double* ret_score){
int topk = 0;
int display_topk = 1;
int build_idx_offset = 0;
int query_idx_offset = 0;
Data* data = reinterpret_cast<Data*>(index_context->data);
GraphWrapper* graph = reinterpret_cast<GraphWrapper*>(index_context->graph);
std::vector<std::pair<int,value_t>> point;
point.reserve(dim);
for(int j = 0;j < dim;++j)
point.push_back(std::make_pair(j,dense_vec[j]));
std::vector<idx_t> topN;
std::vector<double> score;
graph->search_top_k_with_score(point,search_budget,topN,score);
for(int i = 0;i < topN.size() && i < return_k;++i){
// printf("%d: (%zu, %f)\n",i,topN[i],score[i]);
ret_id[i] = topN[i];
ret_score[i] = score[i];
}
}
void release_context(IndexContext* index_context){
delete (Data*)(index_context->data);
delete (GraphWrapper*)(index_context->graph);
}
} // extern "C"
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
import paddle
import numpy.ctypeslib as ctl
import numpy as np
import os
import json
from ctypes import *
from numpy.ctypeslib import ndpointer
lib = ctypes.cdll.LoadLibrary("./index.so")
class IndexContext(Structure):
_fields_=[("graph",c_void_p),
("data",c_void_p)]
# for mobius IP index
build_mobius_index = lib.build_mobius_index
build_mobius_index.restype = None
build_mobius_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_char_p]
search_mobius_index = lib.search_mobius_index
search_mobius_index.restype = None
search_mobius_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int,ctypes.c_int,POINTER(IndexContext),ctl.ndpointer(np.uint64, flags='aligned, c_contiguous'),ctl.ndpointer(np.float64, flags='aligned, c_contiguous')]
load_mobius_index_prefix = lib.load_mobius_index_prefix
load_mobius_index_prefix.restype = None
load_mobius_index_prefix.argtypes = [ctypes.c_int, ctypes.c_int, POINTER(IndexContext), ctypes.c_char_p]
save_mobius_index_prefix = lib.save_mobius_index_prefix
save_mobius_index_prefix.restype = None
save_mobius_index_prefix.argtypes = [POINTER(IndexContext), ctypes.c_char_p]
# for L2 index
build_l2_index = lib.build_l2_index
build_l2_index.restype = None
build_l2_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_char_p]
search_l2_index = lib.search_l2_index
search_l2_index.restype = None
search_l2_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int,ctypes.c_int,POINTER(IndexContext),ctl.ndpointer(np.uint64, flags='aligned, c_contiguous'),ctl.ndpointer(np.float64, flags='aligned, c_contiguous')]
load_l2_index_prefix = lib.load_l2_index_prefix
load_l2_index_prefix.restype = None
load_l2_index_prefix.argtypes = [ctypes.c_int, ctypes.c_int, POINTER(IndexContext), ctypes.c_char_p]
save_l2_index_prefix = lib.save_l2_index_prefix
save_l2_index_prefix.restype = None
save_l2_index_prefix.argtypes = [POINTER(IndexContext), ctypes.c_char_p]
release_context = lib.release_context
release_context.restype = None
release_context.argtypes = [POINTER(IndexContext)]
class Graph_Index(object):
"""
graph index
"""
def __init__(self, dist_type="IP"):
self.dim = 0
self.total_num = 0
self.dist_type = dist_type
self.mobius_pow = 2.0
self.index_context = IndexContext(0,0)
self.gallery_doc_dict = {}
self.with_attr = False
assert dist_type in ["IP", "L2"], "Only support IP and L2 distance ..."
def build(self, gallery_vectors, gallery_docs=[], pq_size=100, index_path='graph_index/'):
"""
build index
"""
if paddle.is_tensor(gallery_vectors):
gallery_vectors = gallery_vectors.numpy()
assert gallery_vectors.ndim == 2, "Input vector must be 2D ..."
self.total_num = gallery_vectors.shape[0]
self.dim = gallery_vectors.shape[1]
assert (len(gallery_docs) == self.total_num if len(gallery_docs)>0 else True)
print("training index -> num: {}, dim: {}, dist_type: {}".format(self.total_num, self.dim, self.dist_type))
if not os.path.exists(index_path):
os.makedirs(index_path)
if self.dist_type == "IP":
build_mobius_index(gallery_vectors,self.total_num,self.dim, pq_size, self.mobius_pow, create_string_buffer((index_path+"/index").encode('utf-8')))
load_mobius_index_prefix(self.total_num, self.dim, ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
else:
build_l2_index(gallery_vectors,self.total_num,self.dim, pq_size, create_string_buffer((index_path+"/index").encode('utf-8')))
load_l2_index_prefix(self.total_num, self.dim, ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
self.gallery_doc_dict = {}
if len(gallery_docs) > 0:
self.with_attr = True
for i in range(gallery_vectors.shape[0]):
self.gallery_doc_dict[str(i)] = gallery_docs[i]
self.gallery_doc_dict["total_num"] = self.total_num
self.gallery_doc_dict["dim"] = self.dim
self.gallery_doc_dict["dist_type"] = self.dist_type
self.gallery_doc_dict["with_attr"] = self.with_attr
with open(index_path + "/info.json", "w") as f:
json.dump(self.gallery_doc_dict, f)
print("finished creating index ...")
def search(self, query, return_k=10, search_budget=100):
"""
search
"""
ret_id = np.zeros(return_k, dtype=np.uint64)
ret_score = np.zeros(return_k, dtype=np.float64)
if paddle.is_tensor(query):
query = query.numpy()
if self.dist_type == "IP":
search_mobius_index(query,self.dim,search_budget,return_k,ctypes.byref(self.index_context),ret_id,ret_score)
else:
search_l2_index(query,self.dim,search_budget,return_k,ctypes.byref(self.index_context),ret_id,ret_score)
ret_id = ret_id.tolist()
ret_doc = []
if self.with_attr:
for i in range(return_k):
ret_doc.append(self.gallery_doc_dict[str(ret_id[i])])
return ret_score, ret_doc
else:
return ret_score, ret_id
def dump(self, index_path):
if not os.path.exists(index_path):
os.makedirs(index_path)
if self.dist_type == "IP":
save_mobius_index_prefix(ctypes.byref(self.index_context),create_string_buffer((index_path+"/index").encode('utf-8')))
else:
save_l2_index_prefix(ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
with open(index_path + "/info.json", "w") as f:
json.dump(self.gallery_doc_dict, f)
def load(self, index_path):
self.gallery_doc_dict = {}
with open(index_path + "/info.json", "r") as f:
self.gallery_doc_dict = json.load(f)
self.total_num = self.gallery_doc_dict["total_num"]
self.dim = self.gallery_doc_dict["dim"]
self.dist_type = self.gallery_doc_dict["dist_type"]
self.with_attr = self.gallery_doc_dict["with_attr"]
if self.dist_type == "IP":
load_mobius_index_prefix(self.total_num,self.dim,ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
else:
load_l2_index_prefix(self.total_num,self.dim,ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
# MIT License
#
#Copyright (c) 2021 Mobius Authors
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
#from https://github.com/sunbelbd/mobius/blob/e2d166547d61d791da8f06747a63b9cd38f02c71/config.h
#pragma once
typedef float value_t;
//typedef double dist_t;
typedef float dist_t;
typedef size_t idx_t;
typedef int UINT;
#define ACC_BATCH_SIZE 4096
#define FIXED_DEGREE 31
#define FIXED_DEGREE_SHIFT 5
//for construction
#define SEARCH_DEGREE 15
#define CONSTRUCT_SEARCH_BUDGET 150
# MIT License
#
#Copyright (c) 2021 Mobius Authors
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
#from https://github.com/sunbelbd/mobius/blob/e2d166547d61d791da8f06747a63b9cd38f02c71/data.h
#pragma once
#include<memory>
#include<vector>
#include<math.h>
#include"config.h"
#define ZERO_EPS 1e-10
#define _SCALE_WORLD_DENSE_DATA
#ifdef _SCALE_WORLD_DENSE_DATA
//dense data
class Data{
private:
std::unique_ptr<value_t[]> data;
size_t num;
size_t curr_num = 0;
int dim;
public:
value_t mobius_pow = 2;
value_t max_ip_norm = 1;
value_t max_ip_norm2 = 1;
Data(size_t num, int dim) : num(num),dim(dim){
data = std::unique_ptr<value_t[]>(new value_t[num * dim]);
memset(data.get(),0,sizeof(value_t) * num * dim);
}
value_t* get(idx_t idx) const{
return data.get() + idx * dim;
}
template<class T>
dist_t ipwrap_l2_query_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
dist_t normu = 0;
for(int i = 0;i < dim;++i){
auto diff = (*(pa + i) / max_ip_norm) - v[i];
ret += diff * diff;
normu += (*(pa + i)) * (*(pa + i));
}
ret += 1 - normu / max_ip_norm2;
return ret;
}
template<class T>
dist_t ipwrap_l2_build_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
dist_t normu = 0;
dist_t normv = 0;
for(int i = 0;i < dim;++i){
auto diff = *(pa + i) - v[i];
ret += diff * diff;
normu += (*(pa + i)) * (*(pa + i));
normv += v[i] * v[i];
}
dist_t wrap_termu = sqrt(1 - normu / max_ip_norm2);
dist_t wrap_termv = sqrt(1 - normv / max_ip_norm2);
dist_t diff_wrap = wrap_termu - wrap_termv;
ret = ret / max_ip_norm2 + diff_wrap * diff_wrap;
return ret;
}
template<class T>
dist_t l2_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
for(int i = 0;i < dim;++i){
auto diff = *(pa + i) - v[i];
ret += diff * diff;
}
return ret;
}
template<class T>
dist_t negative_inner_prod_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
for(int i = 0;i < dim;++i){
ret -= (*(pa + i)) * v[i];
}
return ret;
}
template<class T>
dist_t negative_cosine_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
value_t lena = 0,lenv = 0;
for(int i = 0;i < dim;++i){
ret += (*(pa + i)) * v[i];
lena += (*(pa + i)) * (*(pa + i));
lenv += v[i] * v[i];
}
int sign = ret < 0 ? 1 : -1;
// return sign * (ret * ret / lena);// / lenv);
return sign * (ret * ret / lena / lenv);
}
template<class T>
dist_t mobius_l2_distance(idx_t a,T& v) const{
auto pa = get(a);
dist_t ret = 0;
value_t lena = 0,lenv = 0;
for(int i = 0;i < dim;++i){
lena += (*(pa + i)) * (*(pa + i));
lenv += v[i] * v[i];
}
value_t modifier_a = pow(lena,0.5 * mobius_pow);
value_t modifier_v = pow(lenv,0.5 * mobius_pow);
if(fabs(modifier_a) < ZERO_EPS)
modifier_a = 1;
if(fabs(modifier_v) < ZERO_EPS)
modifier_v = 1;
for(int i = 0;i < dim;++i){
value_t tmp = (*(pa + i)) / modifier_a - v[i] / modifier_v;
ret += tmp * tmp;
}
return ret;
}
template<class T>
dist_t real_nn(T& v) const{
dist_t minn = 1e100;
for(size_t i = 0;i < curr_num;++i){
auto res = l2_distance(i,v);
if(res < minn){
minn = res;
}
}
return minn;
}
std::vector<value_t> organize_point_mobius(const std::vector<std::pair<int,value_t>>& v){
std::vector<value_t> ret(dim,0);
value_t lena = 0;
for(const auto& p : v){
// ret[p.first] = p.second;
lena += p.second * p.second;
}
value_t modifier_a = pow(lena,0.5 * mobius_pow);
if(fabs(modifier_a) < ZERO_EPS)
modifier_a = 1;
for(const auto& p : v){
ret[p.first] = p.second / modifier_a;
}
return std::move(ret);
}
std::vector<value_t> organize_point(const std::vector<std::pair<int,value_t>>& v){
std::vector<value_t> ret(dim,0);
for(const auto& p : v){
if(p.first >= dim)
printf("error %d %d\n",p.first,dim);
ret[p.first] = p.second;
}
return std::move(ret);
}
value_t vec_sum2(const std::vector<std::pair<int,value_t>>& v){
value_t ret = 0;
for(const auto& p : v){
if(p.first >= dim)
printf("error %d %d\n",p.first,dim);
ret += p.second * p.second;
}
return std::move(ret);
}
void add(idx_t idx, std::vector<std::pair<int,value_t>>& value){
//printf("adding %zu\n",idx);
//for(auto p : value)
// printf("%zu %d %f\n",idx,p.first,p.second);
curr_num = std::max(curr_num,idx);
auto p = get(idx);
for(const auto& v : value)
*(p + v.first) = v.second;
}
void add_mobius(idx_t idx, std::vector<std::pair<int,value_t>>& value){
//printf("adding %zu\n",idx);
//for(auto p : value)
// printf("%zu %d %f\n",idx,p.first,p.second);
curr_num = std::max(curr_num,idx);
auto p = get(idx);
value_t lena = 0;
for(const auto& v : value){
*(p + v.first) = v.second;
lena += v.second * v.second;
}
value_t modifier_a = pow(lena,0.5 * mobius_pow);
if(fabs(modifier_a) < ZERO_EPS)
modifier_a = 1;
for(const auto& v : value){
*(p + v.first) = v.second / modifier_a;
}
}
inline size_t max_vertices(){
return num;
}
inline size_t curr_vertices(){
return curr_num;
}
void print(){
for(int i = 0;i < num && i < 10;++i)
printf("%f ",*(data.get() + i));
printf("\n");
}
int get_dim(){
return dim;
}
void dump(std::string path = "bfsg.data"){
FILE* fp = fopen(path.c_str(),"wb");
fwrite(data.get(),sizeof(value_t) * num * dim,1,fp);
fclose(fp);
}
void load(std::string path = "bfsg.data"){
curr_num = num;
FILE* fp = fopen(path.c_str(),"rb");
auto cnt = fread(data.get(),sizeof(value_t) * num * dim,1,fp);
fclose(fp);
}
};
template<>
dist_t Data::ipwrap_l2_build_distance(idx_t a,idx_t& b) const{
auto pa = get(a);
auto pb = get(b);
dist_t ret = 0;
dist_t normu = 0;
dist_t normv = 0;
for(int i = 0;i < dim;++i){
auto diff = *(pa + i) - *(pb + i);
ret += diff * diff;
normu += (*(pa + i)) * (*(pa + i));
normv += (*(pb + i)) * (*(pb + i));
}
dist_t wrap_termu = sqrt(1 - normu / max_ip_norm2);
dist_t wrap_termv = sqrt(1 - normv / max_ip_norm2);
dist_t diff_wrap = wrap_termu - wrap_termv;
ret = ret / max_ip_norm2 + diff_wrap * diff_wrap;
return ret;
}
template<>
dist_t Data::ipwrap_l2_query_distance(idx_t a,idx_t& b) const{
auto pa = get(a);
auto pb = get(b);
dist_t ret = 0;
dist_t normu = 0;
for(int i = 0;i < dim;++i){
auto diff = (*(pa + i) / max_ip_norm) - *(pb + i);
ret += diff * diff;
normu += (*(pa + i)) * (*(pa + i));
}
ret += 1 - normu / max_ip_norm2;
return ret;
}
template<>
dist_t Data::l2_distance(idx_t a,idx_t& b) const{
auto pa = get(a),
pb = get(b);
dist_t ret = 0;
for(int i = 0;i < dim;++i){
auto diff = *(pa + i) - *(pb + i);
ret += diff * diff;
}
return ret;
}
template<>
dist_t Data::negative_inner_prod_distance(idx_t a,idx_t& b) const{
auto pa = get(a),
pb = get(b);
dist_t ret = 0;
for(int i = 0;i < dim;++i){
ret -= (*(pa + i)) * (*(pb + i));
}
return ret;
}
template<>
dist_t Data::negative_cosine_distance(idx_t a,idx_t& b) const{
auto pa = get(a),
pb = get(b);
dist_t ret = 0;
value_t lena = 0,lenv = 0;
for(int i = 0;i < dim;++i){
ret += (*(pa + i)) * (*(pb + i));
lena += (*(pa + i)) * (*(pa + i));
lenv += (*(pb + i)) * (*(pb + i));
}
int sign = ret < 0 ? 1 : -1;
// return sign * (ret * ret / lena);
return sign * (ret * ret / lena / lenv);
}
template<>
dist_t Data::mobius_l2_distance(idx_t a,idx_t& b) const{
auto pa = get(a),
pb = get(b);
dist_t ret = 0;
value_t lena = 0,lenv = 0;
for(int i = 0;i < dim;++i){
lena += (*(pa + i)) * (*(pa + i));
lenv += (*(pb + i)) * (*(pb + i));
}
value_t modifier_a = pow(lena,0.5 * mobius_pow);
value_t modifier_v = pow(lenv,0.5 * mobius_pow);
if(fabs(modifier_a) < ZERO_EPS)
modifier_a = 1;
if(fabs(modifier_v) < ZERO_EPS)
modifier_v = 1;
for(int i = 0;i < dim;++i){
value_t tmp = (*(pa + i)) / modifier_a - (*(pb + i)) / modifier_v;
ret += tmp * tmp;
}
return ret;
}
#else
//sparse data
class Data{
public:
//TODO
};
#endif
此差异已折叠。
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from interface import Graph_Index
# 随机产生样本
index_vectors = np.random.rand(100000,128).astype(np.float32)
query_vector = np.random.rand(128).astype(np.float32)
index_docs = ["ID_"+str(i) for i in range(100000)]
# 初始化索引结构
indexer = Graph_Index(dist_type="IP") #支持"IP"和"L2"
indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
# 查询
scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
print(scores)
print(docs)
# 保存与加载
indexer.dump(index_path="test")
indexer.load(index_path="test")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册