diff --git a/tests/milvus_groundtruth/README.md b/tests/milvus_groundtruth/README.md index c17f5c1b636cb5998d8357b553dcdab7701af5ec..6140fdbf5422903f2d6e84832e2bc58ea80f37c8 100644 --- a/tests/milvus_groundtruth/README.md +++ b/tests/milvus_groundtruth/README.md @@ -5,28 +5,30 @@ ### Parameter Description: -| parameter | description | default setting | -| ------------------ | --------------------------------------------- | -------------------- | -| GET_VEC | whether to save feature vectors | False | -| PROCESS_NUM | number of processes | 5 | -| IP | whether metric_type is IP | True | -| L2 | whether metric_type is L2 | False | -| CSV | whether the query vector file format is csv | False | -| UINT8 | whether the query vector data format is uint8 | False | -| BASE_FOLDER_NAME | path to the source vector dataset | '/data/milvus/base' | -| NQ_FOLDER_NAME | path to the query vector dataset | '/data/milvus/query' | -| GT_ALL_FOLDER_NAME | intermediate filename | 'ground_truth_all' | -| GT_FOLDER_NAME | path saved the ground truth results | 'ground_truth' | -| LOC_FILE_NAME | file saved the gorund truth's location info | 'location.txt' | -| FLOC_FILE_NAME | file saved the gorund truth's filenames info | 'file_location.txt' | -| VEC_FILE_NAME | file saved the gorund truth's feature vectors | 'vectors.npy' | +| parameter | description | default setting | +| ------------------ | --------------------------------------------- | ----------------------- | +| PROCESS_NUM | number of processes | 12 | +| GET_VEC | whether to save feature vectors | False | +| CSV | whether the query vector file format is csv | False | +| UINT8 | whether the query vector data format is uint8 | False | +| BASE_FOLDER_NAME | path to the source vector dataset | '/data/milvus/base' | +| NQ_FOLDER_NAME | path to the query vector dataset | '/data/milvus/query' | +| GT_ALL_FOLDER_NAME | intermediate filename | 'ground_truth_all' | +| GT_FOLDER_NAME | path saved the ground truth results | 'ground_truth' | +| LOC_FILE_NAME | file saved the gorund truth's location info | 'ground_truth.txt' | +| FLOC_FILE_NAME | file saved the gorund truth's filenames info | 'file_ground_truth.txt' | +| VEC_FILE_NAME | file saved the gorund truth's feature vectors | 'vectors.npy' | ### Usage: ```bash -$ python3 milvus_ground_truth.py [-q ] -k -l +$ python3 milvus_ground_truth.py [-q ] -k -m -l # -q or --nq points the number of vectors taken from the query vector set. This parameter is optional, Without it will take all the data in the query set. + # -k or --topk points calculate the top k similar vectors. -# -l means generate the ground truth results, it will save in GT_FOLDER_NAME.In this path, LOC_FILE_NAME saved the gorund truth's location info, such as "8002005210",the first ‘8’ is meaningless, the 2-4th position means the position of the result file in the folder, the 5-10th position means the position of the result vector in the result file. The result filename and vector location saved in FLOC_FILE_NAME, such as "binary_128d_00000.npy 81759", and the result vector is saved in VEC_FILE_NAME. + +# -m or --metric points the method vector distances are compared in Milvus,such as IP/L2/Tan. + +# -l means generate the ground truth results, it will save in GT_FOLDER_NAME.In this path, LOC_FILE_NAME saved the gorund truth's results info, such as "8002005210",the first ‘8’ is meaningless, the 2-4th position means the position of the result file in the folder, the 5-10th position means the position of the result vector in the result file. The result filename and vector location saved in FLOC_FILE_NAME, such as "binary_128d_00000.npy 81759", and the result vector is saved in VEC_FILE_NAME. ``` \ No newline at end of file diff --git a/tests/milvus_groundtruth/milvus_ground_truth.py b/tests/milvus_groundtruth/milvus_ground_truth.py index c04e0eb1d42272d0968ae7936528b254d241dfe3..f82060162e4f6a1212bb283743959bc49680349e 100644 --- a/tests/milvus_groundtruth/milvus_ground_truth.py +++ b/tests/milvus_groundtruth/milvus_ground_truth.py @@ -2,14 +2,13 @@ import getopt import os import sys import time -from multiprocessing import Process +from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ProcessPoolExecutor import numpy as np +import math +PROCESS_NUM = 12 GET_VEC = False -PROCESS_NUM = 5 - -IP = True -L2 = False CSV = False UINT8 = False @@ -18,31 +17,24 @@ NQ_FOLDER_NAME = '/data/milvus/query' GT_ALL_FOLDER_NAME = 'ground_truth_all' GT_FOLDER_NAME = 'ground_truth' -LOC_FILE_NAME = 'location.txt' -FLOC_FILE_NAME = 'file_location.txt' +LOC_FILE_NAME = 'ground_truth.txt' +FLOC_FILE_NAME = 'file_ground_truth.txt' VEC_FILE_NAME = 'vectors.npy' # get vectors of the files -def load_nq_vec(nq): - vectors = [] - length = 0 +def load_query_vec(nq, vectors=[], length=0): filenames = os.listdir(NQ_FOLDER_NAME) filenames.sort() - if nq == 0: - for filename in filenames: - vectors += load_vec_list(NQ_FOLDER_NAME + '/' + filename) - return vectors for filename in filenames: vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename) length += len(vec_list) - if length > nq: + if nq!=0 and length>nq : num = nq % len(vec_list) - # vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename, num) - vec_list = vec_list[0:num] + vectors += vec_list[0:num] + break vectors += vec_list - if len(vectors) == nq: - return vectors + return vectors # load vectors from filr_name and num means nq's number @@ -55,12 +47,20 @@ def load_vec_list(file_name): data = np.load(file_name) if UINT8: data = (data + 0.5) / 255 - vec_list = [] - for i in range(len(data)): - vec_list.append(data[i].tolist()) + vec_list = data.tolist() return vec_list +def hex_to_bin(fp): + vec=[] + length = len(fp) * 4 + bstr = str(bin(int(fp,16))) + bstr = (length-(len(bstr)-2)) * '0' + bstr[2:] + for f in bstr: + vec.append(int(f)) + return vec + + def calEuclideanDistance(vec1, vec2): vec1 = np.array(vec1) vec2 = np.array(vec2) @@ -75,6 +75,18 @@ def calInnerDistance(vec1, vec2): return dist +def calTanimoto(vec1, vec2): + vec1 = hex_to_bin(vec1) + vec2 = hex_to_bin(vec2) + # print(vec1,vec2) + nc = float(np.inner(vec1, vec2)) + n1 = float(np.sum(vec1)) + n2 = float(np.sum(vec2)) + dist = nc/(n1+n2-nc) + print(nc,n1,n2) + return dist + + def get_ground_truth_l2(topk, idx, vct_nq): filenames = os.listdir(BASE_FOLDER_NAME) filenames.sort() @@ -85,7 +97,7 @@ def get_ground_truth_l2(topk, idx, vct_nq): for j in range(len(vec_list)): dist = calEuclideanDistance(vct_nq, vec_list[j]) num_j = "%01d%03d%06d" % (8, k, j) - if j < topk and k == 0: + if k==0 and j min_value: m = no_dist.pop(min_key) no_dist[num_j] = dist - k = k + 1 + k += 1 no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) + print(no_dist) + save_gt_file(no_dist, idx) + + +def get_ground_truth_tanimoto(topk, idx, vec_nq): + filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names + filenames.sort() + no_dist = {} + k = 0 + for filename in filenames: + vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) + print(BASE_FOLDER_NAME + '/' + filename, len(vec_list)) + for j in range(len(vec_list)): + dist = calTanimoto(vec_nq, vec_list[j]) + num_j = "%03d%06d" % (k, j) + if k==0 and j min_value: + m = no_dist.pop(min_key) + no_dist[num_j] = dist + k += 1 + no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) + print(no_dist) save_gt_file(no_dist, idx) def save_gt_file(no_dist, idx): - s = "%05d" % idx - idx_fname = GT_ALL_FOLDER_NAME + '/' + s + '_idx.txt' - dis_fname = GT_ALL_FOLDER_NAME + '/' + s + '_dis.txt' - with open(idx_fname, 'w') as f: + filename = "%05d" % idx + 'results.txt' + with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f: for re in no_dist: - f.write(str(re[0]) + '\n') - f.write('\n') - with open(dis_fname, 'w') as f: - for re in no_dist: - f.write(str(re[1]) + '\n') - f.write('\n') - + f.write(str(re[0]) + ' ' + str(re[1]) + '\n') def get_loc_txt(file): filenames = os.listdir(GT_ALL_FOLDER_NAME) filenames.sort() write_file = open(GT_FOLDER_NAME + '/' + file, 'w+') for f in filenames: - if f.endswith('_idx.txt'): - f = GT_ALL_FOLDER_NAME + '/' + f - for line in open(f, 'r'): - write_file.write(line) + for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'): + write_file.write(line) + write_file.write('\n') def get_file_loc_txt(gt_file, fnames_file): - gt_file = GT_FOLDER_NAME + '/' + gt_file - fnames_file = GT_FOLDER_NAME + '/' + fnames_file filenames = os.listdir(BASE_FOLDER_NAME) filenames.sort() - with open(gt_file, 'r') as gt_f: - with open(fnames_file, 'w') as fnames_f: + with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f: + with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f: for line in gt_f: if line != '\n': - line = line.strip() + line = line.split()[0] loca = int(line[1:4]) offset = int(line[4:10]) fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n') else: - fnames_f.write('\n') + fnames_f.write(line) def load_gt_file_out(): - file_name = GT_FOLDER_NAME + '/' +FLOC_FILE_NAME + file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME base_filename = [] num = [] with open(file_name, 'r') as f: @@ -177,70 +205,36 @@ def load_gt_file_out(): return base_filename, num -def ground_truth_process(nq, topk): - try: - os.mkdir(GT_ALL_FOLDER_NAME) - except: - print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') - else: - vectors = load_nq_vec(nq) - print("query list:", len(vectors)) - processes = [] - process_num = PROCESS_NUM - nq = len(vectors) - loops = nq // process_num - rest = nq % process_num - if rest != 0: - loops += 1 - time_start = time.time() - for loop in range(loops): - time1_start = time.time() - base = loop * process_num - if rest != 0 and loop == loops - 1: - process_num = rest - print('base:', loop) - for i in range(process_num): - print('nq_index:', base + i) - if L2: - if base + i == 0: - print("get ground truth by L2.") - process = Process(target=get_ground_truth_l2, - args=(topk, base + i, vectors[base + i])) - elif IP: - if base + i == 0: - print("get ground truth by IP.") - process = Process(target=get_ground_truth_ip, - args=(topk, base + i, vectors[base + i])) - processes.append(process) - process.start() - for p in processes: - p.join() - time1_end = time.time() - print("base", loop, "time_cost = ", round(time1_end - time1_start, 4)) - if not os.path.exists(GT_FOLDER_NAME): - os.mkdir(GT_FOLDER_NAME) - get_loc_txt(LOC_FILE_NAME) - get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) - if GET_VEC: - vec = [] - file, num = load_gt_file_out() - for i in range(len(file)): - n = int(num[i]) - 1 - vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) - vec.append(vectors[n]) - print("saved len of vec:", len(vec)) - np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) - time_end = time.time() - time_cost = time_end - time_start - print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") +def ground_truth_process(metric,nq_list, topk, num): + thread_num = len(nq_list) + with ProcessPoolExecutor(thread_num) as executor: + for i in range(thread_num): + # print("Process:",num+i) + if metric == 'L2': + executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i]) + elif metric == 'IP': + executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i]) + elif metric == 'Tan': + executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i]) + get_loc_txt(LOC_FILE_NAME) + get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) + if GET_VEC: + vec = [] + file, num = load_gt_file_out() + for i in range(len(file)): + n = int(num[i]) - 1 + vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) + vec.append(vectors[n]) + print("saved len of vec:", len(vec)) + np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) def main(): try: opts, args = getopt.getopt( sys.argv[1:], - "hlq:k:", - ["help", "nq=", "topk="], + "hlq:k:m:", + ["help", "nq=", "topk=", "metric="], ) except getopt.GetoptError: print("Usage: test.py [-q ] -k -s") @@ -254,9 +248,33 @@ def main(): nq = int(opt_value) elif opt_name in ("-k", "--topk"): topk = int(opt_value) - elif opt_name == "-l": - ground_truth_process(nq, topk) # test.py [-q ] -k -l - sys.exit() + elif opt_name in ("-m", "--metric"): + metric = opt_value + elif opt_name == "-l": # test.py [-q ] -k -m -l + try: + os.mkdir(GT_ALL_FOLDER_NAME) + except: + print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') + sys.exit() + if not os.path.exists(GT_FOLDER_NAME): + os.mkdir(GT_FOLDER_NAME) + + print("metric type is",metric) + time_start = time.time() + query_vectors = load_query_vec(nq) + nq = len(query_vectors) + print("query list:", len(query_vectors)) + num = math.ceil(nq/PROCESS_NUM) + for i in range(num): + print("start with round:",i+1) + if i==num-1: + ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM) + else: + ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM) + + time_end = time.time() + time_cost = time_end - time_start + print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") if __name__ == '__main__':