Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
milvus
提交
cfbe86df
milvus
项目概览
BaiXuePrincess
/
milvus
与 Fork 源项目一致
从无法访问的项目Fork
通知
7
Star
4
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
milvus
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cfbe86df
编写于
3月 27, 2019
作者:
X
xj.lin
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into jinhai
上级
dadf9734
77a0880a
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
56 addition
and
135 deletion
+56
-135
pyengine/engine/controller/scheduler.py
pyengine/engine/controller/scheduler.py
+26
-8
pyengine/engine/controller/tests/test_scheduler.py
pyengine/engine/controller/tests/test_scheduler.py
+7
-7
pyengine/engine/controller/vector_engine.py
pyengine/engine/controller/vector_engine.py
+1
-1
pyengine/engine/ingestion/build_index.py
pyengine/engine/ingestion/build_index.py
+7
-7
pyengine/engine/ingestion/serialize.py
pyengine/engine/ingestion/serialize.py
+4
-0
pyengine/engine/ingestion/tests/test_build.py
pyengine/engine/ingestion/tests/test_build.py
+2
-4
pyengine/engine/retrieval/search_index.py
pyengine/engine/retrieval/search_index.py
+6
-3
pyengine/engine/retrieval/tests/basic_test.py
pyengine/engine/retrieval/tests/basic_test.py
+0
-103
pyengine/engine/retrieval/tests/test_search.py
pyengine/engine/retrieval/tests/test_search.py
+3
-2
未找到文件。
pyengine/engine/controller/scheduler.py
浏览文件 @
cfbe86df
from
engine.retrieval
import
search_index
from
engine.retrieval
import
search_index
from
engine.ingestion
import
build_index
from
engine.ingestion
import
build_index
from
engine.ingestion
import
serialize
from
engine.ingestion
import
serialize
import
numpy
as
np
class
Singleton
(
type
):
class
Singleton
(
type
):
_instances
=
{}
_instances
=
{}
def
__call__
(
cls
,
*
args
,
**
kwargs
):
def
__call__
(
cls
,
*
args
,
**
kwargs
):
if
cls
not
in
cls
.
_instances
:
if
cls
not
in
cls
.
_instances
:
cls
.
_instances
[
cls
]
=
super
(
Singleton
,
cls
).
__call__
(
*
args
,
**
kwargs
)
cls
.
_instances
[
cls
]
=
super
(
Singleton
,
cls
).
__call__
(
*
args
,
**
kwargs
)
...
@@ -12,7 +14,7 @@ class Singleton(type):
...
@@ -12,7 +14,7 @@ class Singleton(type):
class
Scheduler
(
metaclass
=
Singleton
):
class
Scheduler
(
metaclass
=
Singleton
):
def
S
earch
(
self
,
index_file_key
,
vectors
,
k
):
def
s
earch
(
self
,
index_file_key
,
vectors
,
k
):
# assert index_file_key
# assert index_file_key
# assert vectors
# assert vectors
assert
k
!=
0
assert
k
!=
0
...
@@ -20,7 +22,6 @@ class Scheduler(metaclass=Singleton):
...
@@ -20,7 +22,6 @@ class Scheduler(metaclass=Singleton):
query_vectors
=
serialize
.
to_array
(
vectors
)
query_vectors
=
serialize
.
to_array
(
vectors
)
return
self
.
__scheduler
(
index_file_key
,
query_vectors
,
k
)
return
self
.
__scheduler
(
index_file_key
,
query_vectors
,
k
)
def
__scheduler
(
self
,
index_data_key
,
vectors
,
k
):
def
__scheduler
(
self
,
index_data_key
,
vectors
,
k
):
result_list
=
[]
result_list
=
[]
...
@@ -36,18 +37,35 @@ class Scheduler(metaclass=Singleton):
...
@@ -36,18 +37,35 @@ class Scheduler(metaclass=Singleton):
if
'index'
in
index_data_key
:
if
'index'
in
index_data_key
:
index_data_list
=
index_data_key
[
'index'
]
index_data_list
=
index_data_key
[
'index'
]
for
key
in
index_data_list
:
for
key
in
index_data_list
:
index
=
GetIndexD
ata
(
key
)
index
=
get_index_d
ata
(
key
)
searcher
=
search_index
.
FaissSearch
(
index
)
searcher
=
search_index
.
FaissSearch
(
index
)
result_list
.
append
(
searcher
.
search_by_vectors
(
vectors
,
k
))
result_list
.
append
(
searcher
.
search_by_vectors
(
vectors
,
k
))
if
len
(
result_list
)
==
1
:
if
len
(
result_list
)
==
1
:
return
result_list
[
0
].
vectors
return
result_list
[
0
].
vectors
total_result
=
[]
return
result_list
;
# TODO(linxj): add topk
# result = search_index.top_k(result_list, k)
# d_list = np.array([])
return
result_list
# v_list = np.array([])
# for result in result_list:
# rd = result.distance
# rv = result.vectors
#
# td_list = np.array([])
# tv_list = np.array([])
# for d, v in zip(rd, rv):
# td_list = np.append(td_list, d)
# tv_list = np.append(tv_list, v)
# d_list = np.add(d_list, td_list)
# v_list = np.add(v_list, td_list)
#
# print(d_list)
# print(v_list)
# result_map = [d_list, v_list]
# top_k_result = search_index.top_k(result_map, k)
# return top_k_result
def
GetIndexD
ata
(
key
):
def
get_index_d
ata
(
key
):
return
serialize
.
read_index
(
key
)
return
serialize
.
read_index
(
key
)
pyengine/engine/controller/tests/test_scheduler.py
浏览文件 @
cfbe86df
...
@@ -9,11 +9,10 @@ class TestScheduler(unittest.TestCase):
...
@@ -9,11 +9,10 @@ class TestScheduler(unittest.TestCase):
def
test_schedule
(
self
):
def
test_schedule
(
self
):
d
=
64
d
=
64
nb
=
10000
nb
=
10000
nq
=
100
nq
=
2
nt
=
5000
nt
=
5000
xt
,
xb
,
xq
=
get_dataset
(
d
,
nb
,
nt
,
nq
)
xt
,
xb
,
xq
=
get_dataset
(
d
,
nb
,
nt
,
nq
)
file_name
=
"/tmp/faiss/tempfile_1"
file_name
=
"/tmp/tempfile_1"
index
=
faiss
.
IndexFlatL2
(
d
)
index
=
faiss
.
IndexFlatL2
(
d
)
print
(
index
.
is_trained
)
print
(
index
.
is_trained
)
...
@@ -61,5 +60,6 @@ def get_dataset(d, nb, nt, nq):
...
@@ -61,5 +60,6 @@ def get_dataset(d, nb, nt, nq):
x
=
x
.
astype
(
'float32'
)
x
=
x
.
astype
(
'float32'
)
return
x
[:
nt
],
x
[
nt
:
-
nq
],
x
[
-
nq
:]
return
x
[:
nt
],
x
[
nt
:
-
nq
],
x
[
-
nq
:]
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
pyengine/engine/controller/vector_engine.py
浏览文件 @
cfbe86df
...
@@ -156,7 +156,7 @@ class VectorEngine(object):
...
@@ -156,7 +156,7 @@ class VectorEngine(object):
scheduler_instance
=
Scheduler
()
scheduler_instance
=
Scheduler
()
vectors
=
[]
vectors
=
[]
vectors
.
append
(
vector
)
vectors
.
append
(
vector
)
result
=
scheduler_instance
.
S
earch
(
index_map
,
vectors
,
limit
)
result
=
scheduler_instance
.
s
earch
(
index_map
,
vectors
,
limit
)
vector_id
=
[
0
]
vector_id
=
[
0
]
...
...
pyengine/engine/ingestion/build_index.py
浏览文件 @
cfbe86df
...
@@ -3,7 +3,7 @@ from enum import Enum, unique
...
@@ -3,7 +3,7 @@ from enum import Enum, unique
@
unique
@
unique
class
INDEX
_
DEVICES
(
Enum
):
class
INDEXDEVICES
(
Enum
):
CPU
=
0
CPU
=
0
GPU
=
1
GPU
=
1
MULTI_GPU
=
2
MULTI_GPU
=
2
...
@@ -15,7 +15,7 @@ def FactoryIndex(index_name="DefaultIndex"):
...
@@ -15,7 +15,7 @@ def FactoryIndex(index_name="DefaultIndex"):
class
Index
():
class
Index
():
def
build
(
self
,
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEX
_
DEVICES
.
CPU
):
def
build
(
self
,
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEXDEVICES
.
CPU
):
pass
pass
@
staticmethod
@
staticmethod
...
@@ -35,7 +35,7 @@ class DefaultIndex(Index):
...
@@ -35,7 +35,7 @@ class DefaultIndex(Index):
# maybe need to specif parameters
# maybe need to specif parameters
pass
pass
def
build
(
self
,
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEX
_
DEVICES
.
CPU
):
def
build
(
self
,
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEXDEVICES
.
CPU
):
index
=
faiss
.
IndexFlatL2
(
d
)
# trained
index
=
faiss
.
IndexFlatL2
(
d
)
# trained
index
.
add
(
vectors
)
index
.
add
(
vectors
)
return
index
return
index
...
@@ -47,7 +47,7 @@ class LowMemoryIndex(Index):
...
@@ -47,7 +47,7 @@ class LowMemoryIndex(Index):
self
.
__bytes_per_vector
=
8
self
.
__bytes_per_vector
=
8
self
.
__bits_per_sub_vector
=
8
self
.
__bits_per_sub_vector
=
8
def
build
(
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEX
_
DEVICES
.
CPU
):
def
build
(
d
,
vectors
,
vector_ids
,
DEVICE
=
INDEXDEVICES
.
CPU
):
# quantizer = faiss.IndexFlatL2(d)
# quantizer = faiss.IndexFlatL2(d)
# index = faiss.IndexIVFPQ(quantizer, d, self.nlist,
# index = faiss.IndexIVFPQ(quantizer, d, self.nlist,
# self.__bytes_per_vector, self.__bits_per_sub_vector)
# self.__bytes_per_vector, self.__bits_per_sub_vector)
...
...
pyengine/engine/ingestion/serialize.py
浏览文件 @
cfbe86df
import
faiss
import
faiss
import
numpy
as
np
import
numpy
as
np
def
write_index
(
index
,
file_name
):
def
write_index
(
index
,
file_name
):
faiss
.
write_index
(
index
,
file_name
)
faiss
.
write_index
(
index
,
file_name
)
def
read_index
(
file_name
):
def
read_index
(
file_name
):
return
faiss
.
read_index
(
file_name
)
return
faiss
.
read_index
(
file_name
)
def
to_array
(
vec
):
def
to_array
(
vec
):
return
np
.
asarray
(
vec
).
astype
(
'float32'
)
return
np
.
asarray
(
vec
).
astype
(
'float32'
)
def
to_str_array
(
vec
):
def
to_str_array
(
vec
):
return
np
.
asarray
(
vec
).
astype
(
'str'
)
return
np
.
asarray
(
vec
).
astype
(
'str'
)
pyengine/engine/ingestion/tests/test_build.py
浏览文件 @
cfbe86df
...
@@ -65,7 +65,6 @@ class TestBuildIndex(unittest.TestCase):
...
@@ -65,7 +65,6 @@ class TestBuildIndex(unittest.TestCase):
assert
np
.
all
(
Dnew
==
Dref
)
and
np
.
all
(
Inew
==
Iref
)
assert
np
.
all
(
Dnew
==
Dref
)
and
np
.
all
(
Inew
==
Iref
)
def
get_dataset
(
d
,
nb
,
nt
,
nq
):
def
get_dataset
(
d
,
nb
,
nt
,
nq
):
"""A dataset that is not completely random but still challenging to
"""A dataset that is not completely random but still challenging to
index
index
...
@@ -83,6 +82,5 @@ def get_dataset(d, nb, nt, nq):
...
@@ -83,6 +82,5 @@ def get_dataset(d, nb, nt, nq):
return
x
[:
nt
],
x
[
nt
:
-
nq
],
x
[
-
nq
:]
return
x
[:
nt
],
x
[
nt
:
-
nq
],
x
[
-
nq
:]
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
pyengine/engine/retrieval/search_index.py
浏览文件 @
cfbe86df
import
faiss
import
faiss
import
numpy
as
np
class
SearchResult
():
class
SearchResult
():
...
@@ -32,7 +33,9 @@ class FaissSearch():
...
@@ -32,7 +33,9 @@ class FaissSearch():
D
,
I
=
self
.
__index
.
search
(
vector_list
,
k
)
D
,
I
=
self
.
__index
.
search
(
vector_list
,
k
)
return
SearchResult
(
D
,
I
)
return
SearchResult
(
D
,
I
)
import
heapq
# import heapq
def
top_k
(
input
,
k
):
def
top_k
(
input
,
k
):
#sorted = heapq.nsmallest(k, input, key=input.key)
pass
pass
# sorted = heapq.nsmallest(k, input, key=np.sum(input.get()))
# return sorted
pyengine/engine/retrieval/tests/basic_test.py
已删除
100644 → 0
浏览文件 @
dadf9734
# import numpy as np
# d = 64 # dimension
# nb = 100000 # database size
# nq = 10000 # nb of queries
# np.random.seed(1234) # make reproducible
# xb = np.random.random((nb, d)).astype('float32')
# xb[:, 0] += np.arange(nb) / 1000.
# xq = np.random.random((nq, d)).astype('float32')
# xq[:, 0] += np.arange(nq) / 1000.
#
# import faiss # make faiss available
#
# res = faiss.StandardGpuResources() # use a single GPU
#
# ## Using a flat index
#
# index_flat = faiss.IndexFlatL2(d) # build a flat (CPU) index
#
# # make it a flat GPU index
# gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
#
# gpu_index_flat.add(xb) # add vectors to the index
# print(gpu_index_flat.ntotal)
#
# k = 4 # we want to see 4 nearest neighbors
# D, I = gpu_index_flat.search(xq, k) # actual search
# print(I[:5]) # neighbors of the 5 first queries
# print(I[-5:]) # neighbors of the 5 last queries
#
#
# ## Using an IVF index
#
# nlist = 100
# quantizer = faiss.IndexFlatL2(d) # the other index
# index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
# # here we specify METRIC_L2, by default it performs inner-product search
#
# # make it an IVF GPU index
# gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
#
# assert not gpu_index_ivf.is_trained
# gpu_index_ivf.train(xb) # add vectors to the index
# assert gpu_index_ivf.is_trained
#
# gpu_index_ivf.add(xb) # add vectors to the index
# print(gpu_index_ivf.ntotal)
#
# k = 4 # we want to see 4 nearest neighbors
# D, I = gpu_index_ivf.search(xq, k) # actual search
# print(I[:5]) # neighbors of the 5 first queries
# print(I[-5:])
import
numpy
as
np
import
pytest
@
pytest
.
mark
.
skip
(
reason
=
"Not for pytest"
)
def
basic_test
():
d
=
64
# dimension
nb
=
100000
# database size
nq
=
10000
# nb of queries
np
.
random
.
seed
(
1234
)
# make reproducible
xb
=
np
.
random
.
random
((
nb
,
d
)).
astype
(
'float32'
)
xb
[:,
0
]
+=
np
.
arange
(
nb
)
/
1000.
xc
=
np
.
random
.
random
((
nb
,
d
)).
astype
(
'float32'
)
xc
[:,
0
]
+=
np
.
arange
(
nb
)
/
1000.
xq
=
np
.
random
.
random
((
nq
,
d
)).
astype
(
'float32'
)
xq
[:,
0
]
+=
np
.
arange
(
nq
)
/
1000.
import
faiss
# make faiss available
index
=
faiss
.
IndexFlatL2
(
d
)
# build the index
print
(
index
.
is_trained
)
index
.
add
(
xb
)
# add vectors to the index
print
(
index
.
ntotal
)
#faiss.write_index(index, "/tmp/faiss/tempfile_1")
writer
=
faiss
.
VectorIOWriter
()
faiss
.
write_index
(
index
,
writer
)
ar_data
=
faiss
.
vector_to_array
(
writer
.
data
)
import
pickle
pickle
.
dump
(
ar_data
,
open
(
"/tmp/faiss/ser_1"
,
"wb"
))
#index_3 = pickle.load("/tmp/faiss/ser_1")
# index_2 = faiss.IndexFlatL2(d) # build the index
# print(index_2.is_trained)
# index_2.add(xc) # add vectors to the index
# print(index_2.ntotal)
# faiss.write_index(index, "/tmp/faiss/tempfile_2")
#
# index_3 = faiss.read_index
# k = 4 # we want to see 4 nearest neighbors
# D, I = index.search(xb[:5], k) # sanity check
# print(I)
# print(D)
# D, I = index.search(xq, k) # actual search
# print(I[:5]) # neighbors of the 5 first queries
# print(I[-5:]) # neighbors of the 5 last queries
if
__name__
==
'__main__'
:
basic_test
()
pyengine/engine/retrieval/tests/test_search.py
浏览文件 @
cfbe86df
...
@@ -3,6 +3,7 @@ from ..search_index import *
...
@@ -3,6 +3,7 @@ from ..search_index import *
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
class
TestSearchSingleThread
(
unittest
.
TestCase
):
class
TestSearchSingleThread
(
unittest
.
TestCase
):
def
test_search_by_vectors
(
self
):
def
test_search_by_vectors
(
self
):
d
=
64
d
=
64
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录