Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Serving
提交
1eca6c99
S
Serving
项目概览
PaddlePaddle
/
Serving
大约 1 年 前同步成功
通知
186
Star
833
Fork
253
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
105
列表
看板
标记
里程碑
合并请求
10
Wiki
2
Wiki
分析
仓库
DevOps
项目成员
Pages
S
Serving
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
105
Issue
105
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
2
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1eca6c99
编写于
9月 05, 2021
作者:
T
TeslaZhao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add cube cache
上级
50730465
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
446 addition
and
90 deletion
+446
-90
core/configure/proto/server_configure.proto
core/configure/proto/server_configure.proto
+8
-5
core/general-server/op/general_dist_kv_infer_op.cpp
core/general-server/op/general_dist_kv_infer_op.cpp
+100
-35
core/predictor/framework/CMakeLists.txt
core/predictor/framework/CMakeLists.txt
+1
-1
core/predictor/framework/cache.cpp
core/predictor/framework/cache.cpp
+112
-0
core/predictor/framework/cache.h
core/predictor/framework/cache.h
+55
-0
core/predictor/framework/infer.cpp
core/predictor/framework/infer.cpp
+18
-0
core/predictor/framework/infer.h
core/predictor/framework/infer.h
+75
-24
core/predictor/framework/resource.cpp
core/predictor/framework/resource.cpp
+11
-11
core/predictor/framework/resource.h
core/predictor/framework/resource.h
+58
-13
paddle_inference/paddle/include/paddle_engine.h
paddle_inference/paddle/include/paddle_engine.h
+8
-1
未找到文件。
core/configure/proto/server_configure.proto
浏览文件 @
1eca6c99
...
...
@@ -61,11 +61,14 @@ message ResourceConf {
repeated
string
model_toolkit_file
=
2
;
repeated
string
general_model_path
=
3
;
repeated
string
general_model_file
=
4
;
optional
string
cube_config_path
=
5
;
optional
string
cube_config_file
=
6
;
optional
int32
cube_quant_bits
=
7
;
// set 0 if no quant.
optional
string
auth_product_name
=
8
;
optional
string
auth_container_id
=
9
;
optional
string
cube_config_path
=
10
;
optional
string
cube_config_file
=
11
;
optional
int32
cube_quant_bits
=
12
;
optional
string
cube_cache_path
=
13
;
optional
string
auth_product_name
=
20
;
optional
string
auth_container_id
=
21
;
};
// DAG node depency info
...
...
core/general-server/op/general_dist_kv_infer_op.cpp
浏览文件 @
1eca6c99
...
...
@@ -20,6 +20,7 @@
#include <unordered_map>
#include <utility>
#include "core/cube/cube-api/include/cube_api.h"
#include "core/predictor/framework/cache.h"
#include "core/predictor/framework/infer.h"
#include "core/predictor/framework/memory.h"
#include "core/predictor/framework/resource.h"
...
...
@@ -36,10 +37,11 @@ using baidu::paddle_serving::predictor::general_model::Response;
using
baidu
::
paddle_serving
::
predictor
::
general_model
::
Request
;
using
baidu
::
paddle_serving
::
predictor
::
InferManager
;
using
baidu
::
paddle_serving
::
predictor
::
PaddleGeneralModelConfig
;
using
baidu
::
paddle_serving
::
predictor
::
CubeCache
;
// DistKV Infer Op: seek cube and then call paddle inference
// op seq: general_reader-> dist_kv_infer -> general_response
int
GeneralDistKVInferOp
::
inference
()
{
int
GeneralDistKVInferOp
::
inference
()
{
VLOG
(
2
)
<<
"Going to run inference"
;
const
std
::
vector
<
std
::
string
>
pre_node_names
=
pre_names
();
if
(
pre_node_names
.
size
()
!=
1
)
{
...
...
@@ -60,8 +62,8 @@ int GeneralDistKVInferOp::inference() {
GeneralBlob
*
output_blob
=
mutable_data
<
GeneralBlob
>
();
if
(
!
output_blob
)
{
LOG
(
ERROR
)
<<
"(logid="
<<
log_id
<<
") output_blob is nullptr,error"
;
return
-
1
;
LOG
(
ERROR
)
<<
"(logid="
<<
log_id
<<
") output_blob is nullptr,error"
;
return
-
1
;
}
output_blob
->
SetLogId
(
log_id
);
...
...
@@ -76,18 +78,24 @@ int GeneralDistKVInferOp::inference() {
TensorVector
*
out
=
&
output_blob
->
tensor_vector
;
std
::
vector
<
uint64_t
>
keys
;
std
::
vector
<
uint64_t
>
unique_keys
;
std
::
unordered_map
<
uint64_t
,
rec
::
mcube
::
CubeValue
*>
key_map
;
std
::
unordered_map
<
uint64_t
,
rec
::
mcube
::
CubeValue
*>
key_map
;
std
::
vector
<
rec
::
mcube
::
CubeValue
>
values
;
int
sparse_count
=
0
;
// sparse inputs counts, sparse would seek cube
int
dense_count
=
0
;
// dense inputs counts, dense would directly call paddle infer
// sparse inputs counts, sparse would seek cube
int
sparse_count
=
0
;
// dense inputs counts, dense would directly call paddle infer
int
dense_count
=
0
;
std
::
vector
<
std
::
pair
<
int64_t
*
,
size_t
>>
dataptr_size_pairs
;
size_t
key_len
=
0
;
for
(
size_t
i
=
0
;
i
<
in
->
size
();
++
i
)
{
if
(
in
->
at
(
i
).
dtype
!=
paddle
::
PaddleDType
::
INT64
)
{
// dense input type is not int64
++
dense_count
;
continue
;
}
// sparse input type is int64
++
sparse_count
;
size_t
elem_num
=
1
;
for
(
size_t
s
=
0
;
s
<
in
->
at
(
i
).
shape
.
size
();
++
s
)
{
elem_num
*=
in
->
at
(
i
).
shape
[
s
];
...
...
@@ -107,33 +115,70 @@ int GeneralDistKVInferOp::inference() {
key_idx
+=
dataptr_size_pairs
[
i
].
second
;
}
// filter dumplicate keys
int
unique_keys_count
=
0
;
for
(
size_t
i
=
0
;
i
<
keys
.
size
();
++
i
)
{
if
(
key_map
.
find
(
keys
[
i
])
==
key_map
.
end
())
{
key_map
[
keys
[
i
]]
=
nullptr
;
unique_keys
[
unique_keys_count
++
]
=
keys
[
i
];
key_map
[
keys
[
i
]]
=
nullptr
;
unique_keys
[
unique_keys_count
++
]
=
keys
[
i
];
}
}
unique_keys
.
resize
(
unique_keys_count
);
VLOG
(
1
)
<<
"(logid="
<<
log_id
<<
") cube number of keys to look up: "
<<
key_len
<<
" uniq keys: "
<<
unique_keys_count
;
VLOG
(
1
)
<<
"(logid="
<<
log_id
<<
") cube number of keys to look up: "
<<
key_len
<<
" uniq keys: "
<<
unique_keys_count
;
// fitler cache keys
size_t
hit_counts
=
0
;
CubeCache
*
p_cube_cache
=
InferManager
::
instance
().
get_cube_cache
(
engine_name
().
c_str
());
if
(
p_cube_cache
!=
nullptr
)
{
for
(
size_t
i
=
0
;
i
<
unique_keys_count
;
++
i
)
{
rec
::
mcube
::
CubeValue
*
hit_val
=
p_cube_cache
->
get_data
(
unique_keys
[
i
]);
if
(
hit_val
)
{
LOG
(
WARNING
)
<<
"Hit one cache. key:"
<<
unique_keys
[
i
];
key_map
[
unique_keys
[
i
]]
=
hit_val
;
unique_keys
[
i
]
=
0
;
++
hit_counts
;
}
}
}
else
{
LOG
(
WARNING
)
<<
"get cube cache fail. model: "
<<
engine_name
();
}
// clear unique keys which hit caches
if
(
hit_counts
>
0
)
{
for
(
auto
it
=
unique_keys
.
begin
();
it
<
unique_keys
.
end
();)
{
if
(
*
it
==
0
)
{
it
=
unique_keys
.
erase
(
it
);
--
unique_keys_count
;
}
else
{
++
it
;
}
}
}
LOG
(
WARNING
)
<<
"Hit "
<<
hit_counts
<<
" keys in cube cache, unique_keys:"
<<
unique_keys
.
size
();
// seek sparse params
rec
::
mcube
::
CubeAPI
*
cube
=
rec
::
mcube
::
CubeAPI
::
instance
();
std
::
vector
<
std
::
string
>
table_names
=
cube
->
get_table_names
();
if
(
table_names
.
size
()
==
0
)
{
LOG
(
ERROR
)
<<
"cube init error or cube config not given."
;
return
-
1
;
}
int64_t
seek_start
=
timeline
.
TimeStampUS
();
int
ret
=
cube
->
seek
(
table_names
[
0
],
unique_keys
,
&
values
);
int64_t
seek_end
=
timeline
.
TimeStampUS
();
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") cube seek status: "
<<
ret
<<
" seek_time: "
<<
seek_end
-
seek_start
;
for
(
size_t
i
=
0
;
i
<
unique_keys
.
size
();
++
i
)
{
key_map
[
unique_keys
[
i
]]
=
&
values
[
i
];
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") cube seek status: "
<<
ret
<<
" seek_time: "
<<
seek_end
-
seek_start
;
for
(
size_t
i
=
0
;
i
<
unique_keys
.
size
();
++
i
)
{
key_map
[
unique_keys
[
i
]]
=
&
values
[
i
];
}
if
(
values
.
size
()
!=
keys
.
size
()
||
values
[
0
].
buff
.
size
()
==
0
)
{
LOG
(
ERROR
)
<<
"cube value return null"
;
}
//size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
//
size_t EMBEDDING_SIZE = values[0].buff.size() / sizeof(float);
size_t
EMBEDDING_SIZE
=
(
values
[
0
].
buff
.
size
()
-
10
)
/
sizeof
(
float
);
TensorVector
sparse_out
;
sparse_out
.
resize
(
sparse_count
);
...
...
@@ -145,16 +190,22 @@ int GeneralDistKVInferOp::inference() {
std
::
unordered_map
<
int
,
int
>
in_out_map
;
baidu
::
paddle_serving
::
predictor
::
Resource
&
resource
=
baidu
::
paddle_serving
::
predictor
::
Resource
::
instance
();
std
::
shared_ptr
<
PaddleGeneralModelConfig
>
model_config
=
resource
.
get_general_model_config
().
front
();
std
::
shared_ptr
<
PaddleGeneralModelConfig
>
model_config
=
resource
.
get_general_model_config
().
front
();
int
cube_key_found
=
0
;
int
cube_key_miss
=
0
;
int
cube_key_miss
=
0
;
for
(
size_t
i
=
0
;
i
<
in
->
size
();
++
i
)
{
VLOG
(
2
)
<<
"i: "
<<
i
<<
", sparse_idx: "
<<
sparse_idx
<<
", dense_idx: "
<<
dense_idx
;
VLOG
(
2
)
<<
"i: "
<<
i
<<
", dtype: "
<<
in
->
at
(
i
).
dtype
;
if
(
in
->
at
(
i
).
dtype
!=
paddle
::
PaddleDType
::
INT64
)
{
dense_out
[
dense_idx
]
=
in
->
at
(
i
);
++
dense_idx
;
continue
;
}
VLOG
(
2
)
<<
"in->size: "
<<
in
->
size
()
<<
", "
;
VLOG
(
2
)
<<
"lod.size: "
<<
in
->
at
(
i
).
lod
.
size
();
sparse_out
[
sparse_idx
].
lod
.
resize
(
in
->
at
(
i
).
lod
.
size
());
for
(
size_t
x
=
0
;
x
<
sparse_out
[
sparse_idx
].
lod
.
size
();
++
x
)
{
sparse_out
[
sparse_idx
].
lod
[
x
].
resize
(
in
->
at
(
i
).
lod
[
x
].
size
());
...
...
@@ -163,26 +214,39 @@ int GeneralDistKVInferOp::inference() {
sparse_out
[
sparse_idx
].
lod
[
x
].
begin
());
}
sparse_out
[
sparse_idx
].
dtype
=
paddle
::
PaddleDType
::
FLOAT32
;
sparse_out
[
sparse_idx
].
shape
.
push_back
(
sparse_out
[
sparse_idx
].
lod
[
0
].
back
());
sparse_out
[
sparse_idx
].
shape
.
push_back
(
sparse_out
[
sparse_idx
].
lod
[
0
].
back
());
sparse_out
[
sparse_idx
].
shape
.
push_back
(
EMBEDDING_SIZE
);
sparse_out
[
sparse_idx
].
name
=
model_config
->
_feed_name
[
i
];
sparse_out
[
sparse_idx
].
data
.
Resize
(
sparse_out
[
sparse_idx
].
lod
[
0
].
back
()
*
EMBEDDING_SIZE
*
sizeof
(
float
));
float
*
dst_ptr
=
static_cast
<
float
*>
(
sparse_out
[
sparse_idx
].
data
.
data
());
if
(
!
dst_ptr
)
{
VLOG
(
2
)
<<
"dst_ptr is null. sparse_idx:"
<<
sparse_idx
;
continue
;
}
for
(
int
x
=
0
;
x
<
sparse_out
[
sparse_idx
].
lod
[
0
].
back
();
++
x
)
{
float
*
data_ptr
=
dst_ptr
+
x
*
EMBEDDING_SIZE
;
uint64_t
cur_key
=
keys
[
cube_val_idx
];
rec
::
mcube
::
CubeValue
*
cur_val
=
key_map
[
cur_key
];
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") x: "
<<
x
<<
", sparse_idx: "
<<
sparse_idx
<<
" cur_key: "
<<
cur_key
<<
", cube_val_idx:"
<<
cube_val_idx
;
rec
::
mcube
::
CubeValue
*
cur_val
=
key_map
[
cur_key
];
if
(
cur_val
->
buff
.
size
()
==
0
)
{
memset
(
data_ptr
,
(
float
)
0.0
,
sizeof
(
float
)
*
EMBEDDING_SIZE
);
VLOG
(
3
)
<<
"(logid="
<<
log_id
<<
") cube key not found: "
<<
keys
[
cube_val_idx
];
++
cube_key_miss
;
++
cube_val_idx
;
continue
;
memset
(
data_ptr
,
(
float
)
0.0
,
sizeof
(
float
)
*
EMBEDDING_SIZE
);
VLOG
(
3
)
<<
"(logid="
<<
log_id
<<
") cube key not found: "
<<
keys
[
cube_val_idx
];
++
cube_key_miss
;
++
cube_val_idx
;
continue
;
}
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") key: "
<<
keys
[
cube_val_idx
]
<<
" , cube value len:"
<<
cur_val
->
buff
.
size
();
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") key: "
<<
keys
[
cube_val_idx
]
<<
" , cube value len:"
<<
cur_val
->
buff
.
size
();
memcpy
(
data_ptr
,
cur_val
->
buff
.
data
(),
cur_val
->
buff
.
size
());
//VLOG(3) << keys[cube_val_idx] << ":" << data_ptr[0] << ", " << data_ptr[1] << ", " <<data_ptr[2] << ", " <<data_ptr[3] << ", " <<data_ptr[4] << ", " <<data_ptr[5] << ", " <<data_ptr[6] << ", " <<data_ptr[7] << ", " <<data_ptr[8];
// VLOG(3) << keys[cube_val_idx] << ":" << data_ptr[0] << ", " <<
// data_ptr[1] << ", " <<data_ptr[2] << ", " <<data_ptr[3] << ", "
// <<data_ptr[4] << ", " <<data_ptr[5] << ", " <<data_ptr[6] << ", "
// <<data_ptr[7] << ", " <<data_ptr[8];
++
cube_key_found
;
++
cube_val_idx
;
}
...
...
@@ -191,10 +255,11 @@ int GeneralDistKVInferOp::inference() {
bool
cube_fail
=
(
cube_key_found
==
0
);
if
(
cube_fail
)
{
LOG
(
WARNING
)
<<
"(logid="
<<
log_id
<<
") cube seek fail"
;
//CopyBlobInfo(input_blob, output_blob);
//return 0;
//
CopyBlobInfo(input_blob, output_blob);
//
return 0;
}
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") cube key found: "
<<
cube_key_found
<<
" , cube key miss: "
<<
cube_key_miss
;
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") cube key found: "
<<
cube_key_found
<<
" , cube key miss: "
<<
cube_key_miss
;
VLOG
(
2
)
<<
"(logid="
<<
log_id
<<
") sparse tensor load success."
;
timeline
.
Pause
();
VLOG
(
2
)
<<
"dist kv, cube and datacopy time: "
<<
timeline
.
ElapsedUS
();
...
...
@@ -209,21 +274,21 @@ int GeneralDistKVInferOp::inference() {
// call paddle inference here
if
(
InferManager
::
instance
().
infer
(
engine_name
().
c_str
(),
&
infer_in
,
out
,
batch_size
))
{
LOG
(
ERROR
)
<<
"(logid="
<<
log_id
<<
") Failed do infer in fluid model: "
<<
engine_name
();
LOG
(
ERROR
)
<<
"(logid="
<<
log_id
<<
") Failed do infer in fluid model: "
<<
engine_name
();
return
-
1
;
}
int64_t
end
=
timeline
.
TimeStampUS
();
if
(
cube_fail
)
{
float
*
out_ptr
=
static_cast
<
float
*>
(
out
->
at
(
0
).
data
.
data
());
out_ptr
[
0
]
=
0.0
;
if
(
cube_fail
)
{
float
*
out_ptr
=
static_cast
<
float
*>
(
out
->
at
(
0
).
data
.
data
());
out_ptr
[
0
]
=
0.0
;
}
timeline
.
Pause
();
VLOG
(
2
)
<<
"dist kv, pure paddle infer time: "
<<
timeline
.
ElapsedUS
();
VLOG
(
2
)
<<
"dist kv, pure paddle infer time: "
<<
timeline
.
ElapsedUS
();
CopyBlobInfo
(
input_blob
,
output_blob
);
AddBlobInfo
(
output_blob
,
start
);
AddBlobInfo
(
output_blob
,
end
);
return
0
;
return
0
;
}
DEFINE_OP
(
GeneralDistKVInferOp
);
...
...
core/predictor/framework/CMakeLists.txt
浏览文件 @
1eca6c99
FILE
(
GLOB framework_srcs
${
CMAKE_CURRENT_LIST_DIR
}
/*.cpp
)
FILE
(
GLOB framework_srcs
${
CMAKE_CURRENT_LIST_DIR
}
/*.cpp
${
CMAKE_CURRENT_LIST_DIR
}
/../../cube/cube-builder/src/seqfile_reader.cpp
)
LIST
(
APPEND pdserving_srcs
${
framework_srcs
}
)
LIST
(
APPEND pclient_srcs
${
framework_srcs
}
)
core/predictor/framework/cache.cpp
0 → 100644
浏览文件 @
1eca6c99
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#include "core/predictor/framework/cache.h"
#include <dirent.h>
#include <sys/stat.h>
#include <fstream>
#include <string>
#include "core/cube/cube-builder/include/cube-builder/seqfile_reader.h"
namespace
baidu
{
namespace
paddle_serving
{
namespace
predictor
{
int
CubeCache
::
clear
()
{
for
(
auto
it
=
_map_cache
.
begin
();
it
!=
_map_cache
.
end
();
++
it
)
{
if
(
it
->
second
)
{
delete
(
it
->
second
);
it
->
second
=
nullptr
;
}
_map_cache
.
clear
();
}
return
0
;
}
rec
::
mcube
::
CubeValue
*
CubeCache
::
get_data
(
uint64_t
key
)
{
auto
it
=
_map_cache
.
find
(
key
);
if
(
it
!=
_map_cache
.
end
())
{
return
it
->
second
;
}
return
nullptr
;
}
int
CubeCache
::
reload_data
(
const
std
::
string
&
cache_path
)
{
DIR
*
dp
=
nullptr
;
struct
dirent
*
dirp
=
nullptr
;
struct
stat
st
;
// clear cache data
clear
();
// loading data from cache files
if
(
stat
(
cache_path
.
c_str
(),
&
st
)
<
0
||
!
S_ISDIR
(
st
.
st_mode
))
{
LOG
(
ERROR
)
<<
"invalid cache path "
<<
cache_path
;
return
-
1
;
}
if
((
dp
=
opendir
(
cache_path
.
c_str
()))
==
nullptr
)
{
LOG
(
ERROR
)
<<
"opendir "
<<
cache_path
<<
" fail."
;
return
-
1
;
}
while
((
dirp
=
readdir
(
dp
))
!=
nullptr
)
{
// filtering by file type.
if
(
dirp
->
d_type
!=
DT_REG
)
{
continue
;
}
// Filter upper-level directories and hidden files
if
((
!
strncmp
(
dirp
->
d_name
,
"."
,
1
))
||
(
!
strncmp
(
dirp
->
d_name
,
".."
,
2
)))
{
continue
;
}
// Match the file whose name prefix is 'part-'
if
(
std
::
string
(
dirp
->
d_name
).
find
(
"part-"
)
!=
std
::
string
::
npos
)
{
SequenceFileRecordReader
reader
(
dirp
->
d_name
);
if
(
reader
.
open
()
!=
0
)
{
LOG
(
ERROR
)
<<
"open file failed! "
<<
dirp
->
d_name
;
continue
;
}
if
(
reader
.
read_header
()
!=
0
)
{
LOG
(
ERROR
)
<<
"read header error! "
<<
dirp
->
d_name
;
reader
.
close
();
continue
;
}
Record
record
(
reader
.
get_header
());
while
(
reader
.
next
(
&
record
)
==
0
)
{
uint64_t
key
=
*
reinterpret_cast
<
uint64_t
*>
(
const_cast
<
char
*>
(
record
.
key
.
data
()));
auto
it_find
=
_map_cache
.
find
(
key
);
if
(
it_find
!=
_map_cache
.
end
())
{
// load dumplicate key
LOG
(
WARNING
)
<<
"Load dumplicate key:"
<<
key
<<
" from file:"
<<
dirp
->
d_name
;
continue
;
}
rec
::
mcube
::
CubeValue
*
new_value
=
new
rec
::
mcube
::
CubeValue
();
new_value
->
error
=
0
;
new_value
->
buff
=
record
.
value
;
_map_cache
.
insert
(
std
::
make_pair
(
key
,
new_value
));
}
LOG
(
WARNING
)
<<
"Load cube cache file "
<<
dirp
->
d_name
<<
" done."
;
}
LOG
(
WARNING
)
<<
"Load all cube cache files done"
;
}
return
0
;
}
}
// namespace predictor
}
// namespace paddle_serving
}
// namespace baidu
core/predictor/framework/cache.h
0 → 100644
浏览文件 @
1eca6c99
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sys/types.h>
#include <numeric>
#include <string>
#include <unordered_map>
#include "core/cube/cube-api/include/cube_api.h"
namespace
baidu
{
namespace
paddle_serving
{
namespace
predictor
{
// Large models that use sparse parameters may use cube cache.
// When the cube cache exists, the model is required to be
// consistent with the version of the cube cache. Therefore,
// when the model is updated, the model and the cube cache are
// required to be reloaded at the same time.
// Load all cached data at once without updating, it's lock free
// switching two cube cache.
class
CubeCache
{
public:
CubeCache
()
{}
~
CubeCache
()
{
clear
();
}
// clear cache data.
int
clear
();
// get cache data by key
rec
::
mcube
::
CubeValue
*
get_data
(
uint64_t
key
);
// reload all cache files from cache_path
int
reload_data
(
const
std
::
string
&
cache_path
);
private:
// switching free lock, key type is uint64_t, value type is CubeValue*
std
::
unordered_map
<
uint64_t
,
rec
::
mcube
::
CubeValue
*>
_map_cache
;
};
}
// namespace predictor
}
// namespace paddle_serving
}
// namespace baidu
core/predictor/framework/infer.cpp
浏览文件 @
1eca6c99
...
...
@@ -363,6 +363,15 @@ T* VersionedInferEngine::get_core(uint64_t version) {
return
NULL
;
}
CubeCache
*
VersionedInferEngine
::
get_cube_cache
()
{
InferEngine
*
engine
=
default_engine
();
if
(
!
engine
)
{
LOG
(
WARNING
)
<<
"fail to get default engine"
;
return
nullptr
;
}
return
engine
->
get_cube_cache
();
}
int
VersionedInferEngine
::
proc_initialize_impl
(
const
configure
::
EngineDesc
&
conf
,
bool
)
{
return
-
1
;
...
...
@@ -502,6 +511,15 @@ T* InferManager::get_core(const char* model_name) {
return
NULL
;
}
CubeCache
*
InferManager
::
get_cube_cache
(
const
char
*
model_name
)
{
auto
it
=
_map
.
find
(
model_name
);
if
(
it
==
_map
.
end
())
{
LOG
(
WARNING
)
<<
"Cannot find engine in map, model name:"
<<
model_name
;
return
nullptr
;
}
return
it
->
second
->
get_cube_cache
();
}
// Versioned inference interface
int
InferManager
::
infer
(
const
char
*
model_name
,
const
void
*
in
,
...
...
core/predictor/framework/infer.h
浏览文件 @
1eca6c99
...
...
@@ -25,6 +25,7 @@
#include <vector>
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/bsf.h"
#include "core/predictor/framework/cache.h"
#include "core/predictor/framework/factory.h"
#include "core/predictor/framework/infer_data.h"
#include "core/predictor/framework/memory.h"
...
...
@@ -35,6 +36,7 @@ namespace predictor {
using
configure
::
ModelToolkitConf
;
// Auto mutex lock
class
AutoLock
{
public:
explicit
AutoLock
(
pthread_mutex_t
&
mutex
)
:
_mut
(
mutex
)
{
...
...
@@ -46,6 +48,7 @@ class AutoLock {
pthread_mutex_t
&
_mut
;
};
// Gloabl singleton mutex lock
class
GlobalCreateMutex
{
public:
pthread_mutex_t
&
mutex
()
{
return
_mut
;
}
...
...
@@ -60,6 +63,7 @@ class GlobalCreateMutex {
pthread_mutex_t
_mut
;
};
// InferEngine
class
InferEngine
{
public:
virtual
~
InferEngine
()
{}
...
...
@@ -90,11 +94,13 @@ class InferEngine {
void
*
out
,
uint32_t
batch_size
=
-
1
)
=
0
;
virtual
int
task_infer_impl
(
const
void
*
in
,
void
*
out
)
=
0
;
// NOLINT
virtual
CubeCache
*
get_cube_cache
()
=
0
;
protected:
uint32_t
_model_index
;
// end: framework inner call
};
typedef
im
::
bsf
::
Task
<
paddle
::
PaddleTensor
,
paddle
::
PaddleTensor
>
TaskT
;
class
ReloadableInferEngine
:
public
InferEngine
{
public:
...
...
@@ -169,12 +175,12 @@ class ReloadableInferEngine : public InferEngine {
uint64_t
_version
;
};
// Lock free switching two models
// Lock free switching two models
and cube caches
template
<
typename
EngineCore
>
struct
ModelData
{
ModelData
()
:
current_idx
(
1
)
{
cores
[
0
]
=
NULL
;
cores
[
1
]
=
NULL
;
cores
[
0
]
=
nullptr
;
cores
[
1
]
=
nullptr
;
}
~
ModelData
()
{
...
...
@@ -182,9 +188,12 @@ struct ModelData {
delete
cores
[
1
];
}
void
*
get
()
{
return
cores
[
current_idx
]
->
get
();
}
void
*
get_core
()
{
return
cores
[
current_idx
]
->
get
();
}
CubeCache
*
get_cache
()
{
return
&
caches
[
current_idx
];
}
EngineCore
*
cores
[
2
];
CubeCache
caches
[
2
];
uint32_t
current_idx
;
};
...
...
@@ -196,7 +205,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
int
proc_initialize
(
const
configure
::
EngineDesc
&
conf
,
bool
version
)
{
THREAD_KEY_CREATE
(
&
_skey
,
NULL
);
THREAD_MUTEX_INIT
(
&
_mutex
,
NULL
);
gpu_index
=
0
;
_
gpu_index
=
0
;
return
ReloadableInferEngine
::
proc_initialize
(
conf
,
version
);
}
...
...
@@ -209,7 +218,7 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
if
(
_reload_vec
.
empty
())
{
return
0
;
}
gpu_index
=
0
;
_
gpu_index
=
0
;
for
(
uint32_t
ti
=
0
;
ti
<
_reload_vec
.
size
();
++
ti
)
{
if
(
load_data
(
_reload_vec
[
ti
],
conf
)
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed reload engine model: "
<<
ti
;
...
...
@@ -224,26 +233,41 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
virtual
int
load_data
(
ModelData
<
EngineCore
>*
md
,
const
configure
::
EngineDesc
&
conf
)
{
uint32_t
next_idx
=
(
md
->
current_idx
+
1
)
%
2
;
// reload engine core
if
(
md
->
cores
[
next_idx
])
{
delete
md
->
cores
[
next_idx
];
}
md
->
cores
[
next_idx
]
=
new
(
std
::
nothrow
)
EngineCore
;
// params.dump();
size_t
gpu_ids_num
=
conf
.
gpu_ids_size
();
im
::
bsf
::
AutoMutex
lock
(
_mutex
);
int
gpu_id
=
-
1
;
if
(
gpu_ids_num
>
0
)
{
gpu_id
=
conf
.
gpu_ids
(
gpu_index
%
gpu_ids_num
);
gpu_id
=
conf
.
gpu_ids
(
_
gpu_index
%
gpu_ids_num
);
}
if
(
!
md
->
cores
[
next_idx
]
||
md
->
cores
[
next_idx
]
->
create
(
conf
,
gpu_id
)
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed create model, path: "
<<
conf
.
model_dir
();
return
-
1
;
}
gpu_index
++
;
_gpu_index
++
;
LOG
(
WARNING
)
<<
"Reload EngineCore["
<<
next_idx
<<
"] finish."
;
// reload cube cache
std
::
string
model_path
=
conf
.
model_dir
();
if
(
access
(
model_path
.
c_str
(),
F_OK
)
==
0
)
{
std
::
string
cube_cache_path
=
model_path
+
"cube_cache"
;
int
reload_cache_ret
=
md
->
caches
[
next_idx
].
reload_data
(
cube_cache_path
);
LOG
(
WARNING
)
<<
"Reload cube cache["
<<
next_idx
<<
"] finish."
;
}
else
{
LOG
(
ERROR
)
<<
"model_path "
<<
model_path
<<
" is not exits. Ignore cube cache!"
;
}
md
->
current_idx
=
next_idx
;
LOG
(
WARNING
)
<<
"Reload model and cube cache done. switching to current_idx["
<<
next_idx
<<
"]"
;
return
0
;
}
...
...
@@ -309,11 +333,25 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
return
md
->
cores
[
md
->
current_idx
];
}
CubeCache
*
get_cube_cache
()
{
ModelData
<
EngineCore
>*
md
=
(
ModelData
<
EngineCore
>*
)
THREAD_GETSPECIFIC
(
_skey
);
if
(
!
md
)
{
LOG
(
ERROR
)
<<
"Failed get thread specific data"
;
return
NULL
;
}
return
md
->
get_cache
();
}
protected:
THREAD_KEY_T
_skey
;
THREAD_MUTEX_T
_mutex
;
// vector of all model engines
std
::
vector
<
ModelData
<
EngineCore
>*>
_reload_vec
;
int
gpu_index
=
0
;
// gpu card id
int
_gpu_index
=
0
;
};
// 多个EngineCore共用同一份模型数据
...
...
@@ -347,41 +385,42 @@ class CloneDBReloadableInferEngine
im
::
bsf
::
AutoMutex
lock
(
DBReloadableInferEngine
<
EngineCore
>::
_mutex
);
int
gpu_id
=
-
1
;
if
(
gpu_ids_num
>
0
)
{
gpu_id
=
conf
.
gpu_ids
(
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
%
gpu_id
=
conf
.
gpu_ids
(
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
%
gpu_ids_num
);
}
else
{
gpu_ids_num
=
1
;
}
// gpu_index will be set to be 0, when load() or proc_initial() is called.
// gpu_index < gpu_ids_num, means there are predictors still not create
//
_
gpu_index will be set to be 0, when load() or proc_initial() is called.
//
_
gpu_index < gpu_ids_num, means there are predictors still not create
// on some GPU card.
// so we need to create the predictor.
// gpu_index >= gpu_ids_num, means each GPU card has already create one.
//
_
gpu_index >= gpu_ids_num, means each GPU card has already create one.
// so we need to clone the predictor.
if
(
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
<
gpu_ids_num
)
{
if
(
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
<
gpu_ids_num
)
{
if
(
!
md
->
cores
[
next_idx
]
||
md
->
cores
[
next_idx
]
->
create
(
conf
,
gpu_id
)
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed create model, path: "
<<
conf
.
model_dir
();
return
-
1
;
}
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
++
;
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
++
;
md
->
current_idx
=
next_idx
;
if
(
_cloneTemplate
.
size
()
<
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
)
{
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
)
{
_cloneTemplate
.
push_back
(
md
);
}
else
{
_cloneTemplate
[
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
-
1
]
=
md
;
_cloneTemplate
[
DBReloadableInferEngine
<
EngineCore
>::
_gpu_index
-
1
]
=
md
;
}
}
else
{
int
template_index
=
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
%
int
template_index
=
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
%
_cloneTemplate
.
size
();
if
(
!
md
->
cores
[
next_idx
]
||
md
->
cores
[
next_idx
]
->
clone
(
_cloneTemplate
[
template_index
]
->
get
())
!=
0
)
{
md
->
cores
[
next_idx
]
->
clone
(
_cloneTemplate
[
template_index
]
->
get_core
())
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed clone model from core"
;
return
-
1
;
}
DBReloadableInferEngine
<
EngineCore
>::
gpu_index
++
;
DBReloadableInferEngine
<
EngineCore
>::
_
gpu_index
++
;
md
->
current_idx
=
next_idx
;
LOG
(
WARNING
)
<<
"core clone model succ, cur_idx["
<<
md
->
current_idx
<<
"]."
;
...
...
@@ -532,6 +571,10 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
int
task_infer_impl
(
const
void
*
in
,
void
*
out
)
{
// NOLINT
return
infer_impl
(
in
,
out
);
}
CubeCache
*
get_cube_cache
()
{
return
DBReloadableInferEngine
<
EngineCore
>::
get_cube_cache
();
}
};
typedef
FactoryPool
<
InferEngine
>
StaticInferFactory
;
...
...
@@ -565,6 +608,8 @@ class VersionedInferEngine : public InferEngine {
template
<
typename
T
>
T
*
get_core
();
CubeCache
*
get_cube_cache
();
// versioned inference interface
int
infer
(
const
void
*
in
,
void
*
out
,
uint32_t
batch_size
,
uint64_t
version
);
...
...
@@ -616,9 +661,13 @@ class InferManager {
void
*
out
,
uint32_t
batch_size
=
-
1
);
// get engine core
template
<
typename
T
>
T
*
get_core
(
const
char
*
model_name
);
// get cube cache
CubeCache
*
get_cube_cache
(
const
char
*
model_name
);
// Versioned inference interface
int
infer
(
const
char
*
model_name
,
const
void
*
in
,
...
...
@@ -626,9 +675,11 @@ class InferManager {
uint32_t
batch_size
,
uint64_t
version
);
// Versioned get engine core
template
<
typename
T
>
T
*
get_core
(
const
char
*
model_name
,
uint64_t
version
);
// query model version
int
query_version
(
const
std
::
string
&
model
,
uint64_t
&
version
);
private:
...
...
core/predictor/framework/resource.cpp
浏览文件 @
1eca6c99
...
...
@@ -165,18 +165,18 @@ int Resource::initialize(const std::string& path, const std::string& file) {
rec
::
mcube
::
CubeAPI
*
cube
=
rec
::
mcube
::
CubeAPI
::
instance
();
std
::
string
cube_config_fullpath
=
"./"
+
resource_conf
.
cube_config_path
()
+
"/"
+
resource_conf
.
cube_config_file
();
this
->
cube_config_fullpath
=
cube_config_fullpath
;
this
->
cube_quant_bits
=
resource_conf
.
has_cube_quant_bits
()
?
resource_conf
.
cube_quant_bits
()
:
0
;
if
(
this
->
cube_quant_bits
!=
0
&&
this
->
cube_quant_bits
!=
8
)
{
this
->
_
cube_config_fullpath
=
cube_config_fullpath
;
this
->
_
cube_quant_bits
=
resource_conf
.
has_cube_quant_bits
()
?
resource_conf
.
cube_quant_bits
()
:
0
;
if
(
this
->
_cube_quant_bits
!=
0
&&
this
->
_
cube_quant_bits
!=
8
)
{
LOG
(
ERROR
)
<<
"Cube quant bits illegal! should be 0 or 8."
;
return
-
1
;
}
if
(
this
->
cube_quant_bits
==
0
)
{
if
(
this
->
_
cube_quant_bits
==
0
)
{
LOG
(
INFO
)
<<
"cube quant mode OFF"
;
}
else
{
LOG
(
INFO
)
<<
"cube quant mode ON, quant bits: "
<<
this
->
cube_quant_bits
;
LOG
(
INFO
)
<<
"cube quant mode ON, quant bits: "
<<
this
->
_
cube_quant_bits
;
}
}
...
...
@@ -187,10 +187,10 @@ int Resource::initialize(const std::string& path, const std::string& file) {
// model config
int
Resource
::
general_model_initialize
(
const
std
::
string
&
path
,
const
std
::
string
&
file
)
{
if
(
this
->
cube_config_fullpath
.
size
()
!=
0
)
{
LOG
(
INFO
)
<<
"init cube by config file : "
<<
this
->
cube_config_fullpath
;
if
(
this
->
_
cube_config_fullpath
.
size
()
!=
0
)
{
LOG
(
INFO
)
<<
"init cube by config file : "
<<
this
->
_
cube_config_fullpath
;
rec
::
mcube
::
CubeAPI
*
cube
=
rec
::
mcube
::
CubeAPI
::
instance
();
int
ret
=
cube
->
init
(
this
->
cube_config_fullpath
.
c_str
());
int
ret
=
cube
->
init
(
this
->
_
cube_config_fullpath
.
c_str
());
if
(
ret
!=
0
)
{
LOG
(
ERROR
)
<<
"cube init error"
;
return
-
1
;
...
...
@@ -315,7 +315,7 @@ int Resource::thread_clear() {
}
return
0
;
}
size_t
Resource
::
get_cube_quant_bits
()
{
return
this
->
cube_quant_bits
;
}
size_t
Resource
::
get_cube_quant_bits
()
{
return
this
->
_
cube_quant_bits
;
}
int
Resource
::
reload
()
{
if
(
FLAGS_enable_model_toolkit
&&
InferManager
::
instance
().
reload
()
!=
0
)
{
...
...
core/predictor/framework/resource.h
浏览文件 @
1eca6c99
...
...
@@ -16,8 +16,10 @@
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "core/cube/cube-api/include/cube_api.h"
#include "core/predictor/common/inner_common.h"
#include "core/predictor/framework/infer.h"
...
...
@@ -27,6 +29,8 @@ namespace baidu {
namespace
paddle_serving
{
namespace
predictor
{
// Paddle general model configuration, read the model configuration information
// from the general_model_config.proto file
class
PaddleGeneralModelConfig
{
public:
PaddleGeneralModelConfig
()
{}
...
...
@@ -34,23 +38,47 @@ class PaddleGeneralModelConfig {
~
PaddleGeneralModelConfig
()
{}
public:
// feed/fetch name and alias_name
std
::
vector
<
std
::
string
>
_feed_name
;
std
::
vector
<
std
::
string
>
_feed_alias_name
;
std
::
vector
<
int
>
_feed_type
;
// 0 int64, 1 float
std
::
vector
<
bool
>
_is_lod_feed
;
// true lod tensor
std
::
vector
<
bool
>
_is_lod_fetch
;
// whether a fetch var is lod_tensor
std
::
vector
<
int
>
_capacity
;
// capacity for each tensor
/*
feed_shape_ for feeded variable
feed_shape_[i][j] represents the jth dim for ith input Tensor
if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
*/
std
::
vector
<
std
::
vector
<
int
>>
_feed_shape
;
std
::
vector
<
std
::
string
>
_fetch_name
;
std
::
vector
<
std
::
string
>
_fetch_alias_name
;
// Be consistent with model saving interface var type conversion
// (python/paddle serving client/io/__init__)
// int64 => 0;
// float32 => 1;
// int32 => 2;
// float64 => 3;
// int16 => 4;
// float16 => 5;
// bfloat16 => 6;
// uint8 => 7;
// int8 => 8;
// bool => 9;
// complex64 => 10,
// complex128 => 11;
std
::
vector
<
int
>
_feed_type
;
// whether a feed or fetch var is lod_tensor.
std
::
vector
<
bool
>
_is_lod_feed
;
std
::
vector
<
bool
>
_is_lod_fetch
;
// capacity for each tensor
std
::
vector
<
int
>
_capacity
;
// _feed_shape and _fetch_shape are used to represent the dimensional
// information of tensor.
// for examples, feed_shape_[i][j] represents the j(th) dim for i(th) input
// tensor.
// if is_lod_feed_[i] == False, feed_shape_[i][0] = -1
std
::
vector
<
std
::
vector
<
int
>>
_feed_shape
;
std
::
vector
<
std
::
vector
<
int
>>
_fetch_shape
;
// fetch name -> index of fetch_name vector.
std
::
map
<
std
::
string
,
int
>
_fetch_name_to_index
;
// fetch alias name -> index of fetch_alias_name vector.
std
::
map
<
std
::
string
,
int
>
_fetch_alias_name_to_index
;
};
...
...
@@ -73,33 +101,50 @@ class Resource {
return
ins
;
}
// initialize resource
int
initialize
(
const
std
::
string
&
path
,
const
std
::
string
&
file
);
// loading all models configurations from prototxt
int
general_model_initialize
(
const
std
::
string
&
path
,
const
std
::
string
&
file
);
// initialize thread local data
int
thread_initialize
();
// clear thread local data
int
thread_clear
();
// reload resources
int
reload
();
// finalize
int
finalize
();
// get all model configs
std
::
vector
<
std
::
shared_ptr
<
PaddleGeneralModelConfig
>>
get_general_model_config
();
// print all configurations of all models
void
print_general_model_config
(
const
std
::
shared_ptr
<
PaddleGeneralModelConfig
>&
config
);
// get cube quantity bit size
size_t
get_cube_quant_bits
();
private:
int
thread_finalize
()
{
return
0
;
}
private:
// configuration infermation of all models, loading from prototxt files
std
::
vector
<
std
::
shared_ptr
<
PaddleGeneralModelConfig
>>
_configs
;
std
::
string
cube_config_fullpath
;
int
cube_quant_bits
;
// 0 if no empty
// full path of cube configuration file.
std
::
string
_cube_config_fullpath
;
// cube quantify bit size, support 0/8. set 0 if no quant.
size_t
_cube_quant_bits
;
// bthread local key
THREAD_KEY_T
_tls_bspec_key
;
};
...
...
paddle_inference/paddle/include/paddle_engine.h
浏览文件 @
1eca6c99
...
...
@@ -94,7 +94,9 @@ const std::string getFileBySuffix(
return
fileName
;
}
// Engine Base
// Engine Core is the base class of inference engines, which can be derived from
// paddle Inference Engine, or inference engines of other machine learning
// platforms
class
EngineCore
{
public:
virtual
~
EngineCore
()
{}
...
...
@@ -141,6 +143,11 @@ class EngineCore {
virtual
void
*
get
()
{
return
_predictor
.
get
();
}
protected:
// _predictor is a prediction instance of Paddle Inference.
// when inferring on the CPU, _predictor is bound to a model.
// when inferring on the GPU, _predictor is bound to a model and a GPU card.
// Therefore, when using GPU multi-card inference, you need to create multiple
// EngineCore.
std
::
shared_ptr
<
Predictor
>
_predictor
;
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录