Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
ac897177
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ac897177
编写于
7月 13, 2020
作者:
Q
Qi Li
提交者:
GitHub
7月 13, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[NPU] enhance cache offline model, test=develop (#3805) (#3931)
* [NPU] enhance cache offline model, test=develop
上级
813f17ba
变更
22
显示空白变更内容
内联
并排
Showing
22 changed file
with
841 addition
and
380 deletion
+841
-380
.gitignore
.gitignore
+3
-0
cmake/cross_compiling/android.cmake
cmake/cross_compiling/android.cmake
+5
-1
lite/api/cxx_api_impl.cc
lite/api/cxx_api_impl.cc
+4
-0
lite/api/light_api_impl.cc
lite/api/light_api_impl.cc
+5
-0
lite/api/paddle_api.h
lite/api/paddle_api.h
+9
-0
lite/backends/npu/device.cc
lite/backends/npu/device.cc
+103
-33
lite/backends/npu/device.h
lite/backends/npu/device.h
+10
-5
lite/core/context.cc
lite/core/context.cc
+4
-0
lite/core/context.h
lite/core/context.h
+10
-0
lite/core/mir/subgraph/subgraph_detector.cc
lite/core/mir/subgraph/subgraph_detector.cc
+41
-66
lite/core/mir/subgraph/subgraph_pass_test.cc
lite/core/mir/subgraph/subgraph_pass_test.cc
+2
-0
lite/kernels/npu/bridges/engine.cc
lite/kernels/npu/bridges/engine.cc
+75
-38
lite/kernels/npu/bridges/engine.h
lite/kernels/npu/bridges/engine.h
+12
-21
lite/kernels/npu/bridges/graph.h
lite/kernels/npu/bridges/graph.h
+1
-1
lite/kernels/npu/bridges/matmul_op.cc
lite/kernels/npu/bridges/matmul_op.cc
+4
-4
lite/kernels/npu/bridges/utility.h
lite/kernels/npu/bridges/utility.h
+16
-15
lite/kernels/npu/subgraph_compute.cc
lite/kernels/npu/subgraph_compute.cc
+309
-167
lite/kernels/npu/subgraph_compute.h
lite/kernels/npu/subgraph_compute.h
+49
-24
lite/utils/env.h
lite/utils/env.h
+2
-0
lite/utils/io.h
lite/utils/io.h
+35
-0
lite/utils/md5.h
lite/utils/md5.h
+104
-0
lite/utils/string.h
lite/utils/string.h
+38
-5
未找到文件。
.gitignore
浏览文件 @
ac897177
...
...
@@ -117,3 +117,6 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
build*
# hiai libs
ai_ddk_lib*
cmake/cross_compiling/android.cmake
浏览文件 @
ac897177
...
...
@@ -35,8 +35,12 @@ endif()
if
(
NOT DEFINED ANDROID_API_LEVEL
)
set
(
ANDROID_API_LEVEL
"23"
)
if
(
ARM_TARGET_ARCH_ABI STREQUAL
"armv7"
)
if
(
LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH
)
set
(
ANDROID_API_LEVEL
"24"
)
# HIAI DDK depends on android-24
else
()
set
(
ANDROID_API_LEVEL
"22"
)
endif
()
endif
()
endif
()
# then check input arm abi
...
...
lite/api/cxx_api_impl.cc
浏览文件 @
ac897177
...
...
@@ -70,6 +70,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
raw_predictor_
.
Build
(
config
,
places
,
passes
);
mode_
=
config
.
power_mode
();
threads_
=
config
.
threads
();
#ifdef LITE_WITH_NPU
Context
<
TargetType
::
kNPU
>::
SetSubgraphModelCacheDir
(
config
.
subgraph_model_cache_dir
());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
int
num_threads
=
config
.
x86_math_library_num_threads
();
...
...
lite/api/light_api_impl.cc
浏览文件 @
ac897177
...
...
@@ -36,6 +36,11 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
}
mode_
=
config
.
power_mode
();
threads_
=
config
.
threads
();
#ifdef LITE_WITH_NPU
Context
<
TargetType
::
kNPU
>::
SetSubgraphModelCacheDir
(
config
.
subgraph_model_cache_dir
());
#endif
}
std
::
unique_ptr
<
lite_api
::
Tensor
>
LightPredictorImpl
::
GetInput
(
int
i
)
{
...
...
lite/api/paddle_api.h
浏览文件 @
ac897177
...
...
@@ -118,6 +118,8 @@ class LITE_API ConfigBase {
std
::
string
model_dir_
;
int
threads_
{
1
};
PowerMode
mode_
{
LITE_POWER_NO_BIND
};
// to save subgraph model for npu/xpu/...
std
::
string
subgraph_model_cache_dir_
{
""
};
public:
explicit
ConfigBase
(
PowerMode
mode
=
LITE_POWER_NO_BIND
,
int
threads
=
1
);
...
...
@@ -130,6 +132,13 @@ class LITE_API ConfigBase {
// set Thread
void
set_threads
(
int
threads
);
int
threads
()
const
{
return
threads_
;
}
// set subgraph_model_dir
void
set_subgraph_model_cache_dir
(
std
::
string
subgraph_model_cache_dir
)
{
subgraph_model_cache_dir_
=
subgraph_model_cache_dir
;
}
const
std
::
string
&
subgraph_model_cache_dir
()
const
{
return
subgraph_model_cache_dir_
;
}
};
/// CxxConfig is the config for the Full feature predictor.
...
...
lite/backends/npu/device.cc
浏览文件 @
ac897177
...
...
@@ -19,52 +19,122 @@ namespace paddle {
namespace
lite
{
namespace
npu
{
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
Device
::
Build
(
const
std
::
string
model_name
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
// NOLINT
)
{
VLOG
(
3
)
<<
"[NPU] Build model"
;
// Build the HiAI IR graph to the HiAI om model
ge
::
Graph
ir_graph
(
"graph"
);
ir_graph
.
SetInputs
(
input_nodes
).
SetOutputs
(
output_nodes
);
ge
::
Model
om_model
(
"model"
,
"model"
);
om_model
.
SetGraph
(
ir_graph
);
domi
::
HiaiIrBuild
ir_build
;
domi
::
ModelBufferData
om_model_buf
;
if
(
!
ir_build
.
CreateModelBuff
(
om_model
,
om_model_buf
))
{
LOG
(
WARNING
)
<<
"[NPU] CreateModelBuff failed!"
;
return
nullptr
;
}
if
(
!
ir_build
.
BuildIRModel
(
om_model
,
om_model_buf
))
{
LOG
(
WARNING
)
<<
"[NPU] BuildIRModel failed!"
;
ir_build
.
ReleaseModelBuff
(
om_model_buf
);
return
nullptr
;
}
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
Device
::
Load
(
const
std
::
string
&
model_name
,
std
::
vector
<
char
>*
model_buffer
,
bool
*
model_comp
)
{
// Create a HiAI model manager client to load the HiAI om model
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
model_client
(
new
hiai
::
AiModelMngerClient
());
auto
model_client
=
std
::
make_shared
<
hiai
::
AiModelMngerClient
>
();
if
(
model_client
->
Init
(
nullptr
)
!=
hiai
::
AI_SUCCESS
)
{
LOG
(
WARNING
)
<<
"[NPU] AiModelMngerClient init failed)!"
;
ir_build
.
ReleaseModelBuff
(
om_model_buf
);
LOG
(
WARNING
)
<<
"[NPU] Init hiai model client failed!"
;
return
nullptr
;
}
// Check HiAI DDK version
const
char
*
ddk_version
=
model_client
->
GetVersion
();
if
(
ddk_version
)
{
LOG
(
INFO
)
<<
"[NPU] HiAI DDK version: "
<<
ddk_version
;
}
else
{
LOG
(
WARNING
)
<<
"[NPU] Unable to get HiAI DDK version!"
;
}
// Check model compatibility
auto
model_desc
=
std
::
make_shared
<
hiai
::
AiModelDescription
>
(
model_name
,
freq_level
(),
framework_type
(),
model_type
(),
device_type
());
model_desc
->
SetModelBuffer
(
om_model_buf
.
data
,
om_model_buf
.
length
);
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiModelDescription
>>
model_descs
;
model_descs
.
push_back
(
model_desc
);
model_desc
->
SetModelBuffer
(
reinterpret_cast
<
const
void
*>
(
model_buffer
->
data
()),
model_buffer
->
size
());
if
(
!*
model_comp
&&
model_client
->
CheckModelCompatibility
(
*
model_desc
,
*
model_comp
)
!=
hiai
::
AI_SUCCESS
)
{
*
model_comp
=
false
;
VLOG
(
3
)
<<
"[NPU] model is NOT compatiblitiable, setting model_comp to "
<<
*
model_comp
;
}
else
{
*
model_comp
=
true
;
VLOG
(
3
)
<<
"[NPU] model is compatiblitiable, setting model_comp to "
<<
*
model_comp
;
}
// Rebuild and write the data of the compatible model to the model buffer
if
(
!*
model_comp
)
{
std
::
shared_ptr
<
hiai
::
AiModelBuilder
>
model_builder
=
std
::
make_shared
<
hiai
::
AiModelBuilder
>
(
model_client
);
hiai
::
MemBuffer
*
org_model_buffer
=
model_builder
->
InputMemBufferCreate
(
reinterpret_cast
<
void
*>
(
model_buffer
->
data
()),
model_buffer
->
size
());
if
(
org_model_buffer
)
{
std
::
vector
<
hiai
::
MemBuffer
*>
org_model_buffers
;
org_model_buffers
.
push_back
(
org_model_buffer
);
hiai
::
MemBuffer
*
new_model_buffer
=
model_builder
->
OutputMemBufferCreate
(
framework_type
(),
org_model_buffers
);
// VLOG(3) << "[NPU] new model buffer memeory size is " <<
// new_model_buffer->GetMemBufferSize();
if
(
new_model_buffer
)
{
uint32_t
new_model_size
=
0
;
if
(
model_builder
->
BuildModel
(
org_model_buffers
,
new_model_buffer
,
new_model_size
)
==
hiai
::
AI_SUCCESS
)
{
// need to change to new_model_size as GetMemBufferSize is not
// correct.
model_buffer
->
resize
(
new_model_size
);
memcpy
(
reinterpret_cast
<
void
*>
(
model_buffer
->
data
()),
new_model_buffer
->
GetMemBufferData
(),
new_model_size
);
// Reset the model buffer
model_desc
->
SetModelBuffer
(
reinterpret_cast
<
const
void
*>
(
model_buffer
->
data
()),
model_buffer
->
size
());
VLOG
(
3
)
<<
"[NPU] Rebuild the compatible model done."
;
}
else
{
LOG
(
WARNING
)
<<
"[NPU] Rebuild the compatible model failed!"
;
}
model_builder
->
MemBufferDestroy
(
new_model_buffer
);
}
else
{
LOG
(
WARNING
)
<<
"[NPU] OutputMemBufferCreate failed!"
;
}
model_builder
->
MemBufferDestroy
(
org_model_buffer
);
}
else
{
LOG
(
WARNING
)
<<
"[NPU] InputMemBufferCreate failed!"
;
}
}
// Load the compatible model
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiModelDescription
>>
model_descs
{
model_desc
};
if
(
model_client
->
Load
(
model_descs
)
!=
hiai
::
AI_SUCCESS
)
{
LOG
(
WARNING
)
<<
"[NPU] AiModelMngerClient load model failed!"
;
ir_build
.
ReleaseModelBuff
(
om_model_buf
);
return
nullptr
;
}
ir_build
.
ReleaseModelBuff
(
om_model_buf
);
VLOG
(
3
)
<<
"[NPU] Build done"
;
VLOG
(
3
)
<<
"[NPU] Load model done."
;
return
model_client
;
}
bool
Device
::
Build
(
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
,
// NOLINT
std
::
vector
<
char
>*
model_buffer
)
{
// Convert the HiAI IR graph to the HiAI om model
ge
::
Graph
ir_graph
(
"graph"
);
ir_graph
.
SetInputs
(
input_nodes
).
SetOutputs
(
output_nodes
);
ge
::
Model
om_model
(
"model"
,
"model"
);
om_model
.
SetGraph
(
ir_graph
);
// Build the HiAI om model, serialize and output it to the om buffer
domi
::
HiaiIrBuild
ir_build
;
domi
::
ModelBufferData
om_buffer
;
if
(
!
ir_build
.
CreateModelBuff
(
om_model
,
om_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] CreateModelBuff failed!"
;
return
false
;
}
if
(
!
ir_build
.
BuildIRModel
(
om_model
,
om_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] BuildIRModel failed!"
;
ir_build
.
ReleaseModelBuff
(
om_buffer
);
return
false
;
}
model_buffer
->
resize
(
om_buffer
.
length
);
memcpy
(
reinterpret_cast
<
void
*>
(
model_buffer
->
data
()),
reinterpret_cast
<
void
*>
(
om_buffer
.
data
),
om_buffer
.
length
);
ir_build
.
ReleaseModelBuff
(
om_buffer
);
VLOG
(
3
)
<<
"[NPU] Build model done."
;
return
true
;
}
}
// namespace npu
}
// namespace lite
}
// namespace paddle
lite/backends/npu/device.h
浏览文件 @
ac897177
...
...
@@ -38,13 +38,18 @@ class Device {
int
model_type
()
{
return
model_type_
;
}
int
device_type
()
{
return
device_type_
;
}
// Load the HiAI om model from buffer, rebuild the model if it's incompatible
// with the current device, then create a HiAI model manager client(from HiAI
// Server) to run inference
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
Load
(
const
std
::
string
&
model_name
,
std
::
vector
<
char
>*
model_buffer
,
bool
*
model_comp
);
// Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference.
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
Build
(
const
std
::
string
model_name
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
// NOLINT
);
// NOLINT
bool
Build
(
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
,
// NOLINT
std
::
vector
<
char
>*
model_buffer
);
private:
int
freq_level_
{
3
};
...
...
lite/core/context.cc
浏览文件 @
ac897177
...
...
@@ -17,6 +17,10 @@
namespace
paddle
{
namespace
lite
{
#ifdef LITE_WITH_NPU
std
::
string
Context
<
TargetType
::
kNPU
>::
subgraph_model_cache_dir_
{
""
};
// NOLINT
#endif
#ifdef LITE_WITH_XPU
thread_local
xdnn
::
Context
*
Context
<
TargetType
::
kXPU
>::
_tls_raw_ctx
{
nullptr
};
int
Context
<
TargetType
::
kXPU
>::
_workspace_l3_size_per_thread
{
0
};
...
...
lite/core/context.h
浏览文件 @
ac897177
...
...
@@ -85,6 +85,16 @@ class Context<TargetType::kNPU> {
NPUContext
&
operator
=
(
const
NPUContext
&
ctx
)
{}
std
::
string
name
()
const
{
return
"NPUContext"
;
}
static
void
SetSubgraphModelCacheDir
(
std
::
string
subgraph_model_cache_dir
)
{
subgraph_model_cache_dir_
=
subgraph_model_cache_dir
;
}
static
std
::
string
SubgraphModelCacheDir
()
{
return
subgraph_model_cache_dir_
;
}
private:
static
std
::
string
subgraph_model_cache_dir_
;
};
#endif
...
...
lite/core/mir/subgraph/subgraph_detector.cc
浏览文件 @
ac897177
...
...
@@ -426,73 +426,51 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
subgraph_op_desc
.
SetAttr
<
int32_t
>
(
"sub_block"
,
sub_block_idx
);
// Extract input and output nodes from the target subgraph
std
::
unordered_set
<
Node
*>
i
nput
_var_nodes
;
std
::
unordered_set
<
Node
*>
i
data
_var_nodes
;
std
::
unordered_set
<
Node
*>
weight_var_nodes
;
std
::
unordered_set
<
Node
*>
o
utput
_var_nodes
;
std
::
unordered_set
<
Node
*>
o
data
_var_nodes
;
std
::
unordered_set
<
Node
*>
local_var_nodes
;
std
::
unordered_set
<
Node
*>
unused_var_nodes
;
ExtractInputsOutputs
(
subgraph_nodes
,
&
i
nput
_var_nodes
,
&
i
data
_var_nodes
,
&
weight_var_nodes
,
&
o
utput
_var_nodes
,
&
o
data
_var_nodes
,
&
local_var_nodes
,
&
unused_var_nodes
);
// A simplified model without the original weight/local/unused nodes on the
// subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
// true and Predictor->Run(...), Predictor->Save(...) is called.
std
::
unordered_set
<
Node
*>
input_var_nodes
(
idata_var_nodes
.
begin
(),
idata_var_nodes
.
end
());
std
::
unordered_set
<
Node
*>
output_var_nodes
(
odata_var_nodes
.
begin
(),
odata_var_nodes
.
end
());
if
(
!
GetBoolFromEnv
(
SUBGRAPH_DISABLE_ONLINE_MODE
))
{
input_var_nodes
.
insert
(
weight_var_nodes
.
begin
(),
weight_var_nodes
.
end
());
output_var_nodes
.
insert
(
local_var_nodes
.
begin
(),
local_var_nodes
.
end
());
output_var_nodes
.
insert
(
unused_var_nodes
.
begin
(),
unused_var_nodes
.
end
());
}
// Set input and output name mapping which stores the real inputs and
// outputs
std
::
vector
<
std
::
string
>
i
nput
_var_names
;
std
::
vector
<
std
::
string
>
o
utput
_var_names
;
for
(
auto
&
var_node
:
i
nput
_var_nodes
)
{
i
nput
_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
std
::
vector
<
std
::
string
>
i
data
_var_names
;
std
::
vector
<
std
::
string
>
o
data
_var_names
;
for
(
auto
&
var_node
:
i
data
_var_nodes
)
{
i
data
_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
}
for
(
auto
&
var_node
:
o
utput
_var_nodes
)
{
o
utput
_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
for
(
auto
&
var_node
:
o
data
_var_nodes
)
{
o
data
_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
}
subgraph_op_desc
.
SetAttr
<
std
::
vector
<
std
::
string
>>
(
"input_data_names"
,
i
nput
_var_names
);
i
data
_var_names
);
subgraph_op_desc
.
SetAttr
<
std
::
vector
<
std
::
string
>>
(
"output_data_names"
,
output_var_names
);
// Set input/output scale values of input/output var nodes for
// type_precision_cast_pass.
std
::
vector
<
float
>
input_data_scales
;
std
::
vector
<
float
>
output_data_scales
;
for
(
auto
&
var_node
:
input_var_nodes
)
{
auto
any_op_node
=
var_node
->
outlinks
.
front
();
CHECK
(
any_op_node
->
IsStmt
());
auto
&
any_inst
=
any_op_node
->
AsStmt
();
if
(
any_inst
.
op_info
()
->
HasAttr
(
"input_scale"
))
{
input_data_scales
.
push_back
(
any_inst
.
op_info
()
->
GetAttr
<
float
>
(
"input_scale"
));
}
}
for
(
auto
&
var_node
:
output_var_nodes
)
{
auto
any_op_node
=
var_node
->
inlinks
.
front
();
CHECK
(
any_op_node
->
IsStmt
());
auto
&
any_inst
=
any_op_node
->
AsStmt
();
if
(
any_inst
.
op_info
()
->
HasAttr
(
"output_scale"
))
{
output_data_scales
.
push_back
(
any_inst
.
op_info
()
->
GetAttr
<
float
>
(
"output_scale"
));
}
}
if
(
input_data_scales
.
size
()
>
0
)
{
subgraph_op_desc
.
SetAttr
<
std
::
vector
<
float
>>
(
"input_data_scales"
,
input_data_scales
);
}
if
(
output_data_scales
.
size
()
>
0
)
{
subgraph_op_desc
.
SetAttr
<
std
::
vector
<
float
>>
(
"output_data_scales"
,
output_data_scales
);
}
odata_var_names
);
// Set all of the inputs and outputs to the target subgraph op
// To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
for
(
auto
&
var_node
:
weight_var_nodes
)
{
std
::
vector
<
std
::
string
>
input_var_names
;
std
::
vector
<
std
::
string
>
output_var_names
;
for
(
auto
&
var_node
:
input_var_nodes
)
{
input_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
}
for
(
auto
&
var_node
:
local_var_nodes
)
{
output_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
}
for
(
auto
&
var_node
:
unused_var_nodes
)
{
for
(
auto
&
var_node
:
output_var_nodes
)
{
output_var_names
.
push_back
(
var_node
->
AsArg
().
name
);
}
subgraph_op_desc
.
SetInput
(
"Inputs"
,
input_var_names
);
...
...
@@ -509,26 +487,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
for
(
auto
&
var_node
:
input_var_nodes
)
{
IR_NODE_LINK_TO
(
var_node
,
subgraph_op_node
);
}
for
(
auto
&
var_node
:
weight_var_nodes
)
{
IR_NODE_LINK_TO
(
var_node
,
subgraph_op_node
);
}
for
(
auto
&
var_node
:
output_var_nodes
)
{
IR_OP_VAR_LINK
(
subgraph_op_node
,
var_node
);
}
for
(
auto
&
var_node
:
local_var_nodes
)
{
IR_OP_VAR_LINK
(
subgraph_op_node
,
var_node
);
}
for
(
auto
&
var_node
:
unused_var_nodes
)
{
IR_OP_VAR_LINK
(
subgraph_op_node
,
var_node
);
}
// Remove subgraph nodes and unused var nodes
auto
nodes2rm
=
GetNodes2RM
(
subgraph_nodes
,
{
input_var_nodes
,
weight_var_nodes
,
output_var_nodes
,
local_var_nodes
,
unused_var_nodes
});
auto
nodes2rm
=
GetNodes2RM
(
subgraph_nodes
,
{
input_var_nodes
,
output_var_nodes
});
GraphSafeRemoveNodes
(
graph
,
nodes2rm
);
}
...
...
@@ -603,7 +568,17 @@ std::unordered_set<const Node *> GetNodes2RM(
std
::
unordered_set
<
const
Node
*>
nodes2rm
(
op_nodes
.
begin
(),
op_nodes
.
end
());
for
(
auto
&
op_node
:
op_nodes
)
{
for
(
auto
&
var_node
:
op_node
->
inlinks
)
{
if
(
!
nodes2rm
.
count
(
var_node
))
{
bool
skip
=
false
;
// skip the var node which is used by any other ops that doesn't belong to
// the subgraph ops.
for
(
auto
&
out_op_node
:
var_node
->
outlinks
)
{
if
(
std
::
find
(
op_nodes
.
begin
(),
op_nodes
.
end
(),
out_op_node
)
!=
op_nodes
.
end
())
{
skip
=
true
;
break
;
}
}
if
(
!
skip
&&
!
nodes2rm
.
count
(
var_node
))
{
nodes2rm
.
insert
(
var_node
);
}
}
...
...
lite/core/mir/subgraph/subgraph_pass_test.cc
浏览文件 @
ac897177
...
...
@@ -25,6 +25,7 @@ DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
DEFINE_string
(
input_tensor_shape
,
"1,3,224,224"
,
"shape of input tensors"
);
DEFINE_string
(
input_tensor_type
,
"float32"
,
"data type of input tensors"
);
DEFINE_string
(
output_tensor_type
,
"float32"
,
"data type of output tensors"
);
DEFINE_string
(
subgraph_model_cache_dir
,
""
,
"dir of subgraph model cache"
);
namespace
paddle
{
namespace
lite
{
...
...
@@ -132,6 +133,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
mobile_config
.
set_model_from_file
(
optimized_model_dir
+
".nb"
);
mobile_config
.
set_power_mode
(
lite_api
::
PowerMode
::
LITE_POWER_HIGH
);
mobile_config
.
set_threads
(
1
);
mobile_config
.
set_subgraph_model_cache_dir
(
FLAGS_subgraph_model_cache_dir
);
predictor
=
lite_api
::
CreatePaddlePredictor
(
mobile_config
);
FillInputTensors
(
predictor
,
input_tensor_shape
,
input_tensor_type
,
1
);
// Run optimized model
...
...
lite/kernels/npu/bridges/engine.cc
浏览文件 @
ac897177
...
...
@@ -15,6 +15,7 @@
#include "lite/kernels/npu/bridges/engine.h"
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <utility>
#include "lite/kernels/npu/bridges/registry.h"
...
...
@@ -22,11 +23,50 @@ namespace paddle {
namespace
lite
{
namespace
subgraph
{
int
Engine
::
BuildDeviceProgram
()
{
return
FAILED
;
}
Engine
::
Engine
(
KernelContext
*
ctx
,
int
block_idx
,
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
lite
::
Scope
*
scope
)
:
ctx_
(
ctx
),
block_idx_
(
block_idx
),
block_desc_
(
block_desc
),
scope_
(
scope
)
{
input_names_
=
input_names
;
output_names_
=
output_names
;
// Sort the name of input and output tensors, it's convenient for us to get
// the info of input and output tensors in the same order from the device
// program, because the result of subgraph division may be different but right
// at each call of the subgraph pass.
std
::
stable_sort
(
input_names_
.
begin
(),
input_names_
.
end
());
std
::
stable_sort
(
output_names_
.
begin
(),
output_names_
.
end
());
}
int
Engine
::
LaunchDeviceProgram
()
{
return
0
;
}
bool
Engine
::
Run
()
{
if
(
is_first_epoch_
)
{
PrepareWorkspaceForDeviceProgram
();
is_first_epoch_
=
false
;
}
if
(
InputShapeChanged
())
{
BuildDeviceProgram
();
}
return
LaunchDeviceProgram
();
}
int
Engine
::
BuildOriginProgram
()
{
bool
Engine
::
PrepareWorkspaceForOriginProgram
()
{
origin_idims_
.
resize
(
input_names_
.
size
());
origin_itensors_
.
resize
(
input_names_
.
size
());
for
(
int
i
=
0
;
i
<
input_names_
.
size
();
i
++
)
{
origin_itensors_
[
i
]
=
scope_
->
FindMutableTensor
(
input_names_
[
i
]);
CHECK
(
origin_itensors_
[
i
]);
}
origin_otensors_
.
resize
(
output_names_
.
size
());
for
(
int
i
=
0
;
i
<
output_names_
.
size
();
i
++
)
{
origin_otensors_
[
i
]
=
scope_
->
FindMutableTensor
(
output_names_
[
i
]);
CHECK
(
origin_otensors_
[
i
]);
}
return
true
;
}
bool
Engine
::
BuildOriginProgram
()
{
// TODO(hong19860320) The block_desc need to be divided into subgraphs during
// the exection time. But only see them as a subgraph now.
origin_program_
.
clear
();
...
...
@@ -34,11 +74,14 @@ int Engine::BuildOriginProgram() {
auto
op_desc
=
block_desc_
->
GetOp
<
cpp
::
OpDesc
>
(
op_idx
);
CHECK
(
op_desc
);
std
::
string
op_type
=
op_desc
->
Type
();
// Create op and pick up the best kernel
auto
op
=
LiteOpRegistry
::
Global
().
Create
(
op_desc
->
Type
());
CHECK
(
op
)
<<
"no Op found for "
<<
op_type
;
op
->
Attach
(
*
op_desc
,
scope_
);
std
::
unique_ptr
<
KernelBase
>
picked_kernel
;
if
(
op_desc
->
HasAttr
(
kKernelTypeAttr
))
{
// Create op and pick up kernel according to the kKernelTypeAttr attribute
// Create op and pick up the best kernel according to the
// kKernelTypeAttr attribute
auto
kernel_type
=
op_desc
->
GetAttr
<
std
::
string
>
(
kKernelTypeAttr
);
std
::
string
alias
;
Place
place
;
...
...
@@ -48,12 +91,14 @@ int Engine::BuildOriginProgram() {
auto
kernels
=
op
->
CreateKernels
({
place
});
CHECK_GT
(
kernels
.
size
(),
0u
)
<<
"No kernels found for "
<<
op_type
;
auto
it
=
std
::
find_if
(
kernels
.
begin
(),
kernels
.
end
(),
[
&
](
std
::
unique_ptr
<
KernelBase
>
&
it
)
{
kernels
.
begin
(),
kernels
.
end
(),
[
&
](
std
::
unique_ptr
<
KernelBase
>
&
it
)
{
return
it
->
alias
()
==
alias
;
});
CHECK
(
it
!=
kernels
.
end
());
picked_kernel
=
std
::
move
(
*
it
);
}
else
{
// TODO(hong19860320) add kernel picking according to the type of input
// and output tensors
VLOG
(
3
)
<<
"The attr '"
<<
kKernelTypeAttr
<<
"' not found, pick the first kernel for "
<<
op_type
;
std
::
vector
<
std
::
unique_ptr
<
KernelBase
>>
kernels
;
...
...
@@ -74,49 +119,41 @@ int Engine::BuildOriginProgram() {
}
origin_program_
.
emplace_back
(
std
::
move
(
op
),
std
::
move
(
picked_kernel
));
}
return
0
;
CHECK
(
!
origin_program_
.
empty
())
<<
"no instructions"
;
return
true
;
}
int
Engine
::
LaunchOriginProgram
()
{
for
(
auto
&
inst
:
origin_program_
)
{
bool
Engine
::
LaunchOriginProgram
()
{
if
(
origin_program_
.
empty
())
{
BuildOriginProgram
();
}
if
(
!
origin_program_
.
empty
())
{
for
(
auto
&
inst
:
origin_program_
)
{
auto
op_type
=
inst
.
op
()
->
op_info
()
->
Type
();
if
(
op_type
==
"feed"
||
op_type
==
"fetch"
)
continue
;
inst
.
Run
();
}
return
0
;
return
true
;
}
return
false
;
}
int
Engine
::
Build
()
{
// In order to attach all of the ops of the block desc, we need to build the
// original program firstly.
BuildOriginProgram
();
// Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
build_device_program_status_
=
BuildDeviceProgram
();
return
build_device_program_status_
;
bool
Engine
::
PrepareWorkspaceForDeviceProgram
()
{
return
PrepareWorkspaceForOriginProgram
();
}
bool
Engine
::
BuildDeviceProgram
()
{
return
BuildOriginProgram
();
}
bool
Engine
::
LaunchDeviceProgram
()
{
return
LaunchOriginProgram
();
}
bool
Engine
::
InputShapeChanged
()
{
bool
changed
=
false
;
for
(
size_t
i
=
0
;
i
<
origin_itensors_
.
size
();
i
++
)
{
if
(
origin_itensors_
[
i
]
->
dims
()
!=
origin_idims_
[
i
])
{
return
true
;
}
}
return
false
;
}
int
Engine
::
Launch
()
{
// Rebuild device program when the shapes of input tensors have been changed.
if
(
CHECK_SUCCESS
(
build_device_program_status_
)
&&
CHECK_REBUILD_WHEN_SHAPE_CHANGED
(
build_device_program_status_
)
&&
InputShapeChanged
())
{
Build
();
}
if
(
CHECK_FAILED
(
build_device_program_status_
))
{
LaunchOriginProgram
();
}
else
{
LaunchDeviceProgram
();
auto
origin_idim
=
origin_itensors_
[
i
]
->
dims
().
Vectorize
();
changed
|=
origin_idim
!=
origin_idims_
[
i
];
origin_idims_
[
i
]
=
origin_idim
;
}
return
0
;
return
changed
;
}
}
// namespace subgraph
...
...
lite/kernels/npu/bridges/engine.h
浏览文件 @
ac897177
...
...
@@ -33,42 +33,33 @@ class Engine {
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
lite
::
Scope
*
scope
)
:
ctx_
(
ctx
),
block_idx_
(
block_idx
),
block_desc_
(
block_desc
),
input_names_
(
input_names
),
output_names_
(
output_names
),
scope_
(
scope
)
{}
lite
::
Scope
*
scope
);
virtual
~
Engine
()
=
default
;
virtual
int
Build
();
virtual
int
Launch
();
virtual
bool
Run
();
private:
Engine
(
const
Engine
&
)
=
delete
;
protected:
virtual
int
BuildDeviceProgram
();
virtual
int
LaunchDeviceProgram
();
virtual
bool
PrepareWorkspaceForOriginProgram
();
virtual
bool
BuildOriginProgram
();
virtual
bool
LaunchOriginProgram
();
virtual
int
BuildOriginProgram
();
virtual
int
LaunchOriginProgram
();
virtual
bool
PrepareWorkspaceForDeviceProgram
();
virtual
bool
BuildDeviceProgram
();
virtual
bool
LaunchDeviceProgram
();
virtual
bool
InputShapeChanged
();
KernelContext
*
ctx_
{
nullptr
};
int
block_idx_
;
cpp
::
BlockDesc
*
block_desc_
;
int
block_idx_
{
-
1
}
;
cpp
::
BlockDesc
*
block_desc_
{
nullptr
}
;
std
::
vector
<
std
::
string
>
input_names_
;
std
::
vector
<
std
::
string
>
output_names_
;
Scope
*
scope_
{
nullptr
};
// SUCCESS: device program build successed. FAILED: device program build
// failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
// to rebuild when input shape changed.
int
build_device_program_status_
{
0
};
std
::
vector
<
DDim
>
origin_idims_
;
std
::
vector
<
DDim
>
origin_odims_
;
bool
is_first_epoch_
{
true
};
std
::
vector
<
std
::
vector
<
int64_t
>>
origin_idims_
;
std
::
vector
<
Tensor
*>
origin_itensors_
;
std
::
vector
<
Tensor
*>
origin_otensors_
;
std
::
vector
<
Instruction
>
origin_program_
;
...
...
lite/kernels/npu/bridges/graph.h
浏览文件 @
ac897177
...
...
@@ -19,7 +19,7 @@
#include <unordered_map>
#include <utility>
#include <vector>
#include "graph/
op
/all_ops.h"
#include "graph/
compatible
/all_ops.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
...
...
lite/kernels/npu/bridges/matmul_op.cc
浏览文件 @
ac897177
...
...
@@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
else
{
matmul_node
=
graph
->
Add
<
ge
::
op
::
BatchMatMul
>
(
out_name
);
auto
matmul_op
=
matmul_node
->
data
<
ge
::
op
::
BatchMatMul
>
();
matmul_op
->
set_input_x
(
*
x_node
->
data
());
matmul_op
->
set_input_
y
(
*
y_node
->
data
());
matmul_op
->
set_attr_adj_x
(
transpose_x
);
matmul_op
->
set_attr_adj_
y
(
transpose_y
);
matmul_op
->
set_input_x
1
(
*
x_node
->
data
());
matmul_op
->
set_input_
x2
(
*
y_node
->
data
());
matmul_op
->
set_attr_adj_x
1
(
transpose_x
);
matmul_op
->
set_attr_adj_
x2
(
transpose_y
);
}
if
(
fabs
(
alpha
-
1.
f
)
>
1e-6
f
)
{
...
...
lite/kernels/npu/bridges/utility.h
浏览文件 @
ac897177
...
...
@@ -20,11 +20,11 @@
#include <unordered_map>
#include <vector>
#include "graph/buffer.h"
#include "graph/compatible/operator_reg.h"
#include "graph/graph.h"
#include "graph/model.h"
#include "graph/op/all_ops.h"
#include "graph/operator.h"
#include "graph/operator_reg.h"
#include "lite/core/op_lite.h"
#include "lite/utils/macros.h"
...
...
@@ -97,25 +97,26 @@ REG_OP(Pad)
/*
* Multiplies slices of two tensors in batches.
* <Input>
*
x
: The input tensor
*
y
: The input tensor
*
x1
: The input tensor
*
x2
: The input tensor
* <Output>
*
z
: The output tensor
*
y
: The output tensor
* <Attr>
* adj_x : adj_x is true, the input tensor x is transposed, otherwise
* it will not be transposed. Default is false (The current version only
* supports false).
* adj_y : adj_y is true, the input tensor y is transposed, otherwise
* it will not be transposed. Default is false.
* adj_x1 : adj_x1 is true, the input tensor x1 is transposed,
* otherwise it will not be transposed.
* Default is false (The current version only supports false).
* adj_x2 : adj_x2 is true, the input tensor x2 is transposed,
* otherwise it will not be transposed.
* Default is false.
* <Added in HiAI version>
* 100.320.010.010
*/
REG_OP
(
BatchMatMul
)
.
INPUT
(
x
,
TensorType
({
DT_FLOAT
}))
.
INPUT
(
y
,
TensorType
({
DT_FLOAT
}))
.
OUTPUT
(
z
,
TensorType
({
DT_FLOAT
}))
.
ATTR
(
adj_x
,
AttrValue
::
BOOL
{
false
})
.
ATTR
(
adj_
y
,
AttrValue
::
BOOL
{
false
})
.
INPUT
(
x
1
,
TensorType
({
DT_FLOAT
}))
.
INPUT
(
x2
,
TensorType
({
DT_FLOAT
}))
.
OUTPUT
(
y
,
TensorType
({
DT_FLOAT
}))
.
ATTR
(
adj_x
1
,
AttrValue
::
BOOL
{
false
})
.
ATTR
(
adj_
x2
,
AttrValue
::
BOOL
{
false
})
.
OP_END
()
}
// namespace ge
...
...
lite/kernels/npu/subgraph_compute.cc
浏览文件 @
ac897177
...
...
@@ -15,6 +15,8 @@
#include "lite/kernels/npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <functional>
#include <utility>
#include "hiai_ir_build.h" // NOLINT
#include "lite/backends/npu/device.h"
...
...
@@ -22,192 +24,276 @@
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
#include "lite/kernels/npu/bridges/utility.h"
#include "lite/utils/io.h"
#include "lite/utils/md5.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
npu
{
int
SubgraphEngine
::
BuildDeviceProgram
()
{
// Generate the model name by using md5 hashes based on:
// 1. the sorted variable input names
// 2. the shapes of the origin input tensors
// 3. the sorted variable output names
std
::
string
DeviceProgram
::
GenerateModelName
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
)
{
std
::
ostringstream
os
;
CHECK_EQ
(
input_names
.
size
(),
origin_idims
.
size
());
for
(
int
i
=
0
;
i
<
input_names
.
size
();
i
++
)
{
os
<<
input_names
[
i
];
for
(
auto
dim
:
origin_idims
[
i
])
{
os
<<
dim
;
}
}
for
(
auto
output_name
:
output_names
)
{
os
<<
output_name
;
}
return
MD5
(
os
.
str
());
}
// Deserialize the generated model, the precisions and dimensions of the origin
// output tensors of the subgraph op into files
bool
DeviceProgram
::
LoadFromCacheFile
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
,
const
std
::
string
&
model_cache_dir
)
{
// Generate the model name if not initialized
if
(
model_name_
.
empty
())
{
model_name_
=
GenerateModelName
(
input_names
,
output_names
,
origin_idims
);
}
// Load from the cached model file, return a HiAI model manager client for
// inference
auto
model_path
=
model_cache_dir
+
"/"
+
model_name_
+
".om"
;
VLOG
(
3
)
<<
"[NPU] Load model from "
<<
model_path
;
std
::
vector
<
char
>
model_buffer
;
if
(
!
ReadFile
(
model_path
,
&
model_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] read from "
<<
model_path
<<
" failed!"
;
return
false
;
}
bool
model_comp
=
false
;
model_client_
=
lite
::
npu
::
Device
::
Global
().
Load
(
model_name_
,
&
model_buffer
,
&
model_comp
);
if
(
!
model_client_
)
{
LOG
(
WARNING
)
<<
"[NPU] Load model failed!"
;
return
false
;
}
// Rewrite with the compatible model data if the cached
// model file is incompatible with the current device
if
(
!
model_comp
)
{
VLOG
(
3
)
<<
"[NPU] Export the compatible model to "
<<
model_path
;
if
(
!
WriteFile
(
model_path
,
model_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] Open "
<<
model_path
<<
" for writting failed!"
;
}
}
// Deserialize the precisions and shapes of the origin output tensors from the
// cached configuration file
auto
config_path
=
model_cache_dir
+
"/"
+
model_name_
+
".cfg"
;
VLOG
(
3
)
<<
"[NPU] Load configuration from "
<<
config_path
;
std
::
vector
<
char
>
config_buffer
;
if
(
!
ReadFile
(
config_path
,
&
config_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] read from "
<<
config_path
<<
" failed!"
;
return
false
;
}
std
::
string
config_str
(
config_buffer
.
begin
(),
config_buffer
.
end
());
// Parse the precision and shapes of the output tensors
auto
output_options
=
Split
<
std
::
string
>
(
config_str
,
";"
);
CHECK_EQ
(
output_options
.
size
(),
output_names
.
size
());
origin_otypes_
.
resize
(
output_names
.
size
());
origin_odims_
.
resize
(
output_names
.
size
());
for
(
int
i
=
0
;
i
<
output_names
.
size
();
i
++
)
{
auto
items
=
Split
<
std
::
string
>
(
output_options
[
i
],
":"
);
CHECK_EQ
(
items
.
size
(),
2
);
// precision and shapes
origin_otypes_
[
i
]
=
static_cast
<
PrecisionType
>
(
std
::
stoi
(
items
[
0
]));
origin_odims_
[
i
]
=
Split
<
int64_t
>
(
items
[
1
],
","
);
}
return
true
;
}
bool
DeviceProgram
::
BuildGraphAndCacheToFile
(
const
std
::
vector
<
Instruction
>&
origin_program
,
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
,
const
std
::
vector
<
Tensor
*>&
origin_otensors
,
const
std
::
string
&
model_cache_dir
)
{
// Generate the model name if not initialized
if
(
model_name_
.
empty
())
{
model_name_
=
GenerateModelName
(
input_names
,
output_names
,
origin_idims
);
}
// Convert all of ops and their input vars and weights to HiAI IR nodes,
// then added them into the HiAI IR graph
int
status
=
0
;
// Convert all of ops and their input vars and weights and added into the NPU
// HiAI IR graph
CHECK
(
!
origin_program
.
empty
())
<<
"no instructions"
;
subgraph
::
npu
::
Graph
graph
;
const
auto
&
bridges
=
subgraph
::
Registry
::
Instance
();
for
(
auto
&
inst
:
origin_program
_
)
{
for
(
auto
&
inst
:
origin_program
)
{
auto
op
=
const_cast
<
OpLite
*>
(
inst
.
op
());
CHECK
(
op
);
op
->
CheckShape
();
op
->
InferShape
();
std
::
string
op_type
=
op
->
op_info
()
->
Type
();
if
(
!
bridges
.
Exists
(
op_type
,
TARGET
(
kNPU
)))
{
return
subgraph
::
FAILED
;
return
false
;
}
auto
kernel
=
inst
.
kernel
();
status
|=
bridges
.
Select
(
op_type
,
TARGET
(
kNPU
))(
reinterpret_cast
<
void
*>
(
&
graph
),
op
,
const_cast
<
KernelBase
*>
(
kernel
));
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
return
subgraph
::
FAILED
;
return
false
;
}
}
// Collect the valid input and output nodes in the HiAI IR graph and update
// the input and output names
device_inames_
.
clear
();
device_onames_
.
clear
();
// Collect the input and output nodes of the HiAI IR graph
std
::
vector
<
ge
::
Operator
>
device_inodes
;
for
(
size_t
i
=
0
;
i
<
input_names
.
size
();
i
++
)
{
CHECK
(
graph
.
Has
(
input_names
[
i
])
&&
graph
.
Get
(
input_names
[
i
])
->
is_data
());
device_inodes
.
push_back
(
*
graph
.
Get
(
input_names
[
i
])
->
data
());
}
std
::
vector
<
ge
::
Operator
>
device_onodes
;
for
(
auto
&
input_name
:
input_names_
)
{
if
(
graph
.
Has
(
input_name
))
{
if
(
graph
.
Get
(
input_name
)
->
is_data
())
{
device_inodes
.
push_back
(
*
graph
.
Get
(
input_name
)
->
data
());
device_inames_
.
push_back
(
input_name
);
}
else
{
LOG
(
WARNING
)
<<
"[NPU] Input node "
<<
input_name
<<
" is ignored because it is not a data node."
;
}
}
else
{
LOG
(
WARNING
)
<<
"[NPU] Input node "
<<
input_name
<<
" is ignored because it does not exist."
;
}
}
for
(
auto
&
output_name
:
output_names_
)
{
if
(
graph
.
Has
(
output_name
))
{
device_onodes
.
push_back
(
*
graph
.
Get
(
output_name
)
->
data
());
device_onames_
.
push_back
(
output_name
);
}
else
{
LOG
(
WARNING
)
<<
"[NPU] Output node "
<<
output_name
<<
" is ignored because it does not exist."
;
}
}
CHECK
(
!
device_inames_
.
empty
())
<<
"[NPU] No input nodes found for building NPU model"
;
CHECK
(
!
device_onames_
.
empty
())
<<
"[NPU] No output nodes found for building NPU model"
;
// Build the HiAI IR graph to HiAI om model as the device program
if
(
device_program_map_
.
count
(
inputs_shape_
)
>
0
)
{
return
status
;
for
(
size_t
i
=
0
;
i
<
output_names
.
size
();
i
++
)
{
CHECK
(
graph
.
Has
(
output_names
[
i
]));
device_onodes
.
push_back
(
*
graph
.
Get
(
output_names
[
i
])
->
data
());
}
auto
device_client
=
lite
::
npu
::
Device
::
Global
().
Build
(
model_name_
,
device_inodes
,
device_onodes
);
if
(
device_client
==
nullptr
)
{
// Build the HiAI IR graph to the HiAI om model
std
::
vector
<
char
>
model_buffer
;
if
(
!
lite
::
npu
::
Device
::
Global
().
Build
(
device_inodes
,
device_onodes
,
&
model_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] Build model failed!"
;
return
subgraph
::
FAILED
;
return
false
;
}
// Load the HiAI om model and create a HiAI model manager client(from HiAI
// Service) to run inference.
bool
model_comp
=
true
;
model_client_
=
lite
::
npu
::
Device
::
Global
().
Load
(
model_name_
,
&
model_buffer
,
&
model_comp
);
if
(
!
model_client_
)
{
LOG
(
WARNING
)
<<
"[NPU] Load model failed!"
;
return
false
;
}
// Update the precison and dimensions of the origin output tensors
CHECK_EQ
(
origin_otensors
.
size
(),
output_names
.
size
());
origin_otypes_
.
resize
(
output_names
.
size
());
origin_odims_
.
resize
(
output_names
.
size
());
for
(
size_t
i
=
0
;
i
<
output_names
.
size
();
i
++
)
{
origin_otypes_
[
i
]
=
graph
.
Get
(
output_names
[
i
])
->
precision
();
origin_odims_
[
i
]
=
origin_otensors
[
i
]
->
dims
().
Vectorize
();
}
if
(
!
model_cache_dir
.
empty
())
{
// Save the generated model to file, used for the model caching or the
// offline model generation
auto
model_path
=
model_cache_dir
+
"/"
+
model_name_
+
".om"
;
VLOG
(
3
)
<<
"[NPU] Save model to "
<<
model_path
;
if
(
!
WriteFile
(
model_path
,
model_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] Open "
<<
model_path
<<
" for writting failed!"
;
}
// Serialize the precisions and shapes of the origin output tensors into the
// configuration file
std
::
ostringstream
os
;
for
(
int
i
=
0
;
i
<
output_names
.
size
();
i
++
)
{
os
<<
static_cast
<
int32_t
>
(
origin_otypes_
[
i
])
<<
":"
;
for
(
auto
dim
:
origin_odims_
[
i
])
{
os
<<
dim
<<
","
;
}
auto
device_program
=
std
::
make_shared
<
device_program_t
>
(
device_client
);
device_program_map_
[
inputs_shape_
]
=
device_program
;
os
<<
";"
;
}
auto
str
=
os
.
str
();
std
::
vector
<
char
>
config_buffer
(
str
.
begin
(),
str
.
end
());
auto
config_path
=
model_cache_dir
+
"/"
+
model_name_
+
".cfg"
;
VLOG
(
3
)
<<
"[NPU] Save configuration to "
<<
config_path
;
if
(
!
WriteFile
(
config_path
,
config_buffer
))
{
LOG
(
WARNING
)
<<
"[NPU] Open "
<<
config_path
<<
" for writting failed!"
;
}
}
return
true
;
}
// Query and check the dimensions of valid input and output tensors
std
::
vector
<
hiai
::
TensorDimension
>
device_idims
,
device_odims
;
if
(
device_program
->
client
->
GetModelIOTensorDim
(
model_name_
,
device_idims
,
device_odims
)
!=
hiai
::
AI_SUCCESS
)
{
bool
DeviceProgram
::
ShareBufferWithOriginTensors
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
std
::
vector
<
Tensor
*>*
origin_itensors
,
std
::
vector
<
Tensor
*>*
origin_otensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_itensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_otensors
)
{
CHECK
(
!
model_name_
.
empty
()
&&
model_client_
);
// Query the dimensions of the device input and output tensors if not
// initialized
if
(
device_idims_
.
empty
()
||
device_odims_
.
empty
())
{
if
(
model_client_
->
GetModelIOTensorDim
(
model_name_
,
device_idims_
,
device_odims_
)
!=
hiai
::
AI_SUCCESS
)
{
LOG
(
WARNING
)
<<
"[NPU] Get the dimensions of input and output tensors failed!"
;
return
subgraph
::
FAILED
;
return
false
;
}
device_program
->
device_idims
=
device_idims
;
device_program
->
device_odims
=
device_odims
;
CHECK_EQ
(
device_idims
.
size
(),
device_inames_
.
size
());
CHECK_EQ
(
device_odims
.
size
(),
device_onames_
.
size
());
origin_idims_
.
resize
(
device_inames_
.
size
());
origin_itensors_
.
resize
(
device_inames_
.
size
());
device_itensors_
.
resize
(
device_inames_
.
size
());
origin_odims_
.
resize
(
device_onames_
.
size
());
origin_otensors_
.
resize
(
device_onames_
.
size
());
device_otensors_
.
resize
(
device_onames_
.
size
());
for
(
int
i
=
0
;
i
<
device_inames_
.
size
();
i
++
)
{
auto
node
=
graph
.
Get
(
device_inames_
[
i
]);
auto
precision
=
node
->
precision
();
auto
layout
=
node
->
layout
();
origin_itensors_
[
i
]
=
scope_
->
FindMutableTensor
(
device_inames_
[
i
]);
CHECK
(
origin_itensors_
[
i
]);
origin_idims_
[
i
]
=
origin_itensors_
[
i
]
->
dims
();
VLOG
(
3
)
<<
"[NPU] Inputs["
<<
i
<<
"] name: "
<<
device_inames_
[
i
]
<<
" precision: "
<<
PrecisionToStr
(
precision
)
<<
" layout: "
<<
DataLayoutToStr
(
layout
)
<<
" dims: {"
<<
device_idims
[
i
].
GetNumber
()
<<
","
<<
device_idims
[
i
].
GetChannel
()
<<
","
<<
device_idims
[
i
].
GetHeight
()
<<
","
<<
device_idims
[
i
].
GetWidth
()
<<
"}"
;
// Prepare the device input tensors
CHECK_EQ
(
origin_idims_
[
i
].
production
(),
device_idims
[
i
].
GetNumber
()
*
device_idims
[
i
].
GetChannel
()
*
device_idims
[
i
].
GetHeight
()
*
device_idims
[
i
].
GetWidth
());
device_itensors_
[
i
].
reset
(
new
hiai
::
AiTensor
);
device_itensors_
[
i
]
->
Init
(
&
(
device_idims
[
i
]));
}
device_program
->
origin_idims
=
origin_idims_
;
// Check the dimensions of the device tensors and the origin tensors
CHECK_EQ
(
device_itensors
->
size
(),
input_names
.
size
());
CHECK_EQ
(
device_otensors
->
size
(),
output_names
.
size
());
CHECK_EQ
(
origin_otypes_
.
size
(),
output_names
.
size
());
CHECK_EQ
(
origin_odims_
.
size
(),
output_names
.
size
());
CHECK_EQ
(
device_idims_
.
size
(),
input_names
.
size
());
CHECK_EQ
(
device_odims_
.
size
(),
output_names
.
size
());
for
(
int
i
=
0
;
i
<
input_names
.
size
();
i
++
)
{
VLOG
(
3
)
<<
"[NPU] Inputs["
<<
i
<<
"] name: "
<<
input_names
[
i
]
<<
" origin dims:"
<<
(
*
origin_itensors
)[
i
]
->
dims
().
repr
()
<<
" device dims: {"
<<
device_idims_
[
i
].
GetNumber
()
<<
","
<<
device_idims_
[
i
].
GetChannel
()
<<
","
<<
device_idims_
[
i
].
GetHeight
()
<<
","
<<
device_idims_
[
i
].
GetWidth
()
<<
"}"
;
CHECK_EQ
((
*
origin_itensors
)[
i
]
->
dims
().
production
(),
device_idims_
[
i
].
GetNumber
()
*
device_idims_
[
i
].
GetChannel
()
*
device_idims_
[
i
].
GetHeight
()
*
device_idims_
[
i
].
GetWidth
());
VLOG
(
3
)
<<
"[NPU] Init the input tensors for the device program and share "
"their buffers with the origin input tensors"
;
// reinit device tensor will free shared buffer, so copy data to a tmp
// tensor
Tensor
tmp
;
tmp
.
CopyDataFrom
(
*
(
*
origin_itensors
)[
i
]);
(
*
device_itensors
)[
i
]
->
Init
(
&
(
device_idims_
[
i
]));
for
(
int
i
=
0
;
i
<
device_onames_
.
size
();
i
++
)
{
auto
node
=
graph
.
Get
(
device_onames_
[
i
]);
auto
precision
=
node
->
precision
();
auto
layout
=
node
->
layout
();
origin_otensors_
[
i
]
=
scope_
->
FindMutableTensor
(
device_onames_
[
i
]);
CHECK
(
origin_otensors_
[
i
]);
origin_odims_
[
i
]
=
origin_otensors_
[
i
]
->
dims
();
VLOG
(
3
)
<<
"[NPU] Outputs["
<<
i
<<
"] name: "
<<
device_onames_
[
i
]
<<
" precision: "
<<
PrecisionToStr
(
precision
)
<<
" layout: "
<<
DataLayoutToStr
(
layout
)
<<
" dims: {"
<<
device_odims
[
i
].
GetNumber
()
<<
","
<<
device_odims
[
i
].
GetChannel
()
<<
","
<<
device_odims
[
i
].
GetHeight
()
<<
","
<<
device_odims
[
i
].
GetWidth
()
<<
"}"
;
// Prepare the device output tensors
switch
(
precision
)
{
case
PRECISION
(
kFloat
):
origin_otensors_
[
i
]
->
mutable_data
<
float
>
();
break
;
case
PRECISION
(
kBool
):
origin_otensors_
[
i
]
->
mutable_data
<
bool
>
();
break
;
case
PRECISION
(
kInt8
):
origin_otensors_
[
i
]
->
mutable_data
<
int8_t
>
();
break
;
case
PRECISION
(
kInt16
):
origin_otensors_
[
i
]
->
mutable_data
<
int16_t
>
();
break
;
case
PRECISION
(
kInt32
):
origin_otensors_
[
i
]
->
mutable_data
<
int32_t
>
();
break
;
case
PRECISION
(
kInt64
):
origin_otensors_
[
i
]
->
mutable_data
<
int64_t
>
();
break
;
default:
LOG
(
FATAL
)
<<
"[NPU] "
<<
device_onames_
[
i
]
<<
" can't mutable data with precision type "
<<
PrecisionToStr
(
precision
);
break
;
}
device_program
->
origin_odims
=
origin_odims_
;
std
::
memcpy
(
(
*
device_itensors
)[
i
]
->
GetBuffer
(),
tmp
.
raw_data
(),
tmp
.
memory_size
());
CHECK_EQ
(
origin_odims_
[
i
].
production
(),
device_odims
[
i
].
GetNumber
()
*
device_odims
[
i
].
GetChannel
()
*
device_odims
[
i
].
GetHeight
()
*
device_odims
[
i
].
GetWidth
());
device_otensors_
[
i
].
reset
(
new
hiai
::
AiTensor
);
device_otensors_
[
i
]
->
Init
(
&
(
device_odims
[
i
]));
// Share data buf between device_itensor and origin_itensor
std
::
shared_ptr
<
Buffer
>
buffer
=
std
::
make_shared
<
Buffer
>
((
*
device_itensors
)[
i
]
->
GetBuffer
(),
lite_api
::
TargetType
::
kHost
,
(
*
device_itensors
)[
i
]
->
GetSize
());
(
*
origin_itensors
)[
i
]
->
ResetBuffer
(
buffer
,
(
*
device_itensors
)[
i
]
->
GetSize
());
}
return
status
;
}
int
SubgraphEngine
::
LaunchDeviceProgram
()
{
// Copy the data of origin input tensors to the buffer of input HiAI tensors
// init device_itensors_, device_otensors_, origin_otensors_
auto
device_program
=
device_program_map_
[
inputs_shape_
];
for
(
size_t
i
=
0
;
i
<
device_itensors_
.
size
();
i
++
)
{
device_itensors_
[
i
]
->
Init
(
&
(
device_program
->
device_idims
[
i
]));
std
::
memcpy
(
device_itensors_
[
i
]
->
GetBuffer
(),
origin_itensors_
[
i
]
->
raw_data
(),
origin_itensors_
[
i
]
->
memory_size
());
}
for
(
size_t
i
=
0
;
i
<
device_otensors_
.
size
();
i
++
)
{
device_otensors_
[
i
]
->
Init
(
&
(
device_program
->
device_odims
[
i
]));
}
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
i
++
)
{
origin_otensors_
[
i
]
->
Resize
(
device_program
->
origin_odims
[
i
]);
for
(
int
i
=
0
;
i
<
output_names
.
size
();
i
++
)
{
(
*
origin_otensors
)[
i
]
->
set_precision
(
origin_otypes_
[
i
]);
(
*
origin_otensors
)[
i
]
->
Resize
(
origin_odims_
[
i
]);
VLOG
(
3
)
<<
"[NPU] Outputs["
<<
i
<<
"] name: "
<<
output_names
[
i
]
<<
" origin dims:"
<<
(
*
origin_otensors
)[
i
]
->
dims
().
repr
()
<<
" device dims: {"
<<
device_odims_
[
i
].
GetNumber
()
<<
","
<<
device_odims_
[
i
].
GetChannel
()
<<
","
<<
device_odims_
[
i
].
GetHeight
()
<<
","
<<
device_odims_
[
i
].
GetWidth
()
<<
"}"
;
CHECK_EQ
((
*
origin_otensors
)[
i
]
->
dims
().
production
(),
device_odims_
[
i
].
GetNumber
()
*
device_odims_
[
i
].
GetChannel
()
*
device_odims_
[
i
].
GetHeight
()
*
device_odims_
[
i
].
GetWidth
());
(
*
device_otensors
)[
i
]
->
Init
(
&
(
device_odims_
[
i
]));
VLOG
(
3
)
<<
"[NPU] Init the output tensors for the device program and share "
"their buffers with the origin output tensors"
;
// Share data buf between device_itensor and origin_itensor
std
::
shared_ptr
<
Buffer
>
buffer
=
std
::
make_shared
<
Buffer
>
((
*
device_otensors
)[
i
]
->
GetBuffer
(),
lite_api
::
TargetType
::
kHost
,
(
*
device_otensors
)[
i
]
->
GetSize
());
(
*
origin_otensors
)[
i
]
->
ResetBuffer
(
buffer
,
(
*
device_otensors
)[
i
]
->
GetSize
());
}
return
true
;
}
bool
DeviceProgram
::
ZeroCopyRun
(
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_itensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_otensors
)
{
CHECK
(
!
model_name_
.
empty
()
&&
model_client_
);
// Run the HiAI model by name
std
::
string
key
=
"model_name"
;
// Note: key seems must be model_name
hiai
::
AiContext
model_context
;
...
...
@@ -219,30 +305,87 @@ int SubgraphEngine::LaunchDeviceProgram() {
};
int
istamp
;
auto
start_time
=
GetCurrentUS
();
CHECK_EQ
(
device_program
->
client
->
Process
(
model_context
,
device_itensors_
,
device_otensors_
,
1000
,
istamp
),
CHECK_EQ
(
model_client_
->
Process
(
model_context
,
*
device_itensors
,
*
device_otensors
,
1000
,
istamp
),
hiai
::
AI_SUCCESS
);
VLOG
(
3
)
<<
"[NPU] Process cost "
<<
GetCurrentUS
()
-
start_time
<<
" us"
;
return
true
;
}
// Copy the data of output HiAI tensor to the buffer of origin output tensors
for
(
size_t
i
=
0
;
i
<
device_otensors_
.
size
();
i
++
)
{
std
::
memcpy
(
const_cast
<
void
*>
(
origin_otensors_
[
i
]
->
raw_data
()),
device_otensors_
[
i
]
->
GetBuffer
(),
device_otensors_
[
i
]
->
GetSize
());
bool
SubgraphEngine
::
PrepareWorkspaceForDeviceProgram
()
{
// Obtain the origin input tensors, and create the origin output
// tensors(Don't try to access them before launch the device program or the
// origin program)
PrepareWorkspaceForOriginProgram
();
// Create the device input and output tensors, but don't initialize them
// with the dimensions
device_itensors_
.
resize
(
input_names_
.
size
());
for
(
int
i
=
0
;
i
<
input_names_
.
size
();
i
++
)
{
device_itensors_
[
i
].
reset
(
new
hiai
::
AiTensor
);
CHECK
(
device_itensors_
[
i
]);
}
device_otensors_
.
resize
(
output_names_
.
size
());
for
(
int
i
=
0
;
i
<
output_names_
.
size
();
i
++
)
{
device_otensors_
[
i
].
reset
(
new
hiai
::
AiTensor
);
CHECK
(
device_otensors_
[
i
]);
}
return
0
;
return
true
;
}
bool
SubgraphEngine
::
InputShapeChanged
()
{
std
::
vector
<
std
::
vector
<
int64_t
>>
new_shape
;
for
(
auto
origin_itensor
:
origin_itensors_
)
{
new_shape
.
push_back
(
origin_itensor
->
dims
().
Vectorize
());
bool
SubgraphEngine
::
BuildDeviceProgram
()
{
// Check if the cache device program exists
if
(
!
device_programs_
.
count
(
origin_idims_
))
{
auto
device_program
=
std
::
make_shared
<
DeviceProgram
>
();
// Obtain the model cache dir from the NPU Context of the subgraph op
auto
model_cache_dir
=
ctx_
->
As
<
NPUContext
>
().
SubgraphModelCacheDir
();
VLOG
(
3
)
<<
"[NPU] Getting subgraph model_cache_dir is: "
<<
model_cache_dir
;
// Check and load if the cached model and configuration file exists
if
(
model_cache_dir
.
empty
()
||
!
device_program
->
LoadFromCacheFile
(
input_names_
,
output_names_
,
origin_idims_
,
model_cache_dir
))
{
// Build the model online, including converting the paddle ops to the HiAI
// IR nodes, building the HiAI IR graph to the om model, then load it as a
// new HiAI model manager client for inference.
if
(
origin_program_
.
empty
())
{
BuildOriginProgram
();
}
inputs_shape_
=
new_shape
;
if
(
device_program_map_
.
count
(
inputs_shape_
)
>
0
)
{
CHECK
(
!
origin_program_
.
empty
())
<<
"no instructions"
;
if
(
!
device_program
->
BuildGraphAndCacheToFile
(
origin_program_
,
input_names_
,
output_names_
,
origin_idims_
,
origin_otensors_
,
model_cache_dir
))
{
return
false
;
}
return
true
;
}
if
(
device_program
->
model_client_
==
nullptr
)
{
return
false
;
}
device_programs_
[
origin_idims_
]
=
device_program
;
}
auto
device_program
=
device_programs_
[
origin_idims_
];
CHECK
(
device_program
&&
device_program
->
model_client_
);
return
device_program
->
ShareBufferWithOriginTensors
(
input_names_
,
output_names_
,
&
origin_itensors_
,
&
origin_otensors_
,
&
device_itensors_
,
&
device_otensors_
);
}
bool
SubgraphEngine
::
LaunchDeviceProgram
()
{
// Roll back to launch the origin program if the device program can't be
// found or the model client isn't initialized.
if
(
device_programs_
.
count
(
origin_idims_
)
==
0
||
device_programs_
[
origin_idims_
]
->
model_client_
==
nullptr
)
{
return
LaunchOriginProgram
();
}
auto
device_program
=
device_programs_
[
origin_idims_
];
if
(
!
device_program
->
model_client_
)
{
return
LaunchOriginProgram
();
}
return
device_program
->
ZeroCopyRun
(
&
device_itensors_
,
&
device_otensors_
);
}
void
SubgraphCompute
::
PrepareForRun
()
{
...
...
@@ -254,12 +397,11 @@ void SubgraphCompute::PrepareForRun() {
param
.
output_data_names
,
param
.
scope
));
CHECK
(
engine_
);
engine_
->
Build
();
}
void
SubgraphCompute
::
Run
()
{
CHECK
(
engine_
);
engine_
->
Launch
();
engine_
->
Run
();
}
}
// namespace npu
...
...
lite/kernels/npu/subgraph_compute.h
浏览文件 @
ac897177
...
...
@@ -28,40 +28,65 @@ namespace lite {
namespace
kernels
{
namespace
npu
{
class
DeviceProgram
{
public:
DeviceProgram
()
{}
~
DeviceProgram
()
{}
std
::
string
GenerateModelName
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
);
bool
LoadFromCacheFile
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
,
const
std
::
string
&
model_cache_dir
);
bool
BuildGraphAndCacheToFile
(
const
std
::
vector
<
Instruction
>&
origin_program
,
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
origin_idims
,
const
std
::
vector
<
Tensor
*>&
origin_otensors
,
const
std
::
string
&
model_cache_dir
);
bool
ShareBufferWithOriginTensors
(
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
std
::
vector
<
Tensor
*>*
origin_itensors
,
std
::
vector
<
Tensor
*>*
origin_otensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_itensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_otensors
);
bool
ZeroCopyRun
(
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_itensors
,
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>*
device_otensors
);
public:
std
::
string
model_name_
{
""
};
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
model_client_
{
nullptr
};
std
::
vector
<
std
::
vector
<
int64_t
>>
origin_odims_
;
std
::
vector
<
PrecisionType
>
origin_otypes_
;
std
::
vector
<
hiai
::
TensorDimension
>
device_idims_
{};
std
::
vector
<
hiai
::
TensorDimension
>
device_odims_
{};
};
class
SubgraphEngine
:
public
subgraph
::
Engine
{
public:
SubgraphEngine
(
KernelContext
*
ctx
,
SubgraphEngine
(
KernelContext
*
ctx
,
int
block_idx
,
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
Scope
*
scope
)
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
Scope
*
scope
)
:
subgraph
::
Engine
(
ctx
,
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{}
struct
device_program_t
{
explicit
device_program_t
(
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
_client
)
:
client
(
_client
)
{}
std
::
shared_ptr
<
hiai
::
AiModelMngerClient
>
client
{
nullptr
};
std
::
vector
<
DDim
>
origin_idims
{};
std
::
vector
<
DDim
>
origin_odims
{};
std
::
vector
<
hiai
::
TensorDimension
>
device_idims
{};
std
::
vector
<
hiai
::
TensorDimension
>
device_odims
{};
};
protected:
int
Build
DeviceProgram
()
override
;
int
Launch
DeviceProgram
()
override
;
bool
InputShapeChanged
()
override
;
bool
PrepareWorkspaceFor
DeviceProgram
()
override
;
bool
Build
DeviceProgram
()
override
;
bool
LaunchDeviceProgram
()
override
;
std
::
string
model_name_
{
"model.om"
};
std
::
vector
<
std
::
vector
<
int64_t
>>
inputs_shape_
{};
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
shared_ptr
<
device_program_t
>>
device_program_map_
{};
std
::
vector
<
std
::
string
>
device_inames_
{};
std
::
vector
<
std
::
string
>
device_onames_
{};
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>
device_itensors_
{};
std
::
vector
<
std
::
shared_ptr
<
hiai
::
AiTensor
>>
device_otensors_
{};
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
shared_ptr
<
DeviceProgram
>>
device_programs_
;
};
class
SubgraphCompute
:
public
KernelLite
<
TARGET
(
kNPU
),
PRECISION
(
kAny
)
>
{
...
...
lite/utils/env.h
浏览文件 @
ac897177
...
...
@@ -22,6 +22,8 @@
#define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
"SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
#define SUBGRAPH_DISABLE_ONLINE_MODE "SUBGRAPH_DISABLE_ONLINE_MODE"
namespace
paddle
{
namespace
lite
{
...
...
lite/utils/io.h
浏览文件 @
ac897177
...
...
@@ -119,5 +119,40 @@ static std::vector<std::string> ListDir(const std::string& path,
return
paths
;
}
static
bool
ReadFile
(
const
std
::
string
&
filename
,
std
::
vector
<
char
>*
contents
)
{
FILE
*
fp
=
fopen
(
filename
.
c_str
(),
"rb"
);
if
(
!
fp
)
return
false
;
fseek
(
fp
,
0
,
SEEK_END
);
size_t
size
=
ftell
(
fp
);
fseek
(
fp
,
0
,
SEEK_SET
);
contents
->
clear
();
contents
->
resize
(
size
);
size_t
offset
=
0
;
char
*
ptr
=
reinterpret_cast
<
char
*>
(
&
(
contents
->
at
(
0
)));
while
(
offset
<
size
)
{
size_t
already_read
=
fread
(
ptr
,
1
,
size
-
offset
,
fp
);
offset
+=
already_read
;
ptr
+=
already_read
;
}
fclose
(
fp
);
return
true
;
}
static
bool
WriteFile
(
const
std
::
string
&
filename
,
const
std
::
vector
<
char
>&
contents
)
{
FILE
*
fp
=
fopen
(
filename
.
c_str
(),
"wb"
);
if
(
!
fp
)
return
false
;
size_t
size
=
contents
.
size
();
size_t
offset
=
0
;
const
char
*
ptr
=
reinterpret_cast
<
const
char
*>
(
&
(
contents
.
at
(
0
)));
while
(
offset
<
size
)
{
size_t
already_written
=
fwrite
(
ptr
,
1
,
size
-
offset
,
fp
);
offset
+=
already_written
;
ptr
+=
already_written
;
}
fclose
(
fp
);
return
true
;
}
}
// namespace lite
}
// namespace paddle
lite/utils/md5.h
0 → 100644
浏览文件 @
ac897177
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
namespace
paddle
{
namespace
lite
{
std
::
string
MD5
(
std
::
string
message
)
{
const
uint32_t
shiftAmounts
[]
=
{
7
,
12
,
17
,
22
,
7
,
12
,
17
,
22
,
7
,
12
,
17
,
22
,
7
,
12
,
17
,
22
,
5
,
9
,
14
,
20
,
5
,
9
,
14
,
20
,
5
,
9
,
14
,
20
,
5
,
9
,
14
,
20
,
4
,
11
,
16
,
23
,
4
,
11
,
16
,
23
,
4
,
11
,
16
,
23
,
4
,
11
,
16
,
23
,
6
,
10
,
15
,
21
,
6
,
10
,
15
,
21
,
6
,
10
,
15
,
21
,
6
,
10
,
15
,
21
};
const
uint32_t
partsOfSines
[]
=
{
0xd76aa478
,
0xe8c7b756
,
0x242070db
,
0xc1bdceee
,
0xf57c0faf
,
0x4787c62a
,
0xa8304613
,
0xfd469501
,
0x698098d8
,
0x8b44f7af
,
0xffff5bb1
,
0x895cd7be
,
0x6b901122
,
0xfd987193
,
0xa679438e
,
0x49b40821
,
0xf61e2562
,
0xc040b340
,
0x265e5a51
,
0xe9b6c7aa
,
0xd62f105d
,
0x02441453
,
0xd8a1e681
,
0xe7d3fbc8
,
0x21e1cde6
,
0xc33707d6
,
0xf4d50d87
,
0x455a14ed
,
0xa9e3e905
,
0xfcefa3f8
,
0x676f02d9
,
0x8d2a4c8a
,
0xfffa3942
,
0x8771f681
,
0x6d9d6122
,
0xfde5380c
,
0xa4beea44
,
0x4bdecfa9
,
0xf6bb4b60
,
0xbebfbc70
,
0x289b7ec6
,
0xeaa127fa
,
0xd4ef3085
,
0x04881d05
,
0xd9d4d039
,
0xe6db99e5
,
0x1fa27cf8
,
0xc4ac5665
,
0xf4292244
,
0x432aff97
,
0xab9423a7
,
0xfc93a039
,
0x655b59c3
,
0x8f0ccc92
,
0xffeff47d
,
0x85845dd1
,
0x6fa87e4f
,
0xfe2ce6e0
,
0xa3014314
,
0x4e0811a1
,
0xf7537e82
,
0xbd3af235
,
0x2ad7d2bb
,
0xeb86d391
};
uint32_t
state
[
4
];
state
[
0
]
=
0x67452301
;
state
[
1
]
=
0xefcdab89
;
state
[
2
]
=
0x98badcfe
;
state
[
3
]
=
0x10325476
;
// Pad with zeros
int
size
=
((((
message
.
length
()
+
8
)
/
64
)
+
1
)
*
64
)
-
8
;
uint8_t
*
buf
=
reinterpret_cast
<
uint8_t
*>
(
calloc
(
size
+
64
,
1
));
memcpy
(
buf
,
message
.
c_str
(),
message
.
length
());
buf
[
message
.
length
()]
=
128
;
uint32_t
bits
=
8
*
message
.
length
();
memcpy
(
buf
+
size
,
&
bits
,
4
);
// Process at each 512-bit(64 bytes) chunk
#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
for
(
int
offset
=
0
;
offset
<
size
;
offset
+=
64
)
{
uint32_t
A
=
state
[
0
];
uint32_t
B
=
state
[
1
];
uint32_t
C
=
state
[
2
];
uint32_t
D
=
state
[
3
];
uint32_t
*
W
=
reinterpret_cast
<
uint32_t
*>
(
buf
+
offset
);
for
(
uint32_t
i
=
0
;
i
<
64
;
i
++
)
{
uint32_t
F
,
g
;
if
(
i
<
16
)
{
F
=
(
B
&
C
)
|
((
~
B
)
&
D
);
g
=
i
;
}
else
if
(
i
<
32
)
{
F
=
(
D
&
B
)
|
((
~
D
)
&
C
);
g
=
(
5
*
i
+
1
)
%
16
;
}
else
if
(
i
<
48
)
{
F
=
B
^
C
^
D
;
g
=
(
3
*
i
+
5
)
%
16
;
}
else
{
F
=
C
^
(
B
|
(
~
D
));
g
=
(
7
*
i
)
%
16
;
}
uint32_t
T
=
D
;
D
=
C
;
C
=
B
;
B
=
B
+
LEFTROTATE
((
A
+
F
+
partsOfSines
[
i
]
+
W
[
g
]),
shiftAmounts
[
i
]);
A
=
T
;
}
state
[
0
]
+=
A
;
state
[
1
]
+=
B
;
state
[
2
]
+=
C
;
state
[
3
]
+=
D
;
}
#undef LEFTROTATE
free
(
buf
);
// Convert digest to string
std
::
string
res
;
res
.
reserve
(
16
<<
1
);
const
uint8_t
*
digest
=
reinterpret_cast
<
uint8_t
*>
(
state
);
char
hex
[
3
];
for
(
size_t
i
=
0
;
i
<
16
;
i
++
)
{
snprintf
(
hex
,
sizeof
(
hex
),
"%02x"
,
digest
[
i
]);
res
.
append
(
hex
);
}
return
res
;
}
}
// namespace lite
}
// namespace paddle
lite/utils/string.h
浏览文件 @
ac897177
...
...
@@ -60,6 +60,38 @@ static std::string to_string(const T& v) {
return
ss
.
str
();
}
static
std
::
string
to_string
(
int
index
)
{
const
int
BUFFER_LENGTH
=
15
;
char
buffer
[
BUFFER_LENGTH
];
snprintf
(
buffer
,
sizeof
(
buffer
),
"%d"
,
index
);
return
std
::
string
(
buffer
);
}
template
<
typename
T
=
std
::
string
>
static
T
parse_string
(
const
std
::
string
&
v
)
{
return
v
;
}
template
<
>
int32_t
parse_string
<
int32_t
>
(
const
std
::
string
&
v
)
{
return
std
::
stoi
(
v
);
}
template
<
>
int64_t
parse_string
<
int64_t
>
(
const
std
::
string
&
v
)
{
return
std
::
stoll
(
v
);
}
template
<
>
float
parse_string
<
float
>
(
const
std
::
string
&
v
)
{
return
std
::
stof
(
v
);
}
template
<
>
double
parse_string
<
double
>
(
const
std
::
string
&
v
)
{
return
std
::
stod
(
v
);
}
template
<
typename
T
>
std
::
string
Join
(
const
std
::
vector
<
T
>&
vec
,
const
std
::
string
&
delim
)
{
if
(
vec
.
empty
())
return
""
;
...
...
@@ -84,19 +116,20 @@ static std::string Repr(const std::vector<std::string>& v) {
return
"{"
+
Join
(
tmp
,
","
)
+
"}"
;
}
static
std
::
vector
<
std
::
string
>
Split
(
const
std
::
string
&
original
,
template
<
class
T
=
std
::
string
>
static
std
::
vector
<
T
>
Split
(
const
std
::
string
&
original
,
const
std
::
string
&
separator
)
{
std
::
vector
<
std
::
string
>
results
;
std
::
vector
<
T
>
results
;
std
::
string
::
size_type
pos1
,
pos2
;
pos2
=
original
.
find
(
separator
);
pos1
=
0
;
while
(
std
::
string
::
npos
!=
pos2
)
{
results
.
push_back
(
original
.
substr
(
pos1
,
pos2
-
pos1
));
results
.
push_back
(
parse_string
<
T
>
(
original
.
substr
(
pos1
,
pos2
-
pos1
)
));
pos1
=
pos2
+
separator
.
size
();
pos2
=
original
.
find
(
separator
,
pos1
);
}
if
(
pos1
!=
original
.
length
())
{
results
.
push_back
(
original
.
substr
(
pos1
));
results
.
push_back
(
parse_string
<
T
>
(
original
.
substr
(
pos1
)
));
}
return
results
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录